aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJesús <heckyel@hyperbola.info>2022-04-06 03:37:17 +0800
committerJesús <heckyel@hyperbola.info>2022-04-06 03:37:17 +0800
commit1e5a50b71d8f0eae6007bedc329eecb24bb5aba3 (patch)
treea8611cda6596391cb6fb645e1469dcd356b63924
parentf52fb3bceeb9d22b5106c1796fecec474a0cc138 (diff)
downloadhypervideo-1e5a50b71d8f0eae6007bedc329eecb24bb5aba3.tar.lz
hypervideo-1e5a50b71d8f0eae6007bedc329eecb24bb5aba3.tar.xz
hypervideo-1e5a50b71d8f0eae6007bedc329eecb24bb5aba3.zip
update from upstream
-rw-r--r--AUTHORS1136
-rw-r--r--CONTRIBUTORS91
-rw-r--r--MANIFEST.in1
-rw-r--r--[-rwxr-xr-x]bin/hypervideo0
-rw-r--r--completions/zsh/_hypervideo30
-rw-r--r--devscripts/make_lazy_extractors.py13
-rw-r--r--devscripts/make_supportedsites.py8
-rw-r--r--devscripts/prepare_manpage.py90
-rw-r--r--[-rwxr-xr-x]hypervideo_dl/YoutubeDL.py1772
-rw-r--r--hypervideo_dl/__init__.py856
-rw-r--r--[-rwxr-xr-x]hypervideo_dl/__main__.py0
-rw-r--r--hypervideo_dl/aes.py60
-rw-r--r--hypervideo_dl/compat.py40
-rw-r--r--hypervideo_dl/cookies.py356
-rw-r--r--hypervideo_dl/downloader/__init__.py17
-rw-r--r--hypervideo_dl/downloader/common.py99
-rw-r--r--hypervideo_dl/downloader/dash.py68
-rw-r--r--hypervideo_dl/downloader/external.py102
-rw-r--r--hypervideo_dl/downloader/f4m.py2
-rw-r--r--hypervideo_dl/downloader/fc2.py41
-rw-r--r--hypervideo_dl/downloader/fragment.py171
-rw-r--r--hypervideo_dl/downloader/hls.py9
-rw-r--r--hypervideo_dl/downloader/http.py130
-rw-r--r--hypervideo_dl/downloader/ism.py4
-rw-r--r--hypervideo_dl/downloader/mhtml.py13
-rw-r--r--hypervideo_dl/downloader/rtmp.py3
-rw-r--r--hypervideo_dl/downloader/websocket.py7
-rw-r--r--hypervideo_dl/downloader/youtube_live_chat.py9
-rw-r--r--hypervideo_dl/extractor/__init__.py21
-rw-r--r--hypervideo_dl/extractor/abc.py67
-rw-r--r--hypervideo_dl/extractor/abematv.py476
-rw-r--r--hypervideo_dl/extractor/adn.py30
-rw-r--r--hypervideo_dl/extractor/adobeconnect.py4
-rw-r--r--hypervideo_dl/extractor/adobepass.py61
-rw-r--r--hypervideo_dl/extractor/adobetv.py3
-rw-r--r--hypervideo_dl/extractor/afreecatv.py120
-rw-r--r--hypervideo_dl/extractor/aliexpress.py2
-rw-r--r--hypervideo_dl/extractor/aljazeera.py87
-rw-r--r--hypervideo_dl/extractor/allocine.py6
-rw-r--r--hypervideo_dl/extractor/alsace20tv.py87
-rw-r--r--hypervideo_dl/extractor/alura.py9
-rw-r--r--hypervideo_dl/extractor/amazon.py53
-rw-r--r--hypervideo_dl/extractor/animelab.py35
-rw-r--r--hypervideo_dl/extractor/animeondemand.py31
-rw-r--r--hypervideo_dl/extractor/ant1newsgr.py143
-rw-r--r--hypervideo_dl/extractor/anvato.py7
-rw-r--r--hypervideo_dl/extractor/aparat.py15
-rw-r--r--hypervideo_dl/extractor/applepodcasts.py48
-rw-r--r--hypervideo_dl/extractor/archiveorg.py512
-rw-r--r--hypervideo_dl/extractor/arcpublishing.py5
-rw-r--r--hypervideo_dl/extractor/ard.py76
-rw-r--r--hypervideo_dl/extractor/arnes.py3
-rw-r--r--hypervideo_dl/extractor/arte.py50
-rw-r--r--hypervideo_dl/extractor/asiancrush.py3
-rw-r--r--hypervideo_dl/extractor/atresplayer.py12
-rw-r--r--hypervideo_dl/extractor/atvat.py6
-rw-r--r--hypervideo_dl/extractor/audiomack.py35
-rw-r--r--hypervideo_dl/extractor/awaan.py5
-rw-r--r--hypervideo_dl/extractor/azmedien.py10
-rw-r--r--hypervideo_dl/extractor/banbye.py153
-rw-r--r--hypervideo_dl/extractor/bandaichannel.py1
-rw-r--r--hypervideo_dl/extractor/bandcamp.py69
-rw-r--r--hypervideo_dl/extractor/bbc.py89
-rw-r--r--hypervideo_dl/extractor/beeg.py123
-rw-r--r--hypervideo_dl/extractor/bigo.py59
-rw-r--r--hypervideo_dl/extractor/bilibili.py405
-rw-r--r--hypervideo_dl/extractor/biqle.py93
-rw-r--r--hypervideo_dl/extractor/bitwave.py2
-rw-r--r--hypervideo_dl/extractor/blogger.py54
-rw-r--r--hypervideo_dl/extractor/bongacams.py2
-rw-r--r--hypervideo_dl/extractor/br.py5
-rw-r--r--hypervideo_dl/extractor/breitbart.py38
-rw-r--r--hypervideo_dl/extractor/brightcove.py40
-rw-r--r--hypervideo_dl/extractor/cableav.py34
-rw-r--r--hypervideo_dl/extractor/callin.py114
-rw-r--r--hypervideo_dl/extractor/caltrans.py41
-rw-r--r--hypervideo_dl/extractor/cam4.py5
-rw-r--r--hypervideo_dl/extractor/cammodels.py2
-rw-r--r--hypervideo_dl/extractor/canalalpha.py98
-rw-r--r--hypervideo_dl/extractor/canvas.py68
-rw-r--r--hypervideo_dl/extractor/carambatv.py3
-rw-r--r--hypervideo_dl/extractor/cbc.py182
-rw-r--r--hypervideo_dl/extractor/cbs.py28
-rw-r--r--hypervideo_dl/extractor/ccma.py13
-rw-r--r--hypervideo_dl/extractor/cctv.py3
-rw-r--r--hypervideo_dl/extractor/ceskatelevize.py130
-rw-r--r--hypervideo_dl/extractor/chaturbate.py2
-rw-r--r--hypervideo_dl/extractor/chingari.py4
-rw-r--r--hypervideo_dl/extractor/closertotruth.py3
-rw-r--r--hypervideo_dl/extractor/common.py468
-rw-r--r--hypervideo_dl/extractor/corus.py1
-rw-r--r--hypervideo_dl/extractor/coub.py3
-rw-r--r--hypervideo_dl/extractor/cozytv.py40
-rw-r--r--hypervideo_dl/extractor/cpac.py148
-rw-r--r--hypervideo_dl/extractor/crackle.py40
-rw-r--r--hypervideo_dl/extractor/craftsy.py71
-rw-r--r--hypervideo_dl/extractor/crowdbunker.py113
-rw-r--r--hypervideo_dl/extractor/crunchyroll.py359
-rw-r--r--hypervideo_dl/extractor/cspan.py52
-rw-r--r--hypervideo_dl/extractor/ctvnews.py5
-rw-r--r--hypervideo_dl/extractor/curiositystream.py84
-rw-r--r--hypervideo_dl/extractor/cybrary.py146
-rw-r--r--hypervideo_dl/extractor/daftsex.py146
-rw-r--r--hypervideo_dl/extractor/dailymotion.py33
-rw-r--r--hypervideo_dl/extractor/daum.py5
-rw-r--r--hypervideo_dl/extractor/daystar.py48
-rw-r--r--hypervideo_dl/extractor/digitalconcerthall.py141
-rw-r--r--hypervideo_dl/extractor/disney.py9
-rw-r--r--hypervideo_dl/extractor/dispeak.py3
-rw-r--r--hypervideo_dl/extractor/dlive.py2
-rw-r--r--hypervideo_dl/extractor/doodstream.py37
-rw-r--r--hypervideo_dl/extractor/douyutv.py2
-rw-r--r--hypervideo_dl/extractor/dplay.py857
-rw-r--r--hypervideo_dl/extractor/drooble.py116
-rw-r--r--hypervideo_dl/extractor/dropbox.py44
-rw-r--r--hypervideo_dl/extractor/dropout.py212
-rw-r--r--hypervideo_dl/extractor/drtv.py18
-rw-r--r--hypervideo_dl/extractor/dvtv.py7
-rw-r--r--hypervideo_dl/extractor/egghead.py1
-rw-r--r--hypervideo_dl/extractor/ellentube.py3
-rw-r--r--hypervideo_dl/extractor/elonet.py85
-rw-r--r--hypervideo_dl/extractor/engadget.py10
-rw-r--r--hypervideo_dl/extractor/epicon.py4
-rw-r--r--hypervideo_dl/extractor/eroprofile.py9
-rw-r--r--hypervideo_dl/extractor/ertgr.py316
-rw-r--r--hypervideo_dl/extractor/espn.py43
-rw-r--r--hypervideo_dl/extractor/europeantour.py37
-rw-r--r--hypervideo_dl/extractor/euscreen.py2
-rw-r--r--hypervideo_dl/extractor/extractors.py366
-rw-r--r--hypervideo_dl/extractor/facebook.py158
-rw-r--r--hypervideo_dl/extractor/fancode.py41
-rw-r--r--hypervideo_dl/extractor/fc2.py201
-rw-r--r--hypervideo_dl/extractor/filmon.py2
-rw-r--r--hypervideo_dl/extractor/fivetv.py3
-rw-r--r--hypervideo_dl/extractor/flickr.py3
-rw-r--r--hypervideo_dl/extractor/fox.py39
-rw-r--r--hypervideo_dl/extractor/foxgay.py3
-rw-r--r--hypervideo_dl/extractor/fptplay.py102
-rw-r--r--hypervideo_dl/extractor/franceculture.py101
-rw-r--r--hypervideo_dl/extractor/francetv.py6
-rw-r--r--hypervideo_dl/extractor/frontendmasters.py13
-rw-r--r--hypervideo_dl/extractor/fujitv.py70
-rw-r--r--hypervideo_dl/extractor/funimation.py25
-rw-r--r--hypervideo_dl/extractor/funk.py2
-rw-r--r--hypervideo_dl/extractor/gab.py89
-rw-r--r--hypervideo_dl/extractor/gaia.py30
-rw-r--r--hypervideo_dl/extractor/gamejolt.py541
-rw-r--r--hypervideo_dl/extractor/generic.py461
-rw-r--r--hypervideo_dl/extractor/gettr.py159
-rw-r--r--hypervideo_dl/extractor/gfycat.py43
-rw-r--r--hypervideo_dl/extractor/glide.py4
-rw-r--r--hypervideo_dl/extractor/globo.py43
-rw-r--r--hypervideo_dl/extractor/glomex.py220
-rw-r--r--hypervideo_dl/extractor/go.py8
-rw-r--r--hypervideo_dl/extractor/gofile.py83
-rw-r--r--hypervideo_dl/extractor/googlesearch.py21
-rw-r--r--hypervideo_dl/extractor/gronkh.py5
-rw-r--r--hypervideo_dl/extractor/hellporno.py3
-rw-r--r--hypervideo_dl/extractor/hidive.py8
-rw-r--r--hypervideo_dl/extractor/hitbox.py2
-rw-r--r--hypervideo_dl/extractor/hotstar.py10
-rw-r--r--hypervideo_dl/extractor/hrfensehen.py10
-rw-r--r--hypervideo_dl/extractor/hrti.py15
-rw-r--r--hypervideo_dl/extractor/hse.py95
-rw-r--r--hypervideo_dl/extractor/huffpost.py3
-rw-r--r--hypervideo_dl/extractor/huya.py137
-rw-r--r--hypervideo_dl/extractor/imdb.py64
-rw-r--r--hypervideo_dl/extractor/imggaming.py22
-rw-r--r--hypervideo_dl/extractor/infoq.py2
-rw-r--r--hypervideo_dl/extractor/instagram.py552
-rw-r--r--hypervideo_dl/extractor/internazionale.py6
-rw-r--r--hypervideo_dl/extractor/iprima.py145
-rw-r--r--hypervideo_dl/extractor/iqiyi.py377
-rw-r--r--hypervideo_dl/extractor/itprotv.py141
-rw-r--r--hypervideo_dl/extractor/itv.py44
-rw-r--r--hypervideo_dl/extractor/ivideon.py2
-rw-r--r--hypervideo_dl/extractor/iwara.py3
-rw-r--r--hypervideo_dl/extractor/jamendo.py2
-rw-r--r--hypervideo_dl/extractor/joj.py3
-rw-r--r--hypervideo_dl/extractor/kakao.py46
-rw-r--r--hypervideo_dl/extractor/kaltura.py11
-rw-r--r--hypervideo_dl/extractor/keezmovies.py3
-rw-r--r--hypervideo_dl/extractor/kelbyone.py84
-rw-r--r--hypervideo_dl/extractor/kinopoisk.py3
-rw-r--r--hypervideo_dl/extractor/koo.py2
-rw-r--r--hypervideo_dl/extractor/la7.py54
-rw-r--r--hypervideo_dl/extractor/laola1tv.py4
-rw-r--r--hypervideo_dl/extractor/lastfm.py129
-rw-r--r--hypervideo_dl/extractor/lbry.py43
-rw-r--r--hypervideo_dl/extractor/lecturio.py9
-rw-r--r--hypervideo_dl/extractor/lego.py7
-rw-r--r--hypervideo_dl/extractor/limelight.py2
-rw-r--r--hypervideo_dl/extractor/line.py112
-rw-r--r--hypervideo_dl/extractor/linkedin.py100
-rw-r--r--hypervideo_dl/extractor/linuxacademy.py9
-rw-r--r--hypervideo_dl/extractor/litv.py23
-rw-r--r--hypervideo_dl/extractor/livestream.py4
-rw-r--r--hypervideo_dl/extractor/lnkgo.py88
-rw-r--r--hypervideo_dl/extractor/lynda.py11
-rw-r--r--hypervideo_dl/extractor/mainstreaming.py219
-rw-r--r--hypervideo_dl/extractor/mangomolo.py2
-rw-r--r--hypervideo_dl/extractor/manyvids.py1
-rw-r--r--hypervideo_dl/extractor/matchtv.py2
-rw-r--r--hypervideo_dl/extractor/mdr.py12
-rw-r--r--hypervideo_dl/extractor/medaltv.py3
-rw-r--r--hypervideo_dl/extractor/mediaklikk.py4
-rw-r--r--hypervideo_dl/extractor/mediaset.py165
-rw-r--r--hypervideo_dl/extractor/mediasite.py11
-rw-r--r--hypervideo_dl/extractor/megatvcom.py173
-rw-r--r--hypervideo_dl/extractor/mgtv.py59
-rw-r--r--hypervideo_dl/extractor/miaopai.py3
-rw-r--r--hypervideo_dl/extractor/microsoftstream.py125
-rw-r--r--hypervideo_dl/extractor/mildom.py336
-rw-r--r--hypervideo_dl/extractor/minds.py3
-rw-r--r--hypervideo_dl/extractor/mirrativ.py83
-rw-r--r--hypervideo_dl/extractor/mixch.py85
-rw-r--r--hypervideo_dl/extractor/mixcloud.py16
-rw-r--r--hypervideo_dl/extractor/mlssoccer.py117
-rw-r--r--hypervideo_dl/extractor/mojvideo.py3
-rw-r--r--hypervideo_dl/extractor/mtv.py17
-rw-r--r--hypervideo_dl/extractor/muenchentv.py2
-rw-r--r--hypervideo_dl/extractor/murrtube.py165
-rw-r--r--hypervideo_dl/extractor/musescore.py8
-rw-r--r--hypervideo_dl/extractor/musicdex.py175
-rw-r--r--hypervideo_dl/extractor/mxplayer.py2
-rw-r--r--hypervideo_dl/extractor/myspass.py63
-rw-r--r--hypervideo_dl/extractor/n1.py22
-rw-r--r--hypervideo_dl/extractor/nate.py124
-rw-r--r--hypervideo_dl/extractor/naver.py7
-rw-r--r--hypervideo_dl/extractor/nba.py12
-rw-r--r--hypervideo_dl/extractor/nbc.py27
-rw-r--r--hypervideo_dl/extractor/ndr.py2
-rw-r--r--hypervideo_dl/extractor/nebula.py368
-rw-r--r--hypervideo_dl/extractor/neteasemusic.py13
-rw-r--r--hypervideo_dl/extractor/newgrounds.py25
-rw-r--r--hypervideo_dl/extractor/newstube.py10
-rw-r--r--hypervideo_dl/extractor/newsy.py51
-rw-r--r--hypervideo_dl/extractor/nexx.py147
-rw-r--r--hypervideo_dl/extractor/nfb.py62
-rw-r--r--hypervideo_dl/extractor/nfl.py2
-rw-r--r--hypervideo_dl/extractor/nhk.py152
-rw-r--r--hypervideo_dl/extractor/niconico.py823
-rw-r--r--hypervideo_dl/extractor/ninecninemedia.py35
-rw-r--r--hypervideo_dl/extractor/nitter.py221
-rw-r--r--hypervideo_dl/extractor/njpwworld.py19
-rw-r--r--hypervideo_dl/extractor/noco.py9
-rw-r--r--hypervideo_dl/extractor/noodlemagazine.py67
-rw-r--r--hypervideo_dl/extractor/nova.py34
-rw-r--r--hypervideo_dl/extractor/novaplay.py4
-rw-r--r--hypervideo_dl/extractor/npo.py4
-rw-r--r--hypervideo_dl/extractor/npr.py3
-rw-r--r--hypervideo_dl/extractor/nrk.py13
-rw-r--r--hypervideo_dl/extractor/nrl.py1
-rw-r--r--hypervideo_dl/extractor/ntvcojp.py27
-rw-r--r--hypervideo_dl/extractor/nuvid.py49
-rw-r--r--hypervideo_dl/extractor/odnoklassniki.py97
-rw-r--r--hypervideo_dl/extractor/oktoberfesttv.py4
-rw-r--r--hypervideo_dl/extractor/olympics.py71
-rw-r--r--hypervideo_dl/extractor/ondemandkorea.py6
-rw-r--r--hypervideo_dl/extractor/onefootball.py51
-rw-r--r--hypervideo_dl/extractor/onet.py7
-rw-r--r--hypervideo_dl/extractor/opencast.py177
-rw-r--r--hypervideo_dl/extractor/openload.py14
-rw-r--r--hypervideo_dl/extractor/openrec.py161
-rw-r--r--hypervideo_dl/extractor/orf.py231
-rw-r--r--hypervideo_dl/extractor/packtpub.py5
-rw-r--r--hypervideo_dl/extractor/panopto.py607
-rw-r--r--hypervideo_dl/extractor/paramountplus.py31
-rw-r--r--hypervideo_dl/extractor/parliamentliveuk.py3
-rw-r--r--hypervideo_dl/extractor/patreon.py12
-rw-r--r--hypervideo_dl/extractor/pbs.py7
-rw-r--r--hypervideo_dl/extractor/peekvids.py81
-rw-r--r--hypervideo_dl/extractor/peertube.py5
-rw-r--r--hypervideo_dl/extractor/peertv.py57
-rw-r--r--hypervideo_dl/extractor/peloton.py1
-rw-r--r--hypervideo_dl/extractor/periscope.py2
-rw-r--r--hypervideo_dl/extractor/piapro.py96
-rw-r--r--hypervideo_dl/extractor/picarto.py4
-rw-r--r--hypervideo_dl/extractor/piksel.py10
-rw-r--r--hypervideo_dl/extractor/pixivsketch.py122
-rw-r--r--hypervideo_dl/extractor/pladform.py26
-rw-r--r--hypervideo_dl/extractor/planetmarathi.py76
-rw-r--r--hypervideo_dl/extractor/platzi.py9
-rw-r--r--hypervideo_dl/extractor/playplustv.py12
-rw-r--r--hypervideo_dl/extractor/playtvak.py2
-rw-r--r--hypervideo_dl/extractor/playvid.py3
-rw-r--r--hypervideo_dl/extractor/pluralsight.py9
-rw-r--r--hypervideo_dl/extractor/plutotv.py7
-rw-r--r--hypervideo_dl/extractor/pokemon.py40
-rw-r--r--hypervideo_dl/extractor/pokergo.py109
-rw-r--r--hypervideo_dl/extractor/polsatgo.py90
-rw-r--r--hypervideo_dl/extractor/polskieradio.py303
-rw-r--r--hypervideo_dl/extractor/pornez.py43
-rw-r--r--hypervideo_dl/extractor/pornflip.py1
-rw-r--r--hypervideo_dl/extractor/pornhub.py16
-rw-r--r--hypervideo_dl/extractor/projectveritas.py2
-rw-r--r--hypervideo_dl/extractor/prx.py431
-rw-r--r--hypervideo_dl/extractor/radiode.py2
-rw-r--r--hypervideo_dl/extractor/radiokapital.py99
-rw-r--r--hypervideo_dl/extractor/radiozet.py51
-rw-r--r--hypervideo_dl/extractor/radlive.py10
-rw-r--r--hypervideo_dl/extractor/rai.py198
-rw-r--r--hypervideo_dl/extractor/rcti.py128
-rw-r--r--hypervideo_dl/extractor/redbulltv.py3
-rw-r--r--hypervideo_dl/extractor/reddit.py86
-rw-r--r--hypervideo_dl/extractor/redgifs.py232
-rw-r--r--hypervideo_dl/extractor/redtube.py35
-rw-r--r--hypervideo_dl/extractor/rmcdecouverte.py1
-rw-r--r--hypervideo_dl/extractor/rokfin.py256
-rw-r--r--hypervideo_dl/extractor/roosterteeth.py208
-rw-r--r--hypervideo_dl/extractor/rtbf.py2
-rw-r--r--hypervideo_dl/extractor/rtl2.py16
-rw-r--r--hypervideo_dl/extractor/rtnews.py199
-rw-r--r--hypervideo_dl/extractor/rtrfm.py67
-rw-r--r--hypervideo_dl/extractor/rtve.py95
-rw-r--r--hypervideo_dl/extractor/rtvs.py74
-rw-r--r--hypervideo_dl/extractor/rule34video.py65
-rw-r--r--hypervideo_dl/extractor/rumble.py17
-rw-r--r--hypervideo_dl/extractor/rutube.py21
-rw-r--r--hypervideo_dl/extractor/rutv.py13
-rw-r--r--hypervideo_dl/extractor/ruutu.py15
-rw-r--r--hypervideo_dl/extractor/ruv.py88
-rw-r--r--hypervideo_dl/extractor/safari.py9
-rw-r--r--hypervideo_dl/extractor/sbs.py17
-rw-r--r--hypervideo_dl/extractor/scte.py9
-rw-r--r--hypervideo_dl/extractor/senategov.py213
-rw-r--r--hypervideo_dl/extractor/sendtonews.py2
-rw-r--r--hypervideo_dl/extractor/sevenplus.py1
-rw-r--r--hypervideo_dl/extractor/shahid.py8
-rw-r--r--hypervideo_dl/extractor/shemaroome.py11
-rw-r--r--hypervideo_dl/extractor/showroomlive.py2
-rw-r--r--hypervideo_dl/extractor/skeb.py143
-rw-r--r--hypervideo_dl/extractor/sky.py28
-rw-r--r--hypervideo_dl/extractor/skyit.py7
-rw-r--r--hypervideo_dl/extractor/skylinewebcams.py2
-rw-r--r--hypervideo_dl/extractor/skynewsau.py2
-rw-r--r--hypervideo_dl/extractor/slideslive.py3
-rw-r--r--hypervideo_dl/extractor/sonyliv.py60
-rw-r--r--hypervideo_dl/extractor/soundcloud.py344
-rw-r--r--hypervideo_dl/extractor/southpark.py17
-rw-r--r--hypervideo_dl/extractor/sovietscloset.py15
-rw-r--r--hypervideo_dl/extractor/spiegel.py2
-rw-r--r--hypervideo_dl/extractor/sportdeutschland.py8
-rw-r--r--hypervideo_dl/extractor/srgssr.py7
-rw-r--r--hypervideo_dl/extractor/steam.py140
-rw-r--r--hypervideo_dl/extractor/storyfire.py17
-rw-r--r--hypervideo_dl/extractor/streamcz.py173
-rw-r--r--hypervideo_dl/extractor/streamff.py31
-rw-r--r--hypervideo_dl/extractor/stripchat.py66
-rw-r--r--hypervideo_dl/extractor/stv.py5
-rw-r--r--hypervideo_dl/extractor/sunporno.py3
-rw-r--r--hypervideo_dl/extractor/svt.py32
-rw-r--r--hypervideo_dl/extractor/tagesschau.py279
-rw-r--r--hypervideo_dl/extractor/teachable.py3
-rw-r--r--hypervideo_dl/extractor/teamtreehouse.py7
-rw-r--r--hypervideo_dl/extractor/ted.py477
-rw-r--r--hypervideo_dl/extractor/tele5.py87
-rw-r--r--hypervideo_dl/extractor/telebruxelles.py2
-rw-r--r--hypervideo_dl/extractor/telegram.py37
-rw-r--r--hypervideo_dl/extractor/telemundo.py5
-rw-r--r--hypervideo_dl/extractor/telequebec.py12
-rw-r--r--hypervideo_dl/extractor/tennistv.py9
-rw-r--r--hypervideo_dl/extractor/tenplay.py44
-rw-r--r--hypervideo_dl/extractor/tf1.py1
-rw-r--r--hypervideo_dl/extractor/theta.py10
-rw-r--r--hypervideo_dl/extractor/thisav.py4
-rw-r--r--hypervideo_dl/extractor/thisoldhouse.py17
-rw-r--r--hypervideo_dl/extractor/threeqsdn.py18
-rw-r--r--hypervideo_dl/extractor/threespeak.py97
-rw-r--r--hypervideo_dl/extractor/tiktok.py449
-rw-r--r--hypervideo_dl/extractor/toggo.py73
-rw-r--r--hypervideo_dl/extractor/tokentube.py12
-rw-r--r--hypervideo_dl/extractor/tonline.py9
-rw-r--r--hypervideo_dl/extractor/toutv.py7
-rw-r--r--hypervideo_dl/extractor/traileraddict.py3
-rw-r--r--hypervideo_dl/extractor/trovo.py43
-rw-r--r--hypervideo_dl/extractor/trueid.py139
-rw-r--r--hypervideo_dl/extractor/tubitv.py20
-rw-r--r--hypervideo_dl/extractor/tumblr.py408
-rw-r--r--hypervideo_dl/extractor/tunein.py2
-rw-r--r--hypervideo_dl/extractor/turner.py2
-rw-r--r--hypervideo_dl/extractor/tv2.py17
-rw-r--r--hypervideo_dl/extractor/tv2dk.py17
-rw-r--r--hypervideo_dl/extractor/tver.py37
-rw-r--r--hypervideo_dl/extractor/tvnet.py7
-rw-r--r--hypervideo_dl/extractor/tvopengr.py128
-rw-r--r--hypervideo_dl/extractor/tvp.py461
-rw-r--r--hypervideo_dl/extractor/tvplay.py114
-rw-r--r--hypervideo_dl/extractor/tvplayer.py2
-rw-r--r--hypervideo_dl/extractor/twitcasting.py166
-rw-r--r--hypervideo_dl/extractor/twitch.py96
-rw-r--r--hypervideo_dl/extractor/twitter.py13
-rw-r--r--hypervideo_dl/extractor/udemy.py9
-rw-r--r--hypervideo_dl/extractor/uol.py1
-rw-r--r--hypervideo_dl/extractor/urplay.py53
-rw-r--r--hypervideo_dl/extractor/ustream.py5
-rw-r--r--hypervideo_dl/extractor/utreon.py2
-rw-r--r--hypervideo_dl/extractor/varzesh3.py3
-rw-r--r--hypervideo_dl/extractor/veo.py47
-rw-r--r--hypervideo_dl/extractor/veoh.py62
-rw-r--r--hypervideo_dl/extractor/vgtv.py6
-rw-r--r--hypervideo_dl/extractor/vice.py1
-rw-r--r--hypervideo_dl/extractor/videa.py9
-rw-r--r--hypervideo_dl/extractor/videocampus_sachsen.py96
-rw-r--r--hypervideo_dl/extractor/vidio.py14
-rw-r--r--hypervideo_dl/extractor/vidlii.py50
-rw-r--r--hypervideo_dl/extractor/viewlift.py189
-rw-r--r--hypervideo_dl/extractor/viki.py30
-rw-r--r--hypervideo_dl/extractor/vimeo.py521
-rw-r--r--hypervideo_dl/extractor/vimm.py69
-rw-r--r--hypervideo_dl/extractor/vine.py3
-rw-r--r--hypervideo_dl/extractor/viu.py226
-rw-r--r--hypervideo_dl/extractor/vk.py118
-rw-r--r--hypervideo_dl/extractor/vlive.py256
-rw-r--r--hypervideo_dl/extractor/voicy.py7
-rw-r--r--hypervideo_dl/extractor/voot.py2
-rw-r--r--hypervideo_dl/extractor/vrv.py67
-rw-r--r--hypervideo_dl/extractor/vshare.py3
-rw-r--r--hypervideo_dl/extractor/vupload.py12
-rw-r--r--hypervideo_dl/extractor/vyborymos.py4
-rw-r--r--hypervideo_dl/extractor/wakanim.py26
-rw-r--r--hypervideo_dl/extractor/wasdtv.py161
-rw-r--r--hypervideo_dl/extractor/washingtonpost.py21
-rw-r--r--hypervideo_dl/extractor/watchbox.py2
-rw-r--r--hypervideo_dl/extractor/wdr.py65
-rw-r--r--hypervideo_dl/extractor/webcaster.py8
-rw-r--r--hypervideo_dl/extractor/weibo.py3
-rw-r--r--hypervideo_dl/extractor/whowatch.py9
-rw-r--r--hypervideo_dl/extractor/willow.py58
-rw-r--r--hypervideo_dl/extractor/wppilot.py177
-rw-r--r--hypervideo_dl/extractor/xinpianchang.py95
-rw-r--r--hypervideo_dl/extractor/xnxx.py5
-rw-r--r--hypervideo_dl/extractor/xvideos.py32
-rw-r--r--hypervideo_dl/extractor/yahoo.py46
-rw-r--r--hypervideo_dl/extractor/yandexvideo.py99
-rw-r--r--hypervideo_dl/extractor/youjizz.py3
-rw-r--r--hypervideo_dl/extractor/younow.py5
-rw-r--r--hypervideo_dl/extractor/youtube.py3631
-rw-r--r--hypervideo_dl/extractor/zattoo.py25
-rw-r--r--hypervideo_dl/extractor/zdf.py61
-rw-r--r--hypervideo_dl/extractor/zee5.py117
-rw-r--r--hypervideo_dl/extractor/zhihu.py4
-rw-r--r--hypervideo_dl/extractor/zingmp3.py159
-rw-r--r--hypervideo_dl/extractor/zoom.py40
-rw-r--r--hypervideo_dl/jsinterp.py492
-rw-r--r--hypervideo_dl/minicurses.py86
-rw-r--r--hypervideo_dl/options.py531
-rw-r--r--hypervideo_dl/postprocessor/__init__.py8
-rw-r--r--hypervideo_dl/postprocessor/common.py44
-rw-r--r--hypervideo_dl/postprocessor/embedthumbnail.py77
-rw-r--r--hypervideo_dl/postprocessor/exec.py21
-rw-r--r--hypervideo_dl/postprocessor/ffmpeg.py519
-rw-r--r--hypervideo_dl/postprocessor/metadataparser.py29
-rw-r--r--hypervideo_dl/postprocessor/modify_chapters.py22
-rw-r--r--hypervideo_dl/postprocessor/sponskrub.py13
-rw-r--r--hypervideo_dl/postprocessor/sponsorblock.py37
-rw-r--r--hypervideo_dl/utils.py2738
-rw-r--r--hypervideo_dl/version.py6
-rw-r--r--hypervideo_dl/webvtt.py8
-rw-r--r--requirements.txt3
-rw-r--r--setup.py6
-rw-r--r--test/helper.py47
-rw-r--r--test/parameters.json2
-rw-r--r--test/test_InfoExtractor.py184
-rw-r--r--test/test_YoutubeDL.py56
-rw-r--r--test/test_aes.py18
-rw-r--r--test/test_all_urls.py1
-rw-r--r--test/test_cookies.py36
-rwxr-xr-x[-rw-r--r--]test/test_download.py2
-rw-r--r--test/test_netrc.py13
-rw-r--r--test/test_postprocessors.py4
-rw-r--r--test/test_subtitles.py4
-rw-r--r--test/test_utils.py214
-rw-r--r--test/test_verbose_output.py16
-rw-r--r--test/test_youtube_lists.py42
475 files changed, 32107 insertions, 11774 deletions
diff --git a/AUTHORS b/AUTHORS
new file mode 100644
index 0000000..33923ec
--- /dev/null
+++ b/AUTHORS
@@ -0,0 +1,1136 @@
+0l-l0
+0x9fff00
+1-Byte
+23rd
+2ShedsJackson
+3risian
+4a1e2y5
+4rensiker
+50csent
+5moufl
+A Connecticut Princess
+AGSPhoenix
+Aakash Gajjar
+Aarni Koskela
+Aaron Brager
+Aaron Lipinski
+Aaron Wojnowski
+Aaron Zeng
+Abdullah Ibn Fulan
+Abhishek Kedia
+Adam
+Adam Glenn
+Adam Malcontenti-Wilson
+Adam Mesha
+Adam Thalhammer
+Adam Voss
+Adrian Heine né Lang
+Adrian Kretz
+Adrik
+Aidan Rowe
+Alan Yee
+Albert Kim
+Aldo Gunsing
+Aleksandar Topuzovic
+Aleksander Nitecki
+Aleri Kaisattera
+Ales Jirasek
+Alessandro Ghedini
+Alex Merkel
+Alex Monk
+Alex Seiler
+Alex Van't Hof
+Alex Vong
+Alexander Kirk
+Alexander Simon
+Alexander van Gessel
+Alexandre Huot
+Alexandre Macabies
+Alexey Trofimov
+Alf Marius
+Alfonso Solbes
+Ali Irani
+Ali Sherief
+Allan Daemon
+Allan Zhou
+Alpesh Valia
+Amaury Gauthier
+Amish Bhadeshia
+Anand Babu Periasamy
+Anarky
+Anders Einar Hilden
+Andras Elso
+Andre Walker
+Andreas Schmitz
+Andrei Troie
+AndreiArba
+Andrew "Akari" Alexeyew
+Andrew Bottom
+Andrew J. Erickson
+Andrew Morgan
+Andrew Udvare
+AndrewMBL
+Andrey Smirnoff
+AndroKev
+Andrzej Lichnerowicz
+András Veres-Szentkirályi
+Andy Savicki
+Anh Nhan Nguyen
+Aniruddh Joshi
+Aniruddh-J
+Anisse Astier
+Anna Bernardi
+Anssi Hannula
+Anthony Fok
+Anthony J. Bentley
+Anthony Weems
+Anton Larionov
+Anton Novosyolov
+Antti Ajanki
+Arend v. Reinersdorff
+Argn0
+Ariset Llerena
+Arjan Verwer
+Arjun Sreedharan
+Art Zhitnik
+Arvydas Sidorenko
+Ashish Gupta
+Ashutosh Chaudhary
+Atlas Sullivan
+Attila-Mihaly Balazs
+Aurora
+Aurélien Dunand
+Aurélien Grosdidier
+Aurélio A. Heckert
+Austin Adams
+Austin de Coup-Crank
+Awal Garg
+Bagira
+Barbara Miller
+Barbu Paul - Gheorghe
+Bart Kappenburg
+Bastian de Groot
+Batuhan's Unmaintained Account
+Behrooz
+Ben Rog-Wilhelm
+Benedikt Wildenhain
+Benjamin Congdon
+Bepis
+Bernhard M. Wiedemann
+Bjorn Heesakkers
+BlahGeek
+Bob Poekert
+BohwaZ
+Bojidar Qnkov
+Boris Wachtmeister
+Brian Foley
+Brian Marks
+Bricio
+BunnyHelp
+CHJ85
+CXwudi
+Camillo Dell'mour
+Carlos Ramos
+Celthi
+CeruleanSky
+Cédric Luthi
+Charles Chen
+Charlie Le
+ChillingPepper
+Ching Yi, Chan
+Chirantan Ekbote
+Chris Gavin
+Chris Hranj
+Christian Albrecht
+Christian Paul
+Christian Pointner
+Christoph Döpmann
+Christopher Krooss
+Christopher Neugebauer
+Christopher Smith
+Chuck Cho
+Cian Ruane
+CkuT
+Clément DAVID
+Corey Farwell
+Corey Nicholson
+Cory Hall
+Costy Petrisor
+CplPwnies
+Craig Markwardt
+CrypticSignal
+CyberJacob
+Cyril Roelandt
+Cássio Ávila
+DEvmIb
+DaMightyZombie
+Daan van Vugt
+Damiano Amatruda
+Damon Timm
+Dan Church
+Dan Salmon
+Dan Walker
+Dan Weber
+Daniel
+Daniel Bolton
+Daniel Höpfl
+Daniel Peukert
+Daniel Twardowski
+Daniel.Zeng
+Danko Alexeyev
+Dankryn
+Dao Hoang Son
+Dario Guarascio
+DarkZeros
+DarkstaIkers
+Dave
+Dave Loyall
+Dave Vasilevsky
+David
+David Bauer
+David Ben Zakai
+David Caldwell
+David Coppa
+David Development
+David Fabijan
+David Haberthür
+David Powell
+David Rabinowitz
+David Skrundz
+David Triendl
+David Wagner
+Deer-Spangle
+Delon
+Derek Land
+DesweR
+Devin J. Pohly
+Devon Meunier
+Diego Fernando Rodríguez Varón
+DigitalDJ
+Dimitre Liotev
+Dobrosław Żybort
+Dominik
+Dominik Heidler
+Dorian Westacott
+Douglas Su
+DrWursterich
+Dracony
+DroidFreak32
+Duncan
+Duncan Keall
+Déstin Reed
+Eduardo Ferro
+Edward Betts
+Eitan Adler
+Eitan Postavsky
+Elan Ruusamäe
+Elias Probst
+Emanuel Hoogeveen
+Emilien Kenler
+Emmanuel Froissart
+Enes
+EntranceJew
+Entropy
+Eric Wong
+Erik
+Erik Johnson
+Erwin de Haan
+FND
+Fabian Stahl
+Fai
+Fam0r
+Felix S
+Felix Stupp
+Felix Yan
+FestplattenSchnitzel
+Filip B
+Filippo Valsorda
+Finn Petersen
+FireDart
+FliegendeWurst
+FooBarQuaxx
+Founder Fang
+Francesco Frassinelli
+Francois du Toit
+Frans de Jonge
+François Charlier
+François Revol
+Frederic Bournival
+GDR!
+Gabriel Schubiner
+Gaetan Gilbert
+Gary
+Gaurav
+Gautam M
+Genki Sky
+Georg Jaehnig
+George Boyle
+George Brighton
+George Schizas
+Georgi Saev
+Georgi Valkov
+Gergely Imreh
+Giedrius Statkevičius
+Gilles Pietri
+Gino Lisignoli
+Giovanni Visentini
+Giuseppe Fabiano
+Gjorgji Jankovski
+Glenn Slayden
+Gorfiend
+Grabien
+GreyAlien502
+Grom PE
+Grzegorz P
+Grzegorz Ruciński
+Guillem Vela
+Ha Tien Loi
+Hadi0609
+Hakim Boyles
+Han Dai
+HanYOLO
+Hannu Hartikainen
+Hannu Lintala
+Haricharan Padmanaban
+Hendrik Schröter
+Hendrik v. Raven
+Henrik Heimbuerger
+Hirokuni Yano
+Hongjie Dong
+Hormoz K
+Hubert Hirtz
+Hugo Alves De Azevedo
+Huyuumi
+IONECarter
+Idan Kamara
+InfernalUnderling
+Irfan Charania
+Isaac-the-Man
+Ismael Mejia
+Itay Brandes
+Iulian Onofrei
+Ivan Kozik
+J
+J.D. Purcell
+JChris246
+Jack Danger Canty
+Jacob Chapman
+Jacob Kaplan-Moss
+Jai Grimshaw
+Jaime Marquínez Ferrándiz
+Jaime Marquínez Ferrándiz
+Jakub Adam Wieczorek
+Jakub Wilk
+Jalaz Kumar
+JamKage
+Jan 'Yenda' Trmal
+Jan Friesse
+Jan Kratochvil
+Jan Kundrát
+Jan Schär
+Janez Troha
+Jason Normore
+Jason Terk
+Jay
+Jeff Buchbinder
+Jeff Crouse
+Jeff Huffman
+Jeff Smith
+Jelle van der Waa
+Jens Rutschmann
+Jens Timmerman
+Jens Wille
+Jeremie J. Jarosh
+Jertzukka
+Jesse
+Jesse de Zwart
+Jesús
+Jia Rong Yee
+JianxinLi
+Jimbolino
+Jimm Stout
+Joakim Fremstad
+Jody Bruchon
+Joe Frambach
+Joel Potts
+Joel Verhagen
+Joey Adams
+Johan
+Johan K. Jensen
+Johannes Knoedtel
+Johannes N
+John Assael
+John Boehr
+John D
+John Hawkinson
+John Peel
+Johny Mo Swag
+Joost Verdoorn
+Joram Schrijver
+JordanWeatherby
+Joseph Frazier
+Joseph Spiros
+Josh Soref
+Joshua Elsasser
+Joshua Lochner
+Josu Moreno
+Jouke Waleson
+Juan C. Olivares
+Juan Carlos Garcia Segovia
+Juan Francisco Cantero Hurtado
+Juan M
+Juanjo Benages
+Jules-A
+Julien Hadley Jack
+Justin Keogh
+Justin Quan
+Justsoos
+Jérôme Duval
+Kacper Michajłow
+Kagami Hiiragi
+Kai Weber
+Kang Hyojun
+Kareem Moussa
+Kazuma Takahara
+Kegan
+Keith Beckman
+Ken Swenson
+Kevin Deldycke
+Kevin Kwan
+Kevin Ngo
+Kevin O'Connor
+Kevin Velghe
+Kfir Breger
+Khang Nguyen
+KiberInfinity
+Kid
+Kieran O'Reilly
+Kitten King
+Kyle
+Kyu Yeun Kim
+LE
+Laneone
+LangerJan
+Lapinot
+Lars Vierbergen
+Lauren Liberda
+Laurent Raufaste
+Leonardo Amaral
+Leonardo Taccari
+Leslie P. Polzer
+Lesmiscore (Naoya Ozaki)
+Li4ick
+Lionel Elie Mamane
+Liu DongMiao
+Logan B
+Logan Fleur
+Lovius
+Luc Ritchie
+Luca Cherubin
+Luca Steeb
+Lucas
+Lucas M
+Lucas Moura
+Lukas Anzinger
+Lukas Fink
+Lukáš Lalinský
+Léo El Amri
+M.K
+M.Yasoob Khalid
+MAA
+MMM
+MRWITEK
+Magnus Kolstad
+Malte Kiefer
+Mamay Alexander
+Mantas Mikulėnas
+Manu Cornet
+Mao Zedong
+Marcin Cieślak
+Marco Fantauzzo
+Marco Ferragina
+Marco Schuster
+Marek Rusinowski
+Marian Sigler
+Mark Lee
+Mark Oteiza
+Mark Schreiber
+Markus Müller
+Martin Michlmayr
+Martin Polden
+Martin Ström
+Martin Trigaux
+Martin Weinelt
+Marvin Ewald
+Matej Dujava
+Mathias Rav
+Mats
+Matt Broadway
+Matt Crupi
+Matthew Franglen
+Matthew Rayermann
+Matthew Rayfield
+Matthieu Muffato
+Mattias Harrysson
+Mattias Wadman
+Matěj Cepl
+Max
+Max Mehl
+Max Teegen
+MaxReimann
+Mel Shafer
+Meneth32
+Mevious
+Michael Haggerty
+Michael Kaiser
+Michael Klein
+Michael Käufl
+Michael Munch
+Michael Orlitzky
+Michael Pauley
+Michael Smith
+Michael Tilbury
+Michael Walter
+Michal Kubeček
+Michal Čihař
+Mike Fährmann
+MikeCol
+MinePlayersPE
+Miroslav Šedivý
+Mister Hat
+Mitsukarenai
+MobiDotS
+Mohamedh Fazal
+Mohammad Khaled AbouElSherbini
+Mohammad Teimori Pabandi
+Mohammed Yaseen Mowzer
+Moises Lima
+Moritz Patelscheck
+MrDoritos
+MrRawes
+Muratcan Simsek
+N1k145
+NRTICN
+Naglis Jonaitis
+Namnamseo
+Nathan Rossi
+Nehal Patel
+NeroBurner
+Nevar Angelo
+Nick Daniels
+Nicolas Kaiser
+Nicolas SAPA
+Nicolas Évrard
+Nii-90
+Niklas Haas
+Niklas Laxström
+Nikoli
+Nil Admirari
+NotFound
+Odd Stråbø
+OhMyBahGosh
+Ole Ernst
+Oleg Prutz
+Oli Allen
+Oliver Freyermuth
+Olivier Bilodeau
+Ondřej Bárta
+Ondřej Caletka
+Ori Avtalion
+Orn
+Osama Khalid
+Oskar Cieslik
+Oskar Jauch
+P-reducible
+PB
+PC
+PSJay
+PSlava
+Paper
+Parmjit Virk
+Pascal Brax
+Patrice Levesque
+Patrick Dessalle
+Patrick Griffis
+Paul Hartmann
+Paul Henning
+Paul Ivanov
+Paul Wise
+Paul Wrubel
+Pawit Pornkitprasan
+Pccode66
+Pete Hemery
+Peter
+Peter Oettig
+Peter Pitzulo
+Peter Rowlands
+PeterDing
+Petr Kutalek
+Petr Novák
+Petr Vaněk
+Petr Zvoníček
+Phil Kulak
+Philip Huppert
+Philip Xu
+Philipp Hagemeister
+Philipp Stehle
+Phạm Ngọc Quang Nam
+Pierre
+Pierre Fenoll
+Pierre Mdawar
+Pierre Rudloff
+PilzAdam
+PishPosh.McGee
+Pornophage
+Poschi
+Pratyush Singh
+PrinceOfPuppers
+Protuhj
+Puck Meerburg
+Purdea Andrei
+Qijiang Fan
+Quan Hua
+Quentin Rameau
+RPing
+Rafal Borczuch
+Ralf Haring
+Random User
+Raphael Michel
+Rasmus Rendal
+Rastislav Barlik
+Ray Douglass
+Remita Amine
+Reto Kromer
+Reventl0v
+RexYuan
+RiCON
+Ricardo
+Ricardo Constantino
+Ricardo Garcia
+Richard Clamp
+Rob
+Rob van Bekkum
+Robert Smith
+Robin
+Robin Dunn
+Robin Houtevelts
+Robin Neatherway
+Rogério Brito
+Roland Hieber
+Roman Beránek
+Roman Le Négrate
+Roman Sebastian Karwacik
+RomanEmelyanov
+Ronald Ip
+Ronnnny
+Roxedus
+Ruirize
+Ryan Hendrickson
+Ryan Schmidt
+Rémy Léone
+Sahebjot singh
+Saimadhav Heblikar
+Sainyam Kapoor
+Sam
+Samik Some
+Sander
+Sander van den Oever
+Santiago Calcagno
+Scott Leggett
+Seamus Phelan
+Sebastian Blunt
+Sebastian Haas
+Sebastian Leske
+Sematre
+Sen Jiang
+SeonjaeHyeon
+Sergey
+Sergey Alirzaev
+Sergey M․
+Sergio Livi
+Serkora
+Shadab Zafar
+Shai Coleman
+Shaun Walbridge
+Shaya G
+Shrimadhav U K
+Sidney de Koning
+Silvan Mosberger
+Simon Morgan
+Simon W. Jackson
+Singwai Chan
+Sipherdrakon
+SirCipherz
+Slava Shklyaev
+Soebb
+Soneé John
+Sonic
+Stanislav Kupryakhin
+Stanny Nuytkens
+Starsam80
+Stavros Ntentos
+Stefan Pöschel
+Stefan-Gabriel Muscalu
+Steffan Donal
+Stephan
+Stephen Stair
+Steven Gosseling
+Steven Maude
+Sukhbir Singh
+Surkal
+Surya Oktafendri
+SyxbEaEQ2
+TRox1972
+Tailszefox
+Takuya Tsuchida
+Tatsuyuki Ishi
+Teemu Ikonen
+TheRealDude2
+Thijs Vermeir
+Thomas Christlieb
+Thomas Jost
+Thomas van der Berg
+Thor77
+Throaway
+Tianyi Shi
+Till Maas
+Tim
+Tim Broder
+Tim Douglas
+Tim Landscheidt
+Tim Schindler
+Tim Sogard
+Timendum
+Timmy
+TinyToweringTree
+Tithen-Firion
+Tjark Saul
+Toan Nguyen
+Tobias Bell
+Tobias Florek
+Tobias Gruetzmacher
+Tobias Kunze
+Tobias Salzmann
+Todoroki
+Tom
+Tom Gijselinck
+Tom-Oliver Heidel
+Tomáš Čech
+Toni Viemerö
+TotalCaesar659
+Trevor Nelson
+Tristan Waddington
+Tyler Szabo
+Unit 193
+Unknown
+Urgau
+Varun
+Vasyl' Vavrychuk
+Vid
+VietTPham
+Vignesh Venkat
+Vijay Singh
+Viktor Szakats
+Viren Rajput
+Vitaliy Syrchikov
+Vobe
+Vrihub
+Vukkk
+Vítor Galvão
+Wandang
+Wang Jun Tham
+WassimAttar
+Wes
+Will Glynn
+Will Sewell
+Windom
+Witchakorn Kamolpornwijit
+Witold Baryluk
+WolfganP
+Xaver Hellauer
+Xiao Di Guan
+Xie Yanbo
+Xu Cheng
+Xuan Hu (Sean)
+Yakabuff
+Yasoob
+Yen Chi Hsuan
+Your Name
+Yuan Chao
+YuenSzeHong
+Yurii H
+Yuriy Melnyk
+Zach Bruggeman
+Zack Fernandes
+Zenon Mousmoulas
+Zhong Jianxin
+Zirro
+aarubui
+aegamesi
+aeph6Ee0
+aerworker
+ajj8
+alarig
+alimirjamali
+alphapapa
+alxnull
+amigatomte
+anatoly techtonik
+andi
+animelover1984
+anovicecodemonkey
+arza
+ashutosh-mishra
+atomic83
+atomizer
+aviperes
+axelerometer
+aystroganov@gmail.com
+azeem
+bastik
+bato3
+beefchop
+bitraid
+biwubo
+blissland
+bonfy
+bopol
+bpfoley
+bzc6p
+cant-think-of-a-name
+cantandwont
+capital-G
+catboy
+catlover999
+cazulu
+cclauss
+cdarlint
+chaos33
+chaoskagami
+charon2019
+chien-yu
+chio0hai
+chocolateboy
+chris
+ckuu
+cladmi
+clauderains
+cntrl-s
+codelol
+codesparkle
+coletdev
+coletdjnz
+compujo
+comsomisha
+coolsa
+coreynicholson
+corone17
+cpm
+cryptonaut
+cryzed
+cyberfox1691
+cypheron
+d2au
+dalan
+dannyc@omega
+dannycolligan
+danut007ro
+davex25
+denneboomyo
+dequis
+dimqua
+dinesh
+dirkf
+dmsummers
+dodo
+dongmao zhang
+dubber0
+dundua
+dwemthy
+dyn888
+ealgase
+enigmaquip
+epitron
+ericpardee
+exwm
+f4pp3rk1ng
+felix
+fiocfun
+flatgreen
+fluks
+fnord
+foghawk
+forDream
+frenchy1983
+funniray
+gam2046
+gcmalloc
+gdzx
+geauxlo
+geditorit
+git-anony-mouse
+github-actions
+gkoelln
+grimreaper
+gritstub
+guredora
+gustaf
+h-collector
+ha shao
+hakatashi
+hassaanaliw
+hcwhan
+hdclark
+hedii
+helb
+hh0rva1h
+hmlinaric
+hojel
+hrimfaxi
+hseg
+hub2git
+huichen90
+huohuarong
+hurda
+i6t
+ian
+igv
+inondle
+insaneracist
+ipaha
+ischmidt20
+ispedals
+iwconfig
+j
+j54vc1bk
+jahudka
+james
+james mike dupont
+jamiejones
+jfogelman
+jhwgh1968
+jjatria
+jnozsc
+joehillen
+jomo
+julien
+jxu
+k3ns1n
+kaspi
+kayb94
+kaz-us
+kebianizao
+kenavera
+kennell
+kidol
+kikuyan
+kinetoskombi
+king-millez
+kitty
+kkalpakloglou
+knagano
+knapior
+kr4ssi
+krichbanana
+kurumigi
+lazypete365
+light94
+lightmare
+linhua55
+lkho
+llyyr
+logon84
+lorpus
+louie-github
+luboss
+luceatnobis
+lyz-code
+m0viefreak
+mahanstreamer
+main()
+makeworld
+marcwebbie
+marieell
+mars67857
+martin54
+mc2avr
+mcd1992
+megustamucho
+mehq
+mexican porn commits
+midas02
+migbac
+minusf
+mjdubell
+mlindner
+motophil
+mpeter50
+mrBliss
+mrkrossxdx
+mrtnmtth
+mtilbury
+mutantmonkey
+mzbaulhaque
+nawl
+nemunaire
+net
+netanel
+neutric
+newtonelectron
+ngld
+niebles
+nikhil
+nixxo
+nmeum
+nmrugg
+nto
+nulloz
+nyorain
+nyuszika7h
+obeythepenguin@gmail.com
+octotherp
+ofkz
+oittaa
+opusforlife2
+oteng
+ouwou
+ovitei
+ozburo
+pachacamac
+patrickslin
+peugeot
+pgaig
+phaer
+phan-ctrl
+phi
+phiresky
+phlip
+ping
+pingtux
+piplongrun
+pishposhmcgee
+plroman
+pukkandan
+pulpe
+pyed
+pypy
+quinlander
+quyleanh
+raleeper
+random-nick
+rawcoder
+reddraggone9
+reiv
+remis
+renalid
+rhhayward
+rhsmachine
+rigstot
+riking
+rmanola
+robbie
+robin
+rr-
+rrooij
+rubicks
+runningbits
+rupertbaxter2
+ruuk
+rzhxeo
+s0u1h
+sahutd
+satunnainen
+sceext
+schn0sch
+schnusch
+scil
+sh!zeeg
+shirt-dev
+sian1468
+sichuan-pepper
+siddharth
+siikamiika
+skacurt
+slangangular
+slocum
+smed79
+snipem
+sofutru
+sourcerect
+sprhawk
+spvkgn
+squibbysquibby
+ssaqua
+stanoarn
+std-move
+stephen
+stepshal
+steven7851
+striker.sh
+supritkumar
+sxvghd
+t0mm0
+tandy1000
+teemuy
+teesid
+telephono
+tempname
+teridon
+testbonn
+tetra-eder
+tewe
+tfvlrue
+thc202
+theGeekPirate
+theychx
+tiktok
+timethrow
+tinybug
+tippfeler
+tlonic
+tlsssl
+tom
+toniz4
+trasssh
+troywith77
+tsantala
+tsia
+u-spec-png
+user
+utlasidyo
+v-delta
+venth
+vijayanand nandam
+vobe
+vordep
+vvto33
+wankerer
+willbeaufoy
+winwon
+wolfy1339
+xantares
+xarantolus
+xavier
+xbe
+xofe
+xtkoba
+xuhaomin
+xypwn
+xyssy
+yac
+yonaikerlol
+z00nx 0
+zackmark29
+zcanfly
+zejn
+zenerdi0de
+zootedb0t
+zouhair
+zraktvor
+zubearc
+zulaport
+zurfyx
+zx8
+Ákos Sülyi
+虾哥哥
+谭九鼎
diff --git a/CONTRIBUTORS b/CONTRIBUTORS
index 048d988..8d62c04 100644
--- a/CONTRIBUTORS
+++ b/CONTRIBUTORS
@@ -2,6 +2,7 @@ pukkandan (owner)
shirt-dev (collaborator)
coletdjnz/colethedj (collaborator)
Ashish0804 (collaborator)
+nao20010128nao/Lesmiscore (collaborator)
h-h-h-h
pauldubois98
nixxo
@@ -19,7 +20,6 @@ samiksome
alxnull
FelixFrog
Zocker1999NET
-nao20010128nao
kurumigi
bbepis
animelover1984/horahoradev
@@ -125,3 +125,92 @@ jfogelman
timethrow
sarnoud
Bojidarist
+18928172992817182/gustaf
+nixklai
+smplayer-dev
+Zirro
+CrypticSignal
+flashdagger
+fractalf
+frafra
+kaz-us
+ozburo
+rhendric
+sdomi
+selfisekai
+stanoarn
+0xA7404A/Aurora
+4a1e2y5
+aarubui
+chio0hai
+cntrl-s
+Deer-Spangle
+DEvmIb
+Grabien/MaximVol
+j54vc1bk
+mpeter50
+mrpapersonic
+pabs3
+staubichsauger
+xenova
+Yakabuff
+zulaport
+ehoogeveen-medweb
+PilzAdam
+zmousm
+iw0nderhow
+unit193
+TwoThousandHedgehogs/KathrynElrod
+Jertzukka
+cypheron
+Hyeeji
+bwildenhain
+C0D3D3V
+kebianizao
+Lapin0t
+abdullah-if
+DavidSkrundz
+mkubecek
+raleeper
+YuenSzeHong
+Sematre
+jaller94
+r5d
+julien-hadleyjack
+git-anony-mouse
+mdawar
+trassshhub
+foghawk
+k3ns1n
+teridon
+mozlima
+timendum
+ischmidt20
+CreaValix
+sian1468
+arkamar
+hyano
+KiberInfinity
+tejing1
+Bricio
+lazypete365
+Aniruddh-J
+blackgear
+CplPwnies
+cyberfox1691
+FestplattenSchnitzel
+hatienl0i261299
+iphoting
+jakeogh
+lukasfink1
+lyz-code
+marieell
+mdpauley
+Mipsters
+mxmehl
+ofkz
+P-reducible
+pycabbage
+regarten
+Ronnnny
+schn0sch
diff --git a/MANIFEST.in b/MANIFEST.in
index e43cb87..300ae69 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -5,5 +5,6 @@ include README.md
include completions/*/*
include supportedsites.md
include hypervideo.1
+include requirements.txt
recursive-include devscripts *
recursive-include test *
diff --git a/bin/hypervideo b/bin/hypervideo
index baecdeb..baecdeb 100755..100644
--- a/bin/hypervideo
+++ b/bin/hypervideo
diff --git a/completions/zsh/_hypervideo b/completions/zsh/_hypervideo
new file mode 100644
index 0000000..0a8d491
--- /dev/null
+++ b/completions/zsh/_hypervideo
@@ -0,0 +1,30 @@
+#compdef hypervideo
+
+__hypervideo_dl() {
+ local curcontext="$curcontext" fileopts diropts cur prev
+ typeset -A opt_args
+ fileopts="--download-archive|-a|--batch-file|--load-info-json|--load-info|--cookies|--no-cookies"
+ diropts="--cache-dir"
+ cur=$words[CURRENT]
+ case $cur in
+ :)
+ _arguments '*: :(::ytfavorites ::ytrecommended ::ytsubscriptions ::ytwatchlater ::ythistory)'
+ ;;
+ *)
+ prev=$words[CURRENT-1]
+ if [[ ${prev} =~ ${fileopts} ]]; then
+ _path_files
+ elif [[ ${prev} =~ ${diropts} ]]; then
+ _path_files -/
+ elif [[ ${prev} == "--remux-video" ]]; then
+ _arguments '*: :(mp4 mkv)'
+ elif [[ ${prev} == "--recode-video" ]]; then
+ _arguments '*: :(mp4 flv ogg webm mkv)'
+ else
+ _arguments '*: :(--help --version --ignore-errors --no-abort-on-error --abort-on-error --dump-user-agent --list-extractors --extractor-descriptions --force-generic-extractor --default-search --ignore-config --no-config-locations --config-locations --flat-playlist --no-flat-playlist --live-from-start --no-live-from-start --wait-for-video --no-wait-for-video --mark-watched --no-mark-watched --no-colors --compat-options --proxy --socket-timeout --source-address --force-ipv4 --force-ipv6 --geo-verification-proxy --cn-verification-proxy --geo-bypass --no-geo-bypass --geo-bypass-country --geo-bypass-ip-block --playlist-start --playlist-end --playlist-items --match-title --reject-title --min-filesize --max-filesize --date --datebefore --dateafter --min-views --max-views --match-filters --no-match-filter --no-playlist --yes-playlist --age-limit --download-archive --no-download-archive --max-downloads --break-on-existing --break-on-reject --break-per-input --no-break-per-input --skip-playlist-after-errors --include-ads --no-include-ads --concurrent-fragments --limit-rate --throttled-rate --retries --file-access-retries --fragment-retries --skip-unavailable-fragments --abort-on-unavailable-fragment --keep-fragments --no-keep-fragments --buffer-size --resize-buffer --no-resize-buffer --http-chunk-size --test --playlist-reverse --no-playlist-reverse --playlist-random --xattr-set-filesize --hls-prefer-native --hls-prefer-ffmpeg --hls-use-mpegts --no-hls-use-mpegts --downloader --downloader-args --batch-file --no-batch-file --id --paths --output --output-na-placeholder --autonumber-size --autonumber-start --restrict-filenames --no-restrict-filenames --windows-filenames --no-windows-filenames --trim-filenames --no-overwrites --force-overwrites --no-force-overwrites --continue --no-continue --part --no-part --mtime --no-mtime --write-description --no-write-description --write-info-json --no-write-info-json --write-annotations --no-write-annotations --write-playlist-metafiles --no-write-playlist-metafiles --clean-info-json --no-clean-info-json --write-comments --no-write-comments --load-info-json --cookies --no-cookies --cookies-from-browser --no-cookies-from-browser --cache-dir --no-cache-dir --rm-cache-dir --write-thumbnail --no-write-thumbnail --write-all-thumbnails --list-thumbnails --write-link --write-url-link --write-webloc-link --write-desktop-link --quiet --no-warnings --simulate --no-simulate --ignore-no-formats-error --no-ignore-no-formats-error --skip-download --print --print-to-file --get-url --get-title --get-id --get-thumbnail --get-description --get-duration --get-filename --get-format --dump-json --dump-single-json --print-json --force-write-archive --newline --no-progress --progress --console-title --progress-template --verbose --dump-pages --write-pages --youtube-print-sig-code --print-traffic --call-home --no-call-home --encoding --legacy-server-connect --no-check-certificates --prefer-insecure --user-agent --referer --add-header --bidi-workaround --sleep-requests --sleep-interval --max-sleep-interval --sleep-subtitles --format --format-sort --format-sort-force --no-format-sort-force --video-multistreams --no-video-multistreams --audio-multistreams --no-audio-multistreams --all-formats --prefer-free-formats --no-prefer-free-formats --check-formats --check-all-formats --no-check-formats --list-formats --list-formats-as-table --list-formats-old --merge-output-format --allow-unplayable-formats --no-allow-unplayable-formats --write-subs --no-write-subs --write-auto-subs --no-write-auto-subs --all-subs --list-subs --sub-format --sub-langs --username --password --twofactor --netrc --netrc-location --video-password --ap-mso --ap-username --ap-password --ap-list-mso --extract-audio --audio-format --audio-quality --remux-video --recode-video --postprocessor-args --keep-video --no-keep-video --post-overwrites --no-post-overwrites --embed-subs --no-embed-subs --embed-thumbnail --no-embed-thumbnail --embed-metadata --no-embed-metadata --embed-chapters --no-embed-chapters --embed-info-json --no-embed-info-json --metadata-from-title --parse-metadata --replace-in-metadata --xattrs --concat-playlist --fixup --prefer-avconv --prefer-ffmpeg --ffmpeg-location --exec --no-exec --exec-before-download --no-exec-before-download --convert-subs --convert-thumbnails --split-chapters --no-split-chapters --remove-chapters --no-remove-chapters --force-keyframes-at-cuts --no-force-keyframes-at-cuts --use-postprocessor --sponsorblock-mark --sponsorblock-remove --sponsorblock-chapter-title --no-sponsorblock --sponsorblock-api --sponskrub --no-sponskrub --sponskrub-cut --no-sponskrub-cut --sponskrub-force --no-sponskrub-force --sponskrub-location --sponskrub-args --extractor-retries --allow-dynamic-mpd --ignore-dynamic-mpd --hls-split-discontinuity --no-hls-split-discontinuity --extractor-args --youtube-include-dash-manifest --youtube-skip-dash-manifest --youtube-include-hls-manifest --youtube-skip-hls-manifest)'
+ fi
+ ;;
+ esac
+}
+
+__hypervideo_dl \ No newline at end of file
diff --git a/devscripts/make_lazy_extractors.py b/devscripts/make_lazy_extractors.py
index 7a38e40..1e22620 100644
--- a/devscripts/make_lazy_extractors.py
+++ b/devscripts/make_lazy_extractors.py
@@ -9,7 +9,7 @@ import sys
sys.path.insert(0, dirn(dirn((os.path.abspath(__file__)))))
-lazy_extractors_filename = sys.argv[1]
+lazy_extractors_filename = sys.argv[1] if len(sys.argv) > 1 else 'hypervideo_dl/extractor/lazy_extractors.py'
if os.path.exists(lazy_extractors_filename):
os.remove(lazy_extractors_filename)
@@ -39,12 +39,6 @@ class {name}({bases}):
_module = '{module}'
'''
-make_valid_template = '''
- @classmethod
- def _make_valid_url(cls):
- return {valid_url!r}
-'''
-
def get_base_name(base):
if base is InfoExtractor:
@@ -61,15 +55,14 @@ def build_lazy_ie(ie, name):
bases=', '.join(map(get_base_name, ie.__bases__)),
module=ie.__module__)
valid_url = getattr(ie, '_VALID_URL', None)
+ if not valid_url and hasattr(ie, '_make_valid_url'):
+ valid_url = ie._make_valid_url()
if valid_url:
s += f' _VALID_URL = {valid_url!r}\n'
if not ie._WORKING:
s += ' _WORKING = False\n'
if ie.suitable.__func__ is not InfoExtractor.suitable.__func__:
s += f'\n{getsource(ie.suitable)}'
- if hasattr(ie, '_make_valid_url'):
- # search extractors
- s += make_valid_template.format(valid_url=ie._make_valid_url())
return s
diff --git a/devscripts/make_supportedsites.py b/devscripts/make_supportedsites.py
index a079406..9bce04b 100644
--- a/devscripts/make_supportedsites.py
+++ b/devscripts/make_supportedsites.py
@@ -24,11 +24,13 @@ def main():
def gen_ies_md(ies):
for ie in ies:
ie_md = '**{0}**'.format(ie.IE_NAME)
- ie_desc = getattr(ie, 'IE_DESC', None)
- if ie_desc is False:
+ if ie.IE_DESC is False:
continue
- if ie_desc is not None:
+ if ie.IE_DESC is not None:
ie_md += ': {0}'.format(ie.IE_DESC)
+ search_key = getattr(ie, 'SEARCH_KEY', None)
+ if search_key is not None:
+ ie_md += f'; "{ie.SEARCH_KEY}:" prefix'
if not ie.working():
ie_md += ' (Currently broken)'
yield ie_md
diff --git a/devscripts/prepare_manpage.py b/devscripts/prepare_manpage.py
index 58090d4..8920df1 100644
--- a/devscripts/prepare_manpage.py
+++ b/devscripts/prepare_manpage.py
@@ -13,12 +13,14 @@ PREFIX = r'''%HYPERVIDEO(1)
# NAME
-youtube\-dl \- download videos from youtube.com or other video platforms
+yt\-dlp \- A youtube-dl fork with additional features and patches
# SYNOPSIS
**hypervideo** \[OPTIONS\] URL [URL...]
+# DESCRIPTION
+
'''
@@ -33,47 +35,63 @@ def main():
with io.open(README_FILE, encoding='utf-8') as f:
readme = f.read()
- readme = re.sub(r'(?s)^.*?(?=# DESCRIPTION)', '', readme)
- readme = re.sub(r'\s+hypervideo \[OPTIONS\] URL \[URL\.\.\.\]', '', readme)
- readme = PREFIX + readme
-
+ readme = filter_excluded_sections(readme)
+ readme = move_sections(readme)
readme = filter_options(readme)
with io.open(outfile, 'w', encoding='utf-8') as outf:
- outf.write(readme)
+ outf.write(PREFIX + readme)
+
+
+def filter_excluded_sections(readme):
+ EXCLUDED_SECTION_BEGIN_STRING = re.escape('<!-- MANPAGE: BEGIN EXCLUDED SECTION -->')
+ EXCLUDED_SECTION_END_STRING = re.escape('<!-- MANPAGE: END EXCLUDED SECTION -->')
+ return re.sub(
+ rf'(?s){EXCLUDED_SECTION_BEGIN_STRING}.+?{EXCLUDED_SECTION_END_STRING}\n',
+ '', readme)
+
+
+def move_sections(readme):
+ MOVE_TAG_TEMPLATE = '<!-- MANPAGE: MOVE "%s" SECTION HERE -->'
+ sections = re.findall(r'(?m)^%s$' % (
+ re.escape(MOVE_TAG_TEMPLATE).replace(r'\%', '%') % '(.+)'), readme)
+
+ for section_name in sections:
+ move_tag = MOVE_TAG_TEMPLATE % section_name
+ if readme.count(move_tag) > 1:
+ raise Exception(f'There is more than one occurrence of "{move_tag}". This is unexpected')
+
+ sections = re.findall(rf'(?sm)(^# {re.escape(section_name)}.+?)(?=^# )', readme)
+ if len(sections) < 1:
+ raise Exception(f'The section {section_name} does not exist')
+ elif len(sections) > 1:
+ raise Exception(f'There are multiple occurrences of section {section_name}, this is unhandled')
+
+ readme = readme.replace(sections[0], '', 1).replace(move_tag, sections[0], 1)
+ return readme
def filter_options(readme):
- ret = ''
- in_options = False
- for line in readme.split('\n'):
- if line.startswith('# '):
- if line[2:].startswith('OPTIONS'):
- in_options = True
- else:
- in_options = False
-
- if in_options:
- if line.lstrip().startswith('-'):
- split = re.split(r'\s{2,}', line.lstrip())
- # Description string may start with `-` as well. If there is
- # only one piece then it's a description bit not an option.
- if len(split) > 1:
- option, description = split
- split_option = option.split(' ')
-
- if not split_option[-1].startswith('-'): # metavar
- option = ' '.join(split_option[:-1] + ['*%s*' % split_option[-1]])
-
- # Pandoc's definition_lists. See http://pandoc.org/README.html
- # for more information.
- ret += '\n%s\n: %s\n' % (option, description)
- continue
- ret += line.lstrip() + '\n'
- else:
- ret += line + '\n'
-
- return ret
+ section = re.search(r'(?sm)^# USAGE AND OPTIONS\n.+?(?=^# )', readme).group(0)
+ options = '# OPTIONS\n'
+ for line in section.split('\n')[1:]:
+ mobj = re.fullmatch(r'''(?x)
+ \s{4}(?P<opt>-(?:,\s|[^\s])+)
+ (?:\s(?P<meta>(?:[^\s]|\s(?!\s))+))?
+ (\s{2,}(?P<desc>.+))?
+ ''', line)
+ if not mobj:
+ options += f'{line.lstrip()}\n'
+ continue
+ option, metavar, description = mobj.group('opt', 'meta', 'desc')
+
+ # Pandoc's definition_lists. See http://pandoc.org/README.html
+ option = f'{option} *{metavar}*' if metavar else option
+ description = f'{description}\n' if description else ''
+ options += f'\n{option}\n: {description}'
+ continue
+
+ return readme.replace(section, options, 1)
if __name__ == '__main__':
diff --git a/hypervideo_dl/YoutubeDL.py b/hypervideo_dl/YoutubeDL.py
index 5b5a0d7..276f42d 100755..100644
--- a/hypervideo_dl/YoutubeDL.py
+++ b/hypervideo_dl/YoutubeDL.py
@@ -5,7 +5,6 @@ from __future__ import absolute_import, unicode_literals
import collections
import contextlib
-import copy
import datetime
import errno
import fileinput
@@ -28,10 +27,12 @@ import traceback
import random
import unicodedata
+from enum import Enum
from string import ascii_letters
from .compat import (
compat_basestring,
+ compat_brotli,
compat_get_terminal_size,
compat_kwargs,
compat_numeric_types,
@@ -55,9 +56,7 @@ from .utils import (
DEFAULT_OUTTMPL,
determine_ext,
determine_protocol,
- DOT_DESKTOP_LINK_TEMPLATE,
- DOT_URL_LINK_TEMPLATE,
- DOT_WEBLOC_LINK_TEMPLATE,
+ DownloadCancelled,
DownloadError,
encode_compat_str,
encodeFilename,
@@ -66,33 +65,46 @@ from .utils import (
ExistingVideoReached,
expand_path,
ExtractorError,
+ filter_dict,
float_or_none,
format_bytes,
format_field,
+ format_decimal_suffix,
formatSeconds,
GeoRestrictedError,
+ get_domain,
+ has_certifi,
HEADRequest,
+ InAdvancePagedList,
int_or_none,
iri_to_uri,
ISO3166Utils,
+ join_nonempty,
LazyList,
+ LINK_TEMPLATES,
locked_file,
make_dir,
make_HTTPS_handler,
MaxDownloadsReached,
+ merge_headers,
network_exceptions,
+ NO_DEFAULT,
+ number_of_digits,
orderedSet,
OUTTMPL_TYPES,
PagedList,
parse_filesize,
PerRequestProxyHandler,
platform_name,
+ Popen,
+ POSTPROCESS_WHEN,
PostProcessingError,
preferredencoding,
prepend_extension,
- process_communicate_or_kill,
+ ReExtractInfo,
register_socks_protocols,
RejectedVideoReached,
+ remove_terminal_sequences,
render_table,
replace_extension,
SameFileError,
@@ -107,8 +119,7 @@ from .utils import (
strftime_or_none,
subtitles_filename,
supports_terminal_sequences,
- TERMINAL_SEQUENCES,
- ThrottledDownload,
+ timetuple_from_msec,
to_high_limit_path,
traverse_obj,
try_get,
@@ -123,6 +134,7 @@ from .utils import (
YoutubeDLRedirectHandler,
)
from .cache import Cache
+from .minicurses import format_text
from .extractor import (
gen_extractor_classes,
get_info_extractor,
@@ -139,6 +151,7 @@ from .downloader.rtmp import rtmpdump_version
from .postprocessor import (
get_postprocessor,
EmbedThumbnailPP,
+ FFmpegFixupDuplicateMoovPP,
FFmpegFixupDurationPP,
FFmpegFixupM3u8PP,
FFmpegFixupM4aPP,
@@ -192,7 +205,12 @@ class YoutubeDL(object):
verbose: Print additional info to stdout.
quiet: Do not print messages to stdout.
no_warnings: Do not print out anything for warnings.
- forceprint: A list of templates to force print
+ forceprint: A dict with keys WHEN mapped to a list of templates to
+ print to stdout. The allowed keys are video or any of the
+ items in utils.POSTPROCESS_WHEN.
+ For compatibility, a single list is also accepted
+ print_to_file: A dict with keys WHEN (same as forceprint) mapped to
+ a list of tuples with (template, filename)
forceurl: Force printing final URL. (Deprecated)
forcetitle: Force printing title. (Deprecated)
forceid: Force printing ID. (Deprecated)
@@ -208,20 +226,26 @@ class YoutubeDL(object):
simulate: Do not download the video files. If unset (or None),
simulate only if listsubtitles, listformats or list_thumbnails is used
format: Video format code. see "FORMAT SELECTION" for more details.
+ You can also pass a function. The function takes 'ctx' as
+ argument and returns the formats to download.
+ See "build_format_selector" for an implementation
allow_unplayable_formats: Allow unplayable formats to be extracted and downloaded.
ignore_no_formats_error: Ignore "No video formats" error. Usefull for
extracting metadata even if the video is not actually
available for download (experimental)
- format_sort: How to sort the video formats. see "Sorting Formats"
- for more details.
+ format_sort: A list of fields by which to sort the video formats.
+ See "Sorting Formats" for more details.
format_sort_force: Force the given format_sort. see "Sorting Formats"
for more details.
+ prefer_free_formats: Whether to prefer video formats with free containers
+ over non-free ones of same quality.
allow_multiple_video_streams: Allow multiple video streams to be merged
into a single file
allow_multiple_audio_streams: Allow multiple audio streams to be merged
into a single file
check_formats Whether to test if the formats are downloadable.
- Can be True (check all), False (check none)
+ Can be True (check all), False (check none),
+ 'selected' (check selected formats),
or None (check only if requested by extractor)
paths: Dictionary of output paths. The allowed keys are 'home'
'temp' and the keys of OUTTMPL_TYPES (in utils.py)
@@ -303,13 +327,18 @@ class YoutubeDL(object):
file that is in the archive.
break_on_reject: Stop the download process when encountering a video that
has been filtered out.
+ break_per_url: Whether break_on_reject and break_on_existing
+ should act on each input URL as opposed to for the entire queue
cookiefile: File name where cookies should be read from and dumped to
- cookiesfrombrowser: A tuple containing the name of the browser and the profile
- name/path from where cookies are loaded.
- Eg: ('chrome', ) or (vivaldi, 'default')
- nocheckcertificate:Do not verify SSL certificates
+ cookiesfrombrowser: A tuple containing the name of the browser, the profile
+ name/pathfrom where cookies are loaded, and the name of the
+ keyring. Eg: ('chrome', ) or ('vivaldi', 'default', 'BASICTEXT')
+ legacyserverconnect: Explicitly allow HTTPS connection to servers that do not
+ support RFC 5746 secure renegotiation
+ nocheckcertificate: Do not verify SSL certificates
prefer_insecure: Use HTTP instead of HTTPS to retrieve information.
At the moment, this is only supported by YouTube.
+ http_headers: A dictionary of custom headers to be used for all requests
proxy: URL of the proxy server to use
geo_verification_proxy: URL of the proxy to use for IP address verification
on geo-restricted sites.
@@ -317,18 +346,21 @@ class YoutubeDL(object):
bidi_workaround: Work around buggy terminals without bidirectional text
support, using fridibi
debug_printtraffic:Print out sent and received HTTP traffic
- include_ads: Download ads as well
+ include_ads: Download ads as well (deprecated)
default_search: Prepend this string if an input url is not valid.
'auto' for elaborate guessing
encoding: Use this encoding instead of the system-specified.
extract_flat: Do not resolve URLs, return the immediate result.
Pass in 'in_playlist' to only show this behavior for
playlist items.
+ wait_for_video: If given, wait for scheduled streams to become available.
+ The value should be a tuple containing the range
+ (min_secs, max_secs) to wait between retries
postprocessors: A list of dictionaries, each with an entry
* key: The name of the postprocessor. See
hypervideo_dl/postprocessor/__init__.py for a list.
- * when: When to run the postprocessor. Can be one of
- pre_process|before_dl|post_process|after_move.
+ * when: When to run the postprocessor. Allowed values are
+ the entries of utils.POSTPROCESS_WHEN
Assumed to be 'post_process' if not given
post_hooks: Deprecated - Register a custom postprocessor instead
A list of functions that get called as the final step
@@ -370,8 +402,7 @@ class YoutubeDL(object):
(with status "started" and "finished") if the processing is successful.
merge_output_format: Extension to use when merging formats.
final_ext: Expected final extension; used to detect when the file was
- already downloaded and converted. "merge_output_format" is
- replaced by this extension when given
+ already downloaded and converted
fixup: Automatically correct known faults of the file.
One of:
- "never": do nothing
@@ -425,7 +456,7 @@ class YoutubeDL(object):
compat_opts: Compatibility options. See "Differences in default behavior".
The following options do not work when used through the API:
filename, abort-on-error, multistreams, no-live-chat, format-sort
- no-clean-infojson, no-playlist-metafiles, no-keep-subs.
+ no-clean-infojson, no-playlist-metafiles, no-keep-subs, no-attach-info-json.
Refer __init__.py for their implementation
progress_template: Dictionary of templates for progress outputs.
Allowed keys are 'download', 'postprocess',
@@ -435,9 +466,9 @@ class YoutubeDL(object):
The following parameters are not used by YoutubeDL itself, they are used by
the downloader (see hypervideo_dl/downloader/common.py):
nopart, updatetime, buffersize, ratelimit, throttledratelimit, min_filesize,
- max_filesize, test, noresizebuffer, retries, fragment_retries, continuedl,
- noprogress, xattr_set_filesize, hls_use_mpegts, http_chunk_size,
- external_downloader_args.
+ max_filesize, test, noresizebuffer, retries, file_access_retries, fragment_retries,
+ continuedl, noprogress, xattr_set_filesize, hls_use_mpegts, http_chunk_size,
+ external_downloader_args, concurrent_fragment_downloads.
The following options are used by the post processors:
prefer_ffmpeg: If False, use avconv instead of ffmpeg if both are available,
@@ -460,6 +491,7 @@ class YoutubeDL(object):
extractor_args: A dictionary of arguments to be passed to the extractors.
See "EXTRACTOR ARGUMENTS" for details.
Eg: {'youtube': {'skip': ['dash', 'hls']}}
+ mark_watched: Mark videos watched (even with --simulate). Only for YouTube
youtube_include_dash_manifest: Deprecated - Use extractor_args instead.
If True (default), DASH manifests and related
data will be downloaded and processed by extractor.
@@ -482,33 +514,33 @@ class YoutubeDL(object):
'track_number', 'disc_number', 'release_year',
))
+ _format_fields = {
+ # NB: Keep in sync with the docstring of extractor/common.py
+ 'url', 'manifest_url', 'manifest_stream_number', 'ext', 'format', 'format_id', 'format_note',
+ 'width', 'height', 'resolution', 'dynamic_range', 'tbr', 'abr', 'acodec', 'asr',
+ 'vbr', 'fps', 'vcodec', 'container', 'filesize', 'filesize_approx',
+ 'player_url', 'protocol', 'fragment_base_url', 'fragments', 'is_from_start',
+ 'preference', 'language', 'language_preference', 'quality', 'source_preference',
+ 'http_headers', 'stretched_ratio', 'no_resume', 'has_drm', 'downloader_options',
+ 'page_url', 'app', 'play_path', 'tc_url', 'flash_version', 'rtmp_live', 'rtmp_conn', 'rtmp_protocol', 'rtmp_real_time'
+ }
_format_selection_exts = {
'audio': {'m4a', 'mp3', 'ogg', 'aac'},
'video': {'mp4', 'flv', 'webm', '3gp'},
'storyboards': {'mhtml'},
}
- params = None
- _ies = {}
- _pps = {'pre_process': [], 'before_dl': [], 'after_move': [], 'post_process': []}
- _printed_messages = set()
- _first_webpage_request = True
- _download_retcode = None
- _num_downloads = None
- _playlist_level = 0
- _playlist_urls = set()
- _screen_file = None
-
def __init__(self, params=None, auto_init=True):
"""Create a FileDownloader object with the given options.
@param auto_init Whether to load the default extractors and print header (if verbose).
- Set to 'no_verbose_header' to not ptint the header
+ Set to 'no_verbose_header' to not print the header
"""
if params is None:
params = {}
+ self.params = params
self._ies = {}
self._ies_instances = {}
- self._pps = {'pre_process': [], 'before_dl': [], 'after_move': [], 'post_process': []}
+ self._pps = {k: [] for k in POSTPROCESS_WHEN}
self._printed_messages = set()
self._first_webpage_request = True
self._post_hooks = []
@@ -516,14 +548,23 @@ class YoutubeDL(object):
self._postprocessor_hooks = []
self._download_retcode = 0
self._num_downloads = 0
- self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
- self._err_file = sys.stderr
- self.params = params
+ self._num_videos = 0
+ self._playlist_level = 0
+ self._playlist_urls = set()
self.cache = Cache(self)
windows_enable_vt_mode()
- # FIXME: This will break if we ever print color to stdout
- self.params['no_color'] = self.params.get('no_color') or not supports_terminal_sequences(self._err_file)
+ self._out_files = {
+ 'error': sys.stderr,
+ 'print': sys.stderr if self.params.get('logtostderr') else sys.stdout,
+ 'console': None if compat_os_name == 'nt' else next(
+ filter(supports_terminal_sequences, (sys.stderr, sys.stdout)), None)
+ }
+ self._out_files['screen'] = sys.stderr if self.params.get('quiet') else self._out_files['print']
+ self._allow_colors = {
+ type_: not self.params.get('no_color') and supports_terminal_sequences(self._out_files[type_])
+ for type_ in ('screen', 'error')
+ }
if sys.version_info < (3, 6):
self.report_warning(
@@ -531,10 +572,10 @@ class YoutubeDL(object):
if self.params.get('allow_unplayable_formats'):
self.report_warning(
- f'You have asked for {self._color_text("unplayable formats", "blue")} to be listed/downloaded. '
+ f'You have asked for {self._format_err("UNPLAYABLE", self.Styles.EMPHASIS)} formats to be listed/downloaded. '
'This is a developer option intended for debugging. \n'
' If you experience any issues while using this option, '
- f'{self._color_text("DO NOT", "red")} open a bug report')
+ f'{self._format_err("DO NOT", self.Styles.ERROR)} open a bug report')
def check_deprecated(param, option, suggestion):
if self.params.get(param) is not None:
@@ -550,8 +591,13 @@ class YoutubeDL(object):
check_deprecated('usetitle', '--title', '-o "%(title)s-%(id)s.%(ext)s"')
check_deprecated('useid', '--id', '-o "%(id)s.%(ext)s"')
- for msg in self.params.get('warnings', []):
+ for msg in self.params.get('_warnings', []):
self.report_warning(msg)
+ for msg in self.params.get('_deprecation_warnings', []):
+ self.deprecation_warning(msg)
+
+ if 'list-formats' in self.params.get('compat_opts', []):
+ self.params['listformats_table'] = False
if 'overwrites' not in self.params and self.params.get('nooverwrites') is not None:
# nooverwrites was unnecessarily changed to overwrites
@@ -563,7 +609,14 @@ class YoutubeDL(object):
else:
self.params['nooverwrites'] = not self.params['overwrites']
- if params.get('bidi_workaround', False):
+ self.params.setdefault('forceprint', {})
+ self.params.setdefault('print_to_file', {})
+
+ # Compatibility with older syntax
+ if not isinstance(params['forceprint'], dict):
+ self.params['forceprint'] = {'video': params['forceprint']}
+
+ if self.params.get('bidi_workaround', False):
try:
import pty
master, slave = pty.openpty()
@@ -575,24 +628,23 @@ class YoutubeDL(object):
sp_kwargs = dict(
stdin=subprocess.PIPE,
stdout=slave,
- stderr=self._err_file)
+ stderr=self._out_files['error'])
try:
- self._output_process = subprocess.Popen(
- ['bidiv'] + width_args, **sp_kwargs
- )
+ self._output_process = Popen(['bidiv'] + width_args, **sp_kwargs)
except OSError:
- self._output_process = subprocess.Popen(
- ['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs)
+ self._output_process = Popen(['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs)
self._output_channel = os.fdopen(master, 'rb')
except OSError as ose:
if ose.errno == errno.ENOENT:
- self.report_warning('Could not find fribidi executable, ignoring --bidi-workaround . Make sure that fribidi is an executable file in one of the directories in your $PATH.')
+ self.report_warning(
+ 'Could not find fribidi executable, ignoring --bidi-workaround. '
+ 'Make sure that fribidi is an executable file in one of the directories in your $PATH.')
else:
raise
if (sys.platform != 'win32'
and sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968']
- and not params.get('restrictfilenames', False)):
+ and not self.params.get('restrictfilenames', False)):
# Unicode filesystem API will throw errors (#1474, #13027)
self.report_warning(
'Assuming --restrict-filenames since file system encoding '
@@ -604,9 +656,13 @@ class YoutubeDL(object):
# Creating format selector here allows us to catch syntax errors before the extraction
self.format_selector = (
- None if self.params.get('format') is None
+ self.params.get('format') if self.params.get('format') in (None, '-')
+ else self.params['format'] if callable(self.params['format'])
else self.build_format_selector(self.params['format']))
+ # Set http_headers defaults according to std_headers
+ self.params['http_headers'] = merge_headers(std_headers, self.params.get('http_headers', {}))
+
self._setup_opener()
if auto_init:
@@ -614,18 +670,21 @@ class YoutubeDL(object):
self.print_debug_header()
self.add_default_info_extractors()
+ hooks = {
+ 'post_hooks': self.add_post_hook,
+ 'progress_hooks': self.add_progress_hook,
+ 'postprocessor_hooks': self.add_postprocessor_hook,
+ }
+ for opt, fn in hooks.items():
+ for ph in self.params.get(opt, []):
+ fn(ph)
+
for pp_def_raw in self.params.get('postprocessors', []):
pp_def = dict(pp_def_raw)
when = pp_def.pop('when', 'post_process')
- pp_class = get_postprocessor(pp_def.pop('key'))
- pp = pp_class(self, **compat_kwargs(pp_def))
- self.add_post_processor(pp, when=when)
-
- for ph in self.params.get('post_hooks', []):
- self.add_post_hook(ph)
-
- for ph in self.params.get('progress_hooks', []):
- self.add_progress_hook(ph)
+ self.add_post_processor(
+ get_postprocessor(pp_def.pop('key'))(self, **compat_kwargs(pp_def)),
+ when=when)
register_socks_protocols()
@@ -633,7 +692,7 @@ class YoutubeDL(object):
"""Preload the archive, if any is specified"""
if fn is None:
return False
- self.write_debug('Loading archive file %r\n' % fn)
+ self.write_debug(f'Loading archive file {fn!r}')
try:
with locked_file(fn, 'r', encoding='utf-8') as archive_file:
for line in archive_file:
@@ -660,7 +719,7 @@ class YoutubeDL(object):
)
self.report_warning(
'Long argument string detected. '
- 'Use -- to separate parameters and URLs, like this:\n%s\n' %
+ 'Use -- to separate parameters and URLs, like this:\n%s' %
args_to_str(correct_argv))
def add_info_extractor(self, ie):
@@ -713,6 +772,9 @@ class YoutubeDL(object):
def add_postprocessor_hook(self, ph):
"""Add the postprocessing progress hook"""
self._postprocessor_hooks.append(ph)
+ for pps in self._pps.values():
+ for pp in pps:
+ pp.add_progress_hook(ph)
def _bidi_workaround(self, message):
if not hasattr(self, '_output_channel'):
@@ -734,14 +796,24 @@ class YoutubeDL(object):
self._printed_messages.add(message)
write_string(message, out=out, encoding=self.params.get('encoding'))
- def to_stdout(self, message, skip_eol=False, quiet=False):
+ def to_stdout(self, message, skip_eol=False, quiet=None):
"""Print message to stdout"""
+ if quiet is not None:
+ self.deprecation_warning('"YoutubeDL.to_stdout" no longer accepts the argument quiet. Use "YoutubeDL.to_screen" instead')
+ self._write_string(
+ '%s%s' % (self._bidi_workaround(message), ('' if skip_eol else '\n')),
+ self._out_files['print'])
+
+ def to_screen(self, message, skip_eol=False, quiet=None):
+ """Print message to screen if not in quiet mode"""
if self.params.get('logger'):
self.params['logger'].debug(message)
- elif not quiet or self.params.get('verbose'):
- self._write_string(
- '%s%s' % (self._bidi_workaround(message), ('' if skip_eol else '\n')),
- self._err_file if quiet else self._screen_file)
+ return
+ if (self.params.get('quiet') if quiet is None else quiet) and not self.params.get('verbose'):
+ return
+ self._write_string(
+ '%s%s' % (self._bidi_workaround(message), ('' if skip_eol else '\n')),
+ self._out_files['screen'])
def to_stderr(self, message, only_once=False):
"""Print message to stderr"""
@@ -749,36 +821,34 @@ class YoutubeDL(object):
if self.params.get('logger'):
self.params['logger'].error(message)
else:
- self._write_string('%s\n' % self._bidi_workaround(message), self._err_file, only_once=only_once)
+ self._write_string('%s\n' % self._bidi_workaround(message), self._out_files['error'], only_once=only_once)
+
+ def _send_console_code(self, code):
+ if compat_os_name == 'nt' or not self._out_files['console']:
+ return
+ self._write_string(code, self._out_files['console'])
def to_console_title(self, message):
if not self.params.get('consoletitle', False):
return
+ message = remove_terminal_sequences(message)
if compat_os_name == 'nt':
if ctypes.windll.kernel32.GetConsoleWindow():
# c_wchar_p() might not be necessary if `message` is
# already of type unicode()
ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
- elif 'TERM' in os.environ:
- self._write_string('\033]0;%s\007' % message, self._screen_file)
+ else:
+ self._send_console_code(f'\033]0;{message}\007')
def save_console_title(self):
- if not self.params.get('consoletitle', False):
- return
- if self.params.get('simulate'):
+ if not self.params.get('consoletitle') or self.params.get('simulate'):
return
- if compat_os_name != 'nt' and 'TERM' in os.environ:
- # Save the title on stack
- self._write_string('\033[22;0t', self._screen_file)
+ self._send_console_code('\033[22;0t') # Save the title on stack
def restore_console_title(self):
- if not self.params.get('consoletitle', False):
- return
- if self.params.get('simulate'):
+ if not self.params.get('consoletitle') or self.params.get('simulate'):
return
- if compat_os_name != 'nt' and 'TERM' in os.environ:
- # Restore the title from stack
- self._write_string('\033[23;0t', self._screen_file)
+ self._send_console_code('\033[23;0t') # Restore the title from stack
def __enter__(self):
self.save_console_title()
@@ -790,14 +860,15 @@ class YoutubeDL(object):
if self.params.get('cookiefile') is not None:
self.cookiejar.save(ignore_discard=True, ignore_expires=True)
- def trouble(self, message=None, tb=None):
+ def trouble(self, message=None, tb=None, is_error=True):
"""Determine action to take when a download problem appears.
Depending on if the downloader has been configured to ignore
download errors or not, this method may throw an exception or
not when errors are found, after printing the message.
- tb, if given, is additional traceback information.
+ @param tb If given, is additional traceback information
+ @param is_error Whether to raise error according to ignorerrors
"""
if message is not None:
self.to_stderr(message)
@@ -813,6 +884,8 @@ class YoutubeDL(object):
tb = ''.join(tb_data)
if tb:
self.to_stderr(tb)
+ if not is_error:
+ return
if not self.params.get('ignoreerrors'):
if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
exc_info = sys.exc_info()[1].exc_info
@@ -821,15 +894,34 @@ class YoutubeDL(object):
raise DownloadError(message, exc_info)
self._download_retcode = 1
- def to_screen(self, message, skip_eol=False):
- """Print message to stdout if not in quiet mode"""
- self.to_stdout(
- message, skip_eol, quiet=self.params.get('quiet', False))
-
- def _color_text(self, text, color):
- if self.params.get('no_color'):
- return text
- return f'{TERMINAL_SEQUENCES[color.upper()]}{text}{TERMINAL_SEQUENCES["RESET_STYLE"]}'
+ class Styles(Enum):
+ HEADERS = 'yellow'
+ EMPHASIS = 'light blue'
+ ID = 'green'
+ DELIM = 'blue'
+ ERROR = 'red'
+ WARNING = 'yellow'
+ SUPPRESS = 'light black'
+
+ def _format_text(self, handle, allow_colors, text, f, fallback=None, *, test_encoding=False):
+ if test_encoding:
+ original_text = text
+ # handle.encoding can be None. See https://github.com/hypervideo/hypervideo/issues/2711
+ encoding = self.params.get('encoding') or getattr(handle, 'encoding', None) or 'ascii'
+ text = text.encode(encoding, 'ignore').decode(encoding)
+ if fallback is not None and text != original_text:
+ text = fallback
+ if isinstance(f, self.Styles):
+ f = f.value
+ return format_text(text, f) if allow_colors else text if fallback is None else fallback
+
+ def _format_screen(self, *args, **kwargs):
+ return self._format_text(
+ self._out_files['screen'], self._allow_colors['screen'], *args, **kwargs)
+
+ def _format_err(self, *args, **kwargs):
+ return self._format_text(
+ self._out_files['error'], self._allow_colors['error'], *args, **kwargs)
def report_warning(self, message, only_once=False):
'''
@@ -841,14 +933,20 @@ class YoutubeDL(object):
else:
if self.params.get('no_warnings'):
return
- self.to_stderr(f'{self._color_text("WARNING:", "yellow")} {message}', only_once)
+ self.to_stderr(f'{self._format_err("WARNING:", self.Styles.WARNING)} {message}', only_once)
+
+ def deprecation_warning(self, message):
+ if self.params.get('logger') is not None:
+ self.params['logger'].warning(f'DeprecationWarning: {message}')
+ else:
+ self.to_stderr(f'{self._format_err("DeprecationWarning:", self.Styles.ERROR)} {message}', True)
- def report_error(self, message, tb=None):
+ def report_error(self, message, *args, **kwargs):
'''
Do the same as trouble, but prefixes the message with 'ERROR:', colored
in red if stderr is a tty file.
'''
- self.trouble(f'{self._color_text("ERROR:", "red")} {message}', tb)
+ self.trouble(f'{self._format_err("ERROR:", self.Styles.ERROR)} {message}', *args, **kwargs)
def write_debug(self, message, only_once=False):
'''Log debug message or Print message to stderr'''
@@ -874,13 +972,13 @@ class YoutubeDL(object):
except UnicodeEncodeError:
self.to_screen('Deleting existing file')
- def raise_no_formats(self, info, forced=False):
+ def raise_no_formats(self, info, forced=False, *, msg=None):
has_drm = info.get('__has_drm')
- msg = 'This video is DRM protected' if has_drm else 'No video formats found!'
- expected = self.params.get('ignore_no_formats_error')
- if forced or not expected:
+ ignored, expected = self.params.get('ignore_no_formats_error'), bool(msg)
+ msg = msg or has_drm and 'This video is DRM protected' or 'No video formats found!'
+ if forced or not ignored:
raise ExtractorError(msg, video_id=info['id'], ie=info['extractor'],
- expected=has_drm or expected)
+ expected=has_drm or ignored or expected)
else:
self.report_warning(msg)
@@ -945,7 +1043,7 @@ class YoutubeDL(object):
def validate_outtmpl(cls, outtmpl):
''' @return None or Exception object '''
outtmpl = re.sub(
- STR_FORMAT_RE_TMPL.format('[^)]*', '[ljqBU]'),
+ STR_FORMAT_RE_TMPL.format('[^)]*', '[ljqBUDS]'),
lambda mobj: f'{mobj.group(0)[:-1]}s',
cls._outtmpl_expandpath(outtmpl))
try:
@@ -957,12 +1055,15 @@ class YoutubeDL(object):
@staticmethod
def _copy_infodict(info_dict):
info_dict = dict(info_dict)
- for key in ('__original_infodict', '__postprocessors'):
- info_dict.pop(key, None)
+ info_dict.pop('__postprocessors', None)
return info_dict
- def prepare_outtmpl(self, outtmpl, info_dict, sanitize=None):
- """ Make the outtmpl and info_dict suitable for substitution: ydl.escape_outtmpl(outtmpl) % info_dict """
+ def prepare_outtmpl(self, outtmpl, info_dict, sanitize=False):
+ """ Make the outtmpl and info_dict suitable for substitution: ydl.escape_outtmpl(outtmpl) % info_dict
+ @param sanitize Whether to sanitize the output as a filename.
+ For backward compatibility, a function can also be passed
+ """
+
info_dict.setdefault('epoch', int(time.time())) # keep epoch consistent once set
info_dict = self._copy_infodict(info_dict)
@@ -971,19 +1072,20 @@ class YoutubeDL(object):
if info_dict.get('duration', None) is not None
else None)
info_dict['autonumber'] = self.params.get('autonumber_start', 1) - 1 + self._num_downloads
+ info_dict['video_autonumber'] = self._num_videos
if info_dict.get('resolution') is None:
info_dict['resolution'] = self.format_resolution(info_dict, default=None)
# For fields playlist_index, playlist_autonumber and autonumber convert all occurrences
# of %(field)s to %(field)0Nd for backward compatibility
field_size_compat_map = {
- 'playlist_index': len(str(info_dict.get('_last_playlist_index') or '')),
- 'playlist_autonumber': len(str(info_dict.get('n_entries') or '')),
+ 'playlist_index': number_of_digits(info_dict.get('_last_playlist_index') or 0),
+ 'playlist_autonumber': number_of_digits(info_dict.get('n_entries') or 0),
'autonumber': self.params.get('autonumber_size') or 5,
}
TMPL_DICT = {}
- EXTERNAL_FORMAT_RE = re.compile(STR_FORMAT_RE_TMPL.format('[^)]*', f'[{STR_FORMAT_TYPES}ljqBU]'))
+ EXTERNAL_FORMAT_RE = re.compile(STR_FORMAT_RE_TMPL.format('[^)]*', f'[{STR_FORMAT_TYPES}ljqBUDS]'))
MATH_FUNCTIONS = {
'+': float.__add__,
'-': float.__sub__,
@@ -991,16 +1093,18 @@ class YoutubeDL(object):
# Field is of the form key1.key2...
# where keys (except first) can be string, int or slice
FIELD_RE = r'\w*(?:\.(?:\w+|{num}|{num}?(?::{num}?){{1,2}}))*'.format(num=r'(?:-?\d+)')
- MATH_FIELD_RE = r'''{field}|{num}'''.format(field=FIELD_RE, num=r'-?\d+(?:.\d+)?')
+ MATH_FIELD_RE = r'''(?:{field}|{num})'''.format(field=FIELD_RE, num=r'-?\d+(?:.\d+)?')
MATH_OPERATORS_RE = r'(?:%s)' % '|'.join(map(re.escape, MATH_FUNCTIONS.keys()))
INTERNAL_FORMAT_RE = re.compile(r'''(?x)
(?P<negate>-)?
(?P<fields>{field})
(?P<maths>(?:{math_op}{math_field})*)
(?:>(?P<strf_format>.+?))?
- (?P<alternate>(?<!\\),[^|)]+)?
- (?:\|(?P<default>.*?))?
- $'''.format(field=FIELD_RE, math_op=MATH_OPERATORS_RE, math_field=MATH_FIELD_RE))
+ (?P<remaining>
+ (?P<alternate>(?<!\\),[^|&)]+)?
+ (?:&(?P<replacement>.*?))?
+ (?:\|(?P<default>.*?))?
+ )$'''.format(field=FIELD_RE, math_op=MATH_OPERATORS_RE, math_field=MATH_FIELD_RE))
def _traverse_infodict(k):
k = k.split('.')
@@ -1046,24 +1150,34 @@ class YoutubeDL(object):
na = self.params.get('outtmpl_na_placeholder', 'NA')
+ def filename_sanitizer(key, value, restricted=self.params.get('restrictfilenames')):
+ return sanitize_filename(str(value), restricted=restricted, is_id=(
+ bool(re.search(r'(^|[_.])id(\.|$)', key))
+ if 'filename-sanitization' in self.params.get('compat_opts', [])
+ else NO_DEFAULT))
+
+ sanitizer = sanitize if callable(sanitize) else filename_sanitizer
+ sanitize = bool(sanitize)
+
def _dumpjson_default(obj):
if isinstance(obj, (set, LazyList)):
return list(obj)
- raise TypeError(f'Object of type {type(obj).__name__} is not JSON serializable')
+ return repr(obj)
def create_key(outer_mobj):
if not outer_mobj.group('has_key'):
return outer_mobj.group(0)
key = outer_mobj.group('key')
mobj = re.match(INTERNAL_FORMAT_RE, key)
- initial_field = mobj.group('fields').split('.')[-1] if mobj else ''
- value, default = None, na
+ initial_field = mobj.group('fields') if mobj else ''
+ value, replacement, default = None, None, na
while mobj:
mobj = mobj.groupdict()
default = mobj['default'] if mobj['default'] is not None else default
value = get_value(mobj)
+ replacement = mobj['replacement']
if value is None and mobj['alternate']:
- mobj = re.match(INTERNAL_FORMAT_RE, mobj['alternate'][1:])
+ mobj = re.match(INTERNAL_FORMAT_RE, mobj['remaining'][1:])
else:
break
@@ -1071,25 +1185,32 @@ class YoutubeDL(object):
if fmt == 's' and value is not None and key in field_size_compat_map.keys():
fmt = '0{:d}d'.format(field_size_compat_map[key])
- value = default if value is None else value
+ value = default if value is None else value if replacement is None else replacement
+ flags = outer_mobj.group('conversion') or ''
str_fmt = f'{fmt[:-1]}s'
if fmt[-1] == 'l': # list
- delim = '\n' if '#' in (outer_mobj.group('conversion') or '') else ', '
- value, fmt = delim.join(variadic(value)), str_fmt
+ delim = '\n' if '#' in flags else ', '
+ value, fmt = delim.join(map(str, variadic(value, allowed_types=(str, bytes)))), str_fmt
elif fmt[-1] == 'j': # json
- value, fmt = json.dumps(value, default=_dumpjson_default), str_fmt
+ value, fmt = json.dumps(value, default=_dumpjson_default, indent=4 if '#' in flags else None), str_fmt
elif fmt[-1] == 'q': # quoted
- value, fmt = compat_shlex_quote(str(value)), str_fmt
+ value = map(str, variadic(value) if '#' in flags else [value])
+ value, fmt = ' '.join(map(compat_shlex_quote, value)), str_fmt
elif fmt[-1] == 'B': # bytes
value = f'%{str_fmt}'.encode('utf-8') % str(value).encode('utf-8')
value, fmt = value.decode('utf-8', 'ignore'), 's'
elif fmt[-1] == 'U': # unicode normalized
- opts = outer_mobj.group('conversion') or ''
value, fmt = unicodedata.normalize(
# "+" = compatibility equivalence, "#" = NFD
- 'NF%s%s' % ('K' if '+' in opts else '', 'D' if '#' in opts else 'C'),
+ 'NF%s%s' % ('K' if '+' in flags else '', 'D' if '#' in flags else 'C'),
value), str_fmt
+ elif fmt[-1] == 'D': # decimal suffix
+ num_fmt, fmt = fmt[:-1].replace('#', ''), 's'
+ value = format_decimal_suffix(value, f'%{num_fmt}f%s' if num_fmt else '%d%s',
+ factor=1024 if '#' in flags else 1000)
+ elif fmt[-1] == 'S': # filename sanitization
+ value, fmt = filename_sanitizer(initial_field, value, restricted='#' in flags), str_fmt
elif fmt[-1] == 'c':
if value:
value = str(value)[0]
@@ -1106,7 +1227,7 @@ class YoutubeDL(object):
# So we convert it to repr first
value, fmt = repr(value), str_fmt
if fmt[-1] in 'csr':
- value = sanitize(initial_field, value)
+ value = sanitizer(initial_field, value)
key = '%s\0%s' % (key.replace('%', '%\0'), outer_mobj.group('format'))
TMPL_DICT[key] = value
@@ -1118,38 +1239,42 @@ class YoutubeDL(object):
outtmpl, info_dict = self.prepare_outtmpl(outtmpl, info_dict, *args, **kwargs)
return self.escape_outtmpl(outtmpl) % info_dict
- def _prepare_filename(self, info_dict, tmpl_type='default'):
+ def _prepare_filename(self, info_dict, *, outtmpl=None, tmpl_type=None):
+ assert None in (outtmpl, tmpl_type), 'outtmpl and tmpl_type are mutually exclusive'
+ if outtmpl is None:
+ outtmpl = self.outtmpl_dict.get(tmpl_type or 'default', self.outtmpl_dict['default'])
try:
- sanitize = lambda k, v: sanitize_filename(
- compat_str(v),
- restricted=self.params.get('restrictfilenames'),
- is_id=(k == 'id' or k.endswith('_id')))
- outtmpl = self._outtmpl_expandpath(self.outtmpl_dict.get(tmpl_type, self.outtmpl_dict['default']))
- filename = self.evaluate_outtmpl(outtmpl, info_dict, sanitize)
+ outtmpl = self._outtmpl_expandpath(outtmpl)
+ filename = self.evaluate_outtmpl(outtmpl, info_dict, True)
+ if not filename:
+ return None
- force_ext = OUTTMPL_TYPES.get(tmpl_type)
- if filename and force_ext is not None:
- filename = replace_extension(filename, force_ext, info_dict.get('ext'))
+ if tmpl_type in ('', 'temp'):
+ final_ext, ext = self.params.get('final_ext'), info_dict.get('ext')
+ if final_ext and ext and final_ext != ext and filename.endswith(f'.{final_ext}'):
+ filename = replace_extension(filename, ext, final_ext)
+ elif tmpl_type:
+ force_ext = OUTTMPL_TYPES[tmpl_type]
+ if force_ext:
+ filename = replace_extension(filename, force_ext, info_dict.get('ext'))
# https://github.com/blackjack4494/youtube-dlc/issues/85
trim_file_name = self.params.get('trim_file_name', False)
if trim_file_name:
- fn_groups = filename.rsplit('.')
- ext = fn_groups[-1]
- sub_ext = ''
- if len(fn_groups) > 2:
- sub_ext = fn_groups[-2]
- filename = '.'.join(filter(None, [fn_groups[0][:trim_file_name], sub_ext, ext]))
+ no_ext, *ext = filename.rsplit('.', 2)
+ filename = join_nonempty(no_ext[:trim_file_name], *ext, delim='.')
return filename
except ValueError as err:
self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')')
return None
- def prepare_filename(self, info_dict, dir_type='', warn=False):
- """Generate the output filename."""
-
- filename = self._prepare_filename(info_dict, dir_type or 'default')
+ def prepare_filename(self, info_dict, dir_type='', *, outtmpl=None, warn=False):
+ """Generate the output filename"""
+ if outtmpl:
+ assert not dir_type, 'outtmpl and dir_type are mutually exclusive'
+ dir_type = None
+ filename = self._prepare_filename(info_dict, tmpl_type=dir_type, outtmpl=outtmpl)
if not filename and dir_type not in ('', 'temp'):
return ''
@@ -1266,8 +1391,9 @@ class YoutubeDL(object):
temp_id = ie.get_temp_id(url)
if temp_id is not None and self.in_download_archive({'id': temp_id, 'ie_key': ie_key}):
- self.to_screen("[%s] %s: has already been recorded in archive" % (
- ie_key, temp_id))
+ self.to_screen(f'[{ie_key}] {temp_id}: has already been recorded in the archive')
+ if self.params.get('break_on_existing', False):
+ raise ExistingVideoReached()
break
return self.__extract_info(url, self.get_info_extractor(ie_key), download, extra_info, process)
else:
@@ -1276,30 +1402,76 @@ class YoutubeDL(object):
def __handle_extraction_exceptions(func):
@functools.wraps(func)
def wrapper(self, *args, **kwargs):
- try:
- return func(self, *args, **kwargs)
- except GeoRestrictedError as e:
- msg = e.msg
- if e.countries:
- msg += '\nThis video is available in %s.' % ', '.join(
- map(ISO3166Utils.short2full, e.countries))
- msg += '\nYou might want to use a VPN or a proxy server (with --proxy) to workaround.'
- self.report_error(msg)
- except ExtractorError as e: # An error we somewhat expected
- self.report_error(compat_str(e), e.format_traceback())
- except ThrottledDownload:
- self.to_stderr('\r')
- self.report_warning('The download speed is below throttle limit. Re-extracting data')
- return wrapper(self, *args, **kwargs)
- except (MaxDownloadsReached, ExistingVideoReached, RejectedVideoReached, LazyList.IndexError):
- raise
- except Exception as e:
- if self.params.get('ignoreerrors'):
- self.report_error(error_to_compat_str(e), tb=encode_compat_str(traceback.format_exc()))
- else:
+ while True:
+ try:
+ return func(self, *args, **kwargs)
+ except (DownloadCancelled, LazyList.IndexError, PagedList.IndexError):
raise
+ except ReExtractInfo as e:
+ if e.expected:
+ self.to_screen(f'{e}; Re-extracting data')
+ else:
+ self.to_stderr('\r')
+ self.report_warning(f'{e}; Re-extracting data')
+ continue
+ except GeoRestrictedError as e:
+ msg = e.msg
+ if e.countries:
+ msg += '\nThis video is available in %s.' % ', '.join(
+ map(ISO3166Utils.short2full, e.countries))
+ msg += '\nYou might want to use a VPN or a proxy server (with --proxy) to workaround.'
+ self.report_error(msg)
+ except ExtractorError as e: # An error we somewhat expected
+ self.report_error(str(e), e.format_traceback())
+ except Exception as e:
+ if self.params.get('ignoreerrors'):
+ self.report_error(str(e), tb=encode_compat_str(traceback.format_exc()))
+ else:
+ raise
+ break
return wrapper
+ def _wait_for_video(self, ie_result):
+ if (not self.params.get('wait_for_video')
+ or ie_result.get('_type', 'video') != 'video'
+ or ie_result.get('formats') or ie_result.get('url')):
+ return
+
+ format_dur = lambda dur: '%02d:%02d:%02d' % timetuple_from_msec(dur * 1000)[:-1]
+ last_msg = ''
+
+ def progress(msg):
+ nonlocal last_msg
+ self.to_screen(msg + ' ' * (len(last_msg) - len(msg)) + '\r', skip_eol=True)
+ last_msg = msg
+
+ min_wait, max_wait = self.params.get('wait_for_video')
+ diff = try_get(ie_result, lambda x: x['release_timestamp'] - time.time())
+ if diff is None and ie_result.get('live_status') == 'is_upcoming':
+ diff = round(random.uniform(min_wait, max_wait) if (max_wait and min_wait) else (max_wait or min_wait), 0)
+ self.report_warning('Release time of video is not known')
+ elif (diff or 0) <= 0:
+ self.report_warning('Video should already be available according to extracted info')
+ diff = min(max(diff or 0, min_wait or 0), max_wait or float('inf'))
+ self.to_screen(f'[wait] Waiting for {format_dur(diff)} - Press Ctrl+C to try now')
+
+ wait_till = time.time() + diff
+ try:
+ while True:
+ diff = wait_till - time.time()
+ if diff <= 0:
+ progress('')
+ raise ReExtractInfo('[wait] Wait period ended', expected=True)
+ progress(f'[wait] Remaining time until next attempt: {self._format_screen(format_dur(diff), self.Styles.EMPHASIS)}')
+ time.sleep(1)
+ except KeyboardInterrupt:
+ progress('')
+ raise ReExtractInfo('[wait] Interrupted by user', expected=True)
+ except BaseException as e:
+ if not isinstance(e, ReExtractInfo):
+ self.to_screen('')
+ raise
+
@__handle_extraction_exceptions
def __extract_info(self, url, ie, download, extra_info, process):
ie_result = ie.extract(url)
@@ -1315,6 +1487,7 @@ class YoutubeDL(object):
ie_result.setdefault('original_url', extra_info['original_url'])
self.add_default_extra_info(ie_result, ie, url)
if process:
+ self._wait_for_video(ie_result)
return self.process_ie_result(ie_result, download, extra_info)
else:
return ie_result
@@ -1324,7 +1497,12 @@ class YoutubeDL(object):
self.add_extra_info(ie_result, {
'webpage_url': url,
'original_url': url,
- 'webpage_url_basename': url_basename(url),
+ })
+ webpage_url = ie_result.get('webpage_url')
+ if webpage_url:
+ self.add_extra_info(ie_result, {
+ 'webpage_url_basename': url_basename(webpage_url),
+ 'webpage_url_domain': get_domain(webpage_url),
})
if ie is not None:
self.add_extra_info(ie_result, {
@@ -1358,6 +1536,7 @@ class YoutubeDL(object):
info_copy['id'] = ie.get_temp_id(ie_result['url'])
self.add_default_extra_info(info_copy, ie, ie_result['url'])
self.add_extra_info(info_copy, extra_info)
+ info_copy, _ = self.pre_process(info_copy)
self.__forced_printings(info_copy, self.prepare_filename(info_copy), incomplete=True)
if self.params.get('force_write_download_archive', False):
self.record_download_archive(info_copy)
@@ -1376,7 +1555,7 @@ class YoutubeDL(object):
self.write_debug('Additional URLs: "%s"' % '", "'.join(additional_urls))
ie_result['additional_entries'] = [
self.extract_info(
- url, download, extra_info,
+ url, download, extra_info=extra_info,
force_generic_extractor=self.params.get('force_generic_extractor'))
for url in additional_urls
]
@@ -1400,13 +1579,9 @@ class YoutubeDL(object):
if not info:
return info
- force_properties = dict(
- (k, v) for k, v in ie_result.items() if v is not None)
- for f in ('_type', 'url', 'id', 'extractor', 'extractor_key', 'ie_key'):
- if f in force_properties:
- del force_properties[f]
new_result = info.copy()
- new_result.update(force_properties)
+ new_result.update(filter_dict(ie_result, lambda k, v: (
+ v is not None and k not in {'_type', 'url', 'id', 'extractor', 'extractor_key', 'ie_key'})))
# Extracted info may not be a video result (i.e.
# info.get('_type', 'video') != video) but rather an url or
@@ -1431,6 +1606,7 @@ class YoutubeDL(object):
self._playlist_level += 1
self._playlist_urls.add(webpage_url)
+ self._fill_common_fields(ie_result, False)
self._sanitize_thumbnails(ie_result)
try:
return self.__process_playlist(ie_result, download)
@@ -1448,6 +1624,7 @@ class YoutubeDL(object):
'extractor': ie_result['extractor'],
'webpage_url': ie_result['webpage_url'],
'webpage_url_basename': url_basename(ie_result['webpage_url']),
+ 'webpage_url_domain': get_domain(ie_result['webpage_url']),
'extractor_key': ie_result['extractor_key'],
})
return r
@@ -1462,18 +1639,33 @@ class YoutubeDL(object):
def _ensure_dir_exists(self, path):
return make_dir(path, self.report_error)
+ @staticmethod
+ def _playlist_infodict(ie_result, **kwargs):
+ return {
+ **ie_result,
+ 'playlist': ie_result.get('title') or ie_result.get('id'),
+ 'playlist_id': ie_result.get('id'),
+ 'playlist_title': ie_result.get('title'),
+ 'playlist_uploader': ie_result.get('uploader'),
+ 'playlist_uploader_id': ie_result.get('uploader_id'),
+ 'playlist_index': 0,
+ **kwargs,
+ }
+
def __process_playlist(self, ie_result, download):
# We process each entry in the playlist
playlist = ie_result.get('title') or ie_result.get('id')
self.to_screen('[download] Downloading playlist: %s' % playlist)
if 'entries' not in ie_result:
- raise EntryNotInPlaylist()
+ raise EntryNotInPlaylist('There are no entries')
+
+ MissingEntry = object()
incomplete_entries = bool(ie_result.get('requested_entries'))
if incomplete_entries:
- def fill_missing_entries(entries, indexes):
- ret = [None] * max(*indexes)
- for i, entry in zip(indexes, entries):
+ def fill_missing_entries(entries, indices):
+ ret = [MissingEntry] * max(indices)
+ for i, entry in zip(indices, entries):
ret[i - 1] = entry
return ret
ie_result['entries'] = fill_missing_entries(ie_result['entries'], ie_result['requested_entries'])
@@ -1500,23 +1692,27 @@ class YoutubeDL(object):
playlistitems = orderedSet(iter_playlistitems(playlistitems_str))
ie_entries = ie_result['entries']
- msg = (
- 'Downloading %d videos' if not isinstance(ie_entries, list)
- else 'Collected %d videos; downloading %%d of them' % len(ie_entries))
-
if isinstance(ie_entries, list):
+ playlist_count = len(ie_entries)
+ msg = f'Collected {playlist_count} videos; downloading %d of them'
+ ie_result['playlist_count'] = ie_result.get('playlist_count') or playlist_count
+
def get_entry(i):
return ie_entries[i - 1]
else:
- if not isinstance(ie_entries, PagedList):
+ msg = 'Downloading %d videos'
+ if not isinstance(ie_entries, (PagedList, LazyList)):
ie_entries = LazyList(ie_entries)
+ elif isinstance(ie_entries, InAdvancePagedList):
+ if ie_entries._pagesize == 1:
+ playlist_count = ie_entries._pagecount
def get_entry(i):
return YoutubeDL.__handle_extraction_exceptions(
lambda self, i: ie_entries[i - 1]
)(self, i)
- entries = []
+ entries, broken = [], False
items = playlistitems if playlistitems is not None else itertools.count(playliststart)
for i in items:
if i == 0:
@@ -1526,11 +1722,11 @@ class YoutubeDL(object):
entry = None
try:
entry = get_entry(i)
- if entry is None:
+ if entry is MissingEntry:
raise EntryNotInPlaylist()
except (IndexError, EntryNotInPlaylist):
if incomplete_entries:
- raise EntryNotInPlaylist()
+ raise EntryNotInPlaylist(f'Entry {i} cannot be found')
elif not playlistitems:
break
entries.append(entry)
@@ -1538,6 +1734,7 @@ class YoutubeDL(object):
if entry is not None:
self._match_entry(entry, incomplete=True, silent=True)
except (ExistingVideoReached, RejectedVideoReached):
+ broken = True
break
ie_result['entries'] = entries
@@ -1548,23 +1745,22 @@ class YoutubeDL(object):
if entry is not None]
n_entries = len(entries)
- if not playlistitems and (playliststart or playlistend):
+ if not (ie_result.get('playlist_count') or broken or playlistitems or playlistend):
+ ie_result['playlist_count'] = n_entries
+
+ if not playlistitems and (playliststart != 1 or playlistend):
playlistitems = list(range(playliststart, playliststart + n_entries))
ie_result['requested_entries'] = playlistitems
- if self.params.get('allow_playlist_files', True):
- ie_copy = {
- 'playlist': playlist,
- 'playlist_id': ie_result.get('id'),
- 'playlist_title': ie_result.get('title'),
- 'playlist_uploader': ie_result.get('uploader'),
- 'playlist_uploader_id': ie_result.get('uploader_id'),
- 'playlist_index': 0,
- }
- ie_copy.update(dict(ie_result))
-
- if self._write_info_json('playlist', ie_result,
- self.prepare_filename(ie_copy, 'pl_infojson')) is None:
+ _infojson_written = False
+ write_playlist_files = self.params.get('allow_playlist_files', True)
+ if write_playlist_files and self.params.get('list_thumbnails'):
+ self.list_thumbnails(ie_result)
+ if write_playlist_files and not self.params.get('simulate'):
+ ie_copy = self._playlist_infodict(ie_result, n_entries=n_entries)
+ _infojson_written = self._write_info_json(
+ 'playlist', ie_result, self.prepare_filename(ie_copy, 'pl_infojson'))
+ if _infojson_written is None:
return
if self._write_description('playlist', ie_result,
self.prepare_filename(ie_copy, 'pl_description')) is None:
@@ -1594,6 +1790,7 @@ class YoutubeDL(object):
extra = {
'n_entries': n_entries,
'_last_playlist_index': max(playlistitems) if playlistitems else (playlistend or n_entries),
+ 'playlist_count': ie_result.get('playlist_count'),
'playlist_index': playlist_index,
'playlist_autonumber': i,
'playlist': playlist,
@@ -1604,6 +1801,7 @@ class YoutubeDL(object):
'extractor': ie_result['extractor'],
'webpage_url': ie_result['webpage_url'],
'webpage_url_basename': url_basename(ie_result['webpage_url']),
+ 'webpage_url_domain': get_domain(ie_result['webpage_url']),
'extractor_key': ie_result['extractor_key'],
}
@@ -1617,10 +1815,17 @@ class YoutubeDL(object):
self.report_error(
'Skipping the remaining entries in playlist "%s" since %d items failed extraction' % (playlist, failures))
break
- # TODO: skip failed (empty) entries?
playlist_results.append(entry_result)
ie_result['entries'] = playlist_results
- self.to_screen('[download] Finished downloading playlist: %s' % playlist)
+
+ # Write the updated info to json
+ if _infojson_written is True and self._write_info_json(
+ 'updated playlist', ie_result,
+ self.prepare_filename(ie_copy, 'pl_infojson'), overwrite=True) is None:
+ return
+
+ ie_result = self.run_all_pps('playlist', ie_result)
+ self.to_screen(f'[download] Finished downloading playlist: {playlist}')
return ie_result
@__handle_extraction_exceptions
@@ -1664,15 +1869,21 @@ class YoutubeDL(object):
'^=': lambda attr, value: attr.startswith(value),
'$=': lambda attr, value: attr.endswith(value),
'*=': lambda attr, value: value in attr,
+ '~=': lambda attr, value: value.search(attr) is not None
}
str_operator_rex = re.compile(r'''(?x)\s*
(?P<key>[a-zA-Z0-9._-]+)\s*
- (?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
- (?P<value>[a-zA-Z0-9._-]+)\s*
+ (?P<negation>!\s*)?(?P<op>%s)\s*(?P<none_inclusive>\?\s*)?
+ (?P<quote>["'])?
+ (?P<value>(?(quote)(?:(?!(?P=quote))[^\\]|\\.)+|[\w.-]+))
+ (?(quote)(?P=quote))\s*
''' % '|'.join(map(re.escape, STR_OPERATORS.keys())))
m = str_operator_rex.fullmatch(filter_spec)
if m:
- comparison_value = m.group('value')
+ if m.group('op') == '~=':
+ comparison_value = re.compile(m.group('value'))
+ else:
+ comparison_value = re.sub(r'''\\([\\"'])''', r'\1', m.group('value'))
str_op = STR_OPERATORS[m.group('op')]
if m.group('negation'):
op = lambda attr, value: not str_op(attr, value)
@@ -1689,6 +1900,29 @@ class YoutubeDL(object):
return op(actual_value, comparison_value)
return _filter
+ def _check_formats(self, formats):
+ for f in formats:
+ self.to_screen('[info] Testing format %s' % f['format_id'])
+ path = self.get_output_path('temp')
+ if not self._ensure_dir_exists(f'{path}/'):
+ continue
+ temp_file = tempfile.NamedTemporaryFile(suffix='.tmp', delete=False, dir=path or None)
+ temp_file.close()
+ try:
+ success, _ = self.dl(temp_file.name, f, test=True)
+ except (DownloadError, IOError, OSError, ValueError) + network_exceptions:
+ success = False
+ finally:
+ if os.path.exists(temp_file.name):
+ try:
+ os.remove(temp_file.name)
+ except OSError:
+ self.report_warning('Unable to delete temporary file "%s"' % temp_file.name)
+ if success:
+ yield f
+ else:
+ self.to_screen('[info] Unable to download format %s. Skipping...' % f['format_id'])
+
def _default_format_spec(self, info_dict, download=True):
def can_merge():
@@ -1728,7 +1962,7 @@ class YoutubeDL(object):
allow_multiple_streams = {'audio': self.params.get('allow_multiple_audio_streams', False),
'video': self.params.get('allow_multiple_video_streams', False)}
- check_formats = self.params.get('check_formats')
+ check_formats = self.params.get('check_formats') == 'selected'
def _parse_filter(tokens):
filter_parts = []
@@ -1873,9 +2107,9 @@ class YoutubeDL(object):
'format_id': '+'.join(filtered('format_id')),
'ext': output_ext,
'protocol': '+'.join(map(determine_protocol, formats_info)),
- 'language': '+'.join(orderedSet(filtered('language'))),
- 'format_note': '+'.join(orderedSet(filtered('format_note'))),
- 'filesize_approx': sum(filtered('filesize', 'filesize_approx')),
+ 'language': '+'.join(orderedSet(filtered('language'))) or None,
+ 'format_note': '+'.join(orderedSet(filtered('format_note'))) or None,
+ 'filesize_approx': sum(filtered('filesize', 'filesize_approx')) or None,
'tbr': sum(filtered('tbr', 'vbr', 'abr')),
}
@@ -1885,6 +2119,7 @@ class YoutubeDL(object):
'height': the_only_video.get('height'),
'resolution': the_only_video.get('resolution') or self.format_resolution(the_only_video),
'fps': the_only_video.get('fps'),
+ 'dynamic_range': the_only_video.get('dynamic_range'),
'vcodec': the_only_video.get('vcodec'),
'vbr': the_only_video.get('vbr'),
'stretched_ratio': the_only_video.get('stretched_ratio'),
@@ -1903,26 +2138,7 @@ class YoutubeDL(object):
if not check_formats:
yield from formats
return
- for f in formats:
- self.to_screen('[info] Testing format %s' % f['format_id'])
- temp_file = tempfile.NamedTemporaryFile(
- suffix='.tmp', delete=False,
- dir=self.get_output_path('temp') or None)
- temp_file.close()
- try:
- success, _ = self.dl(temp_file.name, f, test=True)
- except (DownloadError, IOError, OSError, ValueError) + network_exceptions:
- success = False
- finally:
- if os.path.exists(temp_file.name):
- try:
- os.remove(temp_file.name)
- except OSError:
- self.report_warning('Unable to delete temporary file "%s"' % temp_file.name)
- if success:
- yield f
- else:
- self.to_screen('[info] Unable to download format %s. Skipping...' % f['format_id'])
+ yield from self._check_formats(formats)
def _build_selector_function(selector):
if isinstance(selector, list): # ,
@@ -1950,8 +2166,7 @@ class YoutubeDL(object):
selector_1, selector_2 = map(_build_selector_function, selector.selector)
def selector_function(ctx):
- for pair in itertools.product(
- selector_1(copy.deepcopy(ctx)), selector_2(copy.deepcopy(ctx))):
+ for pair in itertools.product(selector_1(ctx), selector_2(ctx)):
yield _merge(pair)
elif selector.type == SINGLE: # atom
@@ -1960,7 +2175,7 @@ class YoutubeDL(object):
# TODO: Add allvideo, allaudio etc by generalizing the code with best/worst selector
if format_spec == 'all':
def selector_function(ctx):
- yield from _check_formats(ctx['formats'])
+ yield from _check_formats(ctx['formats'][::-1])
elif format_spec == 'mergeall':
def selector_function(ctx):
formats = list(_check_formats(ctx['formats']))
@@ -1972,7 +2187,7 @@ class YoutubeDL(object):
yield merged_format
else:
- format_fallback, format_reverse, format_idx = False, True, 1
+ format_fallback, seperate_fallback, format_reverse, format_idx = False, None, True, 1
mobj = re.match(
r'(?P<bw>best|worst|b|w)(?P<type>video|audio|v|a)?(?P<mod>\*)?(?:\.(?P<n>[1-9]\d*))?$',
format_spec)
@@ -1999,6 +2214,7 @@ class YoutubeDL(object):
filter_f = lambda f: f.get('ext') == format_spec and f.get('acodec') != 'none'
elif format_spec in self._format_selection_exts['video']:
filter_f = lambda f: f.get('ext') == format_spec and f.get('acodec') != 'none' and f.get('vcodec') != 'none'
+ seperate_fallback = lambda f: f.get('ext') == format_spec and f.get('vcodec') != 'none'
elif format_spec in self._format_selection_exts['storyboards']:
filter_f = lambda f: f.get('ext') == format_spec and f.get('acodec') == 'none' and f.get('vcodec') == 'none'
else:
@@ -2007,11 +2223,15 @@ class YoutubeDL(object):
def selector_function(ctx):
formats = list(ctx['formats'])
matches = list(filter(filter_f, formats)) if filter_f is not None else formats
- if format_fallback and ctx['incomplete_formats'] and not matches:
- # for extractors with incomplete formats (audio only (soundcloud)
- # or video only (imgur)) best/worst will fallback to
- # best/worst {video,audio}-only format
- matches = formats
+ if not matches:
+ if format_fallback and ctx['incomplete_formats']:
+ # for extractors with incomplete formats (audio only (soundcloud)
+ # or video only (imgur)) best/worst will fallback to
+ # best/worst {video,audio}-only format
+ matches = formats
+ elif seperate_fallback and not ctx['has_merged_format']:
+ # for compatibility with youtube-dl when there is no pre-merged format
+ matches = list(filter(seperate_fallback, formats))
matches = LazyList(_check_formats(matches[::-1 if format_reverse else 1]))
try:
yield matches[format_idx - 1]
@@ -2021,7 +2241,7 @@ class YoutubeDL(object):
filters = [self._build_format_filter(f) for f in selector.filters]
def final_selector(ctx):
- ctx_copy = copy.deepcopy(ctx)
+ ctx_copy = dict(ctx)
for _filter in filters:
ctx_copy['formats'] = list(filter(_filter, ctx_copy['formats']))
return selector_function(ctx_copy)
@@ -2057,11 +2277,7 @@ class YoutubeDL(object):
return _build_selector_function(parsed_selector)
def _calc_headers(self, info_dict):
- res = std_headers.copy()
-
- add_headers = info_dict.get('http_headers')
- if add_headers:
- res.update(add_headers)
+ res = merge_headers(self.params['http_headers'], info_dict.get('http_headers') or {})
cookies = self._calc_cookies(info_dict)
if cookies:
@@ -2079,51 +2295,106 @@ class YoutubeDL(object):
self.cookiejar.add_cookie_header(pr)
return pr.get_header('Cookie')
+ def _sort_thumbnails(self, thumbnails):
+ thumbnails.sort(key=lambda t: (
+ t.get('preference') if t.get('preference') is not None else -1,
+ t.get('width') if t.get('width') is not None else -1,
+ t.get('height') if t.get('height') is not None else -1,
+ t.get('id') if t.get('id') is not None else '',
+ t.get('url')))
+
def _sanitize_thumbnails(self, info_dict):
thumbnails = info_dict.get('thumbnails')
if thumbnails is None:
thumbnail = info_dict.get('thumbnail')
if thumbnail:
info_dict['thumbnails'] = thumbnails = [{'url': thumbnail}]
- if thumbnails:
- thumbnails.sort(key=lambda t: (
- t.get('preference') if t.get('preference') is not None else -1,
- t.get('width') if t.get('width') is not None else -1,
- t.get('height') if t.get('height') is not None else -1,
- t.get('id') if t.get('id') is not None else '',
- t.get('url')))
-
- def thumbnail_tester():
- def test_thumbnail(t):
- self.to_screen(f'[info] Testing thumbnail {t["id"]}')
- try:
- self.urlopen(HEADRequest(t['url']))
- except network_exceptions as err:
- self.to_screen(f'[info] Unable to connect to thumbnail {t["id"]} URL {t["url"]!r} - {err}. Skipping...')
- return False
- return True
- return test_thumbnail
-
- for i, t in enumerate(thumbnails):
- if t.get('id') is None:
- t['id'] = '%d' % i
- if t.get('width') and t.get('height'):
- t['resolution'] = '%dx%d' % (t['width'], t['height'])
- t['url'] = sanitize_url(t['url'])
-
- if self.params.get('check_formats'):
- info_dict['thumbnails'] = LazyList(filter(thumbnail_tester(), thumbnails[::-1])).reverse()
- else:
- info_dict['thumbnails'] = thumbnails
+ if not thumbnails:
+ return
+
+ def check_thumbnails(thumbnails):
+ for t in thumbnails:
+ self.to_screen(f'[info] Testing thumbnail {t["id"]}')
+ try:
+ self.urlopen(HEADRequest(t['url']))
+ except network_exceptions as err:
+ self.to_screen(f'[info] Unable to connect to thumbnail {t["id"]} URL {t["url"]!r} - {err}. Skipping...')
+ continue
+ yield t
+
+ self._sort_thumbnails(thumbnails)
+ for i, t in enumerate(thumbnails):
+ if t.get('id') is None:
+ t['id'] = '%d' % i
+ if t.get('width') and t.get('height'):
+ t['resolution'] = '%dx%d' % (t['width'], t['height'])
+ t['url'] = sanitize_url(t['url'])
+
+ if self.params.get('check_formats') is True:
+ info_dict['thumbnails'] = LazyList(check_thumbnails(thumbnails[::-1]), reverse=True)
+ else:
+ info_dict['thumbnails'] = thumbnails
+
+ def _fill_common_fields(self, info_dict, is_video=True):
+ # TODO: move sanitization here
+ if is_video:
+ # playlists are allowed to lack "title"
+ info_dict['fulltitle'] = info_dict.get('title')
+ if 'title' not in info_dict:
+ raise ExtractorError('Missing "title" field in extractor result',
+ video_id=info_dict['id'], ie=info_dict['extractor'])
+ elif not info_dict.get('title'):
+ self.report_warning('Extractor failed to obtain "title". Creating a generic title instead')
+ info_dict['title'] = f'{info_dict["extractor"]} video #{info_dict["id"]}'
+
+ if info_dict.get('duration') is not None:
+ info_dict['duration_string'] = formatSeconds(info_dict['duration'])
+
+ for ts_key, date_key in (
+ ('timestamp', 'upload_date'),
+ ('release_timestamp', 'release_date'),
+ ('modified_timestamp', 'modified_date'),
+ ):
+ if info_dict.get(date_key) is None and info_dict.get(ts_key) is not None:
+ # Working around out-of-range timestamp values (e.g. negative ones on Windows,
+ # see http://bugs.python.org/issue1646728)
+ try:
+ upload_date = datetime.datetime.utcfromtimestamp(info_dict[ts_key])
+ info_dict[date_key] = upload_date.strftime('%Y%m%d')
+ except (ValueError, OverflowError, OSError):
+ pass
+
+ live_keys = ('is_live', 'was_live')
+ live_status = info_dict.get('live_status')
+ if live_status is None:
+ for key in live_keys:
+ if info_dict.get(key) is False:
+ continue
+ if info_dict.get(key):
+ live_status = key
+ break
+ if all(info_dict.get(key) is False for key in live_keys):
+ live_status = 'not_live'
+ if live_status:
+ info_dict['live_status'] = live_status
+ for key in live_keys:
+ if info_dict.get(key) is None:
+ info_dict[key] = (live_status == key)
+
+ # Auto generate title fields corresponding to the *_number fields when missing
+ # in order to always have clean titles. This is very common for TV series.
+ for field in ('chapter', 'season', 'episode'):
+ if info_dict.get('%s_number' % field) is not None and not info_dict.get(field):
+ info_dict[field] = '%s %d' % (field.capitalize(), info_dict['%s_number' % field])
def process_video_result(self, info_dict, download=True):
assert info_dict.get('_type', 'video') == 'video'
+ self._num_videos += 1
if 'id' not in info_dict:
- raise ExtractorError('Missing "id" field in extractor result')
- if 'title' not in info_dict:
- raise ExtractorError('Missing "title" field in extractor result',
- video_id=info_dict['id'], ie=info_dict['extractor'])
+ raise ExtractorError('Missing "id" field in extractor result', ie=info_dict['extractor'])
+ elif not info_dict.get('id'):
+ raise ExtractorError('Extractor failed to obtain "id"', ie=info_dict['extractor'])
def report_force_conversion(field, field_not, conversion):
self.report_warning(
@@ -2147,6 +2418,8 @@ class YoutubeDL(object):
sanitize_string_field(info_dict, 'id')
sanitize_numeric_fields(info_dict)
+ if (info_dict.get('duration') or 0) <= 0 and info_dict.pop('duration', None):
+ self.report_warning('"duration" field is negative, there is an error in extractor')
if 'playlist' not in info_dict:
# It isn't part of a playlist
@@ -2165,44 +2438,7 @@ class YoutubeDL(object):
if info_dict.get('display_id') is None and 'id' in info_dict:
info_dict['display_id'] = info_dict['id']
- if info_dict.get('duration') is not None:
- info_dict['duration_string'] = formatSeconds(info_dict['duration'])
-
- for ts_key, date_key in (
- ('timestamp', 'upload_date'),
- ('release_timestamp', 'release_date'),
- ):
- if info_dict.get(date_key) is None and info_dict.get(ts_key) is not None:
- # Working around out-of-range timestamp values (e.g. negative ones on Windows,
- # see http://bugs.python.org/issue1646728)
- try:
- upload_date = datetime.datetime.utcfromtimestamp(info_dict[ts_key])
- info_dict[date_key] = upload_date.strftime('%Y%m%d')
- except (ValueError, OverflowError, OSError):
- pass
-
- live_keys = ('is_live', 'was_live')
- live_status = info_dict.get('live_status')
- if live_status is None:
- for key in live_keys:
- if info_dict.get(key) is False:
- continue
- if info_dict.get(key):
- live_status = key
- break
- if all(info_dict.get(key) is False for key in live_keys):
- live_status = 'not_live'
- if live_status:
- info_dict['live_status'] = live_status
- for key in live_keys:
- if info_dict.get(key) is None:
- info_dict[key] = (live_status == key)
-
- # Auto generate title fields corresponding to the *_number fields when missing
- # in order to always have clean titles. This is very common for TV series.
- for field in ('chapter', 'season', 'episode'):
- if info_dict.get('%s_number' % field) is not None and not info_dict.get(field):
- info_dict[field] = '%s %d' % (field.capitalize(), info_dict['%s_number' % field])
+ self._fill_common_fields(info_dict)
for cc_kind in ('subtitles', 'automatic_captions'):
cc = info_dict.get(cc_kind)
@@ -2220,7 +2456,6 @@ class YoutubeDL(object):
info_dict['requested_subtitles'] = self.process_subtitles(
info_dict['id'], subtitles, automatic_captions)
- # We now pick which formats have to be downloaded
if info_dict.get('formats') is None:
# There's only one format available
formats = [info_dict]
@@ -2230,6 +2465,21 @@ class YoutubeDL(object):
info_dict['__has_drm'] = any(f.get('has_drm') for f in formats)
if not self.params.get('allow_unplayable_formats'):
formats = [f for f in formats if not f.get('has_drm')]
+ if info_dict['__has_drm'] and all(
+ f.get('acodec') == f.get('vcodec') == 'none' for f in formats):
+ self.report_warning(
+ 'This video is DRM protected and only images are available for download. '
+ 'Use --list-formats to see them')
+
+ get_from_start = not info_dict.get('is_live') or bool(self.params.get('live_from_start'))
+ if not get_from_start:
+ info_dict['title'] += ' ' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M')
+ if info_dict.get('is_live') and formats:
+ formats = [f for f in formats if bool(f.get('is_from_start')) == get_from_start]
+ if get_from_start and not formats:
+ self.raise_no_formats(info_dict, msg=(
+ '--live-from-start is passed, but there are no formats that can be downloaded from the start. '
+ 'If you want to download from the current time, use --no-live-from-start'))
if not formats:
self.raise_no_formats(info_dict)
@@ -2292,6 +2542,10 @@ class YoutubeDL(object):
format['resolution'] = self.format_resolution(format, default=None)
if format.get('dynamic_range') is None and format.get('vcodec') != 'none':
format['dynamic_range'] = 'SDR'
+ if (info_dict.get('duration') and format.get('tbr')
+ and not format.get('filesize') and not format.get('filesize_approx')):
+ format['filesize_approx'] = info_dict['duration'] * format['tbr'] * (1024 / 8)
+
# Add HTTP headers, so that external programs can use them from the
# json output
full_format_info = info_dict.copy()
@@ -2301,7 +2555,8 @@ class YoutubeDL(object):
if '__x_forwarded_for_ip' in info_dict:
del info_dict['__x_forwarded_for_ip']
- # TODO Central sorting goes here
+ if self.params.get('check_formats') is True:
+ formats = LazyList(self._check_formats(formats[::-1]), reverse=True)
if not formats or formats[0] is not info_dict:
# only set the 'formats' fields if the original info_dict list them
@@ -2312,20 +2567,27 @@ class YoutubeDL(object):
info_dict, _ = self.pre_process(info_dict)
+ if self._match_entry(info_dict, incomplete=self._format_fields) is not None:
+ return info_dict
+
+ self.post_extract(info_dict)
+ info_dict, _ = self.pre_process(info_dict, 'after_filter')
+
+ # The pre-processors may have modified the formats
+ formats = info_dict.get('formats', [info_dict])
+
+ list_only = self.params.get('simulate') is None and (
+ self.params.get('list_thumbnails') or self.params.get('listformats') or self.params.get('listsubtitles'))
+ interactive_format_selection = not list_only and self.format_selector == '-'
if self.params.get('list_thumbnails'):
self.list_thumbnails(info_dict)
- if self.params.get('listformats'):
- if not info_dict.get('formats') and not info_dict.get('url'):
- self.to_screen('%s has no formats' % info_dict['id'])
- else:
- self.list_formats(info_dict)
if self.params.get('listsubtitles'):
if 'automatic_captions' in info_dict:
self.list_subtitles(
info_dict['id'], automatic_captions, 'automatic captions')
self.list_subtitles(info_dict['id'], subtitles, 'subtitles')
- list_only = self.params.get('simulate') is None and (
- self.params.get('list_thumbnails') or self.params.get('listformats') or self.params.get('listsubtitles'))
+ if self.params.get('listformats') or interactive_format_selection:
+ self.list_formats(info_dict)
if list_only:
# Without this printing, -F --print-json will not work
self.__forced_printings(info_dict, self.prepare_filename(info_dict), incomplete=True)
@@ -2337,55 +2599,72 @@ class YoutubeDL(object):
self.write_debug('Default format spec: %s' % req_format)
format_selector = self.build_format_selector(req_format)
- # While in format selection we may need to have an access to the original
- # format set in order to calculate some metrics or do some processing.
- # For now we need to be able to guess whether original formats provided
- # by extractor are incomplete or not (i.e. whether extractor provides only
- # video-only or audio-only formats) for proper formats selection for
- # extractors with such incomplete formats (see
- # https://github.com/ytdl-org/youtube-dl/pull/5556).
- # Since formats may be filtered during format selection and may not match
- # the original formats the results may be incorrect. Thus original formats
- # or pre-calculated metrics should be passed to format selection routines
- # as well.
- # We will pass a context object containing all necessary additional data
- # instead of just formats.
- # This fixes incorrect format selection issue (see
- # https://github.com/ytdl-org/youtube-dl/issues/10083).
- incomplete_formats = (
- # All formats are video-only or
- all(f.get('vcodec') != 'none' and f.get('acodec') == 'none' for f in formats)
- # all formats are audio-only
- or all(f.get('vcodec') == 'none' and f.get('acodec') != 'none' for f in formats))
-
- ctx = {
- 'formats': formats,
- 'incomplete_formats': incomplete_formats,
- }
+ while True:
+ if interactive_format_selection:
+ req_format = input(
+ self._format_screen('\nEnter format selector: ', self.Styles.EMPHASIS))
+ try:
+ format_selector = self.build_format_selector(req_format)
+ except SyntaxError as err:
+ self.report_error(err, tb=False, is_error=False)
+ continue
+
+ formats_to_download = list(format_selector({
+ 'formats': formats,
+ 'has_merged_format': any('none' not in (f.get('acodec'), f.get('vcodec')) for f in formats),
+ 'incomplete_formats': (
+ # All formats are video-only or
+ all(f.get('vcodec') != 'none' and f.get('acodec') == 'none' for f in formats)
+ # all formats are audio-only
+ or all(f.get('vcodec') == 'none' and f.get('acodec') != 'none' for f in formats)),
+ }))
+ if interactive_format_selection and not formats_to_download:
+ self.report_error('Requested format is not available', tb=False, is_error=False)
+ continue
+ break
- formats_to_download = list(format_selector(ctx))
if not formats_to_download:
if not self.params.get('ignore_no_formats_error'):
- raise ExtractorError('Requested format is not available', expected=True,
- video_id=info_dict['id'], ie=info_dict['extractor'])
- else:
- self.report_warning('Requested format is not available')
- # Process what we can, even without any available formats.
- self.process_info(dict(info_dict))
- elif download:
- self.to_screen(
- '[info] %s: Downloading %d format(s): %s' % (
- info_dict['id'], len(formats_to_download),
- ", ".join([f['format_id'] for f in formats_to_download])))
- for fmt in formats_to_download:
- new_info = dict(info_dict)
- # Save a reference to the original info_dict so that it can be modified in process_info if needed
- new_info['__original_infodict'] = info_dict
+ raise ExtractorError(
+ 'Requested format is not available. Use --list-formats for a list of available formats',
+ expected=True, video_id=info_dict['id'], ie=info_dict['extractor'])
+ self.report_warning('Requested format is not available')
+ # Process what we can, even without any available formats.
+ formats_to_download = [{}]
+
+ best_format = formats_to_download[-1]
+ if download:
+ if best_format:
+ self.to_screen(
+ f'[info] {info_dict["id"]}: Downloading {len(formats_to_download)} format(s): '
+ + ', '.join([f['format_id'] for f in formats_to_download]))
+ max_downloads_reached = False
+ for i, fmt in enumerate(formats_to_download):
+ formats_to_download[i] = new_info = self._copy_infodict(info_dict)
new_info.update(fmt)
- self.process_info(new_info)
- # We update the info dict with the best quality format (backwards compatibility)
- if formats_to_download:
- info_dict.update(formats_to_download[-1])
+ try:
+ self.process_info(new_info)
+ except MaxDownloadsReached:
+ max_downloads_reached = True
+ # Remove copied info
+ for key, val in tuple(new_info.items()):
+ if info_dict.get(key) == val:
+ new_info.pop(key)
+ if max_downloads_reached:
+ break
+
+ write_archive = set(f.get('__write_download_archive', False) for f in formats_to_download)
+ assert write_archive.issubset({True, False, 'ignore'})
+ if True in write_archive and False not in write_archive:
+ self.record_download_archive(info_dict)
+
+ info_dict['requested_downloads'] = formats_to_download
+ info_dict = self.run_all_pps('after_video', info_dict)
+ if max_downloads_reached:
+ raise MaxDownloadsReached()
+
+ # We update the info dict with the selected best quality format (backwards compatibility)
+ info_dict.update(best_format)
return info_dict
def process_subtitles(self, video_id, normal_subtitles, automatic_captions):
@@ -2411,12 +2690,15 @@ class YoutubeDL(object):
# given in subtitleslangs. See https://github.com/hypervideo/hypervideo/issues/1041
requested_langs = []
for lang_re in self.params.get('subtitleslangs'):
- if lang_re == 'all':
- requested_langs.extend(all_sub_langs)
- continue
discard = lang_re[0] == '-'
if discard:
lang_re = lang_re[1:]
+ if lang_re == 'all':
+ if discard:
+ requested_langs = []
+ else:
+ requested_langs.extend(all_sub_langs)
+ continue
current_langs = filter(re.compile(lang_re + '$').match, all_sub_langs)
if discard:
for lang in current_langs:
@@ -2456,6 +2738,34 @@ class YoutubeDL(object):
subs[lang] = f
return subs
+ def _forceprint(self, key, info_dict):
+ if info_dict is None:
+ return
+ info_copy = info_dict.copy()
+ info_copy['formats_table'] = self.render_formats_table(info_dict)
+ info_copy['thumbnails_table'] = self.render_thumbnails_table(info_dict)
+ info_copy['subtitles_table'] = self.render_subtitles_table(info_dict.get('id'), info_dict.get('subtitles'))
+ info_copy['automatic_captions_table'] = self.render_subtitles_table(info_dict.get('id'), info_dict.get('automatic_captions'))
+
+ def format_tmpl(tmpl):
+ mobj = re.match(r'\w+(=?)$', tmpl)
+ if mobj and mobj.group(1):
+ return f'{tmpl[:-1]} = %({tmpl[:-1]})r'
+ elif mobj:
+ return f'%({tmpl})s'
+ return tmpl
+
+ for tmpl in self.params['forceprint'].get(key, []):
+ self.to_stdout(self.evaluate_outtmpl(format_tmpl(tmpl), info_copy))
+
+ for tmpl, file_tmpl in self.params['print_to_file'].get(key, []):
+ filename = self.prepare_filename(info_dict, outtmpl=file_tmpl)
+ tmpl = format_tmpl(tmpl)
+ self.to_screen(f'[info] Writing {tmpl!r} to: {filename}')
+ if self._ensure_dir_exists(filename):
+ with io.open(filename, 'a', encoding='utf-8') as f:
+ f.write(self.evaluate_outtmpl(tmpl, info_copy) + '\n')
+
def __forced_printings(self, info_dict, filename, incomplete):
def print_mandatory(field, actual_field=None):
if actual_field is None:
@@ -2475,18 +2785,14 @@ class YoutubeDL(object):
if info_dict.get('requested_formats') is not None:
# For RTMP URLs, also include the playpath
info_dict['urls'] = '\n'.join(f['url'] + f.get('play_path', '') for f in info_dict['requested_formats'])
- elif 'url' in info_dict:
+ elif info_dict.get('url'):
info_dict['urls'] = info_dict['url'] + info_dict.get('play_path', '')
- if self.params.get('forceprint') or self.params.get('forcejson'):
+ if (self.params.get('forcejson')
+ or self.params['forceprint'].get('video')
+ or self.params['print_to_file'].get('video')):
self.post_extract(info_dict)
- for tmpl in self.params.get('forceprint', []):
- mobj = re.match(r'\w+(=?)$', tmpl)
- if mobj and mobj.group(1):
- tmpl = f'{tmpl[:-1]} = %({tmpl[:-1]})s'
- elif mobj:
- tmpl = '%({})s'.format(tmpl)
- self.to_stdout(self.evaluate_outtmpl(tmpl, info_dict))
+ self._forceprint('video', info_dict)
print_mandatory('title')
print_mandatory('id')
@@ -2524,33 +2830,44 @@ class YoutubeDL(object):
if not test:
for ph in self._progress_hooks:
fd.add_progress_hook(ph)
- urls = '", "'.join([f['url'] for f in info.get('requested_formats', [])] or [info['url']])
+ urls = '", "'.join(
+ (f['url'].split(',')[0] + ',<data>' if f['url'].startswith('data:') else f['url'])
+ for f in info.get('requested_formats', []) or [info])
self.write_debug('Invoking downloader on "%s"' % urls)
- new_info = copy.deepcopy(self._copy_infodict(info))
+ # Note: Ideally info should be a deep-copied so that hooks cannot modify it.
+ # But it may contain objects that are not deep-copyable
+ new_info = self._copy_infodict(info)
if new_info.get('http_headers') is None:
new_info['http_headers'] = self._calc_headers(new_info)
return fd.download(name, new_info, subtitle)
- def process_info(self, info_dict):
- """Process a single resolved IE result."""
+ def existing_file(self, filepaths, *, default_overwrite=True):
+ existing_files = list(filter(os.path.exists, orderedSet(filepaths)))
+ if existing_files and not self.params.get('overwrites', default_overwrite):
+ return existing_files[0]
- assert info_dict.get('_type', 'video') == 'video'
+ for file in existing_files:
+ self.report_file_delete(file)
+ os.remove(file)
+ return None
- max_downloads = self.params.get('max_downloads')
- if max_downloads is not None:
- if self._num_downloads >= int(max_downloads):
- raise MaxDownloadsReached()
+ def process_info(self, info_dict):
+ """Process a single resolved IE result. (Modifies it in-place)"""
- # TODO: backward compatibility, to be removed
- info_dict['fulltitle'] = info_dict['title']
+ assert info_dict.get('_type', 'video') == 'video'
+ original_infodict = info_dict
if 'format' not in info_dict and 'ext' in info_dict:
info_dict['format'] = info_dict['ext']
+ # This is mostly just for backward compatibility of process_info
+ # As a side-effect, this allows for format-specific filters
if self._match_entry(info_dict) is not None:
+ info_dict['__write_download_archive'] = 'ignore'
return
+ # Does nothing under normal operation - for backward compatibility of process_info
self.post_extract(info_dict)
self._num_downloads += 1
@@ -2563,9 +2880,7 @@ class YoutubeDL(object):
self.__forced_printings(info_dict, full_filename, incomplete=('format' not in info_dict))
if self.params.get('simulate'):
- if self.params.get('force_write_download_archive', False):
- self.record_download_archive(info_dict)
- # Do nothing else if in simulate mode
+ info_dict['__write_download_archive'] = self.params.get('force_write_download_archive')
return
if full_filename is None:
@@ -2593,6 +2908,8 @@ class YoutubeDL(object):
infofn = self.prepare_filename(info_dict, 'infojson')
_infojson_written = self._write_info_json('video', info_dict, infofn)
if _infojson_written:
+ info_dict['infojson_filename'] = infofn
+ # For backward compatibility, even though it was a private field
info_dict['__infojson_filename'] = infofn
elif _infojson_written is None:
return
@@ -2620,91 +2937,79 @@ class YoutubeDL(object):
return
# Write internet shortcut files
- url_link = webloc_link = desktop_link = False
- if self.params.get('writelink', False):
- if sys.platform == "darwin": # macOS.
- webloc_link = True
- elif sys.platform.startswith("linux"):
- desktop_link = True
- else: # if sys.platform in ['win32', 'cygwin']:
- url_link = True
- if self.params.get('writeurllink', False):
- url_link = True
- if self.params.get('writewebloclink', False):
- webloc_link = True
- if self.params.get('writedesktoplink', False):
- desktop_link = True
-
- if url_link or webloc_link or desktop_link:
- if 'webpage_url' not in info_dict:
- self.report_error('Cannot write internet shortcut file because the "webpage_url" field is missing in the media information')
- return
- ascii_url = iri_to_uri(info_dict['webpage_url'])
-
- def _write_link_file(extension, template, newline, embed_filename):
- linkfn = replace_extension(full_filename, extension, info_dict.get('ext'))
+ def _write_link_file(link_type):
+ url = try_get(info_dict['webpage_url'], iri_to_uri)
+ if not url:
+ self.report_warning(
+ f'Cannot write internet shortcut file because the actual URL of "{info_dict["webpage_url"]}" is unknown')
+ return True
+ linkfn = replace_extension(self.prepare_filename(info_dict, 'link'), link_type, info_dict.get('ext'))
+ if not self._ensure_dir_exists(encodeFilename(linkfn)):
+ return False
if self.params.get('overwrites', True) and os.path.exists(encodeFilename(linkfn)):
- self.to_screen('[info] Internet shortcut is already present')
- else:
- try:
- self.to_screen('[info] Writing internet shortcut to: ' + linkfn)
- with io.open(encodeFilename(to_high_limit_path(linkfn)), 'w', encoding='utf-8', newline=newline) as linkfile:
- template_vars = {'url': ascii_url}
- if embed_filename:
- template_vars['filename'] = linkfn[:-(len(extension) + 1)]
- linkfile.write(template % template_vars)
- except (OSError, IOError):
- self.report_error('Cannot write internet shortcut ' + linkfn)
- return False
+ self.to_screen(f'[info] Internet shortcut (.{link_type}) is already present')
+ return True
+ try:
+ self.to_screen(f'[info] Writing internet shortcut (.{link_type}) to: {linkfn}')
+ with io.open(encodeFilename(to_high_limit_path(linkfn)), 'w', encoding='utf-8',
+ newline='\r\n' if link_type == 'url' else '\n') as linkfile:
+ template_vars = {'url': url}
+ if link_type == 'desktop':
+ template_vars['filename'] = linkfn[:-(len(link_type) + 1)]
+ linkfile.write(LINK_TEMPLATES[link_type] % template_vars)
+ except (OSError, IOError):
+ self.report_error(f'Cannot write internet shortcut {linkfn}')
+ return False
return True
- if url_link:
- if not _write_link_file('url', DOT_URL_LINK_TEMPLATE, '\r\n', embed_filename=False):
- return
- if webloc_link:
- if not _write_link_file('webloc', DOT_WEBLOC_LINK_TEMPLATE, '\n', embed_filename=False):
- return
- if desktop_link:
- if not _write_link_file('desktop', DOT_DESKTOP_LINK_TEMPLATE, '\n', embed_filename=True):
+ write_links = {
+ 'url': self.params.get('writeurllink'),
+ 'webloc': self.params.get('writewebloclink'),
+ 'desktop': self.params.get('writedesktoplink'),
+ }
+ if self.params.get('writelink'):
+ link_type = ('webloc' if sys.platform == 'darwin'
+ else 'desktop' if sys.platform.startswith('linux')
+ else 'url')
+ write_links[link_type] = True
+
+ if any(should_write and not _write_link_file(link_type)
+ for link_type, should_write in write_links.items()):
+ return
+
+ def replace_info_dict(new_info):
+ nonlocal info_dict
+ if new_info == info_dict:
return
+ info_dict.clear()
+ info_dict.update(new_info)
try:
- info_dict, files_to_move = self.pre_process(info_dict, 'before_dl', files_to_move)
+ new_info, files_to_move = self.pre_process(info_dict, 'before_dl', files_to_move)
+ replace_info_dict(new_info)
except PostProcessingError as err:
self.report_error('Preprocessing: %s' % str(err))
return
- must_record_download_archive = False
- if self.params.get('skip_download', False):
+ if self.params.get('skip_download'):
info_dict['filepath'] = temp_filename
info_dict['__finaldir'] = os.path.dirname(os.path.abspath(encodeFilename(full_filename)))
info_dict['__files_to_move'] = files_to_move
- info_dict = self.run_pp(MoveFilesAfterDownloadPP(self, False), info_dict)
+ replace_info_dict(self.run_pp(MoveFilesAfterDownloadPP(self, False), info_dict))
+ info_dict['__write_download_archive'] = self.params.get('force_write_download_archive')
else:
# Download
info_dict.setdefault('__postprocessors', [])
try:
- def existing_file(*filepaths):
+ def existing_video_file(*filepaths):
ext = info_dict.get('ext')
- final_ext = self.params.get('final_ext', ext)
- existing_files = []
- for file in orderedSet(filepaths):
- if final_ext != ext:
- converted = replace_extension(file, final_ext, ext)
- if os.path.exists(encodeFilename(converted)):
- existing_files.append(converted)
- if os.path.exists(encodeFilename(file)):
- existing_files.append(file)
-
- if not existing_files or self.params.get('overwrites', False):
- for file in orderedSet(existing_files):
- self.report_file_delete(file)
- os.remove(encodeFilename(file))
- return None
-
- info_dict['ext'] = os.path.splitext(existing_files[0])[1][1:]
- return existing_files[0]
+ converted = lambda file: replace_extension(file, self.params.get('final_ext') or ext, ext)
+ file = self.existing_file(itertools.chain(*zip(map(converted, filepaths), filepaths)),
+ default_overwrite=False)
+ if file:
+ info_dict['ext'] = os.path.splitext(file)[1][1:]
+ return file
success = True
if info_dict.get('requested_formats') is not None:
@@ -2758,30 +3063,39 @@ class YoutubeDL(object):
# Ensure filename always has a correct extension for successful merge
full_filename = correct_ext(full_filename)
temp_filename = correct_ext(temp_filename)
- dl_filename = existing_file(full_filename, temp_filename)
+ dl_filename = existing_video_file(full_filename, temp_filename)
info_dict['__real_download'] = False
+ downloaded = []
+ merger = FFmpegMergerPP(self)
+
+ fd = get_suitable_downloader(info_dict, self.params, to_stdout=temp_filename == '-')
if dl_filename is not None:
self.report_file_already_downloaded(dl_filename)
- elif get_suitable_downloader(info_dict, self.params, to_stdout=temp_filename == '-'):
+ elif fd:
+ for f in requested_formats if fd != FFmpegFD else []:
+ f['filepath'] = fname = prepend_extension(
+ correct_ext(temp_filename, info_dict['ext']),
+ 'f%s' % f['format_id'], info_dict['ext'])
+ downloaded.append(fname)
info_dict['url'] = '\n'.join(f['url'] for f in requested_formats)
success, real_download = self.dl(temp_filename, info_dict)
info_dict['__real_download'] = real_download
else:
- downloaded = []
- merger = FFmpegMergerPP(self)
if self.params.get('allow_unplayable_formats'):
self.report_warning(
'You have requested merging of multiple formats '
'while also allowing unplayable formats to be downloaded. '
'The formats won\'t be merged to prevent data corruption.')
elif not merger.available:
- self.report_warning(
- 'You have requested merging of multiple formats but ffmpeg is not installed. '
- 'The formats won\'t be merged.')
+ msg = 'You have requested merging of multiple formats but ffmpeg is not installed'
+ if not self.params.get('ignoreerrors'):
+ self.report_error(f'{msg}. Aborting due to --abort-on-error')
+ return
+ self.report_warning(f'{msg}. The formats won\'t be merged')
if temp_filename == '-':
- reason = ('using a downloader other than ffmpeg' if FFmpegFD.can_merge_formats(info_dict)
+ reason = ('using a downloader other than ffmpeg' if FFmpegFD.can_merge_formats(info_dict, self.params)
else 'but the formats are incompatible for simultaneous download' if merger.available
else 'but ffmpeg is not installed')
self.report_warning(
@@ -2803,17 +3117,18 @@ class YoutubeDL(object):
partial_success, real_download = self.dl(fname, new_info)
info_dict['__real_download'] = info_dict['__real_download'] or real_download
success = success and partial_success
- if merger.available and not self.params.get('allow_unplayable_formats'):
- info_dict['__postprocessors'].append(merger)
- info_dict['__files_to_merge'] = downloaded
- # Even if there were no downloads, it is being merged only now
- info_dict['__real_download'] = True
- else:
- for file in downloaded:
- files_to_move[file] = None
+
+ if downloaded and merger.available and not self.params.get('allow_unplayable_formats'):
+ info_dict['__postprocessors'].append(merger)
+ info_dict['__files_to_merge'] = downloaded
+ # Even if there were no downloads, it is being merged only now
+ info_dict['__real_download'] = True
+ else:
+ for file in downloaded:
+ files_to_move[file] = None
else:
# Just a single file
- dl_filename = existing_file(full_filename, temp_filename)
+ dl_filename = existing_video_file(full_filename, temp_filename)
if dl_filename is None or dl_filename == temp_filename:
# dl_filename == temp_filename could mean that the file was partially downloaded with --no-part.
# So we should try to resume the download
@@ -2877,14 +3192,20 @@ class YoutubeDL(object):
downloader = get_suitable_downloader(info_dict, self.params) if 'protocol' in info_dict else None
downloader = downloader.__name__ if downloader else None
- ffmpeg_fixup(info_dict.get('requested_formats') is None and downloader == 'HlsFD',
- 'malformed AAC bitstream detected', FFmpegFixupM3u8PP)
- ffmpeg_fixup(downloader == 'WebSocketFragmentFD', 'malformed timestamps detected', FFmpegFixupTimestampPP)
- ffmpeg_fixup(downloader == 'WebSocketFragmentFD', 'malformed duration detected', FFmpegFixupDurationPP)
+
+ if info_dict.get('requested_formats') is None: # Not necessary if doing merger
+ ffmpeg_fixup(downloader == 'HlsFD',
+ 'Possible MPEG-TS in MP4 container or malformed AAC timestamps',
+ FFmpegFixupM3u8PP)
+ ffmpeg_fixup(info_dict.get('is_live') and downloader == 'DashSegmentsFD',
+ 'Possible duplicate MOOV atoms', FFmpegFixupDuplicateMoovPP)
+
+ ffmpeg_fixup(downloader == 'WebSocketFragmentFD', 'Malformed timestamps detected', FFmpegFixupTimestampPP)
+ ffmpeg_fixup(downloader == 'WebSocketFragmentFD', 'Malformed duration detected', FFmpegFixupDurationPP)
fixup()
try:
- info_dict = self.post_process(dl_filename, info_dict, files_to_move)
+ replace_info_dict(self.post_process(dl_filename, info_dict, files_to_move))
except PostProcessingError as err:
self.report_error('Postprocessing: %s' % str(err))
return
@@ -2894,16 +3215,41 @@ class YoutubeDL(object):
except Exception as err:
self.report_error('post hooks: %s' % str(err))
return
- must_record_download_archive = True
+ info_dict['__write_download_archive'] = True
+
+ if self.params.get('force_write_download_archive'):
+ info_dict['__write_download_archive'] = True
+
+ # Make sure the info_dict was modified in-place
+ assert info_dict is original_infodict
- if must_record_download_archive or self.params.get('force_write_download_archive', False):
- self.record_download_archive(info_dict)
max_downloads = self.params.get('max_downloads')
if max_downloads is not None and self._num_downloads >= int(max_downloads):
raise MaxDownloadsReached()
+ def __download_wrapper(self, func):
+ @functools.wraps(func)
+ def wrapper(*args, **kwargs):
+ try:
+ res = func(*args, **kwargs)
+ except UnavailableVideoError as e:
+ self.report_error(e)
+ except MaxDownloadsReached as e:
+ self.to_screen(f'[info] {e}')
+ raise
+ except DownloadCancelled as e:
+ self.to_screen(f'[info] {e}')
+ if not self.params.get('break_per_url'):
+ raise
+ else:
+ if self.params.get('dump_single_json', False):
+ self.post_extract(res)
+ self.to_stdout(json.dumps(self.sanitize_info(res)))
+ return wrapper
+
def download(self, url_list):
"""Download a given list of URLs."""
+ url_list = variadic(url_list) # Passing a single URL is a common mistake
outtmpl = self.outtmpl_dict['default']
if (len(url_list) > 1
and outtmpl != '-'
@@ -2912,25 +3258,8 @@ class YoutubeDL(object):
raise SameFileError(outtmpl)
for url in url_list:
- try:
- # It also downloads the videos
- res = self.extract_info(
- url, force_generic_extractor=self.params.get('force_generic_extractor', False))
- except UnavailableVideoError:
- self.report_error('unable to download video')
- except MaxDownloadsReached:
- self.to_screen('[info] Maximum number of downloads reached')
- raise
- except ExistingVideoReached:
- self.to_screen('[info] Encountered a video that is already in the archive, stopping due to --break-on-existing')
- raise
- except RejectedVideoReached:
- self.to_screen('[info] Encountered a video that did not match filter, stopping due to --break-on-reject')
- raise
- else:
- if self.params.get('dump_single_json', False):
- self.post_extract(res)
- self.to_stdout(json.dumps(self.sanitize_info(res)))
+ self.__download_wrapper(self.extract_info)(
+ url, force_generic_extractor=self.params.get('force_generic_extractor', False))
return self._download_retcode
@@ -2941,11 +3270,13 @@ class YoutubeDL(object):
# FileInput doesn't have a read method, we can't call json.load
info = self.sanitize_info(json.loads('\n'.join(f)), self.params.get('clean_infojson', True))
try:
- self.process_ie_result(info, download=True)
- except (DownloadError, EntryNotInPlaylist, ThrottledDownload):
+ self.__download_wrapper(self.process_ie_result)(info, download=True)
+ except (DownloadError, EntryNotInPlaylist, ReExtractInfo) as e:
+ if not isinstance(e, EntryNotInPlaylist):
+ self.to_stderr('\r')
webpage_url = info.get('webpage_url')
if webpage_url is not None:
- self.report_warning('The info failed to download, trying with "%s"' % webpage_url)
+ self.report_warning(f'The info failed to download: {e}; trying with URL {webpage_url}')
return self.download([webpage_url])
else:
raise
@@ -2957,22 +3288,26 @@ class YoutubeDL(object):
if info_dict is None:
return info_dict
info_dict.setdefault('epoch', int(time.time()))
- remove_keys = {'__original_infodict'} # Always remove this since this may contain a copy of the entire dict
- keep_keys = ['_type'], # Always keep this to facilitate load-info-json
+ info_dict.setdefault('_type', 'video')
+
if remove_private_keys:
- remove_keys |= {
- 'requested_formats', 'requested_subtitles', 'requested_entries',
- 'filepath', 'entries', 'original_url', 'playlist_autonumber',
+ reject = lambda k, v: v is None or (k.startswith('_') and k != '_type') or k in {
+ 'requested_downloads', 'requested_formats', 'requested_subtitles', 'requested_entries',
+ 'entries', 'filepath', 'infojson_filename', 'original_url', 'playlist_autonumber',
}
- empty_values = (None, {}, [], set(), tuple())
- reject = lambda k, v: k not in keep_keys and (
- k.startswith('_') or k in remove_keys or v in empty_values)
else:
- reject = lambda k, v: k in remove_keys
- filter_fn = lambda obj: (
- list(map(filter_fn, obj)) if isinstance(obj, (LazyList, list, tuple, set))
- else obj if not isinstance(obj, dict)
- else dict((k, filter_fn(v)) for k, v in obj.items() if not reject(k, v)))
+ reject = lambda k, v: False
+
+ def filter_fn(obj):
+ if isinstance(obj, dict):
+ return {k: filter_fn(v) for k, v in obj.items() if not reject(k, v)}
+ elif isinstance(obj, (list, tuple, set, LazyList)):
+ return list(map(filter_fn, obj))
+ elif obj is None or isinstance(obj, (str, int, float, bool)):
+ return obj
+ else:
+ return repr(obj)
+
return filter_fn(info_dict)
@staticmethod
@@ -2980,6 +3315,19 @@ class YoutubeDL(object):
''' Alias of sanitize_info for backward compatibility '''
return YoutubeDL.sanitize_info(info_dict, actually_filter)
+ @staticmethod
+ def post_extract(info_dict):
+ def actual_post_extract(info_dict):
+ if info_dict.get('_type') in ('playlist', 'multi_video'):
+ for video_dict in info_dict.get('entries', {}):
+ actual_post_extract(video_dict or {})
+ return
+
+ post_extractor = info_dict.pop('__post_extractor', None) or (lambda: {})
+ info_dict.update(post_extractor())
+
+ actual_post_extract(info_dict or {})
+
def run_pp(self, pp, infodict):
files_to_delete = []
if '__files_to_move' not in infodict:
@@ -3009,45 +3357,26 @@ class YoutubeDL(object):
del infodict['__files_to_move'][old_filename]
return infodict
- @staticmethod
- def post_extract(info_dict):
- def actual_post_extract(info_dict):
- if info_dict.get('_type') in ('playlist', 'multi_video'):
- for video_dict in info_dict.get('entries', {}):
- actual_post_extract(video_dict or {})
- return
-
- post_extractor = info_dict.get('__post_extractor') or (lambda: {})
- extra = post_extractor().items()
- info_dict.update(extra)
- info_dict.pop('__post_extractor', None)
-
- original_infodict = info_dict.get('__original_infodict') or {}
- original_infodict.update(extra)
- original_infodict.pop('__post_extractor', None)
-
- actual_post_extract(info_dict or {})
+ def run_all_pps(self, key, info, *, additional_pps=None):
+ self._forceprint(key, info)
+ for pp in (additional_pps or []) + self._pps[key]:
+ info = self.run_pp(pp, info)
+ return info
def pre_process(self, ie_info, key='pre_process', files_to_move=None):
info = dict(ie_info)
info['__files_to_move'] = files_to_move or {}
- for pp in self._pps[key]:
- info = self.run_pp(pp, info)
+ info = self.run_all_pps(key, info)
return info, info.pop('__files_to_move', None)
- def post_process(self, filename, ie_info, files_to_move=None):
+ def post_process(self, filename, info, files_to_move=None):
"""Run all the postprocessors on the given file."""
- info = dict(ie_info)
info['filepath'] = filename
info['__files_to_move'] = files_to_move or {}
-
- for pp in ie_info.get('__postprocessors', []) + self._pps['post_process']:
- info = self.run_pp(pp, info)
+ info = self.run_all_pps('post_process', info, additional_pps=info.get('__postprocessors'))
info = self.run_pp(MoveFilesAfterDownloadPP(self), info)
del info['__files_to_move']
- for pp in self._pps['after_move']:
- info = self.run_pp(pp, info)
- return info
+ return self.run_all_pps('after_move', info)
def _make_archive_id(self, info_dict):
video_id = info_dict.get('id')
@@ -3086,41 +3415,46 @@ class YoutubeDL(object):
return
vid_id = self._make_archive_id(info_dict)
assert vid_id
+ self.write_debug(f'Adding to archive: {vid_id}')
with locked_file(fn, 'a', encoding='utf-8') as archive_file:
archive_file.write(vid_id + '\n')
self.archive.add(vid_id)
@staticmethod
def format_resolution(format, default='unknown'):
- is_images = format.get('vcodec') == 'none' and format.get('acodec') == 'none'
if format.get('vcodec') == 'none' and format.get('acodec') != 'none':
return 'audio only'
if format.get('resolution') is not None:
return format['resolution']
if format.get('width') and format.get('height'):
- res = '%dx%d' % (format['width'], format['height'])
+ return '%dx%d' % (format['width'], format['height'])
elif format.get('height'):
- res = '%sp' % format['height']
+ return '%sp' % format['height']
elif format.get('width'):
- res = '%dx?' % format['width']
- elif is_images:
- return 'images'
- else:
- return default
- return f'{res} images' if is_images else res
+ return '%dx?' % format['width']
+ return default
+
+ def _list_format_headers(self, *headers):
+ if self.params.get('listformats_table', True) is not False:
+ return [self._format_screen(header, self.Styles.HEADERS) for header in headers]
+ return headers
def _format_note(self, fdict):
res = ''
if fdict.get('ext') in ['f4f', 'f4m']:
- res += '(unsupported) '
+ res += '(unsupported)'
if fdict.get('language'):
if res:
res += ' '
- res += '[%s] ' % fdict['language']
+ res += '[%s]' % fdict['language']
if fdict.get('format_note') is not None:
- res += fdict['format_note'] + ' '
+ if res:
+ res += ' '
+ res += fdict['format_note']
if fdict.get('tbr') is not None:
- res += '%4dk ' % fdict['tbr']
+ if res:
+ res += ', '
+ res += '%4dk' % fdict['tbr']
if fdict.get('container') is not None:
if res:
res += ', '
@@ -3165,83 +3499,97 @@ class YoutubeDL(object):
res += '~' + format_bytes(fdict['filesize_approx'])
return res
- def list_formats(self, info_dict):
+ def render_formats_table(self, info_dict):
+ if not info_dict.get('formats') and not info_dict.get('url'):
+ return None
+
formats = info_dict.get('formats', [info_dict])
- new_format = (
- 'list-formats' not in self.params.get('compat_opts', [])
- and self.params.get('listformats_table', True) is not False)
- if new_format:
+ if not self.params.get('listformats_table', True) is not False:
table = [
[
format_field(f, 'format_id'),
format_field(f, 'ext'),
self.format_resolution(f),
- format_field(f, 'fps', '%d'),
- format_field(f, 'dynamic_range', '%s', ignore=(None, 'SDR')).replace('HDR', ''),
- '|',
- format_field(f, 'filesize', ' %s', func=format_bytes) + format_field(f, 'filesize_approx', '~%s', func=format_bytes),
- format_field(f, 'tbr', '%4dk'),
- shorten_protocol_name(f.get('protocol', '').replace("native", "n")),
- '|',
- format_field(f, 'vcodec', default='unknown').replace('none', ''),
- format_field(f, 'vbr', '%4dk'),
- format_field(f, 'acodec', default='unknown').replace('none', ''),
- format_field(f, 'abr', '%3dk'),
- format_field(f, 'asr', '%5dHz'),
- ', '.join(filter(None, (
- 'UNSUPPORTED' if f.get('ext') in ('f4f', 'f4m') else '',
- format_field(f, 'language', '[%s]'),
- format_field(f, 'format_note'),
- format_field(f, 'container', ignore=(None, f.get('ext'))),
- ))),
+ self._format_note(f)
] for f in formats if f.get('preference') is None or f['preference'] >= -1000]
- header_line = ['ID', 'EXT', 'RESOLUTION', 'FPS', 'HDR', '|', ' FILESIZE', ' TBR', 'PROTO',
- '|', 'VCODEC', ' VBR', 'ACODEC', ' ABR', ' ASR', 'MORE INFO']
- else:
- table = [
- [
- format_field(f, 'format_id'),
- format_field(f, 'ext'),
- self.format_resolution(f),
- self._format_note(f)]
- for f in formats
- if f.get('preference') is None or f['preference'] >= -1000]
- header_line = ['format code', 'extension', 'resolution', 'note']
-
- self.to_screen(
- '[info] Available formats for %s:' % info_dict['id'])
- self.to_stdout(render_table(
- header_line, table, delim=new_format, extraGap=(0 if new_format else 1), hideEmpty=new_format))
-
- def list_thumbnails(self, info_dict):
- thumbnails = list(info_dict.get('thumbnails'))
+ return render_table(['format code', 'extension', 'resolution', 'note'], table, extra_gap=1)
+
+ delim = self._format_screen('\u2502', self.Styles.DELIM, '|', test_encoding=True)
+ table = [
+ [
+ self._format_screen(format_field(f, 'format_id'), self.Styles.ID),
+ format_field(f, 'ext'),
+ format_field(f, func=self.format_resolution, ignore=('audio only', 'images')),
+ format_field(f, 'fps', '\t%d'),
+ format_field(f, 'dynamic_range', '%s', ignore=(None, 'SDR')).replace('HDR', ''),
+ delim,
+ format_field(f, 'filesize', ' \t%s', func=format_bytes) + format_field(f, 'filesize_approx', '~\t%s', func=format_bytes),
+ format_field(f, 'tbr', '\t%dk'),
+ shorten_protocol_name(f.get('protocol', '')),
+ delim,
+ format_field(f, 'vcodec', default='unknown').replace(
+ 'none', 'images' if f.get('acodec') == 'none'
+ else self._format_screen('audio only', self.Styles.SUPPRESS)),
+ format_field(f, 'vbr', '\t%dk'),
+ format_field(f, 'acodec', default='unknown').replace(
+ 'none', '' if f.get('vcodec') == 'none'
+ else self._format_screen('video only', self.Styles.SUPPRESS)),
+ format_field(f, 'abr', '\t%dk'),
+ format_field(f, 'asr', '\t%dHz'),
+ join_nonempty(
+ self._format_screen('UNSUPPORTED', 'light red') if f.get('ext') in ('f4f', 'f4m') else None,
+ format_field(f, 'language', '[%s]'),
+ join_nonempty(format_field(f, 'format_note'),
+ format_field(f, 'container', ignore=(None, f.get('ext'))),
+ delim=', '),
+ delim=' '),
+ ] for f in formats if f.get('preference') is None or f['preference'] >= -1000]
+ header_line = self._list_format_headers(
+ 'ID', 'EXT', 'RESOLUTION', '\tFPS', 'HDR', delim, '\tFILESIZE', '\tTBR', 'PROTO',
+ delim, 'VCODEC', '\tVBR', 'ACODEC', '\tABR', '\tASR', 'MORE INFO')
+
+ return render_table(
+ header_line, table, hide_empty=True,
+ delim=self._format_screen('\u2500', self.Styles.DELIM, '-', test_encoding=True))
+
+ def render_thumbnails_table(self, info_dict):
+ thumbnails = list(info_dict.get('thumbnails') or [])
if not thumbnails:
- self.to_screen('[info] No thumbnails present for %s' % info_dict['id'])
- return
-
- self.to_screen(
- '[info] Thumbnails for %s:' % info_dict['id'])
- self.to_stdout(render_table(
- ['ID', 'width', 'height', 'URL'],
- [[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails]))
-
- def list_subtitles(self, video_id, subtitles, name='subtitles'):
- if not subtitles:
- self.to_screen('%s has no %s' % (video_id, name))
- return
- self.to_screen(
- 'Available %s for %s:' % (name, video_id))
+ return None
+ return render_table(
+ self._list_format_headers('ID', 'Width', 'Height', 'URL'),
+ [[t.get('id'), t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails])
+ def render_subtitles_table(self, video_id, subtitles):
def _row(lang, formats):
exts, names = zip(*((f['ext'], f.get('name') or 'unknown') for f in reversed(formats)))
if len(set(names)) == 1:
names = [] if names[0] == 'unknown' else names[:1]
return [lang, ', '.join(names), ', '.join(exts)]
- self.to_stdout(render_table(
- ['Language', 'Name', 'Formats'],
+ if not subtitles:
+ return None
+ return render_table(
+ self._list_format_headers('Language', 'Name', 'Formats'),
[_row(lang, formats) for lang, formats in subtitles.items()],
- hideEmpty=True))
+ hide_empty=True)
+
+ def __list_table(self, video_id, name, func, *args):
+ table = func(*args)
+ if not table:
+ self.to_screen(f'{video_id} has no {name}')
+ return
+ self.to_screen(f'[info] Available {name} for {video_id}:')
+ self.to_stdout(table)
+
+ def list_formats(self, info_dict):
+ self.__list_table(info_dict['id'], 'formats', self.render_formats_table, info_dict)
+
+ def list_thumbnails(self, info_dict):
+ self.__list_table(info_dict['id'], 'thumbnails', self.render_thumbnails_table, info_dict)
+
+ def list_subtitles(self, video_id, subtitles, name='subtitles'):
+ self.__list_table(video_id, name, self.render_subtitles_table, video_id, subtitles)
def urlopen(self, req):
""" Start an HTTP download """
@@ -3252,45 +3600,61 @@ class YoutubeDL(object):
def print_debug_header(self):
if not self.params.get('verbose'):
return
- get_encoding = lambda stream: getattr(stream, 'encoding', 'missing (%s)' % type(stream).__name__)
- encoding_str = (
- '[debug] Encodings: locale %s, fs %s, stdout %s, stderr %s, pref %s\n' % (
- locale.getpreferredencoding(),
- sys.getfilesystemencoding(),
- get_encoding(self._screen_file), get_encoding(self._err_file),
- self.get_encoding()))
+
+ def get_encoding(stream):
+ ret = str(getattr(stream, 'encoding', 'missing (%s)' % type(stream).__name__))
+ if not supports_terminal_sequences(stream):
+ from .compat import WINDOWS_VT_MODE
+ ret += ' (No VT)' if WINDOWS_VT_MODE is False else ' (No ANSI)'
+ return ret
+
+ encoding_str = 'Encodings: locale %s, fs %s, out %s, err %s, pref %s' % (
+ locale.getpreferredencoding(),
+ sys.getfilesystemencoding(),
+ get_encoding(self._out_files['screen']), get_encoding(self._out_files['error']),
+ self.get_encoding())
logger = self.params.get('logger')
if logger:
write_debug = lambda msg: logger.debug(f'[debug] {msg}')
write_debug(encoding_str)
else:
- write_debug = lambda msg: self._write_string(f'[debug] {msg}')
- write_string(encoding_str, encoding=None)
-
- write_debug('hypervideo version %s%s\n' % (__version__, '' if source == 'unknown' else f' ({source})'))
- if _LAZY_LOADER:
- write_debug('Lazy loading extractors enabled\n')
+ write_string(f'[debug] {encoding_str}\n', encoding=None)
+ write_debug = lambda msg: self._write_string(f'[debug] {msg}\n')
+
+ source = detect_variant()
+ write_debug(join_nonempty(
+ 'hypervideo version', __version__,
+ f'[{RELEASE_GIT_HEAD}]' if RELEASE_GIT_HEAD else '',
+ '' if source == 'unknown' else f'({source})',
+ delim=' '))
+ if not _LAZY_LOADER:
+ if os.environ.get('YTDLP_NO_LAZY_EXTRACTORS'):
+ write_debug('Lazy loading extractors is forcibly disabled')
+ else:
+ write_debug('Lazy loading extractors is disabled')
if plugin_extractors or plugin_postprocessors:
- write_debug('Plugins: %s\n' % [
+ write_debug('Plugins: %s' % [
'%s%s' % (klass.__name__, '' if klass.__name__ == name else f' as {name}')
for name, klass in itertools.chain(plugin_extractors.items(), plugin_postprocessors.items())])
if self.params.get('compat_opts'):
- write_debug('Compatibility options: %s\n' % ', '.join(self.params.get('compat_opts')))
- try:
- sp = subprocess.Popen(
- ['git', 'rev-parse', '--short', 'HEAD'],
- stdout=subprocess.PIPE, stderr=subprocess.PIPE,
- cwd=os.path.dirname(os.path.abspath(__file__)))
- out, err = process_communicate_or_kill(sp)
- out = out.decode().strip()
- if re.match('[0-9a-f]+', out):
- write_debug('Git HEAD: %s\n' % out)
- except Exception:
+ write_debug('Compatibility options: %s' % ', '.join(self.params.get('compat_opts')))
+
+ if source == 'source':
try:
- sys.exc_clear()
+ sp = Popen(
+ ['git', 'rev-parse', '--short', 'HEAD'],
+ stdout=subprocess.PIPE, stderr=subprocess.PIPE,
+ cwd=os.path.dirname(os.path.abspath(__file__)))
+ out, err = sp.communicate_or_kill()
+ out = out.decode().strip()
+ if re.match('[0-9a-f]+', out):
+ write_debug('Git HEAD: %s' % out)
except Exception:
- pass
+ try:
+ sys.exc_clear()
+ except Exception:
+ pass
def python_implementation():
impl_name = platform.python_implementation()
@@ -3298,46 +3662,49 @@ class YoutubeDL(object):
return impl_name + ' version %d.%d.%d' % sys.pypy_version_info[:3]
return impl_name
- write_debug('Python version %s (%s %s) - %s\n' % (
+ write_debug('Python version %s (%s %s) - %s' % (
platform.python_version(),
python_implementation(),
platform.architecture()[0],
platform_name()))
- exe_versions = FFmpegPostProcessor.get_versions(self)
+ exe_versions, ffmpeg_features = FFmpegPostProcessor.get_versions_and_features(self)
+ ffmpeg_features = {key for key, val in ffmpeg_features.items() if val}
+ if ffmpeg_features:
+ exe_versions['ffmpeg'] += ' (%s)' % ','.join(ffmpeg_features)
+
exe_versions['rtmpdump'] = rtmpdump_version()
exe_versions['phantomjs'] = PhantomJSwrapper._version()
exe_str = ', '.join(
f'{exe} {v}' for exe, v in sorted(exe_versions.items()) if v
) or 'none'
- write_debug('exe versions: %s\n' % exe_str)
+ write_debug('exe versions: %s' % exe_str)
from .downloader.websocket import has_websockets
from .postprocessor.embedthumbnail import has_mutagen
- from .cookies import SQLITE_AVAILABLE, KEYRING_AVAILABLE
+ from .cookies import SQLITE_AVAILABLE, SECRETSTORAGE_AVAILABLE
- lib_str = ', '.join(sorted(filter(None, (
+ lib_str = join_nonempty(
+ compat_brotli and compat_brotli.__name__,
+ has_certifi and 'certifi',
compat_pycrypto_AES and compat_pycrypto_AES.__name__.split('.')[0],
- has_websockets and 'websockets',
+ SECRETSTORAGE_AVAILABLE and 'secretstorage',
has_mutagen and 'mutagen',
SQLITE_AVAILABLE and 'sqlite',
- KEYRING_AVAILABLE and 'keyring',
- )))) or 'none'
- write_debug('Optional libraries: %s\n' % lib_str)
- write_debug('ANSI escape support: stdout = %s, stderr = %s\n' % (
- supports_terminal_sequences(self._screen_file),
- supports_terminal_sequences(self._err_file)))
+ has_websockets and 'websockets',
+ delim=', ') or 'none'
+ write_debug('Optional libraries: %s' % lib_str)
proxy_map = {}
for handler in self._opener.handlers:
if hasattr(handler, 'proxies'):
proxy_map.update(handler.proxies)
- write_debug('Proxy map: ' + compat_str(proxy_map) + '\n')
+ write_debug(f'Proxy map: {proxy_map}')
- if self.params.get('call_home', False):
+ # Not implemented
+ if False and self.params.get('call_home'):
ipaddr = self.urlopen('https://yt-dl.org/ip').read().decode('utf-8')
- write_debug('Public IP address: %s\n' % ipaddr)
- return
+ write_debug('Public IP address: %s' % ipaddr)
latest_version = self.urlopen(
'https://yt-dl.org/latest/version').read().decode('utf-8')
if version_tuple(latest_version) > version_tuple(__version__):
@@ -3410,8 +3777,10 @@ class YoutubeDL(object):
encoding = preferredencoding()
return encoding
- def _write_info_json(self, label, ie_result, infofn):
- ''' Write infojson and returns True = written, False = skip, None = error '''
+ def _write_info_json(self, label, ie_result, infofn, overwrite=None):
+ ''' Write infojson and returns True = written, 'exists' = Already exists, False = skip, None = error '''
+ if overwrite is None:
+ overwrite = self.params.get('overwrites', True)
if not self.params.get('writeinfojson'):
return False
elif not infofn:
@@ -3419,16 +3788,17 @@ class YoutubeDL(object):
return False
elif not self._ensure_dir_exists(infofn):
return None
- elif not self.params.get('overwrites', True) and os.path.exists(infofn):
+ elif not overwrite and os.path.exists(infofn):
self.to_screen(f'[info] {label.title()} metadata is already present')
- else:
- self.to_screen(f'[info] Writing {label} metadata as JSON to: {infofn}')
- try:
- write_json_file(self.sanitize_info(ie_result, self.params.get('clean_infojson', True)), infofn)
- except (OSError, IOError):
- self.report_error(f'Cannot write {label} metadata to JSON file {infofn}')
- return None
- return True
+ return 'exists'
+
+ self.to_screen(f'[info] Writing {label} metadata as JSON to: {infofn}')
+ try:
+ write_json_file(self.sanitize_info(ie_result, self.params.get('clean_infojson', True)), infofn)
+ return True
+ except (OSError, IOError):
+ self.report_error(f'Cannot write {label} metadata to JSON file {infofn}')
+ return None
def _write_description(self, label, ie_result, descfn):
''' Write description and returns True = written, False = skip, None = error '''
@@ -3471,10 +3841,11 @@ class YoutubeDL(object):
sub_format = sub_info['ext']
sub_filename = subtitles_filename(filename, sub_lang, sub_format, info_dict.get('ext'))
sub_filename_final = subtitles_filename(sub_filename_base, sub_lang, sub_format, info_dict.get('ext'))
- if not self.params.get('overwrites', True) and os.path.exists(sub_filename):
+ existing_sub = self.existing_file((sub_filename_final, sub_filename))
+ if existing_sub:
self.to_screen(f'[info] Video subtitle {sub_lang}.{sub_format} is already present')
- sub_info['filepath'] = sub_filename
- ret.append((sub_filename, sub_filename_final))
+ sub_info['filepath'] = existing_sub
+ ret.append((existing_sub, sub_filename_final))
continue
self.to_screen(f'[info] Writing video subtitles to: {sub_filename}')
@@ -3497,9 +3868,13 @@ class YoutubeDL(object):
self.dl(sub_filename, sub_copy, subtitle=True)
sub_info['filepath'] = sub_filename
ret.append((sub_filename, sub_filename_final))
- except (ExtractorError, IOError, OSError, ValueError) + network_exceptions as err:
- self.report_warning(f'Unable to download video subtitles for {sub_lang!r}: {err}')
- continue
+ except (DownloadError, ExtractorError, IOError, OSError, ValueError) + network_exceptions as err:
+ msg = f'Unable to download video subtitles for {sub_lang!r}: {err}'
+ if self.params.get('ignoreerrors') is not True: # False or 'only_download'
+ if not self.params.get('ignoreerrors'):
+ self.report_error(msg)
+ raise DownloadError(msg)
+ self.report_warning(msg)
return ret
def _write_thumbnails(self, label, info_dict, filename, thumb_filename_base=None):
@@ -3516,26 +3891,29 @@ class YoutubeDL(object):
self.write_debug(f'Skipping writing {label} thumbnail')
return ret
- for t in thumbnails[::-1]:
+ for idx, t in list(enumerate(thumbnails))[::-1]:
thumb_ext = (f'{t["id"]}.' if multiple else '') + determine_ext(t['url'], 'jpg')
- thumb_display_id = f'{label} thumbnail' + (f' {t["id"]}' if multiple else '')
+ thumb_display_id = f'{label} thumbnail {t["id"]}'
thumb_filename = replace_extension(filename, thumb_ext, info_dict.get('ext'))
thumb_filename_final = replace_extension(thumb_filename_base, thumb_ext, info_dict.get('ext'))
- if not self.params.get('overwrites', True) and os.path.exists(thumb_filename):
- ret.append((thumb_filename, thumb_filename_final))
- t['filepath'] = thumb_filename
- self.to_screen(f'[info] {thumb_display_id.title()} is already present')
+ existing_thumb = self.existing_file((thumb_filename_final, thumb_filename))
+ if existing_thumb:
+ self.to_screen('[info] %s is already present' % (
+ thumb_display_id if multiple else f'{label} thumbnail').capitalize())
+ t['filepath'] = existing_thumb
+ ret.append((existing_thumb, thumb_filename_final))
else:
self.to_screen(f'[info] Downloading {thumb_display_id} ...')
try:
- uf = self.urlopen(t['url'])
+ uf = self.urlopen(sanitized_Request(t['url'], headers=t.get('http_headers', {})))
self.to_screen(f'[info] Writing {thumb_display_id} to: {thumb_filename}')
with open(encodeFilename(thumb_filename), 'wb') as thumbf:
shutil.copyfileobj(uf, thumbf)
ret.append((thumb_filename, thumb_filename_final))
t['filepath'] = thumb_filename
except network_exceptions as err:
+ thumbnails.pop(idx)
self.report_warning(f'Unable to download {thumb_display_id}: {err}')
if ret and not write_all:
break
diff --git a/hypervideo_dl/__init__.py b/hypervideo_dl/__init__.py
index d8b7de5..dc53a9e 100644
--- a/hypervideo_dl/__init__.py
+++ b/hypervideo_dl/__init__.py
@@ -11,32 +11,33 @@ import random
import re
import sys
-from .options import (
- parseOpts,
-)
+from .options import parseOpts
from .compat import (
compat_getpass,
+ compat_os_name,
compat_shlex_quote,
workaround_optparse_bug9161,
)
-from .cookies import SUPPORTED_BROWSERS
+from .cookies import SUPPORTED_BROWSERS, SUPPORTED_KEYRINGS
from .utils import (
DateRange,
decodeOption,
+ DownloadCancelled,
DownloadError,
- error_to_compat_str,
- ExistingVideoReached,
expand_path,
+ float_or_none,
+ GeoUtils,
+ int_or_none,
match_filter_func,
- MaxDownloadsReached,
+ NO_DEFAULT,
parse_duration,
preferredencoding,
read_batch_urls,
- RejectedVideoReached,
render_table,
SameFileError,
setproctitle,
std_headers,
+ traverse_obj,
write_string,
)
from .downloader import (
@@ -57,215 +58,68 @@ from .postprocessor import (
from .YoutubeDL import YoutubeDL
-def _real_main(argv=None):
- # Compatibility fixes for Windows
- if sys.platform == 'win32':
- # https://github.com/ytdl-org/youtube-dl/issues/820
- codecs.register(lambda name: codecs.lookup('utf-8') if name == 'cp65001' else None)
-
- workaround_optparse_bug9161()
-
- setproctitle('hypervideo')
-
- parser, opts, args = parseOpts(argv)
- warnings = []
-
- # Set user agent
- if opts.user_agent is not None:
- std_headers['User-Agent'] = opts.user_agent
-
- # Set referer
- if opts.referer is not None:
- std_headers['Referer'] = opts.referer
-
- # Custom HTTP headers
- std_headers.update(opts.headers)
-
- # Dump user agent
- if opts.dump_user_agent:
- write_string(std_headers['User-Agent'] + '\n', out=sys.stdout)
- sys.exit(0)
-
+def get_urls(urls, batchfile, verbose):
# Batch file verification
batch_urls = []
- if opts.batchfile is not None:
+ if batchfile is not None:
try:
- if opts.batchfile == '-':
+ if batchfile == '-':
+ write_string('Reading URLs from stdin - EOF (%s) to end:\n' % (
+ 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'))
batchfd = sys.stdin
else:
batchfd = io.open(
- expand_path(opts.batchfile),
+ expand_path(batchfile),
'r', encoding='utf-8', errors='ignore')
batch_urls = read_batch_urls(batchfd)
- if opts.verbose:
+ if verbose:
write_string('[debug] Batch file urls: ' + repr(batch_urls) + '\n')
except IOError:
- sys.exit('ERROR: batch file %s could not be read' % opts.batchfile)
- all_urls = batch_urls + [url.strip() for url in args] # batch_urls are already striped in read_batch_urls
+ sys.exit('ERROR: batch file %s could not be read' % batchfile)
_enc = preferredencoding()
- all_urls = [url.decode(_enc, 'ignore') if isinstance(url, bytes) else url for url in all_urls]
+ return [
+ url.strip().decode(_enc, 'ignore') if isinstance(url, bytes) else url.strip()
+ for url in batch_urls + urls]
+
+def print_extractor_information(opts, urls):
if opts.list_extractors:
for ie in list_extractors(opts.age_limit):
write_string(ie.IE_NAME + (' (CURRENTLY BROKEN)' if not ie.working() else '') + '\n', out=sys.stdout)
- matchedUrls = [url for url in all_urls if ie.suitable(url)]
+ matchedUrls = [url for url in urls if ie.suitable(url)]
for mu in matchedUrls:
write_string(' ' + mu + '\n', out=sys.stdout)
- sys.exit(0)
- if opts.list_extractor_descriptions:
+ elif opts.list_extractor_descriptions:
for ie in list_extractors(opts.age_limit):
if not ie.working():
continue
- desc = getattr(ie, 'IE_DESC', ie.IE_NAME)
- if desc is False:
+ if ie.IE_DESC is False:
continue
- if hasattr(ie, 'SEARCH_KEY'):
+ desc = ie.IE_DESC or ie.IE_NAME
+ if getattr(ie, 'SEARCH_KEY', None) is not None:
_SEARCHES = ('cute kittens', 'slithering pythons', 'falling cat', 'angry poodle', 'purple fish', 'running tortoise', 'sleeping bunny', 'burping cow')
_COUNTS = ('', '5', '10', 'all')
- desc += ' (Example: "%s%s:%s" )' % (ie.SEARCH_KEY, random.choice(_COUNTS), random.choice(_SEARCHES))
+ desc += f'; "{ie.SEARCH_KEY}:" prefix (Example: "{ie.SEARCH_KEY}{random.choice(_COUNTS)}:{random.choice(_SEARCHES)}")'
write_string(desc + '\n', out=sys.stdout)
- sys.exit(0)
- if opts.ap_list_mso:
+ elif opts.ap_list_mso:
table = [[mso_id, mso_info['name']] for mso_id, mso_info in MSO_INFO.items()]
write_string('Supported TV Providers:\n' + render_table(['mso', 'mso name'], table) + '\n', out=sys.stdout)
- sys.exit(0)
-
- # Conflicting, missing and erroneous options
- if opts.usenetrc and (opts.username is not None or opts.password is not None):
- parser.error('using .netrc conflicts with giving username/password')
- if opts.password is not None and opts.username is None:
- parser.error('account username missing\n')
- if opts.ap_password is not None and opts.ap_username is None:
- parser.error('TV Provider account username missing\n')
- if opts.autonumber_size is not None:
- if opts.autonumber_size <= 0:
- parser.error('auto number size must be positive')
- if opts.autonumber_start is not None:
- if opts.autonumber_start < 0:
- parser.error('auto number start must be positive or 0')
- if opts.username is not None and opts.password is None:
- opts.password = compat_getpass('Type account password and press [Return]: ')
- if opts.ap_username is not None and opts.ap_password is None:
- opts.ap_password = compat_getpass('Type TV provider account password and press [Return]: ')
- if opts.ratelimit is not None:
- numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
- if numeric_limit is None:
- parser.error('invalid rate limit specified')
- opts.ratelimit = numeric_limit
- if opts.throttledratelimit is not None:
- numeric_limit = FileDownloader.parse_bytes(opts.throttledratelimit)
- if numeric_limit is None:
- parser.error('invalid rate limit specified')
- opts.throttledratelimit = numeric_limit
- if opts.min_filesize is not None:
- numeric_limit = FileDownloader.parse_bytes(opts.min_filesize)
- if numeric_limit is None:
- parser.error('invalid min_filesize specified')
- opts.min_filesize = numeric_limit
- if opts.max_filesize is not None:
- numeric_limit = FileDownloader.parse_bytes(opts.max_filesize)
- if numeric_limit is None:
- parser.error('invalid max_filesize specified')
- opts.max_filesize = numeric_limit
- if opts.sleep_interval is not None:
- if opts.sleep_interval < 0:
- parser.error('sleep interval must be positive or 0')
- if opts.max_sleep_interval is not None:
- if opts.max_sleep_interval < 0:
- parser.error('max sleep interval must be positive or 0')
- if opts.sleep_interval is None:
- parser.error('min sleep interval must be specified, use --min-sleep-interval')
- if opts.max_sleep_interval < opts.sleep_interval:
- parser.error('max sleep interval must be greater than or equal to min sleep interval')
else:
- opts.max_sleep_interval = opts.sleep_interval
- if opts.sleep_interval_subtitles is not None:
- if opts.sleep_interval_subtitles < 0:
- parser.error('subtitles sleep interval must be positive or 0')
- if opts.sleep_interval_requests is not None:
- if opts.sleep_interval_requests < 0:
- parser.error('requests sleep interval must be positive or 0')
- if opts.ap_mso and opts.ap_mso not in MSO_INFO:
- parser.error('Unsupported TV Provider, use --ap-list-mso to get a list of supported TV Providers')
- if opts.overwrites: # --yes-overwrites implies --no-continue
- opts.continue_dl = False
- if opts.concurrent_fragment_downloads <= 0:
- raise ValueError('Concurrent fragments must be positive')
+ return False
+ return True
- def parse_retries(retries, name=''):
- if retries in ('inf', 'infinite'):
- parsed_retries = float('inf')
- else:
- try:
- parsed_retries = int(retries)
- except (TypeError, ValueError):
- parser.error('invalid %sretry count specified' % name)
- return parsed_retries
- if opts.retries is not None:
- opts.retries = parse_retries(opts.retries)
- if opts.fragment_retries is not None:
- opts.fragment_retries = parse_retries(opts.fragment_retries, 'fragment ')
- if opts.extractor_retries is not None:
- opts.extractor_retries = parse_retries(opts.extractor_retries, 'extractor ')
- if opts.buffersize is not None:
- numeric_buffersize = FileDownloader.parse_bytes(opts.buffersize)
- if numeric_buffersize is None:
- parser.error('invalid buffer size specified')
- opts.buffersize = numeric_buffersize
- if opts.http_chunk_size is not None:
- numeric_chunksize = FileDownloader.parse_bytes(opts.http_chunk_size)
- if not numeric_chunksize:
- parser.error('invalid http chunk size specified')
- opts.http_chunk_size = numeric_chunksize
- if opts.playliststart <= 0:
- raise ValueError('Playlist start must be positive')
- if opts.playlistend not in (-1, None) and opts.playlistend < opts.playliststart:
- raise ValueError('Playlist end must be greater than playlist start')
- if opts.extractaudio:
- if opts.audioformat not in ['best'] + list(FFmpegExtractAudioPP.SUPPORTED_EXTS):
- parser.error('invalid audio format specified')
- if opts.audioquality:
- opts.audioquality = opts.audioquality.strip('k').strip('K')
- if not opts.audioquality.isdigit():
- parser.error('invalid audio quality specified')
- if opts.recodevideo is not None:
- opts.recodevideo = opts.recodevideo.replace(' ', '')
- if not re.match(FFmpegVideoConvertorPP.FORMAT_RE, opts.recodevideo):
- parser.error('invalid video remux format specified')
- if opts.remuxvideo is not None:
- opts.remuxvideo = opts.remuxvideo.replace(' ', '')
- if not re.match(FFmpegVideoRemuxerPP.FORMAT_RE, opts.remuxvideo):
- parser.error('invalid video remux format specified')
- if opts.convertsubtitles is not None:
- if opts.convertsubtitles not in FFmpegSubtitlesConvertorPP.SUPPORTED_EXTS:
- parser.error('invalid subtitle format specified')
- if opts.convertthumbnails is not None:
- if opts.convertthumbnails not in FFmpegThumbnailsConvertorPP.SUPPORTED_EXTS:
- parser.error('invalid thumbnail format specified')
-
- if opts.cookiesfrombrowser is not None:
- opts.cookiesfrombrowser = [
- part.strip() or None for part in opts.cookiesfrombrowser.split(':', 1)]
- if opts.cookiesfrombrowser[0].lower() not in SUPPORTED_BROWSERS:
- parser.error('unsupported browser specified for cookies')
-
- if opts.date is not None:
- date = DateRange.day(opts.date)
- else:
- date = DateRange(opts.dateafter, opts.datebefore)
-
- compat_opts = opts.compat_opts
+def set_compat_opts(opts):
def _unused_compat_opt(name):
- if name not in compat_opts:
+ if name not in opts.compat_opts:
return False
- compat_opts.discard(name)
- compat_opts.update(['*%s' % name])
+ opts.compat_opts.discard(name)
+ opts.compat_opts.update(['*%s' % name])
return True
def set_default_compat(compat_name, opt_name, default=True, remove_compat=True):
attr = getattr(opts, opt_name)
- if compat_name in compat_opts:
+ if compat_name in opts.compat_opts:
if attr is None:
setattr(opts, opt_name, not default)
return True
@@ -280,54 +134,204 @@ def _real_main(argv=None):
set_default_compat('abort-on-error', 'ignoreerrors', 'only_download')
set_default_compat('no-playlist-metafiles', 'allow_playlist_files')
set_default_compat('no-clean-infojson', 'clean_infojson')
- if 'format-sort' in compat_opts:
+ if 'no-attach-info-json' in opts.compat_opts:
+ if opts.embed_infojson:
+ _unused_compat_opt('no-attach-info-json')
+ else:
+ opts.embed_infojson = False
+ if 'format-sort' in opts.compat_opts:
opts.format_sort.extend(InfoExtractor.FormatSort.ytdl_default)
_video_multistreams_set = set_default_compat('multistreams', 'allow_multiple_video_streams', False, remove_compat=False)
_audio_multistreams_set = set_default_compat('multistreams', 'allow_multiple_audio_streams', False, remove_compat=False)
if _video_multistreams_set is False and _audio_multistreams_set is False:
_unused_compat_opt('multistreams')
- outtmpl_default = opts.outtmpl.get('default')
- if 'filename' in compat_opts:
- if outtmpl_default is None:
- outtmpl_default = '%(title)s-%(id)s.%(ext)s'
- opts.outtmpl.update({'default': outtmpl_default})
+ if 'filename' in opts.compat_opts:
+ if opts.outtmpl.get('default') is None:
+ opts.outtmpl.update({'default': '%(title)s-%(id)s.%(ext)s'})
else:
_unused_compat_opt('filename')
+
+def validate_options(opts):
+ def validate(cndn, name, value=None, msg=None):
+ if cndn:
+ return True
+ raise ValueError((msg or 'invalid {name} "{value}" given').format(name=name, value=value))
+
+ def validate_in(name, value, items, msg=None):
+ return validate(value is None or value in items, name, value, msg)
+
+ def validate_regex(name, value, regex):
+ return validate(value is None or re.match(regex, value), name, value)
+
+ def validate_positive(name, value, strict=False):
+ return validate(value is None or value > 0 or (not strict and value == 0),
+ name, value, '{name} "{value}" must be positive' + ('' if strict else ' or 0'))
+
+ def validate_minmax(min_val, max_val, min_name, max_name=None):
+ if max_val is None or min_val is None or max_val >= min_val:
+ return
+ if not max_name:
+ min_name, max_name = f'min {min_name}', f'max {min_name}'
+ raise ValueError(f'{max_name} "{max_val}" must be must be greater than or equal to {min_name} "{min_val}"')
+
+ # Usernames and passwords
+ validate(not opts.usenetrc or (opts.username is None and opts.password is None),
+ '.netrc', msg='using {name} conflicts with giving username/password')
+ validate(opts.password is None or opts.username is not None, 'account username', msg='{name} missing')
+ validate(opts.ap_password is None or opts.ap_username is not None,
+ 'TV Provider account username', msg='{name} missing')
+ validate_in('TV Provider', opts.ap_mso, MSO_INFO,
+ 'Unsupported {name} "{value}", use --ap-list-mso to get a list of supported TV Providers')
+
+ # Numbers
+ validate_positive('autonumber start', opts.autonumber_start)
+ validate_positive('autonumber size', opts.autonumber_size, True)
+ validate_positive('concurrent fragments', opts.concurrent_fragment_downloads, True)
+ validate_positive('playlist start', opts.playliststart, True)
+ if opts.playlistend != -1:
+ validate_minmax(opts.playliststart, opts.playlistend, 'playlist start', 'playlist end')
+
+ # Time ranges
+ validate_positive('subtitles sleep interval', opts.sleep_interval_subtitles)
+ validate_positive('requests sleep interval', opts.sleep_interval_requests)
+ validate_positive('sleep interval', opts.sleep_interval)
+ validate_positive('max sleep interval', opts.max_sleep_interval)
+ if opts.sleep_interval is None:
+ validate(
+ opts.max_sleep_interval is None, 'min sleep interval',
+ msg='{name} must be specified; use --min-sleep-interval')
+ elif opts.max_sleep_interval is None:
+ opts.max_sleep_interval = opts.sleep_interval
+ else:
+ validate_minmax(opts.sleep_interval, opts.max_sleep_interval, 'sleep interval')
+
+ if opts.wait_for_video is not None:
+ min_wait, max_wait, *_ = map(parse_duration, opts.wait_for_video.split('-', 1) + [None])
+ validate(min_wait is not None and not (max_wait is None and '-' in opts.wait_for_video),
+ 'time range to wait for video', opts.wait_for_video)
+ validate_minmax(min_wait, max_wait, 'time range to wait for video')
+ opts.wait_for_video = (min_wait, max_wait)
+
+ # Format sort
+ for f in opts.format_sort:
+ validate_regex('format sorting', f, InfoExtractor.FormatSort.regex)
+
+ # Postprocessor formats
+ validate_in('audio format', opts.audioformat, ['best'] + list(FFmpegExtractAudioPP.SUPPORTED_EXTS))
+ validate_in('subtitle format', opts.convertsubtitles, FFmpegSubtitlesConvertorPP.SUPPORTED_EXTS)
+ validate_in('thumbnail format', opts.convertthumbnails, FFmpegThumbnailsConvertorPP.SUPPORTED_EXTS)
+ if opts.recodevideo is not None:
+ opts.recodevideo = opts.recodevideo.replace(' ', '')
+ validate_regex('video recode format', opts.recodevideo, FFmpegVideoConvertorPP.FORMAT_RE)
+ if opts.remuxvideo is not None:
+ opts.remuxvideo = opts.remuxvideo.replace(' ', '')
+ validate_regex('video remux format', opts.remuxvideo, FFmpegVideoRemuxerPP.FORMAT_RE)
+ if opts.audioquality:
+ opts.audioquality = opts.audioquality.strip('k').strip('K')
+ # int_or_none prevents inf, nan
+ validate_positive('audio quality', int_or_none(float_or_none(opts.audioquality), default=0))
+
+ # Retries
+ def parse_retries(name, value):
+ if value is None:
+ return None
+ elif value in ('inf', 'infinite'):
+ return float('inf')
+ try:
+ return int(value)
+ except (TypeError, ValueError):
+ validate(False, f'{name} retry count', value)
+
+ opts.retries = parse_retries('download', opts.retries)
+ opts.fragment_retries = parse_retries('fragment', opts.fragment_retries)
+ opts.extractor_retries = parse_retries('extractor', opts.extractor_retries)
+ opts.file_access_retries = parse_retries('file access', opts.file_access_retries)
+
+ # Bytes
+ def parse_bytes(name, value):
+ if value is None:
+ return None
+ numeric_limit = FileDownloader.parse_bytes(value)
+ validate(numeric_limit is not None, 'rate limit', value)
+ return numeric_limit
+
+ opts.ratelimit = parse_bytes('rate limit', opts.ratelimit)
+ opts.throttledratelimit = parse_bytes('throttled rate limit', opts.throttledratelimit)
+ opts.min_filesize = parse_bytes('min filesize', opts.min_filesize)
+ opts.max_filesize = parse_bytes('max filesize', opts.max_filesize)
+ opts.buffersize = parse_bytes('buffer size', opts.buffersize)
+ opts.http_chunk_size = parse_bytes('http chunk size', opts.http_chunk_size)
+
+ # Output templates
def validate_outtmpl(tmpl, msg):
err = YoutubeDL.validate_outtmpl(tmpl)
if err:
- parser.error('invalid %s %r: %s' % (msg, tmpl, error_to_compat_str(err)))
+ raise ValueError(f'invalid {msg} "{tmpl}": {err}')
for k, tmpl in opts.outtmpl.items():
validate_outtmpl(tmpl, f'{k} output template')
- opts.forceprint = opts.forceprint or []
- for tmpl in opts.forceprint or []:
- validate_outtmpl(tmpl, 'print template')
+ for type_, tmpl_list in opts.forceprint.items():
+ for tmpl in tmpl_list:
+ validate_outtmpl(tmpl, f'{type_} print template')
+ for type_, tmpl_list in opts.print_to_file.items():
+ for tmpl, file in tmpl_list:
+ validate_outtmpl(tmpl, f'{type_} print to file template')
+ validate_outtmpl(file, f'{type_} print to file filename')
validate_outtmpl(opts.sponsorblock_chapter_title, 'SponsorBlock chapter title')
for k, tmpl in opts.progress_template.items():
k = f'{k[:-6]} console title' if '-title' in k else f'{k} progress'
validate_outtmpl(tmpl, f'{k} template')
- if opts.extractaudio and not opts.keepvideo and opts.format is None:
- opts.format = 'bestaudio/best'
-
- if outtmpl_default is not None and not os.path.splitext(outtmpl_default)[1] and opts.extractaudio:
- parser.error('Cannot download a video and extract audio into the same'
- ' file! Use "{0}.%(ext)s" instead of "{0}" as the output'
- ' template'.format(outtmpl_default))
-
- for f in opts.format_sort:
- if re.match(InfoExtractor.FormatSort.regex, f) is None:
- parser.error('invalid format sort string "%s" specified' % f)
-
+ outtmpl_default = opts.outtmpl.get('default')
+ if outtmpl_default == '':
+ opts.skip_download = None
+ del opts.outtmpl['default']
+ if outtmpl_default and not os.path.splitext(outtmpl_default)[1] and opts.extractaudio:
+ raise ValueError(
+ 'Cannot download a video and extract audio into the same file! '
+ f'Use "{outtmpl_default}.%(ext)s" instead of "{outtmpl_default}" as the output template')
+
+ # Remove chapters
+ remove_chapters_patterns, opts.remove_ranges = [], []
+ for regex in opts.remove_chapters or []:
+ if regex.startswith('*'):
+ dur = list(map(parse_duration, regex[1:].split('-')))
+ if len(dur) == 2 and all(t is not None for t in dur):
+ opts.remove_ranges.append(tuple(dur))
+ continue
+ raise ValueError(f'invalid --remove-chapters time range "{regex}". Must be of the form *start-end')
+ try:
+ remove_chapters_patterns.append(re.compile(regex))
+ except re.error as err:
+ raise ValueError(f'invalid --remove-chapters regex "{regex}" - {err}')
+ opts.remove_chapters = remove_chapters_patterns
+
+ # Cookies from browser
+ if opts.cookiesfrombrowser:
+ mobj = re.match(r'(?P<name>[^+:]+)(\s*\+\s*(?P<keyring>[^:]+))?(\s*:(?P<profile>.+))?', opts.cookiesfrombrowser)
+ if mobj is None:
+ raise ValueError(f'invalid cookies from browser arguments: {opts.cookiesfrombrowser}')
+ browser_name, keyring, profile = mobj.group('name', 'keyring', 'profile')
+ browser_name = browser_name.lower()
+ if browser_name not in SUPPORTED_BROWSERS:
+ raise ValueError(f'unsupported browser specified for cookies: "{browser_name}". '
+ f'Supported browsers are: {", ".join(sorted(SUPPORTED_BROWSERS))}')
+ if keyring is not None:
+ keyring = keyring.upper()
+ if keyring not in SUPPORTED_KEYRINGS:
+ raise ValueError(f'unsupported keyring specified for cookies: "{keyring}". '
+ f'Supported keyrings are: {", ".join(sorted(SUPPORTED_KEYRINGS))}')
+ opts.cookiesfrombrowser = (browser_name, profile, keyring)
+
+ # MetadataParser
def metadataparser_actions(f):
if isinstance(f, str):
cmd = '--parse-metadata %s' % compat_shlex_quote(f)
try:
actions = [MetadataFromFieldPP.to_action(f)]
except Exception as err:
- parser.error(f'{cmd} is invalid; {err}')
+ raise ValueError(f'{cmd} is invalid; {err}')
else:
cmd = '--replace-in-metadata %s' % ' '.join(map(compat_shlex_quote, f))
actions = ((MetadataParserPP.Actions.REPLACE, x, *f[1:]) for x in f[0].split(','))
@@ -336,242 +340,296 @@ def _real_main(argv=None):
try:
MetadataParserPP.validate_action(*action)
except Exception as err:
- parser.error(f'{cmd} is invalid; {err}')
+ raise ValueError(f'{cmd} is invalid; {err}')
yield action
- if opts.parse_metadata is None:
- opts.parse_metadata = []
+ parse_metadata = opts.parse_metadata or []
if opts.metafromtitle is not None:
- opts.parse_metadata.append('title:%s' % opts.metafromtitle)
- opts.parse_metadata = list(itertools.chain(*map(metadataparser_actions, opts.parse_metadata)))
+ parse_metadata.append('title:%s' % opts.metafromtitle)
+ opts.parse_metadata = list(itertools.chain(*map(metadataparser_actions, parse_metadata)))
- any_getting = opts.forceprint or opts.geturl or opts.gettitle or opts.getid or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat or opts.getduration or opts.dumpjson or opts.dump_single_json
- any_printing = opts.print_json
- download_archive_fn = expand_path(opts.download_archive) if opts.download_archive is not None else opts.download_archive
+ # Other options
+ geo_bypass_code = opts.geo_bypass_ip_block or opts.geo_bypass_country
+ if geo_bypass_code is not None:
+ try:
+ GeoUtils.random_ipv4(geo_bypass_code)
+ except Exception:
+ raise ValueError('unsupported geo-bypass country or ip-block')
- # If JSON is not printed anywhere, but comments are requested, save it to file
- printing_json = opts.dumpjson or opts.print_json or opts.dump_single_json
- if opts.getcomments and not printing_json:
- opts.writeinfojson = True
+ opts.match_filter = match_filter_func(opts.match_filter)
+
+ if opts.download_archive is not None:
+ opts.download_archive = expand_path(opts.download_archive)
+
+ if opts.user_agent is not None:
+ opts.headers.setdefault('User-Agent', opts.user_agent)
+ if opts.referer is not None:
+ opts.headers.setdefault('Referer', opts.referer)
if opts.no_sponsorblock:
- opts.sponsorblock_mark = set()
- opts.sponsorblock_remove = set()
- sponsorblock_query = opts.sponsorblock_mark | opts.sponsorblock_remove
+ opts.sponsorblock_mark = opts.sponsorblock_remove = set()
+
+ warnings, deprecation_warnings = [], []
+
+ # Common mistake: -f best
+ if opts.format == 'best':
+ warnings.append('.\n '.join((
+ '"-f best" selects the best pre-merged format which is often not the best option',
+ 'To let hypervideo download and merge the best available formats, simply do not pass any format selection',
+ 'If you know what you are doing and want only the best pre-merged format, use "-f b" instead to suppress this warning')))
+
+ # --(postprocessor/downloader)-args without name
+ def report_args_compat(name, value, key1, key2=None):
+ if key1 in value and key2 not in value:
+ warnings.append(f'{name} arguments given without specifying name. The arguments will be given to all {name}s')
+ return True
+ return False
+
+ report_args_compat('external downloader', opts.external_downloader_args, 'default')
+ if report_args_compat('post-processor', opts.postprocessor_args, 'default-compat', 'default'):
+ opts.postprocessor_args['default'] = opts.postprocessor_args.pop('default-compat')
+ opts.postprocessor_args.setdefault('sponskrub', [])
+
+ def report_conflict(arg1, opt1, arg2='--allow-unplayable-formats', opt2='allow_unplayable_formats',
+ val1=NO_DEFAULT, val2=NO_DEFAULT, default=False):
+ if val2 is NO_DEFAULT:
+ val2 = getattr(opts, opt2)
+ if not val2:
+ return
+
+ if val1 is NO_DEFAULT:
+ val1 = getattr(opts, opt1)
+ if val1:
+ warnings.append(f'{arg1} is ignored since {arg2} was given')
+ setattr(opts, opt1, default)
+
+ # Conflicting options
+ report_conflict('--dateafter', 'dateafter', '--date', 'date', default=None)
+ report_conflict('--datebefore', 'datebefore', '--date', 'date', default=None)
+ report_conflict('--exec-before-download', 'exec_before_dl_cmd', '"--exec before_dl:"', 'exec_cmd', opts.exec_cmd.get('before_dl'))
+ report_conflict('--id', 'useid', '--output', 'outtmpl', val2=opts.outtmpl.get('default'))
+ report_conflict('--remux-video', 'remuxvideo', '--recode-video', 'recodevideo')
+ report_conflict('--sponskrub', 'sponskrub', '--remove-chapters', 'remove_chapters')
+ report_conflict('--sponskrub', 'sponskrub', '--sponsorblock-mark', 'sponsorblock_mark')
+ report_conflict('--sponskrub', 'sponskrub', '--sponsorblock-remove', 'sponsorblock_remove')
+ report_conflict('--sponskrub-cut', 'sponskrub_cut', '--split-chapter', 'split_chapters', val1=opts.sponskrub and opts.sponskrub_cut)
+
+ # Conflicts with --allow-unplayable-formats
+ report_conflict('--add-metadata', 'addmetadata')
+ report_conflict('--embed-chapters', 'addchapters')
+ report_conflict('--embed-info-json', 'embed_infojson')
+ report_conflict('--embed-subs', 'embedsubtitles')
+ report_conflict('--embed-thumbnail', 'embedthumbnail')
+ report_conflict('--extract-audio', 'extractaudio')
+ report_conflict('--fixup', 'fixup', val1=(opts.fixup or '').lower() in ('', 'never', 'ignore'), default='never')
+ report_conflict('--recode-video', 'recodevideo')
+ report_conflict('--remove-chapters', 'remove_chapters', default=[])
+ report_conflict('--remux-video', 'remuxvideo')
+ report_conflict('--sponskrub', 'sponskrub')
+ report_conflict('--sponsorblock-remove', 'sponsorblock_remove', default=set())
+ report_conflict('--xattrs', 'xattrs')
+
+ # Fully deprecated options
+ def report_deprecation(val, old, new=None):
+ if not val:
+ return
+ deprecation_warnings.append(
+ f'{old} is deprecated and may be removed in a future version. Use {new} instead' if new
+ else f'{old} is deprecated and may not work as expected')
+
+ report_deprecation(opts.sponskrub, '--sponskrub', '--sponsorblock-mark or --sponsorblock-remove')
+ report_deprecation(not opts.prefer_ffmpeg, '--prefer-avconv', 'ffmpeg')
+ # report_deprecation(opts.include_ads, '--include-ads') # We may re-implement this in future
+ # report_deprecation(opts.call_home, '--call-home') # We may re-implement this in future
+ # report_deprecation(opts.writeannotations, '--write-annotations') # It's just that no website has it
+
+ # Dependent options
+ opts.date = DateRange.day(opts.date) if opts.date else DateRange(opts.dateafter, opts.datebefore)
+
+ if opts.exec_before_dl_cmd:
+ opts.exec_cmd['before_dl'] = opts.exec_before_dl_cmd
+
+ if opts.useid: # --id is not deprecated in youtube-dl
+ opts.outtmpl['default'] = '%(id)s.%(ext)s'
+
+ if opts.overwrites: # --force-overwrites implies --no-continue
+ opts.continue_dl = False
if (opts.addmetadata or opts.sponsorblock_mark) and opts.addchapters is None:
+ # Add chapters when adding metadata or marking sponsors
opts.addchapters = True
- opts.remove_chapters = opts.remove_chapters or []
-
- def report_conflict(arg1, arg2):
- warnings.append('%s is ignored since %s was given' % (arg2, arg1))
-
- if (opts.remove_chapters or sponsorblock_query) and opts.sponskrub is not False:
- if opts.sponskrub:
- if opts.remove_chapters:
- report_conflict('--remove-chapters', '--sponskrub')
- if opts.sponsorblock_mark:
- report_conflict('--sponsorblock-mark', '--sponskrub')
- if opts.sponsorblock_remove:
- report_conflict('--sponsorblock-remove', '--sponskrub')
- opts.sponskrub = False
- if opts.sponskrub_cut and opts.split_chapters and opts.sponskrub is not False:
- report_conflict('--split-chapter', '--sponskrub-cut')
- opts.sponskrub_cut = False
-
- if opts.remuxvideo and opts.recodevideo:
- report_conflict('--recode-video', '--remux-video')
- opts.remuxvideo = False
-
- if opts.allow_unplayable_formats:
- if opts.extractaudio:
- report_conflict('--allow-unplayable-formats', '--extract-audio')
- opts.extractaudio = False
- if opts.remuxvideo:
- report_conflict('--allow-unplayable-formats', '--remux-video')
- opts.remuxvideo = False
- if opts.recodevideo:
- report_conflict('--allow-unplayable-formats', '--recode-video')
- opts.recodevideo = False
- if opts.addmetadata:
- report_conflict('--allow-unplayable-formats', '--add-metadata')
- opts.addmetadata = False
- if opts.embedsubtitles:
- report_conflict('--allow-unplayable-formats', '--embed-subs')
- opts.embedsubtitles = False
- if opts.embedthumbnail:
- report_conflict('--allow-unplayable-formats', '--embed-thumbnail')
- opts.embedthumbnail = False
- if opts.xattrs:
- report_conflict('--allow-unplayable-formats', '--xattrs')
- opts.xattrs = False
- if opts.fixup and opts.fixup.lower() not in ('never', 'ignore'):
- report_conflict('--allow-unplayable-formats', '--fixup')
- opts.fixup = 'never'
- if opts.remove_chapters:
- report_conflict('--allow-unplayable-formats', '--remove-chapters')
- opts.remove_chapters = []
- if opts.sponsorblock_remove:
- report_conflict('--allow-unplayable-formats', '--sponsorblock-remove')
- opts.sponsorblock_remove = set()
- if opts.sponskrub:
- report_conflict('--allow-unplayable-formats', '--sponskrub')
- opts.sponskrub = False
-
- # PostProcessors
- postprocessors = list(opts.add_postprocessors)
- if sponsorblock_query:
- postprocessors.append({
- 'key': 'SponsorBlock',
- 'categories': sponsorblock_query,
- 'api': opts.sponsorblock_api,
- # Run this immediately after extraction is complete
- 'when': 'pre_process'
- })
+
+ if opts.extractaudio and not opts.keepvideo and opts.format is None:
+ # Do not unnecessarily download audio
+ opts.format = 'bestaudio/best'
+
+ if opts.getcomments and opts.writeinfojson is None:
+ # If JSON is not printed anywhere, but comments are requested, save it to file
+ if not opts.dumpjson or opts.print_json or opts.dump_single_json:
+ opts.writeinfojson = True
+
+ if opts.allsubtitles and not (opts.embedsubtitles or opts.writeautomaticsub):
+ # --all-sub automatically sets --write-sub if --write-auto-sub is not given
+ opts.writesubtitles = True
+
+ if opts.addmetadata and opts.embed_infojson is None:
+ # If embedding metadata and infojson is present, embed it
+ opts.embed_infojson = 'if_exists'
+
+ # Ask for passwords
+ if opts.username is not None and opts.password is None:
+ opts.password = compat_getpass('Type account password and press [Return]: ')
+ if opts.ap_username is not None and opts.ap_password is None:
+ opts.ap_password = compat_getpass('Type TV provider account password and press [Return]: ')
+
+ return warnings, deprecation_warnings
+
+
+def get_postprocessors(opts):
+ yield from opts.add_postprocessors
+
if opts.parse_metadata:
- postprocessors.append({
+ yield {
'key': 'MetadataParser',
'actions': opts.parse_metadata,
- # Run this immediately after extraction is complete
'when': 'pre_process'
- })
+ }
+ sponsorblock_query = opts.sponsorblock_mark | opts.sponsorblock_remove
+ if sponsorblock_query:
+ yield {
+ 'key': 'SponsorBlock',
+ 'categories': sponsorblock_query,
+ 'api': opts.sponsorblock_api,
+ 'when': 'after_filter'
+ }
if opts.convertsubtitles:
- postprocessors.append({
+ yield {
'key': 'FFmpegSubtitlesConvertor',
'format': opts.convertsubtitles,
- # Run this before the actual video download
'when': 'before_dl'
- })
+ }
if opts.convertthumbnails:
- postprocessors.append({
+ yield {
'key': 'FFmpegThumbnailsConvertor',
'format': opts.convertthumbnails,
- # Run this before the actual video download
'when': 'before_dl'
- })
- # Must be after all other before_dl
- if opts.exec_before_dl_cmd:
- postprocessors.append({
- 'key': 'Exec',
- 'exec_cmd': opts.exec_before_dl_cmd,
- 'when': 'before_dl'
- })
+ }
if opts.extractaudio:
- postprocessors.append({
+ yield {
'key': 'FFmpegExtractAudio',
'preferredcodec': opts.audioformat,
'preferredquality': opts.audioquality,
'nopostoverwrites': opts.nopostoverwrites,
- })
+ }
if opts.remuxvideo:
- postprocessors.append({
+ yield {
'key': 'FFmpegVideoRemuxer',
'preferedformat': opts.remuxvideo,
- })
+ }
if opts.recodevideo:
- postprocessors.append({
+ yield {
'key': 'FFmpegVideoConvertor',
'preferedformat': opts.recodevideo,
- })
+ }
# If ModifyChapters is going to remove chapters, subtitles must already be in the container.
if opts.embedsubtitles:
- already_have_subtitle = opts.writesubtitles and 'no-keep-subs' not in compat_opts
- postprocessors.append({
+ keep_subs = 'no-keep-subs' not in opts.compat_opts
+ yield {
'key': 'FFmpegEmbedSubtitle',
# already_have_subtitle = True prevents the file from being deleted after embedding
- 'already_have_subtitle': already_have_subtitle
- })
- if not opts.writeautomaticsub and 'no-keep-subs' not in compat_opts:
+ 'already_have_subtitle': opts.writesubtitles and keep_subs
+ }
+ if not opts.writeautomaticsub and keep_subs:
opts.writesubtitles = True
- # --all-sub automatically sets --write-sub if --write-auto-sub is not given
- # this was the old behaviour if only --all-sub was given.
- if opts.allsubtitles and not opts.writeautomaticsub:
- opts.writesubtitles = True
+
# ModifyChapters must run before FFmpegMetadataPP
- remove_chapters_patterns, remove_ranges = [], []
- for regex in opts.remove_chapters:
- if regex.startswith('*'):
- dur = list(map(parse_duration, regex[1:].split('-')))
- if len(dur) == 2 and all(t is not None for t in dur):
- remove_ranges.append(tuple(dur))
- continue
- parser.error(f'invalid --remove-chapters time range {regex!r}. Must be of the form ?start-end')
- try:
- remove_chapters_patterns.append(re.compile(regex))
- except re.error as err:
- parser.error(f'invalid --remove-chapters regex {regex!r} - {err}')
if opts.remove_chapters or sponsorblock_query:
- postprocessors.append({
+ yield {
'key': 'ModifyChapters',
- 'remove_chapters_patterns': remove_chapters_patterns,
+ 'remove_chapters_patterns': opts.remove_chapters,
'remove_sponsor_segments': opts.sponsorblock_remove,
- 'remove_ranges': remove_ranges,
+ 'remove_ranges': opts.remove_ranges,
'sponsorblock_chapter_title': opts.sponsorblock_chapter_title,
'force_keyframes': opts.force_keyframes_at_cuts
- })
+ }
# FFmpegMetadataPP should be run after FFmpegVideoConvertorPP and
# FFmpegExtractAudioPP as containers before conversion may not support
# metadata (3gp, webm, etc.)
# By default ffmpeg preserves metadata applicable for both
# source and target containers. From this point the container won't change,
# so metadata can be added here.
- if opts.addmetadata or opts.addchapters:
- postprocessors.append({
+ if opts.addmetadata or opts.addchapters or opts.embed_infojson:
+ yield {
'key': 'FFmpegMetadata',
'add_chapters': opts.addchapters,
'add_metadata': opts.addmetadata,
- })
- # Note: Deprecated
+ 'add_infojson': opts.embed_infojson,
+ }
+ # Deprecated
# This should be above EmbedThumbnail since sponskrub removes the thumbnail attachment
# but must be below EmbedSubtitle and FFmpegMetadata
# See https://github.com/hypervideo/hypervideo/issues/204 , https://github.com/faissaloo/SponSkrub/issues/29
# If opts.sponskrub is None, sponskrub is used, but it silently fails if the executable can't be found
if opts.sponskrub is not False:
- postprocessors.append({
+ yield {
'key': 'SponSkrub',
'path': opts.sponskrub_path,
'args': opts.sponskrub_args,
'cut': opts.sponskrub_cut,
'force': opts.sponskrub_force,
'ignoreerror': opts.sponskrub is None,
- })
+ '_from_cli': True,
+ }
if opts.embedthumbnail:
- already_have_thumbnail = opts.writethumbnail or opts.write_all_thumbnails
- postprocessors.append({
+ yield {
'key': 'EmbedThumbnail',
# already_have_thumbnail = True prevents the file from being deleted after embedding
- 'already_have_thumbnail': already_have_thumbnail
- })
- if not already_have_thumbnail:
+ 'already_have_thumbnail': opts.writethumbnail
+ }
+ if not opts.writethumbnail:
opts.writethumbnail = True
opts.outtmpl['pl_thumbnail'] = ''
if opts.split_chapters:
- postprocessors.append({
+ yield {
'key': 'FFmpegSplitChapters',
'force_keyframes': opts.force_keyframes_at_cuts,
- })
+ }
# XAttrMetadataPP should be run after post-processors that may change file contents
if opts.xattrs:
- postprocessors.append({'key': 'XAttrMetadata'})
- # Exec must be the last PP
- if opts.exec_cmd:
- postprocessors.append({
+ yield {'key': 'XAttrMetadata'}
+ if opts.concat_playlist != 'never':
+ yield {
+ 'key': 'FFmpegConcat',
+ 'only_multi_video': opts.concat_playlist != 'always',
+ 'when': 'playlist',
+ }
+ # Exec must be the last PP of each category
+ for when, exec_cmd in opts.exec_cmd.items():
+ yield {
'key': 'Exec',
- 'exec_cmd': opts.exec_cmd,
- # Run this only after the files have been moved to their final locations
- 'when': 'after_move'
- })
+ 'exec_cmd': exec_cmd,
+ 'when': when,
+ }
- def report_args_compat(arg, name):
- warnings.append('%s given without specifying name. The arguments will be given to all %s' % (arg, name))
- if 'default' in opts.external_downloader_args:
- report_args_compat('--downloader-args', 'external downloaders')
+def parse_options(argv=None):
+ """ @returns (parser, opts, urls, ydl_opts) """
+ parser, opts, urls = parseOpts(argv)
+ urls = get_urls(urls, opts.batchfile, opts.verbose)
- if 'default-compat' in opts.postprocessor_args and 'default' not in opts.postprocessor_args:
- report_args_compat('--post-processor-args', 'post-processors')
- opts.postprocessor_args.setdefault('sponskrub', [])
- opts.postprocessor_args['default'] = opts.postprocessor_args['default-compat']
+ set_compat_opts(opts)
+ try:
+ warnings, deprecation_warnings = validate_options(opts)
+ except ValueError as err:
+ parser.error(f'{err}\n')
+
+ postprocessors = list(get_postprocessors(opts))
+
+ any_getting = (any(opts.forceprint.values()) or opts.dumpjson or opts.dump_single_json
+ or opts.geturl or opts.gettitle or opts.getid or opts.getthumbnail
+ or opts.getdescription or opts.getfilename or opts.getformat or opts.getduration)
+
+ any_printing = opts.print_json
final_ext = (
opts.recodevideo if opts.recodevideo in FFmpegVideoConvertorPP.SUPPORTED_EXTS
@@ -579,11 +637,7 @@ def _real_main(argv=None):
else opts.audioformat if (opts.extractaudio and opts.audioformat != 'best')
else None)
- match_filter = (
- None if opts.match_filter is None
- else match_filter_func(opts.match_filter))
-
- ydl_opts = {
+ return parser, opts, urls, {
'usenetrc': opts.usenetrc,
'netrc_location': opts.netrc_location,
'username': opts.username,
@@ -604,6 +658,7 @@ def _real_main(argv=None):
'forcefilename': opts.getfilename,
'forceformat': opts.getformat,
'forceprint': opts.forceprint,
+ 'print_to_file': opts.print_to_file,
'forcejson': opts.dumpjson or opts.print_json,
'dump_single_json': opts.dump_single_json,
'force_write_download_archive': opts.force_write_download_archive,
@@ -632,6 +687,7 @@ def _real_main(argv=None):
'throttledratelimit': opts.throttledratelimit,
'overwrites': opts.overwrites,
'retries': opts.retries,
+ 'file_access_retries': opts.file_access_retries,
'fragment_retries': opts.fragment_retries,
'extractor_retries': opts.extractor_retries,
'skip_unavailable_fragments': opts.skip_unavailable_fragments,
@@ -649,7 +705,7 @@ def _real_main(argv=None):
'playlistreverse': opts.playlist_reverse,
'playlistrandom': opts.playlist_random,
'noplaylist': opts.noplaylist,
- 'logtostderr': outtmpl_default == '-',
+ 'logtostderr': opts.outtmpl.get('default') == '-',
'consoletitle': opts.consoletitle,
'nopart': opts.nopart,
'updatetime': opts.updatetime,
@@ -659,8 +715,8 @@ def _real_main(argv=None):
'allow_playlist_files': opts.allow_playlist_files,
'clean_infojson': opts.clean_infojson,
'getcomments': opts.getcomments,
- 'writethumbnail': opts.writethumbnail,
- 'write_all_thumbnails': opts.write_all_thumbnails,
+ 'writethumbnail': opts.writethumbnail is True,
+ 'write_all_thumbnails': opts.writethumbnail == 'all',
'writelink': opts.writelink,
'writeurllink': opts.writeurllink,
'writewebloclink': opts.writewebloclink,
@@ -685,18 +741,21 @@ def _real_main(argv=None):
'max_filesize': opts.max_filesize,
'min_views': opts.min_views,
'max_views': opts.max_views,
- 'daterange': date,
+ 'daterange': opts.date,
'cachedir': opts.cachedir,
'youtube_print_sig_code': opts.youtube_print_sig_code,
'age_limit': opts.age_limit,
- 'download_archive': download_archive_fn,
+ 'download_archive': opts.download_archive,
'break_on_existing': opts.break_on_existing,
'break_on_reject': opts.break_on_reject,
+ 'break_per_url': opts.break_per_url,
'skip_playlist_after_errors': opts.skip_playlist_after_errors,
'cookiefile': opts.cookiefile,
'cookiesfrombrowser': opts.cookiesfrombrowser,
+ 'legacyserverconnect': opts.legacy_server_connect,
'nocheckcertificate': opts.no_check_certificate,
'prefer_insecure': opts.prefer_insecure,
+ 'http_headers': opts.headers,
'proxy': opts.proxy,
'socket_timeout': opts.socket_timeout,
'bidi_workaround': opts.bidi_workaround,
@@ -710,6 +769,8 @@ def _real_main(argv=None):
'youtube_include_hls_manifest': opts.youtube_include_hls_manifest,
'encoding': opts.encoding,
'extract_flat': opts.extract_flat,
+ 'live_from_start': opts.live_from_start,
+ 'wait_for_video': opts.wait_for_video,
'mark_watched': opts.mark_watched,
'merge_output_format': opts.merge_output_format,
'final_ext': final_ext,
@@ -725,7 +786,7 @@ def _real_main(argv=None):
'list_thumbnails': opts.list_thumbnails,
'playlist_items': opts.playlist_items,
'xattr_set_filesize': opts.xattr_set_filesize,
- 'match_filter': match_filter,
+ 'match_filter': opts.match_filter,
'no_color': opts.no_color,
'ffmpeg_location': opts.ffmpeg_location,
'hls_prefer_native': opts.hls_prefer_native,
@@ -738,12 +799,35 @@ def _real_main(argv=None):
'geo_bypass': opts.geo_bypass,
'geo_bypass_country': opts.geo_bypass_country,
'geo_bypass_ip_block': opts.geo_bypass_ip_block,
- 'warnings': warnings,
- 'compat_opts': compat_opts,
+ '_warnings': warnings,
+ '_deprecation_warnings': deprecation_warnings,
+ 'compat_opts': opts.compat_opts,
}
+
+def _real_main(argv=None):
+ # Compatibility fixes for Windows
+ if sys.platform == 'win32':
+ # https://github.com/ytdl-org/youtube-dl/issues/820
+ codecs.register(lambda name: codecs.lookup('utf-8') if name == 'cp65001' else None)
+
+ workaround_optparse_bug9161()
+
+ setproctitle('hypervideo')
+
+ parser, opts, all_urls, ydl_opts = parse_options(argv)
+
+ # Dump user agent
+ if opts.dump_user_agent:
+ ua = traverse_obj(opts.headers, 'User-Agent', casesense=False, default=std_headers['User-Agent'])
+ write_string(f'{ua}\n', out=sys.stdout)
+ sys.exit(0)
+
+ if print_extractor_information(opts, all_urls):
+ sys.exit(0)
+
with YoutubeDL(ydl_opts) as ydl:
- actual_use = len(all_urls) or opts.load_info_filename
+ actual_use = all_urls or opts.load_info_filename
# Remove cache dir
if opts.rm_cachedir:
@@ -761,7 +845,7 @@ def _real_main(argv=None):
retcode = ydl.download_with_info_file(expand_path(opts.load_info_filename))
else:
retcode = ydl.download(all_urls)
- except (MaxDownloadsReached, ExistingVideoReached, RejectedVideoReached):
+ except DownloadCancelled:
ydl.to_screen('Aborting remaining downloads')
retcode = 101
@@ -773,15 +857,21 @@ def main(argv=None):
_real_main(argv)
except DownloadError:
sys.exit(1)
- except SameFileError:
- sys.exit('ERROR: fixed output name but more than one file to download')
+ except SameFileError as e:
+ sys.exit(f'ERROR: {e}')
except KeyboardInterrupt:
sys.exit('\nERROR: Interrupted by user')
- except BrokenPipeError:
+ except BrokenPipeError as e:
# https://docs.python.org/3/library/signal.html#note-on-sigpipe
devnull = os.open(os.devnull, os.O_WRONLY)
os.dup2(devnull, sys.stdout.fileno())
- sys.exit(r'\nERROR: {err}')
+ sys.exit(f'\nERROR: {e}')
-__all__ = ['main', 'YoutubeDL', 'gen_extractors', 'list_extractors']
+__all__ = [
+ 'main',
+ 'YoutubeDL',
+ 'parse_options',
+ 'gen_extractors',
+ 'list_extractors',
+]
diff --git a/hypervideo_dl/__main__.py b/hypervideo_dl/__main__.py
index 49765e4..49765e4 100755..100644
--- a/hypervideo_dl/__main__.py
+++ b/hypervideo_dl/__main__.py
diff --git a/hypervideo_dl/aes.py b/hypervideo_dl/aes.py
index 60cdeb7..b37f0dd 100644
--- a/hypervideo_dl/aes.py
+++ b/hypervideo_dl/aes.py
@@ -2,8 +2,15 @@ from __future__ import unicode_literals
from math import ceil
-from .compat import compat_b64decode, compat_pycrypto_AES
-from .utils import bytes_to_intlist, intlist_to_bytes
+from .compat import (
+ compat_b64decode,
+ compat_ord,
+ compat_pycrypto_AES,
+)
+from .utils import (
+ bytes_to_intlist,
+ intlist_to_bytes,
+)
if compat_pycrypto_AES:
@@ -25,9 +32,55 @@ else:
return intlist_to_bytes(aes_gcm_decrypt_and_verify(*map(bytes_to_intlist, (data, key, tag, nonce))))
+def unpad_pkcs7(data):
+ return data[:-compat_ord(data[-1])]
+
+
BLOCK_SIZE_BYTES = 16
+def aes_ecb_encrypt(data, key, iv=None):
+ """
+ Encrypt with aes in ECB mode
+
+ @param {int[]} data cleartext
+ @param {int[]} key 16/24/32-Byte cipher key
+ @param {int[]} iv Unused for this mode
+ @returns {int[]} encrypted data
+ """
+ expanded_key = key_expansion(key)
+ block_count = int(ceil(float(len(data)) / BLOCK_SIZE_BYTES))
+
+ encrypted_data = []
+ for i in range(block_count):
+ block = data[i * BLOCK_SIZE_BYTES: (i + 1) * BLOCK_SIZE_BYTES]
+ encrypted_data += aes_encrypt(block, expanded_key)
+ encrypted_data = encrypted_data[:len(data)]
+
+ return encrypted_data
+
+
+def aes_ecb_decrypt(data, key, iv=None):
+ """
+ Decrypt with aes in ECB mode
+
+ @param {int[]} data cleartext
+ @param {int[]} key 16/24/32-Byte cipher key
+ @param {int[]} iv Unused for this mode
+ @returns {int[]} decrypted data
+ """
+ expanded_key = key_expansion(key)
+ block_count = int(ceil(float(len(data)) / BLOCK_SIZE_BYTES))
+
+ encrypted_data = []
+ for i in range(block_count):
+ block = data[i * BLOCK_SIZE_BYTES: (i + 1) * BLOCK_SIZE_BYTES]
+ encrypted_data += aes_decrypt(block, expanded_key)
+ encrypted_data = encrypted_data[:len(data)]
+
+ return encrypted_data
+
+
def aes_ctr_decrypt(data, key, iv):
"""
Decrypt with aes in counter mode
@@ -464,5 +517,6 @@ __all__ = [
'aes_encrypt',
'aes_gcm_decrypt_and_verify',
'aes_gcm_decrypt_and_verify_bytes',
- 'key_expansion'
+ 'key_expansion',
+ 'unpad_pkcs7',
]
diff --git a/hypervideo_dl/compat.py b/hypervideo_dl/compat.py
index 5e0e5d8..bdea14c 100644
--- a/hypervideo_dl/compat.py
+++ b/hypervideo_dl/compat.py
@@ -2,6 +2,7 @@
import asyncio
import base64
+import collections
import ctypes
import getpass
import html
@@ -19,6 +20,7 @@ import shlex
import shutil
import socket
import struct
+import subprocess
import sys
import tokenize
import urllib
@@ -132,6 +134,16 @@ except AttributeError:
asyncio.run = compat_asyncio_run
+try: # >= 3.7
+ asyncio.tasks.all_tasks
+except AttributeError:
+ asyncio.tasks.all_tasks = asyncio.tasks.Task.all_tasks
+
+try:
+ import websockets as compat_websockets
+except ImportError:
+ compat_websockets = None
+
# Python 3.8+ does not honor %HOME% on windows, but this breaks compatibility with youtube-dl
# See https://github.com/hypervideo/hypervideo/issues/792
# https://docs.python.org/3/library/os.path.html#os.path.expanduser
@@ -158,25 +170,45 @@ except ImportError:
except ImportError:
compat_pycrypto_AES = None
+try:
+ import brotlicffi as compat_brotli
+except ImportError:
+ try:
+ import brotli as compat_brotli
+ except ImportError:
+ compat_brotli = None
+
+WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
+
def windows_enable_vt_mode(): # TODO: Do this the proper way https://bugs.python.org/issue30075
if compat_os_name != 'nt':
return
- os.system('')
+ global WINDOWS_VT_MODE
+ startupinfo = subprocess.STARTUPINFO()
+ startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
+ try:
+ subprocess.Popen('', shell=True, startupinfo=startupinfo)
+ WINDOWS_VT_MODE = True
+ except Exception:
+ pass
# Deprecated
compat_basestring = str
compat_chr = chr
+compat_filter = filter
compat_input = input
compat_integer_types = (int, )
compat_kwargs = lambda kwargs: kwargs
+compat_map = map
compat_numeric_types = (int, float, complex)
compat_str = str
compat_xpath = lambda xpath: xpath
compat_zip = zip
+compat_collections_abc = collections.abc
compat_HTMLParser = html.parser.HTMLParser
compat_HTTPError = urllib.error.HTTPError
compat_Struct = struct.Struct
@@ -223,6 +255,7 @@ compat_xml_parse_error = etree.ParseError
# Set public objects
__all__ = [
+ 'WINDOWS_VT_MODE',
'compat_HTMLParseError',
'compat_HTMLParser',
'compat_HTTPError',
@@ -232,7 +265,9 @@ __all__ = [
'compat_asyncio_run',
'compat_b64decode',
'compat_basestring',
+ 'compat_brotli',
'compat_chr',
+ 'compat_collections_abc',
'compat_cookiejar',
'compat_cookiejar_Cookie',
'compat_cookies',
@@ -242,6 +277,7 @@ __all__ = [
'compat_etree_fromstring',
'compat_etree_register_namespace',
'compat_expanduser',
+ 'compat_filter',
'compat_get_terminal_size',
'compat_getenv',
'compat_getpass',
@@ -253,6 +289,7 @@ __all__ = [
'compat_integer_types',
'compat_itertools_count',
'compat_kwargs',
+ 'compat_map',
'compat_numeric_types',
'compat_ord',
'compat_os_name',
@@ -284,6 +321,7 @@ __all__ = [
'compat_urllib_response',
'compat_urlparse',
'compat_urlretrieve',
+ 'compat_websockets',
'compat_xml_parse_error',
'compat_xpath',
'compat_zip',
diff --git a/hypervideo_dl/cookies.py b/hypervideo_dl/cookies.py
index 38fbdfa..f963729 100644
--- a/hypervideo_dl/cookies.py
+++ b/hypervideo_dl/cookies.py
@@ -1,3 +1,4 @@
+import contextlib
import ctypes
import json
import os
@@ -7,17 +8,22 @@ import subprocess
import sys
import tempfile
from datetime import datetime, timedelta, timezone
+from enum import Enum, auto
from hashlib import pbkdf2_hmac
-from .aes import aes_cbc_decrypt_bytes, aes_gcm_decrypt_and_verify_bytes
+from .aes import (
+ aes_cbc_decrypt_bytes,
+ aes_gcm_decrypt_and_verify_bytes,
+ unpad_pkcs7,
+)
from .compat import (
compat_b64decode,
compat_cookiejar_Cookie,
)
from .utils import (
- bug_reports_message,
+ error_to_str,
expand_path,
- process_communicate_or_kill,
+ Popen,
YoutubeDLCookieJar,
)
@@ -31,19 +37,16 @@ except ImportError:
try:
- import keyring
- KEYRING_AVAILABLE = True
- KEYRING_UNAVAILABLE_REASON = f'due to unknown reasons{bug_reports_message()}'
+ import secretstorage
+ SECRETSTORAGE_AVAILABLE = True
except ImportError:
- KEYRING_AVAILABLE = False
- KEYRING_UNAVAILABLE_REASON = (
- 'as the `keyring` module is not installed. '
- 'Please install by running `python3 -m pip install keyring`. '
- 'Depending on your platform, additional packages may be required '
- 'to access the keyring; see https://pypi.org/project/keyring')
+ SECRETSTORAGE_AVAILABLE = False
+ SECRETSTORAGE_UNAVAILABLE_REASON = (
+ 'as the `secretstorage` module is not installed. '
+ 'Please install by running `python3 -m pip install secretstorage`.')
except Exception as _err:
- KEYRING_AVAILABLE = False
- KEYRING_UNAVAILABLE_REASON = 'as the `keyring` module could not be initialized: %s' % _err
+ SECRETSTORAGE_AVAILABLE = False
+ SECRETSTORAGE_UNAVAILABLE_REASON = f'as the `secretstorage` module could not be initialized. {_err}'
CHROMIUM_BASED_BROWSERS = {'brave', 'chrome', 'chromium', 'edge', 'opera', 'vivaldi'}
@@ -74,8 +77,8 @@ class YDLLogger:
def load_cookies(cookie_file, browser_specification, ydl):
cookie_jars = []
if browser_specification is not None:
- browser_name, profile = _parse_browser_specification(*browser_specification)
- cookie_jars.append(extract_cookies_from_browser(browser_name, profile, YDLLogger(ydl)))
+ browser_name, profile, keyring = _parse_browser_specification(*browser_specification)
+ cookie_jars.append(extract_cookies_from_browser(browser_name, profile, YDLLogger(ydl), keyring=keyring))
if cookie_file is not None:
cookie_file = expand_path(cookie_file)
@@ -87,13 +90,13 @@ def load_cookies(cookie_file, browser_specification, ydl):
return _merge_cookie_jars(cookie_jars)
-def extract_cookies_from_browser(browser_name, profile=None, logger=YDLLogger()):
+def extract_cookies_from_browser(browser_name, profile=None, logger=YDLLogger(), *, keyring=None):
if browser_name == 'firefox':
return _extract_firefox_cookies(profile, logger)
elif browser_name == 'safari':
return _extract_safari_cookies(profile, logger)
elif browser_name in CHROMIUM_BASED_BROWSERS:
- return _extract_chrome_cookies(browser_name, profile, logger)
+ return _extract_chrome_cookies(browser_name, profile, keyring, logger)
else:
raise ValueError('unknown browser: {}'.format(browser_name))
@@ -117,7 +120,7 @@ def _extract_firefox_cookies(profile, logger):
raise FileNotFoundError('could not find firefox cookies database in {}'.format(search_root))
logger.debug('Extracting cookies from: "{}"'.format(cookie_database_path))
- with tempfile.TemporaryDirectory(prefix='youtube_dl') as tmpdir:
+ with tempfile.TemporaryDirectory(prefix='hypervideo_dl') as tmpdir:
cursor = None
try:
cursor = _open_database_copy(cookie_database_path, tmpdir)
@@ -207,7 +210,7 @@ def _get_chromium_based_browser_settings(browser_name):
}
-def _extract_chrome_cookies(browser_name, profile, logger):
+def _extract_chrome_cookies(browser_name, profile, keyring, logger):
logger.info('Extracting cookies from {}'.format(browser_name))
if not SQLITE_AVAILABLE:
@@ -234,9 +237,9 @@ def _extract_chrome_cookies(browser_name, profile, logger):
raise FileNotFoundError('could not find {} cookies database in "{}"'.format(browser_name, search_root))
logger.debug('Extracting cookies from: "{}"'.format(cookie_database_path))
- decryptor = get_cookie_decryptor(config['browser_dir'], config['keyring_name'], logger)
+ decryptor = get_cookie_decryptor(config['browser_dir'], config['keyring_name'], logger, keyring=keyring)
- with tempfile.TemporaryDirectory(prefix='youtube_dl') as tmpdir:
+ with tempfile.TemporaryDirectory(prefix='hypervideo_dl') as tmpdir:
cursor = None
try:
cursor = _open_database_copy(cookie_database_path, tmpdir)
@@ -247,6 +250,7 @@ def _extract_chrome_cookies(browser_name, profile, logger):
'expires_utc, {} FROM cookies'.format(secure_column))
jar = YoutubeDLCookieJar()
failed_cookies = 0
+ unencrypted_cookies = 0
for host_key, name, value, encrypted_value, path, expires_utc, is_secure in cursor.fetchall():
host_key = host_key.decode('utf-8')
name = name.decode('utf-8')
@@ -258,6 +262,8 @@ def _extract_chrome_cookies(browser_name, profile, logger):
if value is None:
failed_cookies += 1
continue
+ else:
+ unencrypted_cookies += 1
cookie = compat_cookiejar_Cookie(
version=0, name=name, value=value, port=None, port_specified=False,
@@ -270,6 +276,9 @@ def _extract_chrome_cookies(browser_name, profile, logger):
else:
failed_message = ''
logger.info('Extracted {} cookies from {}{}'.format(len(jar), browser_name, failed_message))
+ counts = decryptor.cookie_counts.copy()
+ counts['unencrypted'] = unencrypted_cookies
+ logger.debug('cookie version breakdown: {}'.format(counts))
return jar
finally:
if cursor is not None:
@@ -305,10 +314,14 @@ class ChromeCookieDecryptor:
def decrypt(self, encrypted_value):
raise NotImplementedError
+ @property
+ def cookie_counts(self):
+ raise NotImplementedError
+
-def get_cookie_decryptor(browser_root, browser_keyring_name, logger):
+def get_cookie_decryptor(browser_root, browser_keyring_name, logger, *, keyring=None):
if sys.platform in ('linux', 'linux2'):
- return LinuxChromeCookieDecryptor(browser_keyring_name, logger)
+ return LinuxChromeCookieDecryptor(browser_keyring_name, logger, keyring=keyring)
elif sys.platform == 'darwin':
return MacChromeCookieDecryptor(browser_keyring_name, logger)
elif sys.platform == 'win32':
@@ -319,13 +332,12 @@ def get_cookie_decryptor(browser_root, browser_keyring_name, logger):
class LinuxChromeCookieDecryptor(ChromeCookieDecryptor):
- def __init__(self, browser_keyring_name, logger):
+ def __init__(self, browser_keyring_name, logger, *, keyring=None):
self._logger = logger
self._v10_key = self.derive_key(b'peanuts')
- if KEYRING_AVAILABLE:
- self._v11_key = self.derive_key(_get_linux_keyring_password(browser_keyring_name))
- else:
- self._v11_key = None
+ password = _get_linux_keyring_password(browser_keyring_name, keyring, logger)
+ self._v11_key = None if password is None else self.derive_key(password)
+ self._cookie_counts = {'v10': 0, 'v11': 0, 'other': 0}
@staticmethod
def derive_key(password):
@@ -333,20 +345,27 @@ class LinuxChromeCookieDecryptor(ChromeCookieDecryptor):
# https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/os_crypt_linux.cc
return pbkdf2_sha1(password, salt=b'saltysalt', iterations=1, key_length=16)
+ @property
+ def cookie_counts(self):
+ return self._cookie_counts
+
def decrypt(self, encrypted_value):
version = encrypted_value[:3]
ciphertext = encrypted_value[3:]
if version == b'v10':
+ self._cookie_counts['v10'] += 1
return _decrypt_aes_cbc(ciphertext, self._v10_key, self._logger)
elif version == b'v11':
+ self._cookie_counts['v11'] += 1
if self._v11_key is None:
- self._logger.warning(f'cannot decrypt cookie {KEYRING_UNAVAILABLE_REASON}', only_once=True)
+ self._logger.warning('cannot decrypt v11 cookies: no key found', only_once=True)
return None
return _decrypt_aes_cbc(ciphertext, self._v11_key, self._logger)
else:
+ self._cookie_counts['other'] += 1
return None
@@ -355,6 +374,7 @@ class MacChromeCookieDecryptor(ChromeCookieDecryptor):
self._logger = logger
password = _get_mac_keyring_password(browser_keyring_name, logger)
self._v10_key = None if password is None else self.derive_key(password)
+ self._cookie_counts = {'v10': 0, 'other': 0}
@staticmethod
def derive_key(password):
@@ -362,11 +382,16 @@ class MacChromeCookieDecryptor(ChromeCookieDecryptor):
# https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/os_crypt_mac.mm
return pbkdf2_sha1(password, salt=b'saltysalt', iterations=1003, key_length=16)
+ @property
+ def cookie_counts(self):
+ return self._cookie_counts
+
def decrypt(self, encrypted_value):
version = encrypted_value[:3]
ciphertext = encrypted_value[3:]
if version == b'v10':
+ self._cookie_counts['v10'] += 1
if self._v10_key is None:
self._logger.warning('cannot decrypt v10 cookies: no key found', only_once=True)
return None
@@ -374,6 +399,7 @@ class MacChromeCookieDecryptor(ChromeCookieDecryptor):
return _decrypt_aes_cbc(ciphertext, self._v10_key, self._logger)
else:
+ self._cookie_counts['other'] += 1
# other prefixes are considered 'old data' which were stored as plaintext
# https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/os_crypt_mac.mm
return encrypted_value
@@ -383,12 +409,18 @@ class WindowsChromeCookieDecryptor(ChromeCookieDecryptor):
def __init__(self, browser_root, logger):
self._logger = logger
self._v10_key = _get_windows_v10_key(browser_root, logger)
+ self._cookie_counts = {'v10': 0, 'other': 0}
+
+ @property
+ def cookie_counts(self):
+ return self._cookie_counts
def decrypt(self, encrypted_value):
version = encrypted_value[:3]
ciphertext = encrypted_value[3:]
if version == b'v10':
+ self._cookie_counts['v10'] += 1
if self._v10_key is None:
self._logger.warning('cannot decrypt v10 cookies: no key found', only_once=True)
return None
@@ -408,6 +440,7 @@ class WindowsChromeCookieDecryptor(ChromeCookieDecryptor):
return _decrypt_aes_gcm(ciphertext, self._v10_key, nonce, authentication_tag, self._logger)
else:
+ self._cookie_counts['other'] += 1
# any other prefix means the data is DPAPI encrypted
# https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/os_crypt_win.cc
return _decrypt_windows_dpapi(encrypted_value, self._logger).decode('utf-8')
@@ -422,7 +455,10 @@ def _extract_safari_cookies(profile, logger):
cookies_path = os.path.expanduser('~/Library/Cookies/Cookies.binarycookies')
if not os.path.isfile(cookies_path):
- raise FileNotFoundError('could not find safari cookies database')
+ logger.debug('Trying secondary cookie location')
+ cookies_path = os.path.expanduser('~/Library/Containers/com.apple.Safari/Data/Library/Cookies/Cookies.binarycookies')
+ if not os.path.isfile(cookies_path):
+ raise FileNotFoundError('could not find safari cookies database')
with open(cookies_path, 'rb') as f:
cookies_data = f.read()
@@ -577,42 +613,220 @@ def parse_safari_cookies(data, jar=None, logger=YDLLogger()):
return jar
-def _get_linux_keyring_password(browser_keyring_name):
- password = keyring.get_password('{} Keys'.format(browser_keyring_name),
- '{} Safe Storage'.format(browser_keyring_name))
- if password is None:
- # this sometimes occurs in KDE because chrome does not check hasEntry and instead
- # just tries to read the value (which kwallet returns "") whereas keyring checks hasEntry
- # to verify this:
- # dbus-monitor "interface='org.kde.KWallet'" "type=method_return"
- # while starting chrome.
- # this may be a bug as the intended behaviour is to generate a random password and store
- # it, but that doesn't matter here.
- password = ''
- return password.encode('utf-8')
+class _LinuxDesktopEnvironment(Enum):
+ """
+ https://chromium.googlesource.com/chromium/src/+/refs/heads/main/base/nix/xdg_util.h
+ DesktopEnvironment
+ """
+ OTHER = auto()
+ CINNAMON = auto()
+ GNOME = auto()
+ KDE = auto()
+ PANTHEON = auto()
+ UNITY = auto()
+ XFCE = auto()
-def _get_mac_keyring_password(browser_keyring_name, logger):
- if KEYRING_AVAILABLE:
- logger.debug('using keyring to obtain password')
- password = keyring.get_password('{} Safe Storage'.format(browser_keyring_name), browser_keyring_name)
- return password.encode('utf-8')
+class _LinuxKeyring(Enum):
+ """
+ https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/key_storage_util_linux.h
+ SelectedLinuxBackend
+ """
+ KWALLET = auto()
+ GNOMEKEYRING = auto()
+ BASICTEXT = auto()
+
+
+SUPPORTED_KEYRINGS = _LinuxKeyring.__members__.keys()
+
+
+def _get_linux_desktop_environment(env):
+ """
+ https://chromium.googlesource.com/chromium/src/+/refs/heads/main/base/nix/xdg_util.cc
+ GetDesktopEnvironment
+ """
+ xdg_current_desktop = env.get('XDG_CURRENT_DESKTOP', None)
+ desktop_session = env.get('DESKTOP_SESSION', None)
+ if xdg_current_desktop is not None:
+ xdg_current_desktop = xdg_current_desktop.split(':')[0].strip()
+
+ if xdg_current_desktop == 'Unity':
+ if desktop_session is not None and 'gnome-fallback' in desktop_session:
+ return _LinuxDesktopEnvironment.GNOME
+ else:
+ return _LinuxDesktopEnvironment.UNITY
+ elif xdg_current_desktop == 'GNOME':
+ return _LinuxDesktopEnvironment.GNOME
+ elif xdg_current_desktop == 'X-Cinnamon':
+ return _LinuxDesktopEnvironment.CINNAMON
+ elif xdg_current_desktop == 'KDE':
+ return _LinuxDesktopEnvironment.KDE
+ elif xdg_current_desktop == 'Pantheon':
+ return _LinuxDesktopEnvironment.PANTHEON
+ elif xdg_current_desktop == 'XFCE':
+ return _LinuxDesktopEnvironment.XFCE
+ elif desktop_session is not None:
+ if desktop_session in ('mate', 'gnome'):
+ return _LinuxDesktopEnvironment.GNOME
+ elif 'kde' in desktop_session:
+ return _LinuxDesktopEnvironment.KDE
+ elif 'xfce' in desktop_session:
+ return _LinuxDesktopEnvironment.XFCE
else:
- logger.debug('using find-generic-password to obtain password')
- proc = subprocess.Popen(['security', 'find-generic-password',
- '-w', # write password to stdout
- '-a', browser_keyring_name, # match 'account'
- '-s', '{} Safe Storage'.format(browser_keyring_name)], # match 'service'
- stdout=subprocess.PIPE,
- stderr=subprocess.DEVNULL)
- try:
- stdout, stderr = process_communicate_or_kill(proc)
- if stdout[-1:] == b'\n':
- stdout = stdout[:-1]
- return stdout
- except BaseException as e:
- logger.warning(f'exception running find-generic-password: {type(e).__name__}({e})')
- return None
+ if 'GNOME_DESKTOP_SESSION_ID' in env:
+ return _LinuxDesktopEnvironment.GNOME
+ elif 'KDE_FULL_SESSION' in env:
+ return _LinuxDesktopEnvironment.KDE
+ return _LinuxDesktopEnvironment.OTHER
+
+
+def _choose_linux_keyring(logger):
+ """
+ https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/key_storage_util_linux.cc
+ SelectBackend
+ """
+ desktop_environment = _get_linux_desktop_environment(os.environ)
+ logger.debug('detected desktop environment: {}'.format(desktop_environment.name))
+ if desktop_environment == _LinuxDesktopEnvironment.KDE:
+ linux_keyring = _LinuxKeyring.KWALLET
+ elif desktop_environment == _LinuxDesktopEnvironment.OTHER:
+ linux_keyring = _LinuxKeyring.BASICTEXT
+ else:
+ linux_keyring = _LinuxKeyring.GNOMEKEYRING
+ return linux_keyring
+
+
+def _get_kwallet_network_wallet(logger):
+ """ The name of the wallet used to store network passwords.
+
+ https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/kwallet_dbus.cc
+ KWalletDBus::NetworkWallet
+ which does a dbus call to the following function:
+ https://api.kde.org/frameworks/kwallet/html/classKWallet_1_1Wallet.html
+ Wallet::NetworkWallet
+ """
+ default_wallet = 'kdewallet'
+ try:
+ proc = Popen([
+ 'dbus-send', '--session', '--print-reply=literal',
+ '--dest=org.kde.kwalletd5',
+ '/modules/kwalletd5',
+ 'org.kde.KWallet.networkWallet'
+ ], stdout=subprocess.PIPE, stderr=subprocess.DEVNULL)
+
+ stdout, stderr = proc.communicate_or_kill()
+ if proc.returncode != 0:
+ logger.warning('failed to read NetworkWallet')
+ return default_wallet
+ else:
+ network_wallet = stdout.decode('utf-8').strip()
+ logger.debug('NetworkWallet = "{}"'.format(network_wallet))
+ return network_wallet
+ except Exception as e:
+ logger.warning('exception while obtaining NetworkWallet: {}'.format(e))
+ return default_wallet
+
+
+def _get_kwallet_password(browser_keyring_name, logger):
+ logger.debug('using kwallet-query to obtain password from kwallet')
+
+ if shutil.which('kwallet-query') is None:
+ logger.error('kwallet-query command not found. KWallet and kwallet-query '
+ 'must be installed to read from KWallet. kwallet-query should be'
+ 'included in the kwallet package for your distribution')
+ return b''
+
+ network_wallet = _get_kwallet_network_wallet(logger)
+
+ try:
+ proc = Popen([
+ 'kwallet-query',
+ '--read-password', '{} Safe Storage'.format(browser_keyring_name),
+ '--folder', '{} Keys'.format(browser_keyring_name),
+ network_wallet
+ ], stdout=subprocess.PIPE, stderr=subprocess.DEVNULL)
+
+ stdout, stderr = proc.communicate_or_kill()
+ if proc.returncode != 0:
+ logger.error('kwallet-query failed with return code {}. Please consult '
+ 'the kwallet-query man page for details'.format(proc.returncode))
+ return b''
+ else:
+ if stdout.lower().startswith(b'failed to read'):
+ logger.debug('failed to read password from kwallet. Using empty string instead')
+ # this sometimes occurs in KDE because chrome does not check hasEntry and instead
+ # just tries to read the value (which kwallet returns "") whereas kwallet-query
+ # checks hasEntry. To verify this:
+ # dbus-monitor "interface='org.kde.KWallet'" "type=method_return"
+ # while starting chrome.
+ # this may be a bug as the intended behaviour is to generate a random password and store
+ # it, but that doesn't matter here.
+ return b''
+ else:
+ logger.debug('password found')
+ if stdout[-1:] == b'\n':
+ stdout = stdout[:-1]
+ return stdout
+ except Exception as e:
+ logger.warning(f'exception running kwallet-query: {error_to_str(e)}')
+ return b''
+
+
+def _get_gnome_keyring_password(browser_keyring_name, logger):
+ if not SECRETSTORAGE_AVAILABLE:
+ logger.error('secretstorage not available {}'.format(SECRETSTORAGE_UNAVAILABLE_REASON))
+ return b''
+ # the Gnome keyring does not seem to organise keys in the same way as KWallet,
+ # using `dbus-monitor` during startup, it can be observed that chromium lists all keys
+ # and presumably searches for its key in the list. It appears that we must do the same.
+ # https://github.com/jaraco/keyring/issues/556
+ with contextlib.closing(secretstorage.dbus_init()) as con:
+ col = secretstorage.get_default_collection(con)
+ for item in col.get_all_items():
+ if item.get_label() == '{} Safe Storage'.format(browser_keyring_name):
+ return item.get_secret()
+ else:
+ logger.error('failed to read from keyring')
+ return b''
+
+
+def _get_linux_keyring_password(browser_keyring_name, keyring, logger):
+ # note: chrome/chromium can be run with the following flags to determine which keyring backend
+ # it has chosen to use
+ # chromium --enable-logging=stderr --v=1 2>&1 | grep key_storage_
+ # Chromium supports a flag: --password-store=<basic|gnome|kwallet> so the automatic detection
+ # will not be sufficient in all cases.
+
+ keyring = _LinuxKeyring[keyring] if keyring else _choose_linux_keyring(logger)
+ logger.debug(f'Chosen keyring: {keyring.name}')
+
+ if keyring == _LinuxKeyring.KWALLET:
+ return _get_kwallet_password(browser_keyring_name, logger)
+ elif keyring == _LinuxKeyring.GNOMEKEYRING:
+ return _get_gnome_keyring_password(browser_keyring_name, logger)
+ elif keyring == _LinuxKeyring.BASICTEXT:
+ # when basic text is chosen, all cookies are stored as v10 (so no keyring password is required)
+ return None
+ assert False, f'Unknown keyring {keyring}'
+
+
+def _get_mac_keyring_password(browser_keyring_name, logger):
+ logger.debug('using find-generic-password to obtain password from OSX keychain')
+ try:
+ proc = Popen(
+ ['security', 'find-generic-password',
+ '-w', # write password to stdout
+ '-a', browser_keyring_name, # match 'account'
+ '-s', '{} Safe Storage'.format(browser_keyring_name)], # match 'service'
+ stdout=subprocess.PIPE, stderr=subprocess.DEVNULL)
+
+ stdout, stderr = proc.communicate_or_kill()
+ if stdout[-1:] == b'\n':
+ stdout = stdout[:-1]
+ return stdout
+ except Exception as e:
+ logger.warning(f'exception running find-generic-password: {error_to_str(e)}')
+ return None
def _get_windows_v10_key(browser_root, logger):
@@ -620,7 +834,7 @@ def _get_windows_v10_key(browser_root, logger):
if path is None:
logger.error('could not find local state file')
return None
- with open(path, 'r') as f:
+ with open(path, 'r', encoding='utf8') as f:
data = json.load(f)
try:
base64_key = data['os_crypt']['encrypted_key']
@@ -640,10 +854,9 @@ def pbkdf2_sha1(password, salt, iterations, key_length):
def _decrypt_aes_cbc(ciphertext, key, logger, initialization_vector=b' ' * 16):
- plaintext = aes_cbc_decrypt_bytes(ciphertext, key, initialization_vector)
- padding_length = plaintext[-1]
+ plaintext = unpad_pkcs7(aes_cbc_decrypt_bytes(ciphertext, key, initialization_vector))
try:
- return plaintext[:-padding_length].decode('utf-8')
+ return plaintext.decode('utf-8')
except UnicodeDecodeError:
logger.warning('failed to decrypt cookie (AES-CBC) because UTF-8 decoding failed. Possibly the key is wrong?', only_once=True)
return None
@@ -736,10 +949,11 @@ def _is_path(value):
return os.path.sep in value
-def _parse_browser_specification(browser_name, profile=None):
- browser_name = browser_name.lower()
+def _parse_browser_specification(browser_name, profile=None, keyring=None):
if browser_name not in SUPPORTED_BROWSERS:
raise ValueError(f'unsupported browser: "{browser_name}"')
+ if keyring not in (None, *SUPPORTED_KEYRINGS):
+ raise ValueError(f'unsupported keyring: "{keyring}"')
if profile is not None and _is_path(profile):
profile = os.path.expanduser(profile)
- return browser_name, profile
+ return browser_name, profile, keyring
diff --git a/hypervideo_dl/downloader/__init__.py b/hypervideo_dl/downloader/__init__.py
index 2449c74..96d484d 100644
--- a/hypervideo_dl/downloader/__init__.py
+++ b/hypervideo_dl/downloader/__init__.py
@@ -12,10 +12,15 @@ def get_suitable_downloader(info_dict, params={}, default=NO_DEFAULT, protocol=N
info_copy = info_dict.copy()
info_copy['to_stdout'] = to_stdout
- downloaders = [_get_suitable_downloader(info_copy, proto, params, default)
- for proto in (protocol or info_copy['protocol']).split('+')]
+ protocols = (protocol or info_copy['protocol']).split('+')
+ downloaders = [_get_suitable_downloader(info_copy, proto, params, default) for proto in protocols]
+
if set(downloaders) == {FFmpegFD} and FFmpegFD.can_merge_formats(info_copy, params):
return FFmpegFD
+ elif (set(downloaders) == {DashSegmentsFD}
+ and not (to_stdout and len(protocols) > 1)
+ and set(protocols) == {'http_dash_segments_generator'}):
+ return DashSegmentsFD
elif len(downloaders) == 1:
return downloaders[0]
return None
@@ -25,6 +30,7 @@ def get_suitable_downloader(info_dict, params={}, default=NO_DEFAULT, protocol=N
from .common import FileDownloader
from .dash import DashSegmentsFD
from .f4m import F4mFD
+from .fc2 import FC2LiveFD
from .hls import HlsFD
from .http import HttpFD
from .rtmp import RtmpFD
@@ -41,6 +47,7 @@ from .external import (
PROTOCOL_MAP = {
'rtmp': RtmpFD,
+ 'rtmpe': RtmpFD,
'rtmp_ffmpeg': FFmpegFD,
'm3u8_native': HlsFD,
'm3u8': FFmpegFD,
@@ -48,9 +55,11 @@ PROTOCOL_MAP = {
'rtsp': RtspFD,
'f4m': F4mFD,
'http_dash_segments': DashSegmentsFD,
+ 'http_dash_segments_generator': DashSegmentsFD,
'ism': IsmFD,
'mhtml': MhtmlFD,
'niconico_dmc': NiconicoDmcFD,
+ 'fc2_live': FC2LiveFD,
'websocket_frag': WebSocketFragmentFD,
'youtube_live_chat': YoutubeLiveChatFD,
'youtube_live_chat_replay': YoutubeLiveChatFD,
@@ -62,6 +71,7 @@ def shorten_protocol_name(proto, simplify=False):
'm3u8_native': 'm3u8_n',
'rtmp_ffmpeg': 'rtmp_f',
'http_dash_segments': 'dash',
+ 'http_dash_segments_generator': 'dash_g',
'niconico_dmc': 'dmc',
'websocket_frag': 'WSfrag',
}
@@ -70,6 +80,7 @@ def shorten_protocol_name(proto, simplify=False):
'https': 'http',
'ftps': 'ftp',
'm3u8_native': 'm3u8',
+ 'http_dash_segments_generator': 'dash',
'rtmp_ffmpeg': 'rtmp',
'm3u8_frag_urls': 'm3u8',
'dash_frag_urls': 'dash',
@@ -108,7 +119,7 @@ def _get_suitable_downloader(info_dict, protocol, params, default):
return FFmpegFD
elif (external_downloader or '').lower() == 'native':
return HlsFD
- elif get_suitable_downloader(
+ elif protocol == 'm3u8_native' and get_suitable_downloader(
info_dict, params, None, protocol='m3u8_frag_urls', to_stdout=info_dict['to_stdout']):
return HlsFD
elif params.get('hls_prefer_native') is True:
diff --git a/hypervideo_dl/downloader/common.py b/hypervideo_dl/downloader/common.py
index 27ca2cd..7cef3e8 100644
--- a/hypervideo_dl/downloader/common.py
+++ b/hypervideo_dl/downloader/common.py
@@ -4,14 +4,17 @@ import os
import re
import time
import random
+import errno
from ..utils import (
decodeArgument,
encodeFilename,
error_to_compat_str,
format_bytes,
+ sanitize_open,
shell_quote,
timeconvert,
+ timetuple_from_msec,
)
from ..minicurses import (
MultilineLogger,
@@ -38,6 +41,7 @@ class FileDownloader(object):
ratelimit: Download speed limit, in bytes/sec.
throttledratelimit: Assume the download is being throttled below this speed (bytes/sec)
retries: Number of times to retry for HTTP error 5xx
+ file_access_retries: Number of times to retry on file access error
buffersize: Size of download buffer in bytes.
noresizebuffer: Do not automatically resize the download buffer.
continuedl: Try to continue downloads if possible.
@@ -75,14 +79,12 @@ class FileDownloader(object):
@staticmethod
def format_seconds(seconds):
- (mins, secs) = divmod(seconds, 60)
- (hours, mins) = divmod(mins, 60)
- if hours > 99:
+ time = timetuple_from_msec(seconds * 1000)
+ if time.hours > 99:
return '--:--:--'
- if hours == 0:
- return '%02d:%02d' % (mins, secs)
- else:
- return '%02d:%02d:%02d' % (hours, mins, secs)
+ if not time.hours:
+ return '%02d:%02d' % time[1:-1]
+ return '%02d:%02d:%02d' % time[:-1]
@staticmethod
def calc_percent(byte_counter, data_len):
@@ -94,6 +96,8 @@ class FileDownloader(object):
def format_percent(percent):
if percent is None:
return '---.-%'
+ elif percent == 100:
+ return '100%'
return '%6s' % ('%3.1f%%' % percent)
@staticmethod
@@ -155,7 +159,7 @@ class FileDownloader(object):
return int(round(number * multiplier))
def to_screen(self, *args, **kargs):
- self.ydl.to_stdout(*args, quiet=self.params.get('quiet'), **kargs)
+ self.ydl.to_screen(*args, quiet=self.params.get('quiet'), **kargs)
def to_stderr(self, message):
self.ydl.to_stderr(message)
@@ -206,13 +210,41 @@ class FileDownloader(object):
def ytdl_filename(self, filename):
return filename + '.ytdl'
+ def wrap_file_access(action, *, fatal=False):
+ def outer(func):
+ def inner(self, *args, **kwargs):
+ file_access_retries = self.params.get('file_access_retries', 0)
+ retry = 0
+ while True:
+ try:
+ return func(self, *args, **kwargs)
+ except (IOError, OSError) as err:
+ retry = retry + 1
+ if retry > file_access_retries or err.errno not in (errno.EACCES, errno.EINVAL):
+ if not fatal:
+ self.report_error(f'unable to {action} file: {err}')
+ return
+ raise
+ self.to_screen(
+ f'[download] Unable to {action} file due to file access error. '
+ f'Retrying (attempt {retry} of {self.format_retries(file_access_retries)}) ...')
+ time.sleep(0.01)
+ return inner
+ return outer
+
+ @wrap_file_access('open', fatal=True)
+ def sanitize_open(self, filename, open_mode):
+ return sanitize_open(filename, open_mode)
+
+ @wrap_file_access('remove')
+ def try_remove(self, filename):
+ os.remove(filename)
+
+ @wrap_file_access('rename')
def try_rename(self, old_filename, new_filename):
if old_filename == new_filename:
return
- try:
- os.replace(old_filename, new_filename)
- except (IOError, OSError) as err:
- self.report_error(f'unable to rename file: {err}')
+ os.replace(old_filename, new_filename)
def try_utime(self, filename, last_modified_hdr):
"""Try to set the last-modified time of the given file."""
@@ -245,14 +277,32 @@ class FileDownloader(object):
elif self.ydl.params.get('logger'):
self._multiline = MultilineLogger(self.ydl.params['logger'], lines)
elif self.params.get('progress_with_newline'):
- self._multiline = BreaklineStatusPrinter(self.ydl._screen_file, lines)
+ self._multiline = BreaklineStatusPrinter(self.ydl._out_files['screen'], lines)
else:
- self._multiline = MultilinePrinter(self.ydl._screen_file, lines, not self.params.get('quiet'))
+ self._multiline = MultilinePrinter(self.ydl._out_files['screen'], lines, not self.params.get('quiet'))
+ self._multiline.allow_colors = self._multiline._HAVE_FULLCAP and not self.params.get('no_color')
def _finish_multiline_status(self):
self._multiline.end()
- def _report_progress_status(self, s):
+ _progress_styles = {
+ 'downloaded_bytes': 'light blue',
+ 'percent': 'light blue',
+ 'eta': 'yellow',
+ 'speed': 'green',
+ 'elapsed': 'bold white',
+ 'total_bytes': '',
+ 'total_bytes_estimate': '',
+ }
+
+ def _report_progress_status(self, s, default_template):
+ for name, style in self._progress_styles.items():
+ name = f'_{name}_str'
+ if name not in s:
+ continue
+ s[name] = self._format_progress(s[name], style)
+ s['_default_template'] = default_template % s
+
progress_dict = s.copy()
progress_dict.pop('info_dict')
progress_dict = {'info': s['info_dict'], 'progress': progress_dict}
@@ -265,6 +315,10 @@ class FileDownloader(object):
progress_template.get('download-title') or 'hypervideo %(progress._default_template)s',
progress_dict))
+ def _format_progress(self, *args, **kwargs):
+ return self.ydl._format_text(
+ self._multiline.stream, self._multiline.allow_colors, *args, **kwargs)
+
def report_progress(self, s):
if s['status'] == 'finished':
if self.params.get('noprogress'):
@@ -277,8 +331,7 @@ class FileDownloader(object):
s['_elapsed_str'] = self.format_seconds(s['elapsed'])
msg_template += ' in %(_elapsed_str)s'
s['_percent_str'] = self.format_percent(100)
- s['_default_template'] = msg_template % s
- self._report_progress_status(s)
+ self._report_progress_status(s, msg_template)
return
if s['status'] != 'downloading':
@@ -287,7 +340,7 @@ class FileDownloader(object):
if s.get('eta') is not None:
s['_eta_str'] = self.format_eta(s['eta'])
else:
- s['_eta_str'] = 'Unknown ETA'
+ s['_eta_str'] = 'Unknown'
if s.get('total_bytes') and s.get('downloaded_bytes') is not None:
s['_percent_str'] = self.format_percent(100 * s['downloaded_bytes'] / s['total_bytes'])
@@ -319,9 +372,12 @@ class FileDownloader(object):
else:
msg_template = '%(_downloaded_bytes_str)s at %(_speed_str)s'
else:
- msg_template = '%(_percent_str)s % at %(_speed_str)s ETA %(_eta_str)s'
- s['_default_template'] = msg_template % s
- self._report_progress_status(s)
+ msg_template = '%(_percent_str)s at %(_speed_str)s ETA %(_eta_str)s'
+ if s.get('fragment_index') and s.get('fragment_count'):
+ msg_template += ' (frag %(fragment_index)s/%(fragment_count)s)'
+ elif s.get('fragment_index'):
+ msg_template += ' (frag %(fragment_index)s)'
+ self._report_progress_status(s, msg_template)
def report_resuming_byte(self, resume_len):
"""Report attempt to resume at given byte."""
@@ -372,6 +428,7 @@ class FileDownloader(object):
'status': 'finished',
'total_bytes': os.path.getsize(encodeFilename(filename)),
}, info_dict)
+ self._finish_multiline_status()
return True, False
if subtitle is False:
diff --git a/hypervideo_dl/downloader/dash.py b/hypervideo_dl/downloader/dash.py
index 6444ad6..a845ee7 100644
--- a/hypervideo_dl/downloader/dash.py
+++ b/hypervideo_dl/downloader/dash.py
@@ -1,4 +1,5 @@
from __future__ import unicode_literals
+import time
from ..downloader import get_suitable_downloader
from .fragment import FragmentFD
@@ -15,27 +16,53 @@ class DashSegmentsFD(FragmentFD):
FD_NAME = 'dashsegments'
def real_download(self, filename, info_dict):
- if info_dict.get('is_live'):
+ if info_dict.get('is_live') and set(info_dict['protocol'].split('+')) != {'http_dash_segments_generator'}:
self.report_error('Live DASH videos are not supported')
- fragment_base_url = info_dict.get('fragment_base_url')
- fragments = info_dict['fragments'][:1] if self.params.get(
- 'test', False) else info_dict['fragments']
-
+ real_start = time.time()
real_downloader = get_suitable_downloader(
info_dict, self.params, None, protocol='dash_frag_urls', to_stdout=(filename == '-'))
- ctx = {
- 'filename': filename,
- 'total_frags': len(fragments),
- }
+ requested_formats = [{**info_dict, **fmt} for fmt in info_dict.get('requested_formats', [])]
+ args = []
+ for fmt in requested_formats or [info_dict]:
+ try:
+ fragment_count = 1 if self.params.get('test') else len(fmt['fragments'])
+ except TypeError:
+ fragment_count = None
+ ctx = {
+ 'filename': fmt.get('filepath') or filename,
+ 'live': 'is_from_start' if fmt.get('is_from_start') else fmt.get('is_live'),
+ 'total_frags': fragment_count,
+ }
+
+ if real_downloader:
+ self._prepare_external_frag_download(ctx)
+ else:
+ self._prepare_and_start_frag_download(ctx, fmt)
+ ctx['start'] = real_start
+
+ fragments_to_download = self._get_fragments(fmt, ctx)
+
+ if real_downloader:
+ self.to_screen(
+ '[%s] Fragment downloads will be delegated to %s' % (self.FD_NAME, real_downloader.get_basename()))
+ info_dict['fragments'] = list(fragments_to_download)
+ fd = real_downloader(self.ydl, self.params)
+ return fd.real_download(filename, info_dict)
+
+ args.append([ctx, fragments_to_download, fmt])
- if real_downloader:
- self._prepare_external_frag_download(ctx)
- else:
- self._prepare_and_start_frag_download(ctx, info_dict)
+ return self.download_and_append_fragments_multiple(*args)
+
+ def _resolve_fragments(self, fragments, ctx):
+ fragments = fragments(ctx) if callable(fragments) else fragments
+ return [next(iter(fragments))] if self.params.get('test') else fragments
+
+ def _get_fragments(self, fmt, ctx):
+ fragment_base_url = fmt.get('fragment_base_url')
+ fragments = self._resolve_fragments(fmt['fragments'], ctx)
- fragments_to_download = []
frag_index = 0
for i, fragment in enumerate(fragments):
frag_index += 1
@@ -46,17 +73,8 @@ class DashSegmentsFD(FragmentFD):
assert fragment_base_url
fragment_url = urljoin(fragment_base_url, fragment['path'])
- fragments_to_download.append({
+ yield {
'frag_index': frag_index,
'index': i,
'url': fragment_url,
- })
-
- if real_downloader:
- self.to_screen(
- '[%s] Fragment downloads will be delegated to %s' % (self.FD_NAME, real_downloader.get_basename()))
- info_dict['fragments'] = fragments_to_download
- fd = real_downloader(self.ydl, self.params)
- return fd.real_download(filename, info_dict)
-
- return self.download_and_append_fragments(ctx, fragments_to_download, info_dict)
+ }
diff --git a/hypervideo_dl/downloader/external.py b/hypervideo_dl/downloader/external.py
index 74adb05..b99dc37 100644
--- a/hypervideo_dl/downloader/external.py
+++ b/hypervideo_dl/downloader/external.py
@@ -13,17 +13,18 @@ from ..compat import (
)
from ..postprocessor.ffmpeg import FFmpegPostProcessor, EXT_TO_OUT_FORMATS
from ..utils import (
+ classproperty,
cli_option,
cli_valueless_option,
cli_bool_option,
_configuration_args,
+ determine_ext,
encodeFilename,
encodeArgument,
handle_youtubedl_headers,
check_executable,
- is_outdated_version,
- process_communicate_or_kill,
- sanitize_open,
+ Popen,
+ remove_end,
)
@@ -73,17 +74,23 @@ class ExternalFD(FragmentFD):
def get_basename(cls):
return cls.__name__[:-2].lower()
+ @classproperty
+ def EXE_NAME(cls):
+ return cls.get_basename()
+
@property
def exe(self):
- return self.get_basename()
+ return self.EXE_NAME
@classmethod
def available(cls, path=None):
- path = check_executable(path or cls.get_basename(), [cls.AVAILABLE_OPT])
- if path:
- cls.exe = path
- return path
- return False
+ path = check_executable(
+ cls.EXE_NAME if path in (None, cls.get_basename()) else path,
+ [cls.AVAILABLE_OPT])
+ if not path:
+ return False
+ cls.exe = path
+ return path
@classmethod
def supports(cls, info_dict):
@@ -106,7 +113,7 @@ class ExternalFD(FragmentFD):
def _configuration_args(self, keys=None, *args, **kwargs):
return _configuration_args(
- self.get_basename(), self.params.get('external_downloader_args'), self.get_basename(),
+ self.get_basename(), self.params.get('external_downloader_args'), self.EXE_NAME,
keys, *args, **kwargs)
def _call_downloader(self, tmpfilename, info_dict):
@@ -116,9 +123,8 @@ class ExternalFD(FragmentFD):
self._debug_cmd(cmd)
if 'fragments' not in info_dict:
- p = subprocess.Popen(
- cmd, stderr=subprocess.PIPE)
- _, stderr = process_communicate_or_kill(p)
+ p = Popen(cmd, stderr=subprocess.PIPE)
+ _, stderr = p.communicate_or_kill()
if p.returncode != 0:
self.to_stderr(stderr.decode('utf-8', 'replace'))
return p.returncode
@@ -128,9 +134,8 @@ class ExternalFD(FragmentFD):
count = 0
while count <= fragment_retries:
- p = subprocess.Popen(
- cmd, stderr=subprocess.PIPE)
- _, stderr = process_communicate_or_kill(p)
+ p = Popen(cmd, stderr=subprocess.PIPE)
+ _, stderr = p.communicate_or_kill()
if p.returncode == 0:
break
# TODO: Decide whether to retry based on error code
@@ -147,23 +152,23 @@ class ExternalFD(FragmentFD):
return -1
decrypt_fragment = self.decrypter(info_dict)
- dest, _ = sanitize_open(tmpfilename, 'wb')
+ dest, _ = self.sanitize_open(tmpfilename, 'wb')
for frag_index, fragment in enumerate(info_dict['fragments']):
fragment_filename = '%s-Frag%d' % (tmpfilename, frag_index)
try:
- src, _ = sanitize_open(fragment_filename, 'rb')
- except IOError:
+ src, _ = self.sanitize_open(fragment_filename, 'rb')
+ except IOError as err:
if skip_unavailable_fragments and frag_index > 1:
- self.to_screen('[%s] Skipping fragment %d ...' % (self.get_basename(), frag_index))
+ self.report_skip_fragment(frag_index, err)
continue
- self.report_error('Unable to open fragment %d' % frag_index)
+ self.report_error(f'Unable to open fragment {frag_index}; {err}')
return -1
dest.write(decrypt_fragment(fragment, src.read()))
src.close()
if not self.params.get('keep_fragments', False):
- os.remove(encodeFilename(fragment_filename))
+ self.try_remove(encodeFilename(fragment_filename))
dest.close()
- os.remove(encodeFilename('%s.frag.urls' % tmpfilename))
+ self.try_remove(encodeFilename('%s.frag.urls' % tmpfilename))
return 0
@@ -171,7 +176,7 @@ class CurlFD(ExternalFD):
AVAILABLE_OPT = '-V'
def _make_cmd(self, tmpfilename, info_dict):
- cmd = [self.exe, '--location', '-o', tmpfilename]
+ cmd = [self.exe, '--location', '-o', tmpfilename, '--compressed']
if info_dict.get('http_headers') is not None:
for key, val in info_dict['http_headers'].items():
cmd += ['--header', '%s: %s' % (key, val)]
@@ -199,8 +204,8 @@ class CurlFD(ExternalFD):
self._debug_cmd(cmd)
# curl writes the progress to stderr so don't capture it.
- p = subprocess.Popen(cmd)
- process_communicate_or_kill(p)
+ p = Popen(cmd)
+ p.communicate_or_kill()
return p.returncode
@@ -221,7 +226,7 @@ class WgetFD(ExternalFD):
AVAILABLE_OPT = '--version'
def _make_cmd(self, tmpfilename, info_dict):
- cmd = [self.exe, '-O', tmpfilename, '-nv', '--no-cookies']
+ cmd = [self.exe, '-O', tmpfilename, '-nv', '--no-cookies', '--compression=auto']
if info_dict.get('http_headers') is not None:
for key, val in info_dict['http_headers'].items():
cmd += ['--header', '%s: %s' % (key, val)]
@@ -232,7 +237,10 @@ class WgetFD(ExternalFD):
retry[1] = '0'
cmd += retry
cmd += self._option('--bind-address', 'source_address')
- cmd += self._option('--proxy', 'proxy')
+ proxy = self.params.get('proxy')
+ if proxy:
+ for var in ('http_proxy', 'https_proxy'):
+ cmd += ['--execute', '%s=%s' % (var, proxy)]
cmd += self._valueless_option('--no-check-certificate', 'nocheckcertificate')
cmd += self._configuration_args()
cmd += ['--', info_dict['url']]
@@ -255,7 +263,7 @@ class Aria2cFD(ExternalFD):
def _make_cmd(self, tmpfilename, info_dict):
cmd = [self.exe, '-c',
'--console-log-level=warn', '--summary-interval=0', '--download-result=hide',
- '--file-allocation=none', '-x16', '-j16', '-s16']
+ '--http-accept-gzip=true', '--file-allocation=none', '-x16', '-j16', '-s16']
if 'fragments' in info_dict:
cmd += ['--allow-overwrite=true', '--allow-piece-length-change=true']
else:
@@ -269,6 +277,7 @@ class Aria2cFD(ExternalFD):
cmd += self._option('--all-proxy', 'proxy')
cmd += self._bool_option('--check-certificate', 'nocheckcertificate', 'false', 'true', '=')
cmd += self._bool_option('--remote-time', 'updatetime', 'true', 'false', '=')
+ cmd += self._bool_option('--show-console-readout', 'noprogress', 'false', 'true', '=')
cmd += self._configuration_args()
# aria2c strips out spaces from the beginning/end of filenames and paths.
@@ -293,7 +302,7 @@ class Aria2cFD(ExternalFD):
for frag_index, fragment in enumerate(info_dict['fragments']):
fragment_filename = '%s-Frag%d' % (os.path.basename(tmpfilename), frag_index)
url_list.append('%s\n\tout=%s' % (fragment['url'], fragment_filename))
- stream, _ = sanitize_open(url_list_file, 'wb')
+ stream, _ = self.sanitize_open(url_list_file, 'wb')
stream.write('\n'.join(url_list).encode('utf-8'))
stream.close()
cmd += ['-i', url_list_file]
@@ -304,10 +313,7 @@ class Aria2cFD(ExternalFD):
class HttpieFD(ExternalFD):
AVAILABLE_OPT = '--version'
-
- @classmethod
- def available(cls, path=None):
- return ExternalFD.available(cls, path or 'http')
+ EXE_NAME = 'http'
def _make_cmd(self, tmpfilename, info_dict):
cmd = ['http', '--download', '--output', tmpfilename, info_dict['url']]
@@ -446,8 +452,7 @@ class FFmpegFD(ExternalFD):
if info_dict.get('requested_formats') or protocol == 'http_dash_segments':
for (i, fmt) in enumerate(info_dict.get('requested_formats') or [info_dict]):
stream_number = fmt.get('manifest_stream_number', 0)
- a_or_v = 'a' if fmt.get('acodec') != 'none' else 'v'
- args.extend(['-map', f'{i}:{a_or_v}:{stream_number}'])
+ args.extend(['-map', f'{i}:{stream_number}'])
if self.params.get('test', False):
args += ['-fs', compat_str(self._TEST_FILE_SIZE)]
@@ -461,12 +466,21 @@ class FFmpegFD(ExternalFD):
args += ['-f', 'mpegts']
else:
args += ['-f', 'mp4']
- if (ffpp.basename == 'ffmpeg' and is_outdated_version(ffpp._versions['ffmpeg'], '3.2', False)) and (not info_dict.get('acodec') or info_dict['acodec'].split('.')[0] in ('aac', 'mp4a')):
+ if (ffpp.basename == 'ffmpeg' and ffpp._features.get('needs_adtstoasc')) and (not info_dict.get('acodec') or info_dict['acodec'].split('.')[0] in ('aac', 'mp4a')):
args += ['-bsf:a', 'aac_adtstoasc']
elif protocol == 'rtmp':
args += ['-f', 'flv']
elif ext == 'mp4' and tmpfilename == '-':
args += ['-f', 'mpegts']
+ elif ext == 'unknown_video':
+ ext = determine_ext(remove_end(tmpfilename, '.part'))
+ if ext == 'unknown_video':
+ self.report_warning(
+ 'The video format is unknown and cannot be downloaded by ffmpeg. '
+ 'Explicitly set the extension in the filename to attempt download in that format')
+ else:
+ self.report_warning(f'The video format is unknown. Trying to download as {ext} according to the filename')
+ args += ['-f', EXT_TO_OUT_FORMATS.get(ext, ext)]
else:
args += ['-f', EXT_TO_OUT_FORMATS.get(ext, ext)]
@@ -476,7 +490,7 @@ class FFmpegFD(ExternalFD):
args.append(encodeFilename(ffpp._ffmpeg_filename_argument(tmpfilename), True))
self._debug_cmd(args)
- proc = subprocess.Popen(args, stdin=subprocess.PIPE, env=env)
+ proc = Popen(args, stdin=subprocess.PIPE, env=env)
if url in ('-', 'pipe:'):
self.on_process_started(proc, proc.stdin)
try:
@@ -488,7 +502,7 @@ class FFmpegFD(ExternalFD):
# streams). Note that Windows is not affected and produces playable
# files (see https://github.com/ytdl-org/youtube-dl/issues/8300).
if isinstance(e, KeyboardInterrupt) and sys.platform != 'win32' and url not in ('-', 'pipe:'):
- process_communicate_or_kill(proc, b'q')
+ proc.communicate_or_kill(b'q')
else:
proc.kill()
proc.wait()
@@ -500,11 +514,13 @@ class AVconvFD(FFmpegFD):
pass
-_BY_NAME = dict(
- (klass.get_basename(), klass)
+_BY_NAME = {
+ klass.get_basename(): klass
for name, klass in globals().items()
if name.endswith('FD') and name not in ('ExternalFD', 'FragmentFD')
-)
+}
+
+_BY_EXE = {klass.EXE_NAME: klass for klass in _BY_NAME.values()}
def list_external_downloaders():
@@ -516,4 +532,4 @@ def get_external_downloader(external_downloader):
downloader . """
# Drop .exe extension on Windows
bn = os.path.splitext(os.path.basename(external_downloader))[0]
- return _BY_NAME.get(bn)
+ return _BY_NAME.get(bn, _BY_EXE.get(bn))
diff --git a/hypervideo_dl/downloader/f4m.py b/hypervideo_dl/downloader/f4m.py
index 9da2776..0008b7c 100644
--- a/hypervideo_dl/downloader/f4m.py
+++ b/hypervideo_dl/downloader/f4m.py
@@ -366,7 +366,7 @@ class F4mFD(FragmentFD):
ctx = {
'filename': filename,
'total_frags': total_frags,
- 'live': live,
+ 'live': bool(live),
}
self._prepare_frag_download(ctx)
diff --git a/hypervideo_dl/downloader/fc2.py b/hypervideo_dl/downloader/fc2.py
new file mode 100644
index 0000000..157bcf2
--- /dev/null
+++ b/hypervideo_dl/downloader/fc2.py
@@ -0,0 +1,41 @@
+from __future__ import division, unicode_literals
+
+import threading
+
+from .common import FileDownloader
+from .external import FFmpegFD
+
+
+class FC2LiveFD(FileDownloader):
+ """
+ Downloads FC2 live without being stopped. <br>
+ Note, this is not a part of public API, and will be removed without notice.
+ DO NOT USE
+ """
+
+ def real_download(self, filename, info_dict):
+ ws = info_dict['ws']
+
+ heartbeat_lock = threading.Lock()
+ heartbeat_state = [None, 1]
+
+ def heartbeat():
+ try:
+ heartbeat_state[1] += 1
+ ws.send('{"name":"heartbeat","arguments":{},"id":%d}' % heartbeat_state[1])
+ except Exception:
+ self.to_screen('[fc2:live] Heartbeat failed')
+
+ with heartbeat_lock:
+ heartbeat_state[0] = threading.Timer(30, heartbeat)
+ heartbeat_state[0]._daemonic = True
+ heartbeat_state[0].start()
+
+ heartbeat()
+
+ new_info_dict = info_dict.copy()
+ new_info_dict.update({
+ 'ws': None,
+ 'protocol': 'live_ffmpeg',
+ })
+ return FFmpegFD(self.ydl, self.params or {}).download(filename, new_info_dict)
diff --git a/hypervideo_dl/downloader/fragment.py b/hypervideo_dl/downloader/fragment.py
index 57068db..a991c6d 100644
--- a/hypervideo_dl/downloader/fragment.py
+++ b/hypervideo_dl/downloader/fragment.py
@@ -1,9 +1,10 @@
from __future__ import division, unicode_literals
+import http.client
+import json
+import math
import os
import time
-import json
-from math import ceil
try:
import concurrent.futures
@@ -13,8 +14,9 @@ except ImportError:
from .common import FileDownloader
from .http import HttpFD
-from ..aes import aes_cbc_decrypt_bytes
+from ..aes import aes_cbc_decrypt_bytes, unpad_pkcs7
from ..compat import (
+ compat_os_name,
compat_urllib_error,
compat_struct_pack,
)
@@ -22,8 +24,8 @@ from ..utils import (
DownloadError,
error_to_compat_str,
encodeFilename,
- sanitize_open,
sanitized_Request,
+ traverse_obj,
)
@@ -31,6 +33,10 @@ class HttpQuietDownloader(HttpFD):
def to_screen(self, *args, **kargs):
pass
+ def report_retry(self, err, count, retries):
+ super().to_screen(
+ f'[download] Got server HTTP error: {err}. Retrying (attempt {count} of {self.format_retries(retries)}) ...')
+
class FragmentFD(FileDownloader):
"""
@@ -44,6 +50,7 @@ class FragmentFD(FileDownloader):
Skip unavailable fragments (DASH and hlsnative only)
keep_fragments: Keep downloaded fragments on disk after downloading is
finished
+ concurrent_fragment_downloads: The number of threads to use for native hls and dash downloads
_no_ytdl_file: Don't use .ytdl file
For each incomplete fragment download hypervideo keeps on disk a special
@@ -72,8 +79,9 @@ class FragmentFD(FileDownloader):
'\r[download] Got server HTTP error: %s. Retrying fragment %d (attempt %d of %s) ...'
% (error_to_compat_str(err), frag_index, count, self.format_retries(retries)))
- def report_skip_fragment(self, frag_index):
- self.to_screen('[download] Skipping fragment %d ...' % frag_index)
+ def report_skip_fragment(self, frag_index, err=None):
+ err = f' {err};' if err else ''
+ self.to_screen(f'[download]{err} Skipping fragment {frag_index:d} ...')
def _prepare_url(self, info_dict, url):
headers = info_dict.get('http_headers')
@@ -84,11 +92,11 @@ class FragmentFD(FileDownloader):
self._start_frag_download(ctx, info_dict)
def __do_ytdl_file(self, ctx):
- return not ctx['live'] and not ctx['tmpfilename'] == '-' and not self.params.get('_no_ytdl_file')
+ return ctx['live'] is not True and ctx['tmpfilename'] != '-' and not self.params.get('_no_ytdl_file')
def _read_ytdl_file(self, ctx):
assert 'ytdl_corrupt' not in ctx
- stream, _ = sanitize_open(self.ytdl_filename(ctx['filename']), 'r')
+ stream, _ = self.sanitize_open(self.ytdl_filename(ctx['filename']), 'r')
try:
ytdl_data = json.loads(stream.read())
ctx['fragment_index'] = ytdl_data['downloader']['current_fragment']['index']
@@ -100,7 +108,7 @@ class FragmentFD(FileDownloader):
stream.close()
def _write_ytdl_file(self, ctx):
- frag_index_stream, _ = sanitize_open(self.ytdl_filename(ctx['filename']), 'w')
+ frag_index_stream, _ = self.sanitize_open(self.ytdl_filename(ctx['filename']), 'w')
try:
downloader = {
'current_fragment': {
@@ -125,14 +133,19 @@ class FragmentFD(FileDownloader):
}
success = ctx['dl'].download(fragment_filename, fragment_info_dict)
if not success:
- return False, None
+ return False
if fragment_info_dict.get('filetime'):
ctx['fragment_filetime'] = fragment_info_dict.get('filetime')
ctx['fragment_filename_sanitized'] = fragment_filename
- return True, self._read_fragment(ctx)
+ return True
def _read_fragment(self, ctx):
- down, frag_sanitized = sanitize_open(ctx['fragment_filename_sanitized'], 'rb')
+ try:
+ down, frag_sanitized = self.sanitize_open(ctx['fragment_filename_sanitized'], 'rb')
+ except FileNotFoundError:
+ if ctx.get('live'):
+ return None
+ raise
ctx['fragment_filename_sanitized'] = frag_sanitized
frag_content = down.read()
down.close()
@@ -146,7 +159,7 @@ class FragmentFD(FileDownloader):
if self.__do_ytdl_file(ctx):
self._write_ytdl_file(ctx)
if not self.params.get('keep_fragments', False):
- os.remove(encodeFilename(ctx['fragment_filename_sanitized']))
+ self.try_remove(encodeFilename(ctx['fragment_filename_sanitized']))
del ctx['fragment_filename_sanitized']
def _prepare_frag_download(self, ctx):
@@ -165,8 +178,8 @@ class FragmentFD(FileDownloader):
dl = HttpQuietDownloader(
self.ydl,
{
- 'continuedl': True,
- 'quiet': True,
+ 'continuedl': self.params.get('continuedl', True),
+ 'quiet': self.params.get('quiet'),
'noprogress': True,
'ratelimit': self.params.get('ratelimit'),
'retries': self.params.get('retries', 0),
@@ -208,7 +221,7 @@ class FragmentFD(FileDownloader):
self._write_ytdl_file(ctx)
assert ctx['fragment_index'] == 0
- dest_stream, tmpfilename = sanitize_open(tmpfilename, open_mode)
+ dest_stream, tmpfilename = self.sanitize_open(tmpfilename, open_mode)
ctx.update({
'dl': dl,
@@ -236,6 +249,7 @@ class FragmentFD(FileDownloader):
start = time.time()
ctx.update({
'started': start,
+ 'fragment_started': start,
# Amount of fragment's bytes downloaded by the time of the previous
# frag progress hook invocation
'prev_frag_downloaded_bytes': 0,
@@ -266,6 +280,9 @@ class FragmentFD(FileDownloader):
ctx['fragment_index'] = state['fragment_index']
state['downloaded_bytes'] += frag_total_bytes - ctx['prev_frag_downloaded_bytes']
ctx['complete_frags_downloaded_bytes'] = state['downloaded_bytes']
+ ctx['speed'] = state['speed'] = self.calc_speed(
+ ctx['fragment_started'], time_now, frag_total_bytes)
+ ctx['fragment_started'] = time.time()
ctx['prev_frag_downloaded_bytes'] = 0
else:
frag_downloaded_bytes = s['downloaded_bytes']
@@ -274,8 +291,8 @@ class FragmentFD(FileDownloader):
state['eta'] = self.calc_eta(
start, time_now, estimated_size - resume_len,
state['downloaded_bytes'] - resume_len)
- state['speed'] = s.get('speed') or ctx.get('speed')
- ctx['speed'] = state['speed']
+ ctx['speed'] = state['speed'] = self.calc_speed(
+ ctx['fragment_started'], time_now, frag_downloaded_bytes)
ctx['prev_frag_downloaded_bytes'] = frag_downloaded_bytes
self._hook_progress(state, info_dict)
@@ -288,7 +305,7 @@ class FragmentFD(FileDownloader):
if self.__do_ytdl_file(ctx):
ytdl_filename = encodeFilename(self.ytdl_filename(ctx['filename']))
if os.path.isfile(ytdl_filename):
- os.remove(ytdl_filename)
+ self.try_remove(ytdl_filename)
elapsed = time.time() - ctx['started']
if ctx['tmpfilename'] == '-':
@@ -355,9 +372,7 @@ class FragmentFD(FileDownloader):
# not what it decrypts to.
if self.params.get('test', False):
return frag_content
- padding_len = 16 - (len(frag_content) % 16)
- decrypted_data = aes_cbc_decrypt_bytes(frag_content + bytes([padding_len] * padding_len), decrypt_info['KEY'], iv)
- return decrypted_data[:-decrypted_data[-1]]
+ return unpad_pkcs7(aes_cbc_decrypt_bytes(frag_content, decrypt_info['KEY'], iv))
return decrypt_fragment
@@ -366,64 +381,105 @@ class FragmentFD(FileDownloader):
@params (ctx1, fragments1, info_dict1), (ctx2, fragments2, info_dict2), ...
all args must be either tuple or list
'''
+ interrupt_trigger = [True]
max_progress = len(args)
if max_progress == 1:
return self.download_and_append_fragments(*args[0], pack_func=pack_func, finish_func=finish_func)
- max_workers = self.params.get('concurrent_fragment_downloads', max_progress)
- self._prepare_multiline_status(max_progress)
+ max_workers = self.params.get('concurrent_fragment_downloads', 1)
+ if max_progress > 1:
+ self._prepare_multiline_status(max_progress)
+ is_live = any(traverse_obj(args, (..., 2, 'is_live'), default=[]))
def thread_func(idx, ctx, fragments, info_dict, tpe):
ctx['max_progress'] = max_progress
ctx['progress_idx'] = idx
- return self.download_and_append_fragments(ctx, fragments, info_dict, pack_func=pack_func, finish_func=finish_func, tpe=tpe)
+ return self.download_and_append_fragments(
+ ctx, fragments, info_dict, pack_func=pack_func, finish_func=finish_func,
+ tpe=tpe, interrupt_trigger=interrupt_trigger)
class FTPE(concurrent.futures.ThreadPoolExecutor):
# has to stop this or it's going to wait on the worker thread itself
def __exit__(self, exc_type, exc_val, exc_tb):
pass
+ if compat_os_name == 'nt':
+ def future_result(future):
+ while True:
+ try:
+ return future.result(0.1)
+ except KeyboardInterrupt:
+ raise
+ except concurrent.futures.TimeoutError:
+ continue
+ else:
+ def future_result(future):
+ return future.result()
+
+ def interrupt_trigger_iter(fg):
+ for f in fg:
+ if not interrupt_trigger[0]:
+ break
+ yield f
+
spins = []
for idx, (ctx, fragments, info_dict) in enumerate(args):
- tpe = FTPE(ceil(max_workers / max_progress))
- job = tpe.submit(thread_func, idx, ctx, fragments, info_dict, tpe)
+ tpe = FTPE(math.ceil(max_workers / max_progress))
+ job = tpe.submit(thread_func, idx, ctx, interrupt_trigger_iter(fragments), info_dict, tpe)
spins.append((tpe, job))
result = True
for tpe, job in spins:
try:
- result = result and job.result()
+ result = result and future_result(job)
+ except KeyboardInterrupt:
+ interrupt_trigger[0] = False
finally:
tpe.shutdown(wait=True)
+ if not interrupt_trigger[0] and not is_live:
+ raise KeyboardInterrupt()
+ # we expect the user wants to stop and DO WANT the preceding postprocessors to run;
+ # so returning a intermediate result here instead of KeyboardInterrupt on live
return result
- def download_and_append_fragments(self, ctx, fragments, info_dict, *, pack_func=None, finish_func=None, tpe=None):
+ def download_and_append_fragments(
+ self, ctx, fragments, info_dict, *, pack_func=None, finish_func=None,
+ tpe=None, interrupt_trigger=None):
+ if not interrupt_trigger:
+ interrupt_trigger = (True, )
+
fragment_retries = self.params.get('fragment_retries', 0)
- is_fatal = (lambda idx: idx == 0) if self.params.get('skip_unavailable_fragments', True) else (lambda _: True)
+ is_fatal = (
+ ((lambda _: False) if info_dict.get('is_live') else (lambda idx: idx == 0))
+ if self.params.get('skip_unavailable_fragments', True) else (lambda _: True))
+
if not pack_func:
pack_func = lambda frag_content, _: frag_content
def download_fragment(fragment, ctx):
+ if not interrupt_trigger[0]:
+ return
+
frag_index = ctx['fragment_index'] = fragment['frag_index']
+ ctx['last_error'] = None
headers = info_dict.get('http_headers', {}).copy()
byte_range = fragment.get('byte_range')
if byte_range:
headers['Range'] = 'bytes=%d-%d' % (byte_range['start'], byte_range['end'] - 1)
# Never skip the first fragment
- fatal = is_fatal(fragment.get('index') or (frag_index - 1))
- count, frag_content = 0, None
+ fatal, count = is_fatal(fragment.get('index') or (frag_index - 1)), 0
while count <= fragment_retries:
try:
- success, frag_content = self._download_fragment(ctx, fragment['url'], info_dict, headers)
- if not success:
- return False, frag_index
- break
- except compat_urllib_error.HTTPError as err:
+ if self._download_fragment(ctx, fragment['url'], info_dict, headers):
+ break
+ return
+ except (compat_urllib_error.HTTPError, http.client.IncompleteRead) as err:
# Unavailable (possibly temporary) fragments may be served.
# First we try to retry then either skip or abort.
# See https://github.com/ytdl-org/youtube-dl/issues/10165,
# https://github.com/ytdl-org/youtube-dl/issues/10448).
count += 1
+ ctx['last_error'] = err
if count <= fragment_retries:
self.report_retry_fragment(err, frag_index, count, fragment_retries)
except DownloadError:
@@ -433,49 +489,46 @@ class FragmentFD(FileDownloader):
break
raise
- if count > fragment_retries:
- if not fatal:
- return False, frag_index
+ if count > fragment_retries and fatal:
ctx['dest_stream'].close()
self.report_error('Giving up after %s fragment retries' % fragment_retries)
- return False, frag_index
- return frag_content, frag_index
def append_fragment(frag_content, frag_index, ctx):
- if not frag_content:
- if not is_fatal(frag_index - 1):
- self.report_skip_fragment(frag_index)
- return True
- else:
- ctx['dest_stream'].close()
- self.report_error(
- 'fragment %s not found, unable to continue' % frag_index)
- return False
- self._append_fragment(ctx, pack_func(frag_content, frag_index))
+ if frag_content:
+ self._append_fragment(ctx, pack_func(frag_content, frag_index))
+ elif not is_fatal(frag_index - 1):
+ self.report_skip_fragment(frag_index, 'fragment not found')
+ else:
+ ctx['dest_stream'].close()
+ self.report_error(f'fragment {frag_index} not found, unable to continue')
+ return False
return True
decrypt_fragment = self.decrypter(info_dict)
- max_workers = self.params.get('concurrent_fragment_downloads', 1)
+ max_workers = math.ceil(
+ self.params.get('concurrent_fragment_downloads', 1) / ctx.get('max_progress', 1))
if can_threaded_download and max_workers > 1:
def _download_fragment(fragment):
ctx_copy = ctx.copy()
- frag_content, frag_index = download_fragment(fragment, ctx_copy)
- return fragment, frag_content, frag_index, ctx_copy.get('fragment_filename_sanitized')
+ download_fragment(fragment, ctx_copy)
+ return fragment, fragment['frag_index'], ctx_copy.get('fragment_filename_sanitized')
self.report_warning('The download speed shown is only of one thread. This is a known issue and patches are welcome')
with tpe or concurrent.futures.ThreadPoolExecutor(max_workers) as pool:
- for fragment, frag_content, frag_index, frag_filename in pool.map(_download_fragment, fragments):
+ for fragment, frag_index, frag_filename in pool.map(_download_fragment, fragments):
ctx['fragment_filename_sanitized'] = frag_filename
ctx['fragment_index'] = frag_index
- result = append_fragment(decrypt_fragment(fragment, frag_content), frag_index, ctx)
+ result = append_fragment(decrypt_fragment(fragment, self._read_fragment(ctx)), frag_index, ctx)
if not result:
return False
else:
for fragment in fragments:
- frag_content, frag_index = download_fragment(fragment, ctx)
- result = append_fragment(decrypt_fragment(fragment, frag_content), frag_index, ctx)
+ if not interrupt_trigger[0]:
+ break
+ download_fragment(fragment, ctx)
+ result = append_fragment(decrypt_fragment(fragment, self._read_fragment(ctx)), fragment['frag_index'], ctx)
if not result:
return False
diff --git a/hypervideo_dl/downloader/hls.py b/hypervideo_dl/downloader/hls.py
index ef8a81b..f3f32b5 100644
--- a/hypervideo_dl/downloader/hls.py
+++ b/hypervideo_dl/downloader/hls.py
@@ -77,6 +77,15 @@ class HlsFD(FragmentFD):
message = ('The stream has AES-128 encryption and neither ffmpeg nor pycryptodome are available; '
'Decryption will be performed natively, but will be extremely slow')
if not can_download:
+ has_drm = re.search('|'.join([
+ r'#EXT-X-FAXS-CM:', # Adobe Flash Access
+ r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://', # Apple FairPlay
+ ]), s)
+ if has_drm and not self.params.get('allow_unplayable_formats'):
+ self.report_error(
+ 'This video is DRM protected; Try selecting another format with --format or '
+ 'add --check-formats to automatically fallback to the next best format')
+ return False
message = message or 'Unsupported features have been detected'
fd = FFmpegFD(self.ydl, self.params)
self.report_warning(f'{message}; extraction will be delegated to {fd.get_basename()}')
diff --git a/hypervideo_dl/downloader/http.py b/hypervideo_dl/downloader/http.py
index 2e95bb9..591a9b0 100644
--- a/hypervideo_dl/downloader/http.py
+++ b/hypervideo_dl/downloader/http.py
@@ -1,29 +1,30 @@
from __future__ import unicode_literals
-import errno
import os
-import socket
+import ssl
import time
import random
-import re
from .common import FileDownloader
from ..compat import (
- compat_str,
compat_urllib_error,
+ compat_http_client
)
from ..utils import (
ContentTooShortError,
encodeFilename,
int_or_none,
- sanitize_open,
+ parse_http_range,
sanitized_Request,
ThrottledDownload,
+ try_call,
write_xattr,
XAttrMetadataError,
XAttrUnavailableError,
)
+RESPONSE_READ_EXCEPTIONS = (TimeoutError, ConnectionError, ssl.SSLError, compat_http_client.HTTPException)
+
class HttpFD(FileDownloader):
def real_download(self, filename, info_dict):
@@ -54,11 +55,11 @@ class HttpFD(FileDownloader):
ctx.open_mode = 'wb'
ctx.resume_len = 0
- ctx.data_len = None
ctx.block_size = self.params.get('buffersize', 1024)
ctx.start_time = time.time()
- ctx.chunk_size = None
- throttle_start = None
+
+ # parse given Range
+ req_start, req_end, _ = parse_http_range(headers.get('Range'))
if self.params.get('continuedl', True):
# Establish possible resume length
@@ -81,43 +82,50 @@ class HttpFD(FileDownloader):
class NextFragment(Exception):
pass
- def set_range(req, start, end):
- range_header = 'bytes=%d-' % start
- if end:
- range_header += compat_str(end)
- req.add_header('Range', range_header)
-
def establish_connection():
ctx.chunk_size = (random.randint(int(chunk_size * 0.95), chunk_size)
if not is_test and chunk_size else chunk_size)
if ctx.resume_len > 0:
range_start = ctx.resume_len
+ if req_start is not None:
+ # offset the beginning of Range to be within request
+ range_start += req_start
if ctx.is_resume:
self.report_resuming_byte(ctx.resume_len)
ctx.open_mode = 'ab'
+ elif req_start is not None:
+ range_start = req_start
elif ctx.chunk_size > 0:
range_start = 0
else:
range_start = None
ctx.is_resume = False
- range_end = range_start + ctx.chunk_size - 1 if ctx.chunk_size else None
- if range_end and ctx.data_len is not None and range_end >= ctx.data_len:
- range_end = ctx.data_len - 1
- has_range = range_start is not None
- ctx.has_range = has_range
+
+ if ctx.chunk_size:
+ chunk_aware_end = range_start + ctx.chunk_size - 1
+ # we're not allowed to download outside Range
+ range_end = chunk_aware_end if req_end is None else min(chunk_aware_end, req_end)
+ elif req_end is not None:
+ # there's no need for chunked downloads, so download until the end of Range
+ range_end = req_end
+ else:
+ range_end = None
+
+ if try_call(lambda: range_start > range_end):
+ ctx.resume_len = 0
+ ctx.open_mode = 'wb'
+ raise RetryDownload(Exception(f'Conflicting range. (start={range_start} > end={range_end})'))
+
+ if try_call(lambda: range_end >= ctx.content_len):
+ range_end = ctx.content_len - 1
+
request = sanitized_Request(url, request_data, headers)
+ has_range = range_start is not None
if has_range:
- set_range(request, range_start, range_end)
+ request.add_header('Range', f'bytes={int(range_start)}-{int_or_none(range_end) or ""}')
# Establish connection
try:
- try:
- ctx.data = self.ydl.urlopen(request)
- except (compat_urllib_error.URLError, ) as err:
- # reason may not be available, e.g. for urllib2.HTTPError on python 2.6
- reason = getattr(err, 'reason', None)
- if isinstance(reason, socket.timeout):
- raise RetryDownload(err)
- raise err
+ ctx.data = self.ydl.urlopen(request)
# When trying to resume, Content-Range HTTP header of response has to be checked
# to match the value of requested Range HTTP header. This is due to a webservers
# that don't support resuming and serve a whole file with no Content-Range
@@ -125,31 +133,27 @@ class HttpFD(FileDownloader):
# https://github.com/ytdl-org/youtube-dl/issues/6057#issuecomment-126129799)
if has_range:
content_range = ctx.data.headers.get('Content-Range')
- if content_range:
- content_range_m = re.search(r'bytes (\d+)-(\d+)?(?:/(\d+))?', content_range)
+ content_range_start, content_range_end, content_len = parse_http_range(content_range)
+ if content_range_start is not None and range_start == content_range_start:
# Content-Range is present and matches requested Range, resume is possible
- if content_range_m:
- if range_start == int(content_range_m.group(1)):
- content_range_end = int_or_none(content_range_m.group(2))
- content_len = int_or_none(content_range_m.group(3))
- accept_content_len = (
- # Non-chunked download
- not ctx.chunk_size
- # Chunked download and requested piece or
- # its part is promised to be served
- or content_range_end == range_end
- or content_len < range_end)
- if accept_content_len:
- ctx.data_len = content_len
- return
+ accept_content_len = (
+ # Non-chunked download
+ not ctx.chunk_size
+ # Chunked download and requested piece or
+ # its part is promised to be served
+ or content_range_end == range_end
+ or content_len < range_end)
+ if accept_content_len:
+ ctx.content_len = content_len
+ ctx.data_len = min(content_len, req_end or content_len) - (req_start or 0)
+ return
# Content-Range is either not present or invalid. Assuming remote webserver is
# trying to send the whole file, resume is not possible, so wiping the local file
# and performing entire redownload
self.report_unable_to_resume()
ctx.resume_len = 0
ctx.open_mode = 'wb'
- ctx.data_len = int_or_none(ctx.data.info().get('Content-length', None))
- return
+ ctx.data_len = ctx.content_len = int_or_none(ctx.data.info().get('Content-length', None))
except (compat_urllib_error.HTTPError, ) as err:
if err.code == 416:
# Unable to resume (requested range not satisfiable)
@@ -191,14 +195,16 @@ class HttpFD(FileDownloader):
# Unexpected HTTP error
raise
raise RetryDownload(err)
- except socket.error as err:
- if err.errno != errno.ECONNRESET:
- # Connection reset is no problem, just retry
+ except compat_urllib_error.URLError as err:
+ if isinstance(err.reason, ssl.CertificateError):
raise
raise RetryDownload(err)
+ # In urllib.request.AbstractHTTPHandler, the response is partially read on request.
+ # Any errors that occur during this will not be wrapped by URLError
+ except RESPONSE_READ_EXCEPTIONS as err:
+ raise RetryDownload(err)
def download():
- nonlocal throttle_start
data_len = ctx.data.info().get('Content-length', None)
# Range HTTP header may be ignored/unsupported by a webserver
@@ -241,16 +247,8 @@ class HttpFD(FileDownloader):
try:
# Download and write
data_block = ctx.data.read(block_size if not is_test else min(block_size, data_len - byte_counter))
- # socket.timeout is a subclass of socket.error but may not have
- # errno set
- except socket.timeout as e:
- retry(e)
- except socket.error as e:
- # SSLError on python 2 (inherits socket.error) may have
- # no errno set but this error message
- if e.errno in (errno.ECONNRESET, errno.ETIMEDOUT) or getattr(e, 'message', None) == 'The read operation timed out':
- retry(e)
- raise
+ except RESPONSE_READ_EXCEPTIONS as err:
+ retry(err)
byte_counter += len(data_block)
@@ -261,7 +259,7 @@ class HttpFD(FileDownloader):
# Open destination file just in time
if ctx.stream is None:
try:
- ctx.stream, ctx.tmpfilename = sanitize_open(
+ ctx.stream, ctx.tmpfilename = self.sanitize_open(
ctx.tmpfilename, ctx.open_mode)
assert ctx.stream is not None
ctx.filename = self.undo_temp_name(ctx.tmpfilename)
@@ -321,16 +319,16 @@ class HttpFD(FileDownloader):
if speed and speed < (self.params.get('throttledratelimit') or 0):
# The speed must stay below the limit for 3 seconds
# This prevents raising error when the speed temporarily goes down
- if throttle_start is None:
- throttle_start = now
- elif now - throttle_start > 3:
+ if ctx.throttle_start is None:
+ ctx.throttle_start = now
+ elif now - ctx.throttle_start > 3:
if ctx.stream is not None and ctx.tmpfilename != '-':
ctx.stream.close()
raise ThrottledDownload()
elif speed:
- throttle_start = None
+ ctx.throttle_start = None
- if not is_test and ctx.chunk_size and ctx.data_len is not None and byte_counter < ctx.data_len:
+ if not is_test and ctx.chunk_size and ctx.content_len is not None and byte_counter < ctx.content_len:
ctx.resume_len = byte_counter
# ctx.block_size = block_size
raise NextFragment()
diff --git a/hypervideo_dl/downloader/ism.py b/hypervideo_dl/downloader/ism.py
index 09516ab..4d5618c 100644
--- a/hypervideo_dl/downloader/ism.py
+++ b/hypervideo_dl/downloader/ism.py
@@ -263,9 +263,11 @@ class IsmFD(FragmentFD):
count = 0
while count <= fragment_retries:
try:
- success, frag_content = self._download_fragment(ctx, segment['url'], info_dict)
+ success = self._download_fragment(ctx, segment['url'], info_dict)
if not success:
return False
+ frag_content = self._read_fragment(ctx)
+
if not extra_state['ism_track_written']:
tfhd_data = extract_box_data(frag_content, [b'moof', b'traf', b'tfhd'])
info_dict['_download_params']['track_id'] = u32.unpack(tfhd_data[4:8])[0]
diff --git a/hypervideo_dl/downloader/mhtml.py b/hypervideo_dl/downloader/mhtml.py
index f0f4dc6..c8332c0 100644
--- a/hypervideo_dl/downloader/mhtml.py
+++ b/hypervideo_dl/downloader/mhtml.py
@@ -114,8 +114,8 @@ body > figure > img {
fragment_base_url = info_dict.get('fragment_base_url')
fragments = info_dict['fragments'][:1] if self.params.get(
'test', False) else info_dict['fragments']
- title = info_dict['title']
- origin = info_dict['webpage_url']
+ title = info_dict.get('title', info_dict['format_id'])
+ origin = info_dict.get('webpage_url', info_dict['url'])
ctx = {
'filename': filename,
@@ -166,10 +166,15 @@ body > figure > img {
if (i + 1) <= ctx['fragment_index']:
continue
- fragment_url = urljoin(fragment_base_url, fragment['path'])
- success, frag_content = self._download_fragment(ctx, fragment_url, info_dict)
+ fragment_url = fragment.get('url')
+ if not fragment_url:
+ assert fragment_base_url
+ fragment_url = urljoin(fragment_base_url, fragment['path'])
+
+ success = self._download_fragment(ctx, fragment_url, info_dict)
if not success:
continue
+ frag_content = self._read_fragment(ctx)
mime_type = b'image/jpeg'
if frag_content.startswith(b'\x89PNG\r\n\x1a\n'):
diff --git a/hypervideo_dl/downloader/rtmp.py b/hypervideo_dl/downloader/rtmp.py
index 6dca647..90f1acf 100644
--- a/hypervideo_dl/downloader/rtmp.py
+++ b/hypervideo_dl/downloader/rtmp.py
@@ -12,6 +12,7 @@ from ..utils import (
encodeFilename,
encodeArgument,
get_exe_version,
+ Popen,
)
@@ -26,7 +27,7 @@ class RtmpFD(FileDownloader):
start = time.time()
resume_percent = None
resume_downloaded_data_len = None
- proc = subprocess.Popen(args, stderr=subprocess.PIPE)
+ proc = Popen(args, stderr=subprocess.PIPE)
cursor_in_new_line = True
proc_stderr_closed = False
try:
diff --git a/hypervideo_dl/downloader/websocket.py b/hypervideo_dl/downloader/websocket.py
index 0882220..58e2bce 100644
--- a/hypervideo_dl/downloader/websocket.py
+++ b/hypervideo_dl/downloader/websocket.py
@@ -5,9 +5,12 @@ import threading
try:
import websockets
- has_websockets = True
-except ImportError:
+except (ImportError, SyntaxError):
+ # websockets 3.10 on python 3.6 causes SyntaxError
+ # See https://github.com/hypervideo/hypervideo/issues/2633
has_websockets = False
+else:
+ has_websockets = True
from .common import FileDownloader
from .external import FFmpegFD
diff --git a/hypervideo_dl/downloader/youtube_live_chat.py b/hypervideo_dl/downloader/youtube_live_chat.py
index ef4205e..dd21ac8 100644
--- a/hypervideo_dl/downloader/youtube_live_chat.py
+++ b/hypervideo_dl/downloader/youtube_live_chat.py
@@ -22,6 +22,9 @@ class YoutubeLiveChatFD(FragmentFD):
def real_download(self, filename, info_dict):
video_id = info_dict['video_id']
self.to_screen('[%s] Downloading live chat' % self.FD_NAME)
+ if not self.params.get('skip_download') and info_dict['protocol'] == 'youtube_live_chat':
+ self.report_warning('Live chat download runs until the livestream ends. '
+ 'If you wish to download the video simultaneously, run a separate hypervideo instance')
fragment_retries = self.params.get('fragment_retries', 0)
test = self.params.get('test', False)
@@ -112,9 +115,10 @@ class YoutubeLiveChatFD(FragmentFD):
count = 0
while count <= fragment_retries:
try:
- success, raw_fragment = dl_fragment(url, request_data, headers)
+ success = dl_fragment(url, request_data, headers)
if not success:
return False, None, None, None
+ raw_fragment = self._read_fragment(ctx)
try:
data = ie.extract_yt_initial_data(video_id, raw_fragment.decode('utf-8', 'replace'))
except RegexNotFoundError:
@@ -142,9 +146,10 @@ class YoutubeLiveChatFD(FragmentFD):
self._prepare_and_start_frag_download(ctx, info_dict)
- success, raw_fragment = dl_fragment(info_dict['url'])
+ success = dl_fragment(info_dict['url'])
if not success:
return False
+ raw_fragment = self._read_fragment(ctx)
try:
data = ie.extract_yt_initial_data(video_id, raw_fragment.decode('utf-8', 'replace'))
except RegexNotFoundError:
diff --git a/hypervideo_dl/extractor/__init__.py b/hypervideo_dl/extractor/__init__.py
index 198c4ae..b354842 100644
--- a/hypervideo_dl/extractor/__init__.py
+++ b/hypervideo_dl/extractor/__init__.py
@@ -1,14 +1,15 @@
-from __future__ import unicode_literals
+import os
from ..utils import load_plugins
-try:
- from .lazy_extractors import *
- from .lazy_extractors import _ALL_CLASSES
- _LAZY_LOADER = True
- _PLUGIN_CLASSES = {}
-except ImportError:
- _LAZY_LOADER = False
+_LAZY_LOADER = False
+if not os.environ.get('YTDLP_NO_LAZY_EXTRACTORS'):
+ try:
+ from .lazy_extractors import *
+ from .lazy_extractors import _ALL_CLASSES
+ _LAZY_LOADER = True
+ except ImportError:
+ pass
if not _LAZY_LOADER:
from .extractors import *
@@ -19,8 +20,8 @@ if not _LAZY_LOADER:
]
_ALL_CLASSES.append(GenericIE)
- _PLUGIN_CLASSES = load_plugins('extractor', 'IE', globals())
- _ALL_CLASSES = list(_PLUGIN_CLASSES.values()) + _ALL_CLASSES
+_PLUGIN_CLASSES = load_plugins('extractor', 'IE', globals())
+_ALL_CLASSES = list(_PLUGIN_CLASSES.values()) + _ALL_CLASSES
def gen_extractor_classes():
diff --git a/hypervideo_dl/extractor/abc.py b/hypervideo_dl/extractor/abc.py
index 3e20216..6fe195e 100644
--- a/hypervideo_dl/extractor/abc.py
+++ b/hypervideo_dl/extractor/abc.py
@@ -8,6 +8,7 @@ import time
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
+ dict_get,
ExtractorError,
js_to_json,
int_or_none,
@@ -212,7 +213,7 @@ class ABCIViewIE(InfoExtractor):
'hdnea': token,
})
- for sd in ('720', 'sd', 'sd-low'):
+ for sd in ('1080', '720', 'sd', 'sd-low'):
sd_url = try_get(
stream, lambda x: x['streams']['hls'][sd], compat_str)
if not sd_url:
@@ -233,8 +234,6 @@ class ABCIViewIE(InfoExtractor):
}]
is_live = video_params.get('livestream') == '1'
- if is_live:
- title = self._live_title(title)
return {
'id': video_id,
@@ -255,3 +254,65 @@ class ABCIViewIE(InfoExtractor):
'subtitles': subtitles,
'is_live': is_live,
}
+
+
+class ABCIViewShowSeriesIE(InfoExtractor):
+ IE_NAME = 'abc.net.au:iview:showseries'
+ _VALID_URL = r'https?://iview\.abc\.net\.au/show/(?P<id>[^/]+)(?:/series/\d+)?$'
+ _GEO_COUNTRIES = ['AU']
+
+ _TESTS = [{
+ 'url': 'https://iview.abc.net.au/show/upper-middle-bogan',
+ 'info_dict': {
+ 'id': '124870-1',
+ 'title': 'Series 1',
+ 'description': 'md5:93119346c24a7c322d446d8eece430ff',
+ 'series': 'Upper Middle Bogan',
+ 'season': 'Series 1',
+ 'thumbnail': r're:^https?://cdn\.iview\.abc\.net\.au/thumbs/.*\.jpg$'
+ },
+ 'playlist_count': 8,
+ }, {
+ 'url': 'https://iview.abc.net.au/show/upper-middle-bogan',
+ 'info_dict': {
+ 'id': 'CO1108V001S00',
+ 'ext': 'mp4',
+ 'title': 'Series 1 Ep 1 I\'m A Swan',
+ 'description': 'md5:7b676758c1de11a30b79b4d301e8da93',
+ 'series': 'Upper Middle Bogan',
+ 'uploader_id': 'abc1',
+ 'upload_date': '20210630',
+ 'timestamp': 1625036400,
+ },
+ 'params': {
+ 'noplaylist': True,
+ 'skip_download': 'm3u8',
+ },
+ }]
+
+ def _real_extract(self, url):
+ show_id = self._match_id(url)
+ webpage = self._download_webpage(url, show_id)
+ webpage_data = self._search_regex(
+ r'window\.__INITIAL_STATE__\s*=\s*[\'"](.+?)[\'"]\s*;',
+ webpage, 'initial state')
+ video_data = self._parse_json(
+ unescapeHTML(webpage_data).encode('utf-8').decode('unicode_escape'), show_id)
+ video_data = video_data['route']['pageData']['_embedded']
+
+ highlight = try_get(video_data, lambda x: x['highlightVideo']['shareUrl'])
+ if not self._yes_playlist(show_id, bool(highlight), video_label='highlight video'):
+ return self.url_result(highlight, ie=ABCIViewIE.ie_key())
+
+ series = video_data['selectedSeries']
+ return {
+ '_type': 'playlist',
+ 'entries': [self.url_result(episode['shareUrl'])
+ for episode in series['_embedded']['videoEpisodes']],
+ 'id': series.get('id'),
+ 'title': dict_get(series, ('title', 'displaySubtitle')),
+ 'description': series.get('description'),
+ 'series': dict_get(series, ('showTitle', 'displayTitle')),
+ 'season': dict_get(series, ('title', 'displaySubtitle')),
+ 'thumbnail': series.get('thumbnail'),
+ }
diff --git a/hypervideo_dl/extractor/abematv.py b/hypervideo_dl/extractor/abematv.py
new file mode 100644
index 0000000..27b7d86
--- /dev/null
+++ b/hypervideo_dl/extractor/abematv.py
@@ -0,0 +1,476 @@
+import io
+import json
+import time
+import hashlib
+import hmac
+import re
+import struct
+from base64 import urlsafe_b64encode
+from binascii import unhexlify
+
+from .common import InfoExtractor
+from ..aes import aes_ecb_decrypt
+from ..compat import (
+ compat_urllib_response,
+ compat_urllib_parse_urlparse,
+ compat_urllib_request,
+)
+from ..utils import (
+ ExtractorError,
+ decode_base,
+ int_or_none,
+ random_uuidv4,
+ request_to_url,
+ time_seconds,
+ update_url_query,
+ traverse_obj,
+ intlist_to_bytes,
+ bytes_to_intlist,
+ urljoin,
+)
+
+
+# NOTE: network handler related code is temporary thing until network stack overhaul PRs are merged (#2861/#2862)
+
+def add_opener(ydl, handler):
+ ''' Add a handler for opening URLs, like _download_webpage '''
+ # https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L426
+ # https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L605
+ assert isinstance(ydl._opener, compat_urllib_request.OpenerDirector)
+ ydl._opener.add_handler(handler)
+
+
+def remove_opener(ydl, handler):
+ '''
+ Remove handler(s) for opening URLs
+ @param handler Either handler object itself or handler type.
+ Specifying handler type will remove all handler which isinstance returns True.
+ '''
+ # https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L426
+ # https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L605
+ opener = ydl._opener
+ assert isinstance(ydl._opener, compat_urllib_request.OpenerDirector)
+ if isinstance(handler, (type, tuple)):
+ find_cp = lambda x: isinstance(x, handler)
+ else:
+ find_cp = lambda x: x is handler
+
+ removed = []
+ for meth in dir(handler):
+ if meth in ["redirect_request", "do_open", "proxy_open"]:
+ # oops, coincidental match
+ continue
+
+ i = meth.find("_")
+ protocol = meth[:i]
+ condition = meth[i + 1:]
+
+ if condition.startswith("error"):
+ j = condition.find("_") + i + 1
+ kind = meth[j + 1:]
+ try:
+ kind = int(kind)
+ except ValueError:
+ pass
+ lookup = opener.handle_error.get(protocol, {})
+ opener.handle_error[protocol] = lookup
+ elif condition == "open":
+ kind = protocol
+ lookup = opener.handle_open
+ elif condition == "response":
+ kind = protocol
+ lookup = opener.process_response
+ elif condition == "request":
+ kind = protocol
+ lookup = opener.process_request
+ else:
+ continue
+
+ handlers = lookup.setdefault(kind, [])
+ if handlers:
+ handlers[:] = [x for x in handlers if not find_cp(x)]
+
+ removed.append(x for x in handlers if find_cp(x))
+
+ if removed:
+ for x in opener.handlers:
+ if find_cp(x):
+ x.add_parent(None)
+ opener.handlers[:] = [x for x in opener.handlers if not find_cp(x)]
+
+
+class AbemaLicenseHandler(compat_urllib_request.BaseHandler):
+ handler_order = 499
+ STRTABLE = '123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz'
+ HKEY = b'3AF0298C219469522A313570E8583005A642E73EDD58E3EA2FB7339D3DF1597E'
+
+ def __init__(self, ie: 'AbemaTVIE'):
+ # the protcol that this should really handle is 'abematv-license://'
+ # abematv_license_open is just a placeholder for development purposes
+ # ref. https://github.com/python/cpython/blob/f4c03484da59049eb62a9bf7777b963e2267d187/Lib/urllib/request.py#L510
+ setattr(self, 'abematv-license_open', getattr(self, 'abematv_license_open'))
+ self.ie = ie
+
+ def _get_videokey_from_ticket(self, ticket):
+ to_show = self.ie._downloader.params.get('verbose', False)
+ media_token = self.ie._get_media_token(to_show=to_show)
+
+ license_response = self.ie._download_json(
+ 'https://license.abema.io/abematv-hls', None, note='Requesting playback license' if to_show else False,
+ query={'t': media_token},
+ data=json.dumps({
+ 'kv': 'a',
+ 'lt': ticket
+ }).encode('utf-8'),
+ headers={
+ 'Content-Type': 'application/json',
+ })
+
+ res = decode_base(license_response['k'], self.STRTABLE)
+ encvideokey = bytes_to_intlist(struct.pack('>QQ', res >> 64, res & 0xffffffffffffffff))
+
+ h = hmac.new(
+ unhexlify(self.HKEY),
+ (license_response['cid'] + self.ie._DEVICE_ID).encode('utf-8'),
+ digestmod=hashlib.sha256)
+ enckey = bytes_to_intlist(h.digest())
+
+ return intlist_to_bytes(aes_ecb_decrypt(encvideokey, enckey))
+
+ def abematv_license_open(self, url):
+ url = request_to_url(url)
+ ticket = compat_urllib_parse_urlparse(url).netloc
+ response_data = self._get_videokey_from_ticket(ticket)
+ return compat_urllib_response.addinfourl(io.BytesIO(response_data), headers={
+ 'Content-Length': len(response_data),
+ }, url=url, code=200)
+
+
+class AbemaTVBaseIE(InfoExtractor):
+ def _extract_breadcrumb_list(self, webpage, video_id):
+ for jld in re.finditer(
+ r'(?is)</span></li></ul><script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
+ webpage):
+ jsonld = self._parse_json(jld.group('json_ld'), video_id, fatal=False)
+ if jsonld:
+ if jsonld.get('@type') != 'BreadcrumbList':
+ continue
+ trav = traverse_obj(jsonld, ('itemListElement', ..., 'name'))
+ if trav:
+ return trav
+ return []
+
+
+class AbemaTVIE(AbemaTVBaseIE):
+ _VALID_URL = r'https?://abema\.tv/(?P<type>now-on-air|video/episode|channels/.+?/slots)/(?P<id>[^?/]+)'
+ _NETRC_MACHINE = 'abematv'
+ _TESTS = [{
+ 'url': 'https://abema.tv/video/episode/194-25_s2_p1',
+ 'info_dict': {
+ 'id': '194-25_s2_p1',
+ 'title': '第1話 「チーズケーキ」 「モーニング再び」',
+ 'series': '異世界食堂2',
+ 'series_number': 2,
+ 'episode': '第1話 「チーズケーキ」 「モーニング再び」',
+ 'episode_number': 1,
+ },
+ 'skip': 'expired',
+ }, {
+ 'url': 'https://abema.tv/channels/anime-live2/slots/E8tvAnMJ7a9a5d',
+ 'info_dict': {
+ 'id': 'E8tvAnMJ7a9a5d',
+ 'title': 'ゆるキャン△ SEASON2 全話一挙【無料ビデオ72時間】',
+ 'series': 'ゆるキャン△ SEASON2',
+ 'episode': 'ゆるキャン△ SEASON2 全話一挙【無料ビデオ72時間】',
+ 'series_number': 2,
+ 'episode_number': 1,
+ 'description': 'md5:9c5a3172ae763278f9303922f0ea5b17',
+ },
+ 'skip': 'expired',
+ }, {
+ 'url': 'https://abema.tv/video/episode/87-877_s1282_p31047',
+ 'info_dict': {
+ 'id': 'E8tvAnMJ7a9a5d',
+ 'title': '第5話『光射す』',
+ 'description': 'md5:56d4fc1b4f7769ded5f923c55bb4695d',
+ 'thumbnail': r're:https://hayabusa\.io/.+',
+ 'series': '相棒',
+ 'episode': '第5話『光射す』',
+ },
+ 'skip': 'expired',
+ }, {
+ 'url': 'https://abema.tv/now-on-air/abema-anime',
+ 'info_dict': {
+ 'id': 'abema-anime',
+ # this varies
+ # 'title': '女子高生の無駄づかい 全話一挙【無料ビデオ72時間】',
+ 'description': 'md5:55f2e61f46a17e9230802d7bcc913d5f',
+ 'is_live': True,
+ },
+ 'skip': 'Not supported until hypervideo implements native live downloader OR AbemaTV can start a local HTTP server',
+ }]
+ _USERTOKEN = None
+ _DEVICE_ID = None
+ _TIMETABLE = None
+ _MEDIATOKEN = None
+
+ _SECRETKEY = b'v+Gjs=25Aw5erR!J8ZuvRrCx*rGswhB&qdHd_SYerEWdU&a?3DzN9BRbp5KwY4hEmcj5#fykMjJ=AuWz5GSMY-d@H7DMEh3M@9n2G552Us$$k9cD=3TxwWe86!x#Zyhe'
+
+ def _generate_aks(self, deviceid):
+ deviceid = deviceid.encode('utf-8')
+ # add 1 hour and then drop minute and secs
+ ts_1hour = int((time_seconds(hours=9) // 3600 + 1) * 3600)
+ time_struct = time.gmtime(ts_1hour)
+ ts_1hour_str = str(ts_1hour).encode('utf-8')
+
+ tmp = None
+
+ def mix_once(nonce):
+ nonlocal tmp
+ h = hmac.new(self._SECRETKEY, digestmod=hashlib.sha256)
+ h.update(nonce)
+ tmp = h.digest()
+
+ def mix_tmp(count):
+ nonlocal tmp
+ for i in range(count):
+ mix_once(tmp)
+
+ def mix_twist(nonce):
+ nonlocal tmp
+ mix_once(urlsafe_b64encode(tmp).rstrip(b'=') + nonce)
+
+ mix_once(self._SECRETKEY)
+ mix_tmp(time_struct.tm_mon)
+ mix_twist(deviceid)
+ mix_tmp(time_struct.tm_mday % 5)
+ mix_twist(ts_1hour_str)
+ mix_tmp(time_struct.tm_hour % 5)
+
+ return urlsafe_b64encode(tmp).rstrip(b'=').decode('utf-8')
+
+ def _get_device_token(self):
+ if self._USERTOKEN:
+ return self._USERTOKEN
+
+ self._DEVICE_ID = random_uuidv4()
+ aks = self._generate_aks(self._DEVICE_ID)
+ user_data = self._download_json(
+ 'https://api.abema.io/v1/users', None, note='Authorizing',
+ data=json.dumps({
+ 'deviceId': self._DEVICE_ID,
+ 'applicationKeySecret': aks,
+ }).encode('utf-8'),
+ headers={
+ 'Content-Type': 'application/json',
+ })
+ self._USERTOKEN = user_data['token']
+
+ # don't allow adding it 2 times or more, though it's guarded
+ remove_opener(self._downloader, AbemaLicenseHandler)
+ add_opener(self._downloader, AbemaLicenseHandler(self))
+
+ return self._USERTOKEN
+
+ def _get_media_token(self, invalidate=False, to_show=True):
+ if not invalidate and self._MEDIATOKEN:
+ return self._MEDIATOKEN
+
+ self._MEDIATOKEN = self._download_json(
+ 'https://api.abema.io/v1/media/token', None, note='Fetching media token' if to_show else False,
+ query={
+ 'osName': 'android',
+ 'osVersion': '6.0.1',
+ 'osLang': 'ja_JP',
+ 'osTimezone': 'Asia/Tokyo',
+ 'appId': 'tv.abema',
+ 'appVersion': '3.27.1'
+ }, headers={
+ 'Authorization': 'bearer ' + self._get_device_token()
+ })['token']
+
+ return self._MEDIATOKEN
+
+ def _perform_login(self, username, password):
+ if '@' in username: # don't strictly check if it's email address or not
+ ep, method = 'user/email', 'email'
+ else:
+ ep, method = 'oneTimePassword', 'userId'
+
+ login_response = self._download_json(
+ f'https://api.abema.io/v1/auth/{ep}', None, note='Logging in',
+ data=json.dumps({
+ method: username,
+ 'password': password
+ }).encode('utf-8'), headers={
+ 'Authorization': 'bearer ' + self._get_device_token(),
+ 'Origin': 'https://abema.tv',
+ 'Referer': 'https://abema.tv/',
+ 'Content-Type': 'application/json',
+ })
+
+ self._USERTOKEN = login_response['token']
+ self._get_media_token(True)
+
+ def _real_extract(self, url):
+ # starting download using infojson from this extractor is undefined behavior,
+ # and never be fixed in the future; you must trigger downloads by directly specifing URL.
+ # (unless there's a way to hook before downloading by extractor)
+ video_id, video_type = self._match_valid_url(url).group('id', 'type')
+ headers = {
+ 'Authorization': 'Bearer ' + self._get_device_token(),
+ }
+ video_type = video_type.split('/')[-1]
+
+ webpage = self._download_webpage(url, video_id)
+ canonical_url = self._search_regex(
+ r'<link\s+rel="canonical"\s*href="(.+?)"', webpage, 'canonical URL',
+ default=url)
+ info = self._search_json_ld(webpage, video_id, default={})
+
+ title = self._search_regex(
+ r'<span\s*class=".+?EpisodeTitleBlock__title">(.+?)</span>', webpage, 'title', default=None)
+ if not title:
+ jsonld = None
+ for jld in re.finditer(
+ r'(?is)<span\s*class="com-m-Thumbnail__image">(?:</span>)?<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
+ webpage):
+ jsonld = self._parse_json(jld.group('json_ld'), video_id, fatal=False)
+ if jsonld:
+ break
+ if jsonld:
+ title = jsonld.get('caption')
+ if not title and video_type == 'now-on-air':
+ if not self._TIMETABLE:
+ # cache the timetable because it goes to 5MiB in size (!!)
+ self._TIMETABLE = self._download_json(
+ 'https://api.abema.io/v1/timetable/dataSet?debug=false', video_id,
+ headers=headers)
+ now = time_seconds(hours=9)
+ for slot in self._TIMETABLE.get('slots', []):
+ if slot.get('channelId') != video_id:
+ continue
+ if slot['startAt'] <= now and now < slot['endAt']:
+ title = slot['title']
+ break
+
+ # read breadcrumb on top of page
+ breadcrumb = self._extract_breadcrumb_list(webpage, video_id)
+ if breadcrumb:
+ # breadcrumb list translates to: (example is 1st test for this IE)
+ # Home > Anime (genre) > Isekai Shokudo 2 (series name) > Episode 1 "Cheese cakes" "Morning again" (episode title)
+ # hence this works
+ info['series'] = breadcrumb[-2]
+ info['episode'] = breadcrumb[-1]
+ if not title:
+ title = info['episode']
+
+ description = self._html_search_regex(
+ (r'<p\s+class="com-video-EpisodeDetailsBlock__content"><span\s+class=".+?">(.+?)</span></p><div',
+ r'<span\s+class=".+?SlotSummary.+?">(.+?)</span></div><div',),
+ webpage, 'description', default=None, group=1)
+ if not description:
+ og_desc = self._html_search_meta(
+ ('description', 'og:description', 'twitter:description'), webpage)
+ if og_desc:
+ description = re.sub(r'''(?sx)
+ ^(.+?)(?:
+ アニメの動画を無料で見るならABEMA!| # anime
+ 等、.+ # applies for most of categories
+ )?
+ ''', r'\1', og_desc)
+
+ # canonical URL may contain series and episode number
+ mobj = re.search(r's(\d+)_p(\d+)$', canonical_url)
+ if mobj:
+ seri = int_or_none(mobj.group(1), default=float('inf'))
+ epis = int_or_none(mobj.group(2), default=float('inf'))
+ info['series_number'] = seri if seri < 100 else None
+ # some anime like Detective Conan (though not available in AbemaTV)
+ # has more than 1000 episodes (1026 as of 2021/11/15)
+ info['episode_number'] = epis if epis < 2000 else None
+
+ is_live, m3u8_url = False, None
+ if video_type == 'now-on-air':
+ is_live = True
+ channel_url = 'https://api.abema.io/v1/channels'
+ if video_id == 'news-global':
+ channel_url = update_url_query(channel_url, {'division': '1'})
+ onair_channels = self._download_json(channel_url, video_id)
+ for ch in onair_channels['channels']:
+ if video_id == ch['id']:
+ m3u8_url = ch['playback']['hls']
+ break
+ else:
+ raise ExtractorError(f'Cannot find on-air {video_id} channel.', expected=True)
+ elif video_type == 'episode':
+ api_response = self._download_json(
+ f'https://api.abema.io/v1/video/programs/{video_id}', video_id,
+ note='Checking playability',
+ headers=headers)
+ ondemand_types = traverse_obj(api_response, ('terms', ..., 'onDemandType'), default=[])
+ if 3 not in ondemand_types:
+ # cannot acquire decryption key for these streams
+ self.report_warning('This is a premium-only stream')
+
+ m3u8_url = f'https://vod-abematv.akamaized.net/program/{video_id}/playlist.m3u8'
+ elif video_type == 'slots':
+ api_response = self._download_json(
+ f'https://api.abema.io/v1/media/slots/{video_id}', video_id,
+ note='Checking playability',
+ headers=headers)
+ if not traverse_obj(api_response, ('slot', 'flags', 'timeshiftFree'), default=False):
+ self.report_warning('This is a premium-only stream')
+
+ m3u8_url = f'https://vod-abematv.akamaized.net/slot/{video_id}/playlist.m3u8'
+ else:
+ raise ExtractorError('Unreachable')
+
+ if is_live:
+ self.report_warning("This is a livestream; hypervideo doesn't support downloading natively, but FFmpeg cannot handle m3u8 manifests from AbemaTV")
+ self.report_warning('Please consider using Streamlink to download these streams (https://github.com/streamlink/streamlink)')
+ formats = self._extract_m3u8_formats(
+ m3u8_url, video_id, ext='mp4', live=is_live)
+
+ info.update({
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'formats': formats,
+ 'is_live': is_live,
+ })
+ return info
+
+
+class AbemaTVTitleIE(AbemaTVBaseIE):
+ _VALID_URL = r'https?://abema\.tv/video/title/(?P<id>[^?/]+)'
+
+ _TESTS = [{
+ 'url': 'https://abema.tv/video/title/90-1597',
+ 'info_dict': {
+ 'id': '90-1597',
+ 'title': 'シャッフルアイランド',
+ },
+ 'playlist_mincount': 2,
+ }, {
+ 'url': 'https://abema.tv/video/title/193-132',
+ 'info_dict': {
+ 'id': '193-132',
+ 'title': '真心が届く~僕とスターのオフィス・ラブ!?~',
+ },
+ 'playlist_mincount': 16,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ playlist_title, breadcrumb = None, self._extract_breadcrumb_list(webpage, video_id)
+ if breadcrumb:
+ playlist_title = breadcrumb[-1]
+
+ playlist = [
+ self.url_result(urljoin('https://abema.tv/', mobj.group(1)))
+ for mobj in re.finditer(r'<li\s*class=".+?EpisodeList.+?"><a\s*href="(/[^"]+?)"', webpage)]
+
+ return self.playlist_result(playlist, playlist_title=playlist_title, playlist_id=video_id)
diff --git a/hypervideo_dl/extractor/adn.py b/hypervideo_dl/extractor/adn.py
index a55ebbc..fca6e60 100644
--- a/hypervideo_dl/extractor/adn.py
+++ b/hypervideo_dl/extractor/adn.py
@@ -8,13 +8,13 @@ import os
import random
from .common import InfoExtractor
-from ..aes import aes_cbc_decrypt
+from ..aes import aes_cbc_decrypt_bytes, unpad_pkcs7
from ..compat import (
compat_HTTPError,
compat_b64decode,
- compat_ord,
)
from ..utils import (
+ ass_subtitles_timecode,
bytes_to_intlist,
bytes_to_long,
ExtractorError,
@@ -68,10 +68,6 @@ class ADNIE(InfoExtractor):
'end': 4,
}
- @staticmethod
- def _ass_subtitles_timecode(seconds):
- return '%01d:%02d:%02d.%02d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 100)
-
def _get_subtitles(self, sub_url, video_id):
if not sub_url:
return None
@@ -87,14 +83,11 @@ class ADNIE(InfoExtractor):
return None
# http://animedigitalnetwork.fr/components/com_vodvideo/videojs/adn-vjs.min.js
- dec_subtitles = intlist_to_bytes(aes_cbc_decrypt(
- bytes_to_intlist(compat_b64decode(enc_subtitles[24:])),
- bytes_to_intlist(binascii.unhexlify(self._K + 'ab9f52f5baae7c72')),
- bytes_to_intlist(compat_b64decode(enc_subtitles[:24]))
- ))
- subtitles_json = self._parse_json(
- dec_subtitles[:-compat_ord(dec_subtitles[-1])].decode(),
- None, fatal=False)
+ dec_subtitles = unpad_pkcs7(aes_cbc_decrypt_bytes(
+ compat_b64decode(enc_subtitles[24:]),
+ binascii.unhexlify(self._K + 'ab9f52f5baae7c72'),
+ compat_b64decode(enc_subtitles[:24])))
+ subtitles_json = self._parse_json(dec_subtitles.decode(), None, fatal=False)
if not subtitles_json:
return None
@@ -117,8 +110,8 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text'''
continue
alignment = self._POS_ALIGN_MAP.get(position_align, 2) + self._LINE_ALIGN_MAP.get(line_align, 0)
ssa += os.linesep + 'Dialogue: Marked=0,%s,%s,Default,,0,0,0,,%s%s' % (
- self._ass_subtitles_timecode(start),
- self._ass_subtitles_timecode(end),
+ ass_subtitles_timecode(start),
+ ass_subtitles_timecode(end),
'{\\a%d}' % alignment if alignment != 2 else '',
text.replace('\n', '\\N').replace('<i>', '{\\i1}').replace('</i>', '{\\i0}'))
@@ -133,10 +126,7 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text'''
}])
return subtitles
- def _real_initialize(self):
- username, password = self._get_login_info()
- if not username:
- return
+ def _perform_login(self, username, password):
try:
access_token = (self._download_json(
self._API_BASE_URL + 'authentication/login', None,
diff --git a/hypervideo_dl/extractor/adobeconnect.py b/hypervideo_dl/extractor/adobeconnect.py
index 728549e..e2e6f93 100644
--- a/hypervideo_dl/extractor/adobeconnect.py
+++ b/hypervideo_dl/extractor/adobeconnect.py
@@ -14,7 +14,7 @@ class AdobeConnectIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- title = self._html_search_regex(r'<title>(.+?)</title>', webpage, 'title')
+ title = self._html_extract_title(webpage)
qs = compat_parse_qs(self._search_regex(r"swfUrl\s*=\s*'([^']+)'", webpage, 'swf url').split('?')[1])
is_live = qs.get('isLive', ['false'])[0] == 'true'
formats = []
@@ -31,7 +31,7 @@ class AdobeConnectIE(InfoExtractor):
return {
'id': video_id,
- 'title': self._live_title(title) if is_live else title,
+ 'title': title,
'formats': formats,
'is_live': is_live,
}
diff --git a/hypervideo_dl/extractor/adobepass.py b/hypervideo_dl/extractor/adobepass.py
index 9378c33..5d98301 100644
--- a/hypervideo_dl/extractor/adobepass.py
+++ b/hypervideo_dl/extractor/adobepass.py
@@ -39,8 +39,8 @@ MSO_INFO = {
},
'RCN': {
'name': 'RCN',
- 'username_field': 'UserName',
- 'password_field': 'UserPassword',
+ 'username_field': 'username',
+ 'password_field': 'password',
},
'Rogers': {
'name': 'Rogers',
@@ -1345,6 +1345,11 @@ MSO_INFO = {
'username_field': 'username',
'password_field': 'password',
},
+ 'Suddenlink': {
+ 'name': 'Suddenlink',
+ 'username_field': 'username',
+ 'password_field': 'password',
+ },
}
@@ -1636,6 +1641,58 @@ class AdobePassIE(InfoExtractor):
query=hidden_data)
post_form(mvpd_confirm_page_res, 'Confirming Login')
+ elif mso_id == 'Suddenlink':
+ # Suddenlink is similar to SlingTV in using a tab history count and a meta refresh,
+ # but they also do a dynmaic redirect using javascript that has to be followed as well
+ first_bookend_page, urlh = post_form(
+ provider_redirect_page_res, 'Pressing Continue...')
+
+ hidden_data = self._hidden_inputs(first_bookend_page)
+ hidden_data['history_val'] = 1
+
+ provider_login_redirect_page_res = self._download_webpage_handle(
+ urlh.geturl(), video_id, 'Sending First Bookend',
+ query=hidden_data)
+
+ provider_login_redirect_page, urlh = provider_login_redirect_page_res
+
+ # Some website partners seem to not have the extra ajaxurl redirect step, so we check if we already
+ # have the login prompt or not
+ if 'id="password" type="password" name="password"' in provider_login_redirect_page:
+ provider_login_page_res = provider_login_redirect_page_res
+ else:
+ provider_tryauth_url = self._html_search_regex(
+ r'url:\s*[\'"]([^\'"]+)', provider_login_redirect_page, 'ajaxurl')
+ provider_tryauth_page = self._download_webpage(
+ provider_tryauth_url, video_id, 'Submitting TryAuth',
+ query=hidden_data)
+
+ provider_login_page_res = self._download_webpage_handle(
+ f'https://authorize.suddenlink.net/saml/module.php/authSynacor/login.php?AuthState={provider_tryauth_page}',
+ video_id, 'Getting Login Page',
+ query=hidden_data)
+
+ provider_association_redirect, urlh = post_form(
+ provider_login_page_res, 'Logging in', {
+ mso_info['username_field']: username,
+ mso_info['password_field']: password
+ })
+
+ provider_refresh_redirect_url = extract_redirect_url(
+ provider_association_redirect, url=urlh.geturl())
+
+ last_bookend_page, urlh = self._download_webpage_handle(
+ provider_refresh_redirect_url, video_id,
+ 'Downloading Auth Association Redirect Page')
+
+ hidden_data = self._hidden_inputs(last_bookend_page)
+ hidden_data['history_val'] = 3
+
+ mvpd_confirm_page_res = self._download_webpage_handle(
+ urlh.geturl(), video_id, 'Sending Final Bookend',
+ query=hidden_data)
+
+ post_form(mvpd_confirm_page_res, 'Confirming Login')
else:
# Some providers (e.g. DIRECTV NOW) have another meta refresh
# based redirect that should be followed.
diff --git a/hypervideo_dl/extractor/adobetv.py b/hypervideo_dl/extractor/adobetv.py
index 12b8192..3cfa1ff 100644
--- a/hypervideo_dl/extractor/adobetv.py
+++ b/hypervideo_dl/extractor/adobetv.py
@@ -9,6 +9,7 @@ from ..utils import (
float_or_none,
int_or_none,
ISO639Utils,
+ join_nonempty,
OnDemandPagedList,
parse_duration,
str_or_none,
@@ -263,7 +264,7 @@ class AdobeTVVideoIE(AdobeTVBaseIE):
continue
formats.append({
'filesize': int_or_none(source.get('kilobytes') or None, invscale=1000),
- 'format_id': '-'.join(filter(None, [source.get('format'), source.get('label')])),
+ 'format_id': join_nonempty(source.get('format'), source.get('label')),
'height': int_or_none(source.get('height') or None),
'tbr': int_or_none(source.get('bitrate') or None),
'width': int_or_none(source.get('width') or None),
diff --git a/hypervideo_dl/extractor/afreecatv.py b/hypervideo_dl/extractor/afreecatv.py
index 063872b..77f0e3c 100644
--- a/hypervideo_dl/extractor/afreecatv.py
+++ b/hypervideo_dl/extractor/afreecatv.py
@@ -10,7 +10,11 @@ from ..utils import (
determine_ext,
ExtractorError,
int_or_none,
+ qualities,
+ traverse_obj,
unified_strdate,
+ unified_timestamp,
+ update_url_query,
url_or_none,
urlencode_postdata,
xpath_text,
@@ -28,7 +32,7 @@ class AfreecaTVIE(InfoExtractor):
/app/(?:index|read_ucc_bbs)\.cgi|
/player/[Pp]layer\.(?:swf|html)
)\?.*?\bnTitleNo=|
- vod\.afreecatv\.com/PLAYER/STATION/
+ vod\.afreecatv\.com/(PLAYER/STATION|player)/
)
(?P<id>\d+)
'''
@@ -166,6 +170,9 @@ class AfreecaTVIE(InfoExtractor):
}, {
'url': 'http://vod.afreecatv.com/PLAYER/STATION/15055030',
'only_matching': True,
+ }, {
+ 'url': 'http://vod.afreecatv.com/player/15055030',
+ 'only_matching': True,
}]
@staticmethod
@@ -177,14 +184,7 @@ class AfreecaTVIE(InfoExtractor):
video_key['part'] = int(m.group('part'))
return video_key
- def _real_initialize(self):
- self._login()
-
- def _login(self):
- username, password = self._get_login_info()
- if username is None:
- return
-
+ def _perform_login(self, username, password):
login_form = {
'szWork': 'login',
'szType': 'json',
@@ -380,3 +380,105 @@ class AfreecaTVIE(InfoExtractor):
})
return info
+
+
+class AfreecaTVLiveIE(AfreecaTVIE):
+
+ IE_NAME = 'afreecatv:live'
+ _VALID_URL = r'https?://play\.afreeca(?:tv)?\.com/(?P<id>[^/]+)(?:/(?P<bno>\d+))?'
+ _TESTS = [{
+ 'url': 'https://play.afreecatv.com/pyh3646/237852185',
+ 'info_dict': {
+ 'id': '237852185',
+ 'ext': 'mp4',
+ 'title': '【 우루과이 오늘은 무슨일이? 】',
+ 'uploader': '박진우[JINU]',
+ 'uploader_id': 'pyh3646',
+ 'timestamp': 1640661495,
+ 'is_live': True,
+ },
+ 'skip': 'Livestream has ended',
+ }, {
+ 'url': 'http://play.afreeca.com/pyh3646/237852185',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://play.afreeca.com/pyh3646',
+ 'only_matching': True,
+ }]
+
+ _LIVE_API_URL = 'https://live.afreecatv.com/afreeca/player_live_api.php'
+
+ _QUALITIES = ('sd', 'hd', 'hd2k', 'original')
+
+ def _real_extract(self, url):
+ broadcaster_id, broadcast_no = self._match_valid_url(url).group('id', 'bno')
+ password = self.get_param('videopassword')
+
+ info = self._download_json(self._LIVE_API_URL, broadcaster_id, fatal=False,
+ data=urlencode_postdata({'bid': broadcaster_id})) or {}
+ channel_info = info.get('CHANNEL') or {}
+ broadcaster_id = channel_info.get('BJID') or broadcaster_id
+ broadcast_no = channel_info.get('BNO') or broadcast_no
+ password_protected = channel_info.get('BPWD')
+ if not broadcast_no:
+ raise ExtractorError(f'Unable to extract broadcast number ({broadcaster_id} may not be live)', expected=True)
+ if password_protected == 'Y' and password is None:
+ raise ExtractorError(
+ 'This livestream is protected by a password, use the --video-password option',
+ expected=True)
+
+ formats = []
+ quality_key = qualities(self._QUALITIES)
+ for quality_str in self._QUALITIES:
+ params = {
+ 'bno': broadcast_no,
+ 'stream_type': 'common',
+ 'type': 'aid',
+ 'quality': quality_str,
+ }
+ if password is not None:
+ params['pwd'] = password
+ aid_response = self._download_json(
+ self._LIVE_API_URL, broadcast_no, fatal=False,
+ data=urlencode_postdata(params),
+ note=f'Downloading access token for {quality_str} stream',
+ errnote=f'Unable to download access token for {quality_str} stream')
+ aid = traverse_obj(aid_response, ('CHANNEL', 'AID'))
+ if not aid:
+ continue
+
+ stream_base_url = channel_info.get('RMD') or 'https://livestream-manager.afreecatv.com'
+ stream_info = self._download_json(
+ f'{stream_base_url}/broad_stream_assign.html', broadcast_no, fatal=False,
+ query={
+ 'return_type': channel_info.get('CDN', 'gcp_cdn'),
+ 'broad_key': f'{broadcast_no}-common-{quality_str}-hls',
+ },
+ note=f'Downloading metadata for {quality_str} stream',
+ errnote=f'Unable to download metadata for {quality_str} stream') or {}
+
+ if stream_info.get('view_url'):
+ formats.append({
+ 'format_id': quality_str,
+ 'url': update_url_query(stream_info['view_url'], {'aid': aid}),
+ 'ext': 'mp4',
+ 'protocol': 'm3u8',
+ 'quality': quality_key(quality_str),
+ })
+
+ self._sort_formats(formats)
+
+ station_info = self._download_json(
+ 'https://st.afreecatv.com/api/get_station_status.php', broadcast_no,
+ query={'szBjId': broadcaster_id}, fatal=False,
+ note='Downloading channel metadata', errnote='Unable to download channel metadata') or {}
+
+ return {
+ 'id': broadcast_no,
+ 'title': channel_info.get('TITLE') or station_info.get('station_title'),
+ 'uploader': channel_info.get('BJNICK') or station_info.get('station_name'),
+ 'uploader_id': broadcaster_id,
+ 'timestamp': unified_timestamp(station_info.get('broad_start')),
+ 'formats': formats,
+ 'is_live': True,
+ }
diff --git a/hypervideo_dl/extractor/aliexpress.py b/hypervideo_dl/extractor/aliexpress.py
index 6f241e6..9722fe9 100644
--- a/hypervideo_dl/extractor/aliexpress.py
+++ b/hypervideo_dl/extractor/aliexpress.py
@@ -18,7 +18,7 @@ class AliExpressLiveIE(InfoExtractor):
'id': '2800002704436634',
'ext': 'mp4',
'title': 'CASIMA7.22',
- 'thumbnail': r're:http://.*\.jpg',
+ 'thumbnail': r're:https?://.*\.jpg',
'uploader': 'CASIMA Official Store',
'timestamp': 1500717600,
'upload_date': '20170722',
diff --git a/hypervideo_dl/extractor/aljazeera.py b/hypervideo_dl/extractor/aljazeera.py
index e829b45..7bcdb7a 100644
--- a/hypervideo_dl/extractor/aljazeera.py
+++ b/hypervideo_dl/extractor/aljazeera.py
@@ -1,55 +1,86 @@
+# coding: utf-8
from __future__ import unicode_literals
import json
from .common import InfoExtractor
+from ..utils import (
+ try_get,
+)
class AlJazeeraIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?aljazeera\.com/(?P<type>program/[^/]+|(?:feature|video)s)/\d{4}/\d{1,2}/\d{1,2}/(?P<id>[^/?&#]+)'
+ _VALID_URL = r'https?://(?P<base>\w+\.aljazeera\.\w+)/(?P<type>programs?/[^/]+|(?:feature|video|new)s)?/\d{4}/\d{1,2}/\d{1,2}/(?P<id>[^/?&#]+)'
_TESTS = [{
- 'url': 'https://www.aljazeera.com/program/episode/2014/9/19/deliverance',
+ 'url': 'https://balkans.aljazeera.net/videos/2021/11/6/pojedini-domovi-u-sarajevu-jos-pod-vodom-mjestanima-se-dostavlja-hrana',
'info_dict': {
- 'id': '3792260579001',
+ 'id': '6280641530001',
'ext': 'mp4',
- 'title': 'The Slum - Episode 1: Deliverance',
- 'description': 'As a birth attendant advocating for family planning, Remy is on the frontline of Tondo\'s battle with overcrowding.',
- 'uploader_id': '665003303001',
- 'timestamp': 1411116829,
- 'upload_date': '20140919',
- },
- 'add_ie': ['BrightcoveNew'],
- 'skip': 'Not accessible from Travis CI server',
- }, {
- 'url': 'https://www.aljazeera.com/videos/2017/5/11/sierra-leone-709-carat-diamond-to-be-auctioned-off',
- 'only_matching': True,
+ 'title': 'Pojedini domovi u Sarajevu još pod vodom, mještanima se dostavlja hrana',
+ 'timestamp': 1636219149,
+ 'description': 'U sarajevskim naseljima Rajlovac i Reljevo stambeni objekti, ali i industrijska postrojenja i dalje su pod vodom.',
+ 'upload_date': '20211106',
+ }
}, {
- 'url': 'https://www.aljazeera.com/features/2017/8/21/transforming-pakistans-buses-into-art',
- 'only_matching': True,
+ 'url': 'https://balkans.aljazeera.net/videos/2021/11/6/djokovic-usao-u-finale-mastersa-u-parizu',
+ 'info_dict': {
+ 'id': '6280654936001',
+ 'ext': 'mp4',
+ 'title': 'Đoković ušao u finale Mastersa u Parizu',
+ 'timestamp': 1636221686,
+ 'description': 'Novak Đoković je u polufinalu Mastersa u Parizu nakon preokreta pobijedio Poljaka Huberta Hurkacza.',
+ 'upload_date': '20211106',
+ },
}]
- BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_default/index.html?videoId=%s'
+ BRIGHTCOVE_URL_RE = r'https?://players.brightcove.net/(?P<account>\d+)/(?P<player_id>[a-zA-Z0-9]+)_(?P<embed>[^/]+)/index.html\?videoId=(?P<id>\d+)'
def _real_extract(self, url):
- post_type, name = self._match_valid_url(url).groups()
+ base, post_type, id = self._match_valid_url(url).groups()
+ wp = {
+ 'balkans.aljazeera.net': 'ajb',
+ 'chinese.aljazeera.net': 'chinese',
+ 'mubasher.aljazeera.net': 'ajm',
+ }.get(base) or 'aje'
post_type = {
'features': 'post',
'program': 'episode',
+ 'programs': 'episode',
'videos': 'video',
+ 'news': 'news',
}[post_type.split('/')[0]]
video = self._download_json(
- 'https://www.aljazeera.com/graphql', name, query={
+ f'https://{base}/graphql', id, query={
+ 'wp-site': wp,
'operationName': 'ArchipelagoSingleArticleQuery',
'variables': json.dumps({
- 'name': name,
+ 'name': id,
'postType': post_type,
}),
}, headers={
- 'wp-site': 'aje',
- })['data']['article']['video']
- video_id = video['id']
- account_id = video.get('accountId') or '665003303001'
- player_id = video.get('playerId') or 'BkeSH5BDb'
- return self.url_result(
- self.BRIGHTCOVE_URL_TEMPLATE % (account_id, player_id, video_id),
- 'BrightcoveNew', video_id)
+ 'wp-site': wp,
+ })
+ video = try_get(video, lambda x: x['data']['article']['video']) or {}
+ video_id = video.get('id')
+ account = video.get('accountId') or '911432371001'
+ player_id = video.get('playerId') or 'csvTfAlKW'
+ embed = 'default'
+
+ if video_id is None:
+ webpage = self._download_webpage(url, id)
+
+ account, player_id, embed, video_id = self._search_regex(self.BRIGHTCOVE_URL_RE, webpage, 'video id',
+ group=(1, 2, 3, 4), default=(None, None, None, None))
+
+ if video_id is None:
+ return {
+ '_type': 'url_transparent',
+ 'url': url,
+ 'ie_key': 'Generic'
+ }
+
+ return {
+ '_type': 'url_transparent',
+ 'url': f'https://players.brightcove.net/{account}/{player_id}_{embed}/index.html?videoId={video_id}',
+ 'ie_key': 'BrightcoveNew'
+ }
diff --git a/hypervideo_dl/extractor/allocine.py b/hypervideo_dl/extractor/allocine.py
index cd533ac..403a277 100644
--- a/hypervideo_dl/extractor/allocine.py
+++ b/hypervideo_dl/extractor/allocine.py
@@ -7,6 +7,7 @@ from ..utils import (
int_or_none,
qualities,
remove_end,
+ strip_or_none,
try_get,
unified_timestamp,
url_basename,
@@ -102,10 +103,7 @@ class AllocineIE(InfoExtractor):
video_id = display_id
media_data = self._download_json(
'http://www.allocine.fr/ws/AcVisiondataV5.ashx?media=%s' % video_id, display_id)
- title = remove_end(
- self._html_search_regex(
- r'(?s)<title>(.+?)</title>', webpage, 'title').strip(),
- ' - AlloCiné')
+ title = remove_end(strip_or_none(self._html_extract_title(webpage), ' - AlloCiné'))
for key, value in media_data['video'].items():
if not key.endswith('Path'):
continue
diff --git a/hypervideo_dl/extractor/alsace20tv.py b/hypervideo_dl/extractor/alsace20tv.py
new file mode 100644
index 0000000..4aae6fe
--- /dev/null
+++ b/hypervideo_dl/extractor/alsace20tv.py
@@ -0,0 +1,87 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ clean_html,
+ dict_get,
+ get_element_by_class,
+ int_or_none,
+ unified_strdate,
+ url_or_none,
+)
+
+
+class Alsace20TVBaseIE(InfoExtractor):
+ def _extract_video(self, video_id, url=None):
+ info = self._download_json(
+ 'https://www.alsace20.tv/visionneuse/visio_v9_js.php?key=%s&habillage=0&mode=html' % (video_id, ),
+ video_id) or {}
+ title = info.get('titre')
+
+ formats = []
+ for res, fmt_url in (info.get('files') or {}).items():
+ formats.extend(
+ self._extract_smil_formats(fmt_url, video_id, fatal=False)
+ if '/smil:_' in fmt_url
+ else self._extract_mpd_formats(fmt_url, video_id, mpd_id=res, fatal=False))
+ self._sort_formats(formats)
+
+ webpage = (url and self._download_webpage(url, video_id, fatal=False)) or ''
+ thumbnail = url_or_none(dict_get(info, ('image', 'preview', )) or self._og_search_thumbnail(webpage))
+ upload_date = self._search_regex(r'/(\d{6})_', thumbnail, 'upload_date', default=None)
+ upload_date = unified_strdate('20%s-%s-%s' % (upload_date[:2], upload_date[2:4], upload_date[4:])) if upload_date else None
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'formats': formats,
+ 'description': clean_html(get_element_by_class('wysiwyg', webpage)),
+ 'upload_date': upload_date,
+ 'thumbnail': thumbnail,
+ 'duration': int_or_none(self._og_search_property('video:duration', webpage) if webpage else None),
+ 'view_count': int_or_none(info.get('nb_vues')),
+ }
+
+
+class Alsace20TVIE(Alsace20TVBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?alsace20\.tv/(?:[\w-]+/)+[\w-]+-(?P<id>[\w]+)'
+ _TESTS = [{
+ 'url': 'https://www.alsace20.tv/VOD/Actu/JT/Votre-JT-jeudi-3-fevrier-lyNHCXpYJh.html',
+ 'info_dict': {
+ 'id': 'lyNHCXpYJh',
+ 'ext': 'mp4',
+ 'description': 'md5:fc0bc4a0692d3d2dba4524053de4c7b7',
+ 'title': 'Votre JT du jeudi 3 février',
+ 'upload_date': '20220203',
+ 'thumbnail': r're:https?://.+\.jpg',
+ 'duration': 1073,
+ 'view_count': int,
+ },
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ return self._extract_video(video_id, url)
+
+
+class Alsace20TVEmbedIE(Alsace20TVBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?alsace20\.tv/emb/(?P<id>[\w]+)'
+ _TESTS = [{
+ 'url': 'https://www.alsace20.tv/emb/lyNHCXpYJh',
+ # 'md5': 'd91851bf9af73c0ad9b2cdf76c127fbb',
+ 'info_dict': {
+ 'id': 'lyNHCXpYJh',
+ 'ext': 'mp4',
+ 'title': 'Votre JT du jeudi 3 février',
+ 'upload_date': '20220203',
+ 'thumbnail': r're:https?://.+\.jpg',
+ 'view_count': int,
+ },
+ 'params': {
+ 'format': 'bestvideo',
+ },
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ return self._extract_video(video_id)
diff --git a/hypervideo_dl/extractor/alura.py b/hypervideo_dl/extractor/alura.py
index f5325de..d2e2df2 100644
--- a/hypervideo_dl/extractor/alura.py
+++ b/hypervideo_dl/extractor/alura.py
@@ -74,14 +74,7 @@ class AluraIE(InfoExtractor):
"formats": formats
}
- def _real_initialize(self):
- self._login()
-
- def _login(self):
- username, password = self._get_login_info()
- if username is None:
- return
- pass
+ def _perform_login(self, username, password):
login_page = self._download_webpage(
self._LOGIN_URL, None, 'Downloading login popup')
diff --git a/hypervideo_dl/extractor/amazon.py b/hypervideo_dl/extractor/amazon.py
new file mode 100644
index 0000000..07b1b18
--- /dev/null
+++ b/hypervideo_dl/extractor/amazon.py
@@ -0,0 +1,53 @@
+# coding: utf-8
+from .common import InfoExtractor
+from ..utils import int_or_none
+
+
+class AmazonStoreIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?amazon\.(?:[a-z]{2,3})(?:\.[a-z]{2})?/(?:[^/]+/)?(?:dp|gp/product)/(?P<id>[^/&#$?]+)'
+
+ _TESTS = [{
+ 'url': 'https://www.amazon.co.uk/dp/B098XNCHLD/',
+ 'info_dict': {
+ 'id': 'B098XNCHLD',
+ 'title': 'md5:5f3194dbf75a8dcfc83079bd63a2abed',
+ },
+ 'playlist_mincount': 1,
+ 'playlist': [{
+ 'info_dict': {
+ 'id': 'A1F83G8C2ARO7P',
+ 'ext': 'mp4',
+ 'title': 'mcdodo usb c cable 100W 5a',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ },
+ }]
+ }, {
+ 'url': 'https://www.amazon.in/Sony-WH-1000XM4-Cancelling-Headphones-Bluetooth/dp/B0863TXGM3',
+ 'info_dict': {
+ 'id': 'B0863TXGM3',
+ 'title': 'md5:b0bde4881d3cfd40d63af19f7898b8ff',
+ },
+ 'playlist_mincount': 4,
+ }, {
+ 'url': 'https://www.amazon.com/dp/B0845NXCXF/',
+ 'info_dict': {
+ 'id': 'B0845NXCXF',
+ 'title': 'md5:2145cd4e3c7782f1ee73649a3cff1171',
+ },
+ 'playlist-mincount': 1,
+ }]
+
+ def _real_extract(self, url):
+ id = self._match_id(url)
+ webpage = self._download_webpage(url, id)
+ data_json = self._parse_json(self._html_search_regex(r'var\s?obj\s?=\s?jQuery\.parseJSON\(\'(.*)\'\)', webpage, 'data'), id)
+ entries = [{
+ 'id': video['marketPlaceID'],
+ 'url': video['url'],
+ 'title': video.get('title'),
+ 'thumbnail': video.get('thumbUrl') or video.get('thumb'),
+ 'duration': video.get('durationSeconds'),
+ 'height': int_or_none(video.get('videoHeight')),
+ 'width': int_or_none(video.get('videoWidth')),
+ } for video in (data_json.get('videos') or []) if video.get('isVideo') and video.get('url')]
+ return self.playlist_result(entries, playlist_id=id, playlist_title=data_json['title'])
diff --git a/hypervideo_dl/extractor/animelab.py b/hypervideo_dl/extractor/animelab.py
index 4fb7ee4..1c2cc47 100644
--- a/hypervideo_dl/extractor/animelab.py
+++ b/hypervideo_dl/extractor/animelab.py
@@ -15,25 +15,21 @@ from ..compat import compat_HTTPError
class AnimeLabBaseIE(InfoExtractor):
- _LOGIN_REQUIRED = True
_LOGIN_URL = 'https://www.animelab.com/login'
_NETRC_MACHINE = 'animelab'
+ _LOGGED_IN = False
- def _login(self):
- def is_logged_in(login_webpage):
- return 'Sign In' not in login_webpage
+ def _is_logged_in(self, login_page=None):
+ if not self._LOGGED_IN:
+ if not login_page:
+ login_page = self._download_webpage(self._LOGIN_URL, None, 'Downloading login page')
+ AnimeLabBaseIE._LOGGED_IN = 'Sign In' not in login_page
+ return self._LOGGED_IN
- login_page = self._download_webpage(
- self._LOGIN_URL, None, 'Downloading login page')
-
- # Check if already logged in
- if is_logged_in(login_page):
+ def _perform_login(self, username, password):
+ if self._is_logged_in():
return
- (username, password) = self._get_login_info()
- if username is None and self._LOGIN_REQUIRED:
- self.raise_login_required('Login is required to access any AnimeLab content')
-
login_form = {
'email': username,
'password': password,
@@ -47,17 +43,14 @@ class AnimeLabBaseIE(InfoExtractor):
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400:
raise ExtractorError('Unable to log in (wrong credentials?)', expected=True)
- else:
- raise
+ raise
- # if login was successful
- if is_logged_in(response):
- return
-
- raise ExtractorError('Unable to login (cannot verify if logged in)')
+ if not self._is_logged_in(response):
+ raise ExtractorError('Unable to login (cannot verify if logged in)')
def _real_initialize(self):
- self._login()
+ if not self._is_logged_in():
+ self.raise_login_required('Login is required to access any AnimeLab content')
class AnimeLabIE(AnimeLabBaseIE):
diff --git a/hypervideo_dl/extractor/animeondemand.py b/hypervideo_dl/extractor/animeondemand.py
index 54e097d..2e674d5 100644
--- a/hypervideo_dl/extractor/animeondemand.py
+++ b/hypervideo_dl/extractor/animeondemand.py
@@ -8,6 +8,7 @@ from ..utils import (
determine_ext,
extract_attributes,
ExtractorError,
+ join_nonempty,
url_or_none,
urlencode_postdata,
urljoin,
@@ -52,11 +53,7 @@ class AnimeOnDemandIE(InfoExtractor):
'only_matching': True,
}]
- def _login(self):
- username, password = self._get_login_info()
- if username is None:
- return
-
+ def _perform_login(self, username, password):
login_page = self._download_webpage(
self._LOGIN_URL, None, 'Downloading login page')
@@ -92,9 +89,6 @@ class AnimeOnDemandIE(InfoExtractor):
raise ExtractorError('Unable to login: %s' % error, expected=True)
raise ExtractorError('Unable to log in')
- def _real_initialize(self):
- self._login()
-
def _real_extract(self, url):
anime_id = self._match_id(url)
@@ -140,15 +134,8 @@ class AnimeOnDemandIE(InfoExtractor):
kind = self._search_regex(
r'videomaterialurl/\d+/([^/]+)/',
playlist_url, 'media kind', default=None)
- format_id_list = []
- if lang:
- format_id_list.append(lang)
- if kind:
- format_id_list.append(kind)
- if not format_id_list and num is not None:
- format_id_list.append(compat_str(num))
- format_id = '-'.join(format_id_list)
- format_note = ', '.join(filter(None, (kind, lang_note)))
+ format_id = join_nonempty(lang, kind) if lang or kind else str(num)
+ format_note = join_nonempty(kind, lang_note, delim=', ')
item_id_list = []
if format_id:
item_id_list.append(format_id)
@@ -195,12 +182,10 @@ class AnimeOnDemandIE(InfoExtractor):
if not file_:
continue
ext = determine_ext(file_)
- format_id_list = [lang, kind]
- if ext == 'm3u8':
- format_id_list.append('hls')
- elif source.get('type') == 'video/dash' or ext == 'mpd':
- format_id_list.append('dash')
- format_id = '-'.join(filter(None, format_id_list))
+ format_id = join_nonempty(
+ lang, kind,
+ 'hls' if ext == 'm3u8' else None,
+ 'dash' if source.get('type') == 'video/dash' or ext == 'mpd' else None)
if ext == 'm3u8':
file_formats = self._extract_m3u8_formats(
file_, video_id, 'mp4',
diff --git a/hypervideo_dl/extractor/ant1newsgr.py b/hypervideo_dl/extractor/ant1newsgr.py
new file mode 100644
index 0000000..1075b46
--- /dev/null
+++ b/hypervideo_dl/extractor/ant1newsgr.py
@@ -0,0 +1,143 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+import urllib.parse
+
+from .common import InfoExtractor
+from ..utils import (
+ HEADRequest,
+ ExtractorError,
+ determine_ext,
+ scale_thumbnails_to_max_format_width,
+ unescapeHTML,
+)
+
+
+class Ant1NewsGrBaseIE(InfoExtractor):
+ def _download_and_extract_api_data(self, video_id, netloc, cid=None):
+ url = f'{self.http_scheme()}//{netloc}{self._API_PATH}'
+ info = self._download_json(url, video_id, query={'cid': cid or video_id})
+ try:
+ source = info['url']
+ except KeyError:
+ raise ExtractorError('no source found for %s' % video_id)
+ formats, subs = (self._extract_m3u8_formats_and_subtitles(source, video_id, 'mp4')
+ if determine_ext(source) == 'm3u8' else ([{'url': source}], {}))
+ self._sort_formats(formats)
+ thumbnails = scale_thumbnails_to_max_format_width(
+ formats, [{'url': info['thumb']}], r'(?<=/imgHandler/)\d+')
+ return {
+ 'id': video_id,
+ 'title': info.get('title'),
+ 'thumbnails': thumbnails,
+ 'formats': formats,
+ 'subtitles': subs,
+ }
+
+
+class Ant1NewsGrWatchIE(Ant1NewsGrBaseIE):
+ IE_NAME = 'ant1newsgr:watch'
+ IE_DESC = 'ant1news.gr videos'
+ _VALID_URL = r'https?://(?P<netloc>(?:www\.)?ant1news\.gr)/watch/(?P<id>\d+)/'
+ _API_PATH = '/templates/data/player'
+
+ _TESTS = [{
+ 'url': 'https://www.ant1news.gr/watch/1506168/ant1-news-09112021-stis-18-45',
+ 'md5': '95925e6b32106754235f2417e0d2dfab',
+ 'info_dict': {
+ 'id': '1506168',
+ 'ext': 'mp4',
+ 'title': 'md5:0ad00fa66ecf8aa233d26ab0dba7514a',
+ 'description': 'md5:18665af715a6dcfeac1d6153a44f16b0',
+ 'thumbnail': 'https://ant1media.azureedge.net/imgHandler/640/26d46bf6-8158-4f02-b197-7096c714b2de.jpg',
+ },
+ }]
+
+ def _real_extract(self, url):
+ video_id, netloc = self._match_valid_url(url).group('id', 'netloc')
+ webpage = self._download_webpage(url, video_id)
+ info = self._download_and_extract_api_data(video_id, netloc)
+ info['description'] = self._og_search_description(webpage)
+ return info
+
+
+class Ant1NewsGrArticleIE(Ant1NewsGrBaseIE):
+ IE_NAME = 'ant1newsgr:article'
+ IE_DESC = 'ant1news.gr articles'
+ _VALID_URL = r'https?://(?:www\.)?ant1news\.gr/[^/]+/article/(?P<id>\d+)/'
+
+ _TESTS = [{
+ 'url': 'https://www.ant1news.gr/afieromata/article/549468/o-tzeims-mpont-sta-meteora-oi-apeiles-kai-o-xesikomos-ton-kalogeron',
+ 'md5': '294f18331bb516539d72d85a82887dcc',
+ 'info_dict': {
+ 'id': '_xvg/m_cmbatw=',
+ 'ext': 'mp4',
+ 'title': 'md5:a93e8ecf2e4073bfdffcb38f59945411',
+ 'timestamp': 1603092840,
+ 'upload_date': '20201019',
+ 'thumbnail': 'https://ant1media.azureedge.net/imgHandler/640/756206d2-d640-40e2-b201-3555abdfc0db.jpg',
+ },
+ }, {
+ 'url': 'https://ant1news.gr/Society/article/620286/symmoria-anilikon-dikigoros-thymaton-ithelan-na-toys-apoteleiosoyn',
+ 'info_dict': {
+ 'id': '620286',
+ 'title': 'md5:91fe569e952e4d146485740ae927662b',
+ },
+ 'playlist_mincount': 2,
+ 'params': {
+ 'skip_download': True,
+ },
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+ info = self._search_json_ld(webpage, video_id, expected_type='NewsArticle')
+ embed_urls = list(Ant1NewsGrEmbedIE._extract_urls(webpage))
+ if not embed_urls:
+ raise ExtractorError('no videos found for %s' % video_id, expected=True)
+ return self.playlist_from_matches(
+ embed_urls, video_id, info.get('title'), ie=Ant1NewsGrEmbedIE.ie_key(),
+ video_kwargs={'url_transparent': True, 'timestamp': info.get('timestamp')})
+
+
+class Ant1NewsGrEmbedIE(Ant1NewsGrBaseIE):
+ IE_NAME = 'ant1newsgr:embed'
+ IE_DESC = 'ant1news.gr embedded videos'
+ _BASE_PLAYER_URL_RE = r'(?:https?:)?//(?:[a-zA-Z0-9\-]+\.)?(?:antenna|ant1news)\.gr/templates/pages/player'
+ _VALID_URL = rf'{_BASE_PLAYER_URL_RE}\?([^#]+&)?cid=(?P<id>[^#&]+)'
+ _API_PATH = '/news/templates/data/jsonPlayer'
+
+ _TESTS = [{
+ 'url': 'https://www.antenna.gr/templates/pages/player?cid=3f_li_c_az_jw_y_u=&w=670&h=377',
+ 'md5': 'dfc58c3a11a5a9aad2ba316ed447def3',
+ 'info_dict': {
+ 'id': '3f_li_c_az_jw_y_u=',
+ 'ext': 'mp4',
+ 'title': 'md5:a30c93332455f53e1e84ae0724f0adf7',
+ 'thumbnail': 'https://ant1media.azureedge.net/imgHandler/640/bbe31201-3f09-4a4e-87f5-8ad2159fffe2.jpg',
+ },
+ }]
+
+ @classmethod
+ def _extract_urls(cls, webpage):
+ _EMBED_URL_RE = rf'{cls._BASE_PLAYER_URL_RE}\?(?:(?!(?P=_q1)).)+'
+ _EMBED_RE = rf'<iframe[^>]+?src=(?P<_q1>["\'])(?P<url>{_EMBED_URL_RE})(?P=_q1)'
+ for mobj in re.finditer(_EMBED_RE, webpage):
+ url = unescapeHTML(mobj.group('url'))
+ if not cls.suitable(url):
+ continue
+ yield url
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ canonical_url = self._request_webpage(
+ HEADRequest(url), video_id,
+ note='Resolve canonical player URL',
+ errnote='Could not resolve canonical player URL').geturl()
+ _, netloc, _, _, query, _ = urllib.parse.urlparse(canonical_url)
+ cid = urllib.parse.parse_qs(query)['cid'][0]
+
+ return self._download_and_extract_api_data(video_id, netloc, cid=cid)
diff --git a/hypervideo_dl/extractor/anvato.py b/hypervideo_dl/extractor/anvato.py
index b82f0b5..686d453 100644
--- a/hypervideo_dl/extractor/anvato.py
+++ b/hypervideo_dl/extractor/anvato.py
@@ -16,6 +16,7 @@ from ..utils import (
determine_ext,
intlist_to_bytes,
int_or_none,
+ join_nonempty,
strip_jsonp,
unescapeHTML,
unsmuggle_url,
@@ -303,13 +304,13 @@ class AnvatoIE(InfoExtractor):
tbr = int_or_none(published_url.get('kbps'))
a_format = {
'url': video_url,
- 'format_id': ('-'.join(filter(None, ['http', published_url.get('cdn_name')]))).lower(),
- 'tbr': tbr if tbr != 0 else None,
+ 'format_id': join_nonempty('http', published_url.get('cdn_name')).lower(),
+ 'tbr': tbr or None,
}
if media_format == 'm3u8' and tbr is not None:
a_format.update({
- 'format_id': '-'.join(filter(None, ['hls', compat_str(tbr)])),
+ 'format_id': join_nonempty('hls', tbr),
'ext': 'mp4',
})
elif media_format == 'm3u8-variant' or ext == 'm3u8':
diff --git a/hypervideo_dl/extractor/aparat.py b/hypervideo_dl/extractor/aparat.py
index da06a3c..1057233 100644
--- a/hypervideo_dl/extractor/aparat.py
+++ b/hypervideo_dl/extractor/aparat.py
@@ -33,19 +33,22 @@ class AparatIE(InfoExtractor):
'only_matching': True,
}]
+ def _parse_options(self, webpage, video_id, fatal=True):
+ return self._parse_json(self._search_regex(
+ r'options\s*=\s*({.+?})\s*;', webpage, 'options', default='{}'), video_id)
+
def _real_extract(self, url):
video_id = self._match_id(url)
- # Provides more metadata
+ # If available, provides more metadata
webpage = self._download_webpage(url, video_id, fatal=False)
+ options = self._parse_options(webpage, video_id, fatal=False)
- if not webpage:
+ if not options:
webpage = self._download_webpage(
'http://www.aparat.com/video/video/embed/vt/frame/showvideo/yes/videohash/' + video_id,
- video_id)
-
- options = self._parse_json(self._search_regex(
- r'options\s*=\s*({.+?})\s*;', webpage, 'options'), video_id)
+ video_id, 'Downloading embed webpage')
+ options = self._parse_options(webpage, video_id)
formats = []
for sources in (options.get('multiSRC') or []):
diff --git a/hypervideo_dl/extractor/applepodcasts.py b/hypervideo_dl/extractor/applepodcasts.py
index 6a74de7..9139ff7 100644
--- a/hypervideo_dl/extractor/applepodcasts.py
+++ b/hypervideo_dl/extractor/applepodcasts.py
@@ -3,7 +3,9 @@ from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
+ clean_html,
clean_podcast_url,
+ get_element_by_class,
int_or_none,
parse_iso8601,
try_get,
@@ -14,16 +16,17 @@ class ApplePodcastsIE(InfoExtractor):
_VALID_URL = r'https?://podcasts\.apple\.com/(?:[^/]+/)?podcast(?:/[^/]+){1,2}.*?\bi=(?P<id>\d+)'
_TESTS = [{
'url': 'https://podcasts.apple.com/us/podcast/207-whitney-webb-returns/id1135137367?i=1000482637777',
- 'md5': 'df02e6acb11c10e844946a39e7222b08',
+ 'md5': '41dc31cd650143e530d9423b6b5a344f',
'info_dict': {
'id': '1000482637777',
'ext': 'mp3',
'title': '207 - Whitney Webb Returns',
- 'description': 'md5:13a73bade02d2e43737751e3987e1399',
+ 'description': 'md5:75ef4316031df7b41ced4e7b987f79c6',
'upload_date': '20200705',
- 'timestamp': 1593921600,
- 'duration': 6425,
+ 'timestamp': 1593932400,
+ 'duration': 6454,
'series': 'The Tim Dillon Show',
+ 'thumbnail': 're:.+[.](png|jpe?g|webp)',
}
}, {
'url': 'https://podcasts.apple.com/podcast/207-whitney-webb-returns/id1135137367?i=1000482637777',
@@ -39,24 +42,47 @@ class ApplePodcastsIE(InfoExtractor):
def _real_extract(self, url):
episode_id = self._match_id(url)
webpage = self._download_webpage(url, episode_id)
- ember_data = self._parse_json(self._search_regex(
- r'id="shoebox-ember-data-store"[^>]*>\s*({.+?})\s*<',
- webpage, 'ember data'), episode_id)
- ember_data = ember_data.get(episode_id) or ember_data
- episode = ember_data['data']['attributes']
+ episode_data = {}
+ ember_data = {}
+ # new page type 2021-11
+ amp_data = self._parse_json(self._search_regex(
+ r'(?s)id="shoebox-media-api-cache-amp-podcasts"[^>]*>\s*({.+?})\s*<',
+ webpage, 'AMP data', default='{}'), episode_id, fatal=False) or {}
+ amp_data = try_get(amp_data,
+ lambda a: self._parse_json(
+ next(a[x] for x in iter(a) if episode_id in x),
+ episode_id),
+ dict) or {}
+ amp_data = amp_data.get('d') or []
+ episode_data = try_get(
+ amp_data,
+ lambda a: next(x for x in a
+ if x['type'] == 'podcast-episodes' and x['id'] == episode_id),
+ dict)
+ if not episode_data:
+ # try pre 2021-11 page type: TODO: consider deleting if no longer used
+ ember_data = self._parse_json(self._search_regex(
+ r'(?s)id="shoebox-ember-data-store"[^>]*>\s*({.+?})\s*<',
+ webpage, 'ember data'), episode_id) or {}
+ ember_data = ember_data.get(episode_id) or ember_data
+ episode_data = try_get(ember_data, lambda x: x['data'], dict)
+ episode = episode_data['attributes']
description = episode.get('description') or {}
series = None
- for inc in (ember_data.get('included') or []):
+ for inc in (amp_data or ember_data.get('included') or []):
if inc.get('type') == 'media/podcast':
series = try_get(inc, lambda x: x['attributes']['name'])
+ series = series or clean_html(get_element_by_class('podcast-header__identity', webpage))
return {
'id': episode_id,
- 'title': episode['name'],
+ 'title': episode.get('name'),
'url': clean_podcast_url(episode['assetUrl']),
'description': description.get('standard') or description.get('short'),
'timestamp': parse_iso8601(episode.get('releaseDateTime')),
'duration': int_or_none(episode.get('durationInMilliseconds'), 1000),
'series': series,
+ 'thumbnail': self._og_search_thumbnail(webpage),
+ 'vcodec': 'none',
}
diff --git a/hypervideo_dl/extractor/archiveorg.py b/hypervideo_dl/extractor/archiveorg.py
index d90fcb1..2ab3c1b 100644
--- a/hypervideo_dl/extractor/archiveorg.py
+++ b/hypervideo_dl/extractor/archiveorg.py
@@ -3,33 +3,37 @@ from __future__ import unicode_literals
import re
import json
-
from .common import InfoExtractor
-from .youtube import YoutubeIE
+from .youtube import YoutubeIE, YoutubeBaseInfoExtractor
from ..compat import (
compat_urllib_parse_unquote,
compat_urllib_parse_unquote_plus,
compat_HTTPError
)
from ..utils import (
+ bug_reports_message,
clean_html,
- determine_ext,
dict_get,
extract_attributes,
ExtractorError,
+ get_element_by_id,
HEADRequest,
int_or_none,
+ join_nonempty,
KNOWN_EXTENSIONS,
merge_dicts,
mimetype2ext,
+ orderedSet,
parse_duration,
parse_qs,
- RegexNotFoundError,
str_to_int,
str_or_none,
+ traverse_obj,
try_get,
unified_strdate,
unified_timestamp,
+ urlhandle_detect_ext,
+ url_or_none
)
@@ -61,7 +65,7 @@ class ArchiveOrgIE(InfoExtractor):
'description': 'md5:43a603fd6c5b4b90d12a96b921212b9c',
'uploader': 'yorkmba99@hotmail.com',
'timestamp': 1387699629,
- 'upload_date': "20131222",
+ 'upload_date': '20131222',
},
}, {
'url': 'http://archive.org/embed/XD300-23_68HighlightsAResearchCntAugHumanIntellect',
@@ -147,8 +151,7 @@ class ArchiveOrgIE(InfoExtractor):
# Archive.org metadata API doesn't clearly demarcate playlist entries
# or subtitle tracks, so we get them from the embeddable player.
- embed_page = self._download_webpage(
- 'https://archive.org/embed/' + identifier, identifier)
+ embed_page = self._download_webpage(f'https://archive.org/embed/{identifier}', identifier)
playlist = self._playlist_data(embed_page)
entries = {}
@@ -163,17 +166,17 @@ class ArchiveOrgIE(InfoExtractor):
'thumbnails': [],
'artist': p.get('artist'),
'track': p.get('title'),
- 'subtitles': {}}
+ 'subtitles': {},
+ }
for track in p.get('tracks', []):
if track['kind'] != 'subtitles':
continue
-
entries[p['orig']][track['label']] = {
- 'url': 'https://archive.org/' + track['file'].lstrip('/')}
+ 'url': 'https://archive.org/' + track['file'].lstrip('/')
+ }
- metadata = self._download_json(
- 'http://archive.org/metadata/' + identifier, identifier)
+ metadata = self._download_json('http://archive.org/metadata/' + identifier, identifier)
m = metadata['metadata']
identifier = m['identifier']
@@ -186,7 +189,7 @@ class ArchiveOrgIE(InfoExtractor):
'license': m.get('licenseurl'),
'release_date': unified_strdate(m.get('date')),
'timestamp': unified_timestamp(dict_get(m, ['publicdate', 'addeddate'])),
- 'webpage_url': 'https://archive.org/details/' + identifier,
+ 'webpage_url': f'https://archive.org/details/{identifier}',
'location': m.get('venue'),
'release_year': int_or_none(m.get('year'))}
@@ -204,7 +207,7 @@ class ArchiveOrgIE(InfoExtractor):
'discnumber': int_or_none(f.get('disc')),
'release_year': int_or_none(f.get('year'))})
entry = entries[f['name']]
- elif f.get('original') in entries:
+ elif traverse_obj(f, 'original', expected_type=str) in entries:
entry = entries[f['original']]
else:
continue
@@ -227,13 +230,12 @@ class ArchiveOrgIE(InfoExtractor):
'filesize': int_or_none(f.get('size')),
'protocol': 'https'})
- # Sort available formats by filesize
for entry in entries.values():
- entry['formats'] = list(sorted(entry['formats'], key=lambda x: x.get('filesize', -1)))
+ self._sort_formats(entry['formats'])
if len(entries) == 1:
# If there's only one item, use it as the main info dict
- only_video = entries[list(entries.keys())[0]]
+ only_video = next(iter(entries.values()))
if entry_id:
info = merge_dicts(only_video, info)
else:
@@ -258,19 +260,19 @@ class ArchiveOrgIE(InfoExtractor):
class YoutubeWebArchiveIE(InfoExtractor):
IE_NAME = 'web.archive:youtube'
- IE_DESC = 'web.archive.org saved youtube videos'
- _VALID_URL = r"""(?x)^
- (?:https?://)?web\.archive\.org/
- (?:web/)?
- (?:[0-9A-Za-z_*]+/)? # /web and the version index is optional
-
- (?:https?(?::|%3[Aa])//)?
- (?:
- (?:\w+\.)?youtube\.com/watch(?:\?|%3[fF])(?:[^\#]+(?:&|%26))?v(?:=|%3[dD]) # Youtube URL
- |(wayback-fakeurl\.archive\.org/yt/) # Or the internal fake url
- )
- (?P<id>[0-9A-Za-z_-]{11})(?:%26|\#|&|$)
- """
+ IE_DESC = 'web.archive.org saved youtube videos, "ytarchive:" prefix'
+ _VALID_URL = r'''(?x)(?:(?P<prefix>ytarchive:)|
+ (?:https?://)?web\.archive\.org/
+ (?:web/)?(?:(?P<date>[0-9]{14})?[0-9A-Za-z_*]*/)? # /web and the version index is optional
+ (?:https?(?::|%3[Aa])//)?(?:
+ (?:\w+\.)?youtube\.com(?::(?:80|443))?/watch(?:\.php)?(?:\?|%3[fF])(?:[^\#]+(?:&|%26))?v(?:=|%3[dD]) # Youtube URL
+ |(?:wayback-fakeurl\.archive\.org/yt/) # Or the internal fake url
+ )
+ )(?P<id>[0-9A-Za-z_-]{11})
+ (?(prefix)
+ (?::(?P<date2>[0-9]{14}))?$|
+ (?:%26|[#&]|$)
+ )'''
_TESTS = [
{
@@ -278,141 +280,395 @@ class YoutubeWebArchiveIE(InfoExtractor):
'info_dict': {
'id': 'aYAGB11YrSs',
'ext': 'webm',
- 'title': 'Team Fortress 2 - Sandviches!'
+ 'title': 'Team Fortress 2 - Sandviches!',
+ 'description': 'md5:4984c0f9a07f349fc5d8e82ab7af4eaf',
+ 'upload_date': '20110926',
+ 'uploader': 'Zeurel',
+ 'channel_id': 'UCukCyHaD-bK3in_pKpfH9Eg',
+ 'duration': 32,
+ 'uploader_id': 'Zeurel',
+ 'uploader_url': 'http://www.youtube.com/user/Zeurel'
}
- },
- {
+ }, {
# Internal link
'url': 'https://web.archive.org/web/2oe/http://wayback-fakeurl.archive.org/yt/97t7Xj_iBv0',
'info_dict': {
'id': '97t7Xj_iBv0',
'ext': 'mp4',
- 'title': 'How Flexible Machines Could Save The World'
+ 'title': 'Why Machines That Bend Are Better',
+ 'description': 'md5:00404df2c632d16a674ff8df1ecfbb6c',
+ 'upload_date': '20190312',
+ 'uploader': 'Veritasium',
+ 'channel_id': 'UCHnyfMqiRRG1u-2MsSQLbXA',
+ 'duration': 771,
+ 'uploader_id': '1veritasium',
+ 'uploader_url': 'http://www.youtube.com/user/1veritasium'
}
- },
- {
- # Video from 2012, webm format itag 45.
+ }, {
+ # Video from 2012, webm format itag 45. Newest capture is deleted video, with an invalid description.
+ # Should use the date in the link. Title ends with '- Youtube'. Capture has description in eow-description
'url': 'https://web.archive.org/web/20120712231619/http://www.youtube.com/watch?v=AkhihxRKcrs&gl=US&hl=en',
'info_dict': {
'id': 'AkhihxRKcrs',
'ext': 'webm',
- 'title': 'Limited Run: Mondo\'s Modern Classic 1 of 3 (SDCC 2012)'
+ 'title': 'Limited Run: Mondo\'s Modern Classic 1 of 3 (SDCC 2012)',
+ 'upload_date': '20120712',
+ 'duration': 398,
+ 'description': 'md5:ff4de6a7980cb65d951c2f6966a4f2f3',
+ 'uploader_id': 'machinima',
+ 'uploader_url': 'http://www.youtube.com/user/machinima'
}
- },
- {
- # Old flash-only video. Webpage title starts with "YouTube - ".
+ }, {
+ # FLV video. Video file URL does not provide itag information
'url': 'https://web.archive.org/web/20081211103536/http://www.youtube.com/watch?v=jNQXAC9IVRw',
'info_dict': {
'id': 'jNQXAC9IVRw',
- 'ext': 'unknown_video',
- 'title': 'Me at the zoo'
+ 'ext': 'flv',
+ 'title': 'Me at the zoo',
+ 'upload_date': '20050423',
+ 'channel_id': 'UC4QobU6STFB0P71PMvOGN5A',
+ 'duration': 19,
+ 'description': 'md5:10436b12e07ac43ff8df65287a56efb4',
+ 'uploader_id': 'jawed',
+ 'uploader_url': 'http://www.youtube.com/user/jawed'
}
- },
- {
- # Flash video with .flv extension (itag 34). Title has prefix "YouTube -"
- # Title has some weird unicode characters too.
+ }, {
'url': 'https://web.archive.org/web/20110712231407/http://www.youtube.com/watch?v=lTx3G6h2xyA',
'info_dict': {
'id': 'lTx3G6h2xyA',
'ext': 'flv',
- 'title': '‪Madeon - Pop Culture (live mashup)‬‏'
+ 'title': 'Madeon - Pop Culture (live mashup)',
+ 'upload_date': '20110711',
+ 'uploader': 'Madeon',
+ 'channel_id': 'UCqMDNf3Pn5L7pcNkuSEeO3w',
+ 'duration': 204,
+ 'description': 'md5:f7535343b6eda34a314eff8b85444680',
+ 'uploader_id': 'itsmadeon',
+ 'uploader_url': 'http://www.youtube.com/user/itsmadeon'
}
- },
- { # Some versions of Youtube have have "YouTube" as page title in html (and later rewritten by js).
- 'url': 'https://web.archive.org/web/http://www.youtube.com/watch?v=kH-G_aIBlFw',
+ }, {
+ # First capture is of dead video, second is the oldest from CDX response.
+ 'url': 'https://web.archive.org/https://www.youtube.com/watch?v=1JYutPM8O6E',
'info_dict': {
- 'id': 'kH-G_aIBlFw',
+ 'id': '1JYutPM8O6E',
'ext': 'mp4',
- 'title': 'kH-G_aIBlFw'
- },
- 'expected_warnings': [
- 'unable to extract title',
- ]
- },
- {
- # First capture is a 302 redirect intermediary page.
- 'url': 'https://web.archive.org/web/20050214000000/http://www.youtube.com/watch?v=0altSZ96U4M',
+ 'title': 'Fake Teen Doctor Strikes AGAIN! - Weekly Weird News',
+ 'upload_date': '20160218',
+ 'channel_id': 'UCdIaNUarhzLSXGoItz7BHVA',
+ 'duration': 1236,
+ 'description': 'md5:21032bae736421e89c2edf36d1936947',
+ 'uploader_id': 'MachinimaETC',
+ 'uploader_url': 'http://www.youtube.com/user/MachinimaETC'
+ }
+ }, {
+ # First capture of dead video, capture date in link links to dead capture.
+ 'url': 'https://web.archive.org/web/20180803221945/https://www.youtube.com/watch?v=6FPhZJGvf4E',
'info_dict': {
- 'id': '0altSZ96U4M',
+ 'id': '6FPhZJGvf4E',
'ext': 'mp4',
- 'title': '0altSZ96U4M'
+ 'title': 'WTF: Video Games Still Launch BROKEN?! - T.U.G.S.',
+ 'upload_date': '20160219',
+ 'channel_id': 'UCdIaNUarhzLSXGoItz7BHVA',
+ 'duration': 798,
+ 'description': 'md5:a1dbf12d9a3bd7cb4c5e33b27d77ffe7',
+ 'uploader_id': 'MachinimaETC',
+ 'uploader_url': 'http://www.youtube.com/user/MachinimaETC'
},
'expected_warnings': [
- 'unable to extract title',
+ r'unable to download capture webpage \(it may not be archived\)'
]
- },
- {
+ }, { # Very old YouTube page, has - YouTube in title.
+ 'url': 'http://web.archive.org/web/20070302011044/http://youtube.com/watch?v=-06-KB9XTzg',
+ 'info_dict': {
+ 'id': '-06-KB9XTzg',
+ 'ext': 'flv',
+ 'title': 'New Coin Hack!! 100% Safe!!'
+ }
+ }, {
+ 'url': 'web.archive.org/https://www.youtube.com/watch?v=dWW7qP423y8',
+ 'info_dict': {
+ 'id': 'dWW7qP423y8',
+ 'ext': 'mp4',
+ 'title': 'It\'s Bootleg AirPods Time.',
+ 'upload_date': '20211021',
+ 'channel_id': 'UC7Jwj9fkrf1adN4fMmTkpug',
+ 'channel_url': 'http://www.youtube.com/channel/UC7Jwj9fkrf1adN4fMmTkpug',
+ 'duration': 810,
+ 'description': 'md5:7b567f898d8237b256f36c1a07d6d7bc',
+ 'uploader': 'DankPods',
+ 'uploader_id': 'UC7Jwj9fkrf1adN4fMmTkpug',
+ 'uploader_url': 'http://www.youtube.com/channel/UC7Jwj9fkrf1adN4fMmTkpug'
+ }
+ }, {
+ # player response contains '};' See: https://github.com/ytdl-org/youtube-dl/issues/27093
+ 'url': 'https://web.archive.org/web/20200827003909if_/http://www.youtube.com/watch?v=6Dh-RL__uN4',
+ 'info_dict': {
+ 'id': '6Dh-RL__uN4',
+ 'ext': 'mp4',
+ 'title': 'bitch lasagna',
+ 'upload_date': '20181005',
+ 'channel_id': 'UC-lHJZR3Gqxm24_Vd_AJ5Yw',
+ 'channel_url': 'http://www.youtube.com/channel/UC-lHJZR3Gqxm24_Vd_AJ5Yw',
+ 'duration': 135,
+ 'description': 'md5:2dbe4051feeff2dab5f41f82bb6d11d0',
+ 'uploader': 'PewDiePie',
+ 'uploader_id': 'PewDiePie',
+ 'uploader_url': 'http://www.youtube.com/user/PewDiePie'
+ }
+ }, {
+ 'url': 'https://web.archive.org/web/http://www.youtube.com/watch?v=kH-G_aIBlFw',
+ 'only_matching': True
+ }, {
+ 'url': 'https://web.archive.org/web/20050214000000_if/http://www.youtube.com/watch?v=0altSZ96U4M',
+ 'only_matching': True
+ }, {
# Video not archived, only capture is unavailable video page
'url': 'https://web.archive.org/web/20210530071008/https://www.youtube.com/watch?v=lHJTf93HL1s&spfreload=10',
- 'only_matching': True,
- },
- { # Encoded url
+ 'only_matching': True
+ }, { # Encoded url
'url': 'https://web.archive.org/web/20120712231619/http%3A//www.youtube.com/watch%3Fgl%3DUS%26v%3DAkhihxRKcrs%26hl%3Den',
- 'only_matching': True,
- },
- {
+ 'only_matching': True
+ }, {
'url': 'https://web.archive.org/web/20120712231619/http%3A//www.youtube.com/watch%3Fv%3DAkhihxRKcrs%26gl%3DUS%26hl%3Den',
- 'only_matching': True,
- }
+ 'only_matching': True
+ }, {
+ 'url': 'https://web.archive.org/web/20060527081937/http://www.youtube.com:80/watch.php?v=ELTFsLT73fA&amp;search=soccer',
+ 'only_matching': True
+ }, {
+ 'url': 'https://web.archive.org/http://www.youtube.com:80/watch?v=-05VVye-ffg',
+ 'only_matching': True
+ }, {
+ 'url': 'ytarchive:BaW_jenozKc:20050214000000',
+ 'only_matching': True
+ }, {
+ 'url': 'ytarchive:BaW_jenozKc',
+ 'only_matching': True
+ },
]
+ _YT_INITIAL_DATA_RE = r'(?:(?:(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;)|%s)' % YoutubeBaseInfoExtractor._YT_INITIAL_DATA_RE
+ _YT_INITIAL_PLAYER_RESPONSE_RE = r'(?:(?:(?:window\s*\[\s*["\']ytInitialPlayerResponse["\']\s*\]|ytInitialPlayerResponse)\s*=[(\s]*({.+?})[)\s]*;)|%s)' % YoutubeBaseInfoExtractor._YT_INITIAL_PLAYER_RESPONSE_RE
+ _YT_INITIAL_BOUNDARY_RE = r'(?:(?:var\s+meta|</script|\n)|%s)' % YoutubeBaseInfoExtractor._YT_INITIAL_BOUNDARY_RE
+
+ _YT_DEFAULT_THUMB_SERVERS = ['i.ytimg.com'] # thumbnails most likely archived on these servers
+ _YT_ALL_THUMB_SERVERS = orderedSet(
+ _YT_DEFAULT_THUMB_SERVERS + ['img.youtube.com', *[f'{c}{n or ""}.ytimg.com' for c in ('i', 's') for n in (*range(0, 5), 9)]])
+
+ _WAYBACK_BASE_URL = 'https://web.archive.org/web/%sif_/'
+ _OLDEST_CAPTURE_DATE = 20050214000000
+ _NEWEST_CAPTURE_DATE = 20500101000000
+
+ def _call_cdx_api(self, item_id, url, filters: list = None, collapse: list = None, query: dict = None, note=None, fatal=False):
+ # CDX docs: https://github.com/internetarchive/wayback/blob/master/wayback-cdx-server/README.md
+ query = {
+ 'url': url,
+ 'output': 'json',
+ 'fl': 'original,mimetype,length,timestamp',
+ 'limit': 500,
+ 'filter': ['statuscode:200'] + (filters or []),
+ 'collapse': collapse or [],
+ **(query or {})
+ }
+ res = self._download_json(
+ 'https://web.archive.org/cdx/search/cdx', item_id,
+ note or 'Downloading CDX API JSON', query=query, fatal=fatal)
+ if isinstance(res, list) and len(res) >= 2:
+ # format response to make it easier to use
+ return list(dict(zip(res[0], v)) for v in res[1:])
+ elif not isinstance(res, list) or len(res) != 0:
+ self.report_warning('Error while parsing CDX API response' + bug_reports_message())
+
+ def _extract_yt_initial_variable(self, webpage, regex, video_id, name):
+ return self._parse_json(self._search_regex(
+ (r'%s\s*%s' % (regex, self._YT_INITIAL_BOUNDARY_RE),
+ regex), webpage, name, default='{}'), video_id, fatal=False)
+
+ def _extract_webpage_title(self, webpage):
+ page_title = self._html_extract_title(webpage, default='')
+ # YouTube video pages appear to always have either 'YouTube -' as prefix or '- YouTube' as suffix.
+ return self._html_search_regex(
+ r'(?:YouTube\s*-\s*(.*)$)|(?:(.*)\s*-\s*YouTube$)',
+ page_title, 'title', default='')
+
+ def _extract_metadata(self, video_id, webpage):
+ search_meta = ((lambda x: self._html_search_meta(x, webpage, default=None)) if webpage else (lambda x: None))
+ player_response = self._extract_yt_initial_variable(
+ webpage, self._YT_INITIAL_PLAYER_RESPONSE_RE, video_id, 'initial player response') or {}
+ initial_data = self._extract_yt_initial_variable(
+ webpage, self._YT_INITIAL_DATA_RE, video_id, 'initial player response') or {}
+
+ initial_data_video = traverse_obj(
+ initial_data, ('contents', 'twoColumnWatchNextResults', 'results', 'results', 'contents', ..., 'videoPrimaryInfoRenderer'),
+ expected_type=dict, get_all=False, default={})
+
+ video_details = traverse_obj(
+ player_response, 'videoDetails', expected_type=dict, get_all=False, default={})
+
+ microformats = traverse_obj(
+ player_response, ('microformat', 'playerMicroformatRenderer'), expected_type=dict, get_all=False, default={})
+
+ video_title = (
+ video_details.get('title')
+ or YoutubeBaseInfoExtractor._get_text(microformats, 'title')
+ or YoutubeBaseInfoExtractor._get_text(initial_data_video, 'title')
+ or self._extract_webpage_title(webpage)
+ or search_meta(['og:title', 'twitter:title', 'title']))
+
+ channel_id = str_or_none(
+ video_details.get('channelId')
+ or microformats.get('externalChannelId')
+ or search_meta('channelId')
+ or self._search_regex(
+ r'data-channel-external-id=(["\'])(?P<id>(?:(?!\1).)+)\1', # @b45a9e6
+ webpage, 'channel id', default=None, group='id'))
+ channel_url = f'http://www.youtube.com/channel/{channel_id}' if channel_id else None
+
+ duration = int_or_none(
+ video_details.get('lengthSeconds')
+ or microformats.get('lengthSeconds')
+ or parse_duration(search_meta('duration')))
+ description = (
+ video_details.get('shortDescription')
+ or YoutubeBaseInfoExtractor._get_text(microformats, 'description')
+ or clean_html(get_element_by_id('eow-description', webpage)) # @9e6dd23
+ or search_meta(['description', 'og:description', 'twitter:description']))
+
+ uploader = video_details.get('author')
+
+ # Uploader ID and URL
+ uploader_mobj = re.search(
+ r'<link itemprop="url" href="(?P<uploader_url>https?://www\.youtube\.com/(?:user|channel)/(?P<uploader_id>[^"]+))">', # @fd05024
+ webpage)
+ if uploader_mobj is not None:
+ uploader_id, uploader_url = uploader_mobj.group('uploader_id'), uploader_mobj.group('uploader_url')
+ else:
+ # @a6211d2
+ uploader_url = url_or_none(microformats.get('ownerProfileUrl'))
+ uploader_id = self._search_regex(
+ r'(?:user|channel)/([^/]+)', uploader_url or '', 'uploader id', default=None)
+
+ upload_date = unified_strdate(
+ dict_get(microformats, ('uploadDate', 'publishDate'))
+ or search_meta(['uploadDate', 'datePublished'])
+ or self._search_regex(
+ [r'(?s)id="eow-date.*?>(.*?)</span>',
+ r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'], # @7998520
+ webpage, 'upload date', default=None))
+
+ return {
+ 'title': video_title,
+ 'description': description,
+ 'upload_date': upload_date,
+ 'uploader': uploader,
+ 'channel_id': channel_id,
+ 'channel_url': channel_url,
+ 'duration': duration,
+ 'uploader_url': uploader_url,
+ 'uploader_id': uploader_id,
+ }
+
+ def _extract_thumbnails(self, video_id):
+ try_all = 'thumbnails' in self._configuration_arg('check_all')
+ thumbnail_base_urls = ['http://{server}/vi{webp}/{video_id}'.format(
+ webp='_webp' if ext == 'webp' else '', video_id=video_id, server=server)
+ for server in (self._YT_ALL_THUMB_SERVERS if try_all else self._YT_DEFAULT_THUMB_SERVERS) for ext in (('jpg', 'webp') if try_all else ('jpg',))]
+
+ thumbnails = []
+ for url in thumbnail_base_urls:
+ response = self._call_cdx_api(
+ video_id, url, filters=['mimetype:image/(?:webp|jpeg)'],
+ collapse=['urlkey'], query={'matchType': 'prefix'})
+ if not response:
+ continue
+ thumbnails.extend(
+ {
+ 'url': (self._WAYBACK_BASE_URL % (int_or_none(thumbnail_dict.get('timestamp')) or self._OLDEST_CAPTURE_DATE)) + thumbnail_dict.get('original'),
+ 'filesize': int_or_none(thumbnail_dict.get('length')),
+ 'preference': int_or_none(thumbnail_dict.get('length'))
+ } for thumbnail_dict in response)
+ if not try_all:
+ break
+
+ self._remove_duplicate_formats(thumbnails)
+ return thumbnails
+
+ def _get_capture_dates(self, video_id, url_date):
+ capture_dates = []
+ # Note: CDX API will not find watch pages with extra params in the url.
+ response = self._call_cdx_api(
+ video_id, f'https://www.youtube.com/watch?v={video_id}',
+ filters=['mimetype:text/html'], collapse=['timestamp:6', 'digest'], query={'matchType': 'prefix'}) or []
+ all_captures = sorted([int_or_none(r['timestamp']) for r in response if int_or_none(r['timestamp']) is not None])
+
+ # Prefer the new polymer UI captures as we support extracting more metadata from them
+ # WBM captures seem to all switch to this layout ~July 2020
+ modern_captures = [x for x in all_captures if x >= 20200701000000]
+ if modern_captures:
+ capture_dates.append(modern_captures[0])
+ capture_dates.append(url_date)
+ if all_captures:
+ capture_dates.append(all_captures[0])
+
+ if 'captures' in self._configuration_arg('check_all'):
+ capture_dates.extend(modern_captures + all_captures)
+
+ # Fallbacks if any of the above fail
+ capture_dates.extend([self._OLDEST_CAPTURE_DATE, self._NEWEST_CAPTURE_DATE])
+ return orderedSet(filter(None, capture_dates))
def _real_extract(self, url):
- video_id = self._match_id(url)
- title = video_id # if we are not able get a title
-
- def _extract_title(webpage):
- page_title = self._html_search_regex(
- r'<title>([^<]*)</title>', webpage, 'title', fatal=False) or ''
- # YouTube video pages appear to always have either 'YouTube -' as suffix or '- YouTube' as prefix.
- try:
- page_title = self._html_search_regex(
- r'(?:YouTube\s*-\s*(.*)$)|(?:(.*)\s*-\s*YouTube$)',
- page_title, 'title', default='')
- except RegexNotFoundError:
- page_title = None
-
- if not page_title:
- self.report_warning('unable to extract title', video_id=video_id)
- return
- return page_title
-
- # If the video is no longer available, the oldest capture may be one before it was removed.
- # Setting the capture date in url to early date seems to redirect to earliest capture.
- webpage = self._download_webpage(
- 'https://web.archive.org/web/20050214000000/http://www.youtube.com/watch?v=%s' % video_id,
- video_id=video_id, fatal=False, errnote='unable to download video webpage (probably not archived).')
- if webpage:
- title = _extract_title(webpage) or title
-
- # Use link translator mentioned in https://github.com/ytdl-org/youtube-dl/issues/13655
- internal_fake_url = 'https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/%s' % video_id
+ video_id, url_date, url_date_2 = self._match_valid_url(url).group('id', 'date', 'date2')
+ url_date = url_date or url_date_2
+
+ urlh = None
try:
- video_file_webpage = self._request_webpage(
- HEADRequest(internal_fake_url), video_id,
- note='Fetching video file url', expected_status=True)
+ urlh = self._request_webpage(
+ HEADRequest('https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/%s' % video_id),
+ video_id, note='Fetching archived video file url', expected_status=True)
except ExtractorError as e:
# HTTP Error 404 is expected if the video is not saved.
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404:
- raise ExtractorError(
- 'HTTP Error %s. Most likely the video is not archived or issue with web.archive.org.' % e.cause.code,
+ self.raise_no_formats(
+ 'The requested video is not archived, indexed, or there is an issue with web.archive.org',
expected=True)
- raise
- video_file_url = compat_urllib_parse_unquote(video_file_webpage.url)
- video_file_url_qs = parse_qs(video_file_url)
-
- # Attempt to recover any ext & format info from playback url
- format = {'url': video_file_url}
- itag = try_get(video_file_url_qs, lambda x: x['itag'][0])
- if itag and itag in YoutubeIE._formats: # Naughty access but it works
- format.update(YoutubeIE._formats[itag])
- format.update({'format_id': itag})
- else:
- mime = try_get(video_file_url_qs, lambda x: x['mime'][0])
- ext = mimetype2ext(mime) or determine_ext(video_file_url)
- format.update({'ext': ext})
- return {
- 'id': video_id,
- 'title': title,
- 'formats': [format],
- 'duration': str_to_int(try_get(video_file_url_qs, lambda x: x['dur'][0]))
- }
+ else:
+ raise
+
+ capture_dates = self._get_capture_dates(video_id, int_or_none(url_date))
+ self.write_debug('Captures to try: ' + join_nonempty(*capture_dates, delim=', '))
+ info = {'id': video_id}
+ for capture in capture_dates:
+ webpage = self._download_webpage(
+ (self._WAYBACK_BASE_URL + 'http://www.youtube.com/watch?v=%s') % (capture, video_id),
+ video_id=video_id, fatal=False, errnote='unable to download capture webpage (it may not be archived)',
+ note='Downloading capture webpage')
+ current_info = self._extract_metadata(video_id, webpage or '')
+ # Try avoid getting deleted video metadata
+ if current_info.get('title'):
+ info = merge_dicts(info, current_info)
+ if 'captures' not in self._configuration_arg('check_all'):
+ break
+
+ info['thumbnails'] = self._extract_thumbnails(video_id)
+
+ if urlh:
+ url = compat_urllib_parse_unquote(urlh.geturl())
+ video_file_url_qs = parse_qs(url)
+ # Attempt to recover any ext & format info from playback url & response headers
+ format = {'url': url, 'filesize': int_or_none(urlh.headers.get('x-archive-orig-content-length'))}
+ itag = try_get(video_file_url_qs, lambda x: x['itag'][0])
+ if itag and itag in YoutubeIE._formats:
+ format.update(YoutubeIE._formats[itag])
+ format.update({'format_id': itag})
+ else:
+ mime = try_get(video_file_url_qs, lambda x: x['mime'][0])
+ ext = (mimetype2ext(mime)
+ or urlhandle_detect_ext(urlh)
+ or mimetype2ext(urlh.headers.get('x-archive-guessed-content-type')))
+ format.update({'ext': ext})
+ info['formats'] = [format]
+ if not info.get('duration'):
+ info['duration'] = str_to_int(try_get(video_file_url_qs, lambda x: x['dur'][0]))
+
+ if not info.get('title'):
+ info['title'] = video_id
+ return info
diff --git a/hypervideo_dl/extractor/arcpublishing.py b/hypervideo_dl/extractor/arcpublishing.py
index 5a9b818..8880e5c 100644
--- a/hypervideo_dl/extractor/arcpublishing.py
+++ b/hypervideo_dl/extractor/arcpublishing.py
@@ -124,8 +124,7 @@ class ArcPublishingIE(InfoExtractor):
formats.extend(smil_formats)
elif stream_type in ('ts', 'hls'):
m3u8_formats = self._extract_m3u8_formats(
- s_url, uuid, 'mp4', 'm3u8' if is_live else 'm3u8_native',
- m3u8_id='hls', fatal=False)
+ s_url, uuid, 'mp4', live=is_live, m3u8_id='hls', fatal=False)
if all([f.get('acodec') == 'none' for f in m3u8_formats]):
continue
for f in m3u8_formats:
@@ -158,7 +157,7 @@ class ArcPublishingIE(InfoExtractor):
return {
'id': uuid,
- 'title': self._live_title(title) if is_live else title,
+ 'title': title,
'thumbnail': try_get(video, lambda x: x['promo_image']['url']),
'description': try_get(video, lambda x: x['subheadlines']['basic']),
'formats': formats,
diff --git a/hypervideo_dl/extractor/ard.py b/hypervideo_dl/extractor/ard.py
index 048d30f..7ea339b 100644
--- a/hypervideo_dl/extractor/ard.py
+++ b/hypervideo_dl/extractor/ard.py
@@ -280,7 +280,7 @@ class ARDMediathekIE(ARDMediathekBaseIE):
info.update({
'id': video_id,
- 'title': self._live_title(title) if info.get('is_live') else title,
+ 'title': title,
'description': description,
'thumbnail': thumbnail,
})
@@ -376,9 +376,24 @@ class ARDIE(InfoExtractor):
formats.append(f)
self._sort_formats(formats)
+ _SUB_FORMATS = (
+ ('./dataTimedText', 'ttml'),
+ ('./dataTimedTextNoOffset', 'ttml'),
+ ('./dataTimedTextVtt', 'vtt'),
+ )
+
+ subtitles = {}
+ for subsel, subext in _SUB_FORMATS:
+ for node in video_node.findall(subsel):
+ subtitles.setdefault('de', []).append({
+ 'url': node.attrib['url'],
+ 'ext': subext,
+ })
+
return {
'id': xpath_text(video_node, './videoId', default=display_id),
'formats': formats,
+ 'subtitles': subtitles,
'display_id': display_id,
'title': video_node.find('./title').text,
'duration': parse_duration(video_node.find('./duration').text),
@@ -388,7 +403,14 @@ class ARDIE(InfoExtractor):
class ARDBetaMediathekIE(ARDMediathekBaseIE):
- _VALID_URL = r'https://(?:(?:beta|www)\.)?ardmediathek\.de/(?P<client>[^/]+)/(?P<mode>player|live|video|sendung|sammlung)/(?P<display_id>(?:[^/]+/)*)(?P<video_id>[a-zA-Z0-9]+)'
+ _VALID_URL = r'''(?x)https://
+ (?:(?:beta|www)\.)?ardmediathek\.de/
+ (?:(?P<client>[^/]+)/)?
+ (?:player|live|video|(?P<playlist>sendung|sammlung))/
+ (?:(?P<display_id>(?(playlist)[^?#]+?|[^?#]+))/)?
+ (?P<id>(?(playlist)|Y3JpZDovL)[a-zA-Z0-9]+)
+ (?(playlist)/(?P<season>\d+)?/?(?:[?#]|$))'''
+
_TESTS = [{
'url': 'https://www.ardmediathek.de/mdr/video/die-robuste-roswita/Y3JpZDovL21kci5kZS9iZWl0cmFnL2Ntcy84MWMxN2MzZC0wMjkxLTRmMzUtODk4ZS0wYzhlOWQxODE2NGI/',
'md5': 'a1dc75a39c61601b980648f7c9f9f71d',
@@ -403,6 +425,25 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
'upload_date': '20200805',
'ext': 'mp4',
},
+ 'skip': 'Error',
+ }, {
+ 'url': 'https://www.ardmediathek.de/video/tagesschau-oder-tagesschau-20-00-uhr/das-erste/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhZ2Vzc2NoYXUvZmM4ZDUxMjgtOTE0ZC00Y2MzLTgzNzAtNDZkNGNiZWJkOTll',
+ 'md5': 'f1837e563323b8a642a8ddeff0131f51',
+ 'info_dict': {
+ 'id': '10049223',
+ 'ext': 'mp4',
+ 'title': 'tagesschau, 20:00 Uhr',
+ 'timestamp': 1636398000,
+ 'description': 'md5:39578c7b96c9fe50afdf5674ad985e6b',
+ 'upload_date': '20211108',
+ },
+ }, {
+ 'url': 'https://www.ardmediathek.de/sendung/beforeigners/beforeigners/staffel-1/Y3JpZDovL2Rhc2Vyc3RlLmRlL2JlZm9yZWlnbmVycw/1',
+ 'playlist_count': 6,
+ 'info_dict': {
+ 'id': 'Y3JpZDovL2Rhc2Vyc3RlLmRlL2JlZm9yZWlnbmVycw',
+ 'title': 'beforeigners/beforeigners/staffel-1',
+ },
}, {
'url': 'https://beta.ardmediathek.de/ard/video/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE',
'only_matching': True,
@@ -426,6 +467,12 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
# playlist of type 'sammlung'
'url': 'https://www.ardmediathek.de/ard/sammlung/team-muenster/5JpTzLSbWUAK8184IOvEir/',
'only_matching': True,
+ }, {
+ 'url': 'https://www.ardmediathek.de/video/coronavirus-update-ndr-info/astrazeneca-kurz-lockdown-und-pims-syndrom-81/ndr/Y3JpZDovL25kci5kZS84NzE0M2FjNi0wMWEwLTQ5ODEtOTE5NS1mOGZhNzdhOTFmOTI/',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.ardmediathek.de/ard/player/Y3JpZDovL3dkci5kZS9CZWl0cmFnLWQ2NDJjYWEzLTMwZWYtNGI4NS1iMTI2LTU1N2UxYTcxOGIzOQ/tatort-duo-koeln-leipzig-ihr-kinderlein-kommet',
+ 'only_matching': True,
}]
def _ARD_load_playlist_snipped(self, playlist_id, display_id, client, mode, pageNumber):
@@ -522,23 +569,16 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
break
pageNumber = pageNumber + 1
- return self.playlist_result(entries, playlist_title=display_id)
+ return self.playlist_result(entries, playlist_id, playlist_title=display_id)
def _real_extract(self, url):
- mobj = self._match_valid_url(url)
- video_id = mobj.group('video_id')
- display_id = mobj.group('display_id')
- if display_id:
- display_id = display_id.rstrip('/')
- if not display_id:
- display_id = video_id
-
- if mobj.group('mode') in ('sendung', 'sammlung'):
- # this is a playlist-URL
- return self._ARD_extract_playlist(
- url, video_id, display_id,
- mobj.group('client'),
- mobj.group('mode'))
+ video_id, display_id, playlist_type, client, season_number = self._match_valid_url(url).group(
+ 'id', 'display_id', 'playlist', 'client', 'season')
+ display_id, client = display_id or video_id, client or 'ard'
+
+ if playlist_type:
+ # TODO: Extract only specified season
+ return self._ARD_extract_playlist(url, video_id, display_id, client, playlist_type)
player_page = self._download_json(
'https://api.ardmediathek.de/public-gateway',
@@ -574,7 +614,7 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
}
}
}
-}''' % (mobj.group('client'), video_id),
+}''' % (client, video_id),
}).encode(), headers={
'Content-Type': 'application/json'
})['data']['playerPage']
diff --git a/hypervideo_dl/extractor/arnes.py b/hypervideo_dl/extractor/arnes.py
index c0032fc..050c252 100644
--- a/hypervideo_dl/extractor/arnes.py
+++ b/hypervideo_dl/extractor/arnes.py
@@ -7,6 +7,7 @@ from ..compat import (
compat_urllib_parse_urlparse,
)
from ..utils import (
+ format_field,
float_or_none,
int_or_none,
parse_iso8601,
@@ -92,7 +93,7 @@ class ArnesIE(InfoExtractor):
'timestamp': parse_iso8601(video.get('creationTime')),
'channel': channel.get('name'),
'channel_id': channel_id,
- 'channel_url': self._BASE_URL + '/?channel=' + channel_id if channel_id else None,
+ 'channel_url': format_field(channel_id, template=f'{self._BASE_URL}/?channel=%s'),
'duration': float_or_none(video.get('duration'), 1000),
'view_count': int_or_none(video.get('views')),
'tags': video.get('hashtags'),
diff --git a/hypervideo_dl/extractor/arte.py b/hypervideo_dl/extractor/arte.py
index 296b169..c2f2c1b 100644
--- a/hypervideo_dl/extractor/arte.py
+++ b/hypervideo_dl/extractor/arte.py
@@ -12,6 +12,7 @@ from ..utils import (
int_or_none,
parse_qs,
qualities,
+ strip_or_none,
try_get,
unified_strdate,
url_or_none,
@@ -137,6 +138,7 @@ class ArteTVIE(ArteTVBaseIE):
break
else:
lang_pref = -1
+ format_note = '%s, %s' % (f.get('versionCode'), f.get('versionLibelle'))
media_type = f.get('mediaType')
if media_type == 'hls':
@@ -144,14 +146,17 @@ class ArteTVIE(ArteTVBaseIE):
format_url, video_id, 'mp4', entry_protocol='m3u8_native',
m3u8_id=format_id, fatal=False)
for m3u8_format in m3u8_formats:
- m3u8_format['language_preference'] = lang_pref
+ m3u8_format.update({
+ 'language_preference': lang_pref,
+ 'format_note': format_note,
+ })
formats.extend(m3u8_formats)
continue
format = {
'format_id': format_id,
'language_preference': lang_pref,
- 'format_note': '%s, %s' % (f.get('versionCode'), f.get('versionLibelle')),
+ 'format_note': format_note,
'width': int_or_none(f.get('width')),
'height': int_or_none(f.get('height')),
'tbr': int_or_none(f.get('bitrate')),
@@ -253,3 +258,44 @@ class ArteTVPlaylistIE(ArteTVBaseIE):
title = collection.get('title')
description = collection.get('shortDescription') or collection.get('teaserText')
return self.playlist_result(entries, playlist_id, title, description)
+
+
+class ArteTVCategoryIE(ArteTVBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>[\w-]+(?:/[\w-]+)*)/?\s*$' % ArteTVBaseIE._ARTE_LANGUAGES
+ _TESTS = [{
+ 'url': 'https://www.arte.tv/en/videos/politics-and-society/',
+ 'info_dict': {
+ 'id': 'politics-and-society',
+ 'title': 'Politics and society',
+ 'description': 'Investigative documentary series, geopolitical analysis, and international commentary',
+ },
+ 'playlist_mincount': 13,
+ },
+ ]
+
+ @classmethod
+ def suitable(cls, url):
+ return (
+ not any(ie.suitable(url) for ie in (ArteTVIE, ArteTVPlaylistIE, ))
+ and super(ArteTVCategoryIE, cls).suitable(url))
+
+ def _real_extract(self, url):
+ lang, playlist_id = self._match_valid_url(url).groups()
+ webpage = self._download_webpage(url, playlist_id)
+
+ items = []
+ for video in re.finditer(
+ r'<a\b[^>]*?href\s*=\s*(?P<q>"|\'|\b)(?P<url>https?://www\.arte\.tv/%s/videos/[\w/-]+)(?P=q)' % lang,
+ webpage):
+ video = video.group('url')
+ if video == url:
+ continue
+ if any(ie.suitable(video) for ie in (ArteTVIE, ArteTVPlaylistIE, )):
+ items.append(video)
+
+ title = (self._og_search_title(webpage, default=None)
+ or self._html_search_regex(r'<title\b[^>]*>([^<]+)</title>', default=None))
+ title = strip_or_none(title.rsplit('|', 1)[0]) or self._generic_title(url)
+
+ return self.playlist_from_matches(items, playlist_id=playlist_id, playlist_title=title,
+ description=self._og_search_description(webpage, default=None))
diff --git a/hypervideo_dl/extractor/asiancrush.py b/hypervideo_dl/extractor/asiancrush.py
index 75a6329..7f1940f 100644
--- a/hypervideo_dl/extractor/asiancrush.py
+++ b/hypervideo_dl/extractor/asiancrush.py
@@ -181,8 +181,7 @@ class AsianCrushPlaylistIE(AsianCrushBaseIE):
'title', default=None) or self._og_search_title(
webpage, default=None) or self._html_search_meta(
'twitter:title', webpage, 'title',
- default=None) or self._search_regex(
- r'<title>([^<]+)</title>', webpage, 'title', fatal=False)
+ default=None) or self._html_extract_title(webpage)
if title:
title = re.sub(r'\s*\|\s*.+?$', '', title)
diff --git a/hypervideo_dl/extractor/atresplayer.py b/hypervideo_dl/extractor/atresplayer.py
index 8143eb4..465af4e 100644
--- a/hypervideo_dl/extractor/atresplayer.py
+++ b/hypervideo_dl/extractor/atresplayer.py
@@ -24,9 +24,6 @@ class AtresPlayerIE(InfoExtractor):
'description': 'md5:7634cdcb4d50d5381bedf93efb537fbc',
'duration': 3413,
},
- 'params': {
- 'format': 'bestvideo',
- },
'skip': 'This video is only available for registered users'
},
{
@@ -40,9 +37,6 @@ class AtresPlayerIE(InfoExtractor):
]
_API_BASE = 'https://api.atresplayer.com/'
- def _real_initialize(self):
- self._login()
-
def _handle_error(self, e, code):
if isinstance(e.cause, compat_HTTPError) and e.cause.code == code:
error = self._parse_json(e.cause.read(), None)
@@ -51,11 +45,7 @@ class AtresPlayerIE(InfoExtractor):
raise ExtractorError(error['error_description'], expected=True)
raise
- def _login(self):
- username, password = self._get_login_info()
- if username is None:
- return
-
+ def _perform_login(self, username, password):
self._request_webpage(
self._API_BASE + 'login', None, 'Downloading login page')
diff --git a/hypervideo_dl/extractor/atvat.py b/hypervideo_dl/extractor/atvat.py
index 7c30cfc..481a097 100644
--- a/hypervideo_dl/extractor/atvat.py
+++ b/hypervideo_dl/extractor/atvat.py
@@ -8,6 +8,7 @@ from ..utils import (
float_or_none,
jwt_encode_hs256,
try_get,
+ ExtractorError,
)
@@ -94,6 +95,11 @@ class ATVAtIE(InfoExtractor):
})
video_id, videos_data = list(videos['data'].items())[0]
+ error_msg = try_get(videos_data, lambda x: x['error']['title'])
+ if error_msg == 'Geo check failed':
+ self.raise_geo_restricted(error_msg)
+ elif error_msg:
+ raise ExtractorError(error_msg)
entries = [
self._extract_video_info(url, contentResource[video['id']], video)
for video in videos_data]
diff --git a/hypervideo_dl/extractor/audiomack.py b/hypervideo_dl/extractor/audiomack.py
index cc77713..19775cf 100644
--- a/hypervideo_dl/extractor/audiomack.py
+++ b/hypervideo_dl/extractor/audiomack.py
@@ -14,7 +14,7 @@ from ..utils import (
class AudiomackIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?audiomack\.com/song/(?P<id>[\w/-]+)'
+ _VALID_URL = r'https?://(?:www\.)?audiomack\.com/(?:song/|(?=.+/song/))(?P<id>[\w/-]+)'
IE_NAME = 'audiomack'
_TESTS = [
# hosted on audiomack
@@ -29,6 +29,7 @@ class AudiomackIE(InfoExtractor):
}
},
# audiomack wrapper around soundcloud song
+ # Needs new test URL.
{
'add_ie': ['Soundcloud'],
'url': 'http://www.audiomack.com/song/hip-hop-daily/black-mamba-freestyle',
@@ -39,15 +40,16 @@ class AudiomackIE(InfoExtractor):
'title': 'Black Mamba Freestyle [Prod. By Danny Wolf]',
'uploader': 'ILOVEMAKONNEN',
'upload_date': '20160414',
- }
+ },
+ 'skip': 'Song has been removed from the site',
},
]
def _real_extract(self, url):
- # URLs end with [uploader name]/[uploader title]
+ # URLs end with [uploader name]/song/[uploader title]
# this title is whatever the user types in, and is rarely
# the proper song title. Real metadata is in the api response
- album_url_tag = self._match_id(url)
+ album_url_tag = self._match_id(url).replace('/song/', '/')
# Request the extended version of the api for extra fields like artist and title
api_response = self._download_json(
@@ -73,13 +75,13 @@ class AudiomackIE(InfoExtractor):
class AudiomackAlbumIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?audiomack\.com/album/(?P<id>[\w/-]+)'
+ _VALID_URL = r'https?://(?:www\.)?audiomack\.com/(?:album/|(?=.+/album/))(?P<id>[\w/-]+)'
IE_NAME = 'audiomack:album'
_TESTS = [
# Standard album playlist
{
'url': 'http://www.audiomack.com/album/flytunezcom/tha-tour-part-2-mixtape',
- 'playlist_count': 15,
+ 'playlist_count': 11,
'info_dict':
{
'id': '812251',
@@ -95,24 +97,27 @@ class AudiomackAlbumIE(InfoExtractor):
},
'playlist': [{
'info_dict': {
- 'title': 'PPP (Pistol P Project) - 9. Heaven or Hell (CHIMACA) ft Zuse (prod by DJ FU)',
- 'id': '837577',
+ 'title': 'PPP (Pistol P Project) - 8. Real (prod by SYK SENSE )',
+ 'id': '837576',
+ 'ext': 'mp3',
+ 'uploader': 'Lil Herb a.k.a. G Herbo',
+ }
+ }, {
+ 'info_dict': {
+ 'title': 'PPP (Pistol P Project) - 10. 4 Minutes Of Hell Part 4 (prod by DY OF 808 MAFIA)',
+ 'id': '837580',
'ext': 'mp3',
'uploader': 'Lil Herb a.k.a. G Herbo',
}
}],
- 'params': {
- 'playliststart': 9,
- 'playlistend': 9,
- }
}
]
def _real_extract(self, url):
- # URLs end with [uploader name]/[uploader title]
+ # URLs end with [uploader name]/album/[uploader title]
# this title is whatever the user types in, and is rarely
# the proper song title. Real metadata is in the api response
- album_url_tag = self._match_id(url)
+ album_url_tag = self._match_id(url).replace('/album/', '/')
result = {'_type': 'playlist', 'entries': []}
# There is no one endpoint for album metadata - instead it is included/repeated in each song's metadata
# Therefore we don't know how many songs the album has and must infi-loop until failure
@@ -134,7 +139,7 @@ class AudiomackAlbumIE(InfoExtractor):
# Pull out the album metadata and add to result (if it exists)
for resultkey, apikey in [('id', 'album_id'), ('title', 'album_title')]:
if apikey in api_response and resultkey not in result:
- result[resultkey] = api_response[apikey]
+ result[resultkey] = compat_str(api_response[apikey])
song_id = url_basename(api_response['url']).rpartition('.')[0]
result['entries'].append({
'id': compat_str(api_response.get('id', song_id)),
diff --git a/hypervideo_dl/extractor/awaan.py b/hypervideo_dl/extractor/awaan.py
index 22cc10d..f5e559c 100644
--- a/hypervideo_dl/extractor/awaan.py
+++ b/hypervideo_dl/extractor/awaan.py
@@ -9,6 +9,7 @@ from ..compat import (
compat_str,
)
from ..utils import (
+ format_field,
int_or_none,
parse_iso8601,
smuggle_url,
@@ -41,9 +42,9 @@ class AWAANBaseIE(InfoExtractor):
return {
'id': video_id,
- 'title': self._live_title(title) if is_live else title,
+ 'title': title,
'description': video_data.get('description_en') or video_data.get('description_ar'),
- 'thumbnail': 'http://admin.mangomolo.com/analytics/%s' % img if img else None,
+ 'thumbnail': format_field(img, template='http://admin.mangomolo.com/analytics/%s'),
'duration': int_or_none(video_data.get('duration')),
'timestamp': parse_iso8601(video_data.get('create_time'), ' '),
'is_live': is_live,
diff --git a/hypervideo_dl/extractor/azmedien.py b/hypervideo_dl/extractor/azmedien.py
index fee640e..0168340 100644
--- a/hypervideo_dl/extractor/azmedien.py
+++ b/hypervideo_dl/extractor/azmedien.py
@@ -11,11 +11,12 @@ class AZMedienIE(InfoExtractor):
IE_DESC = 'AZ Medien videos'
_VALID_URL = r'''(?x)
https?://
- (?:www\.)?
+ (?:www\.|tv\.)?
(?P<host>
telezueri\.ch|
telebaern\.tv|
- telem1\.ch
+ telem1\.ch|
+ tvo-online\.ch
)/
[^/]+/
(?P<id>
@@ -30,7 +31,7 @@ class AZMedienIE(InfoExtractor):
'''
_TESTS = [{
- 'url': 'https://www.telezueri.ch/sonntalk/bundesrats-vakanzen-eu-rahmenabkommen-133214569',
+ 'url': 'https://tv.telezueri.ch/sonntalk/bundesrats-vakanzen-eu-rahmenabkommen-133214569',
'info_dict': {
'id': '1_anruz3wy',
'ext': 'mp4',
@@ -38,6 +39,9 @@ class AZMedienIE(InfoExtractor):
'uploader_id': 'TVOnline',
'upload_date': '20180930',
'timestamp': 1538328802,
+ 'view_count': int,
+ 'thumbnail': 'http://cfvod.kaltura.com/p/1719221/sp/171922100/thumbnail/entry_id/1_anruz3wy/version/100031',
+ 'duration': 1930
},
'params': {
'skip_download': True,
diff --git a/hypervideo_dl/extractor/banbye.py b/hypervideo_dl/extractor/banbye.py
new file mode 100644
index 0000000..3d4d36e
--- /dev/null
+++ b/hypervideo_dl/extractor/banbye.py
@@ -0,0 +1,153 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import math
+
+from .common import InfoExtractor
+from ..compat import (
+ compat_urllib_parse_urlparse,
+ compat_parse_qs,
+)
+from ..utils import (
+ format_field,
+ InAdvancePagedList,
+ traverse_obj,
+ unified_timestamp,
+)
+
+
+class BanByeBaseIE(InfoExtractor):
+ _API_BASE = 'https://api.banbye.com'
+ _CDN_BASE = 'https://cdn.banbye.com'
+ _VIDEO_BASE = 'https://banbye.com/watch'
+
+ @staticmethod
+ def _extract_playlist_id(url, param='playlist'):
+ return compat_parse_qs(
+ compat_urllib_parse_urlparse(url).query).get(param, [None])[0]
+
+ def _extract_playlist(self, playlist_id):
+ data = self._download_json(f'{self._API_BASE}/playlists/{playlist_id}', playlist_id)
+ return self.playlist_result([
+ self.url_result(f'{self._VIDEO_BASE}/{video_id}', BanByeIE)
+ for video_id in data['videoIds']], playlist_id, data.get('name'))
+
+
+class BanByeIE(BanByeBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?banbye.com/(?:en/)?watch/(?P<id>\w+)'
+ _TESTS = [{
+ 'url': 'https://banbye.com/watch/v_ytfmvkVYLE8T',
+ 'md5': '2f4ea15c5ca259a73d909b2cfd558eb5',
+ 'info_dict': {
+ 'id': 'v_ytfmvkVYLE8T',
+ 'ext': 'mp4',
+ 'title': 'md5:5ec098f88a0d796f987648de6322ba0f',
+ 'description': 'md5:4d94836e73396bc18ef1fa0f43e5a63a',
+ 'uploader': 'wRealu24',
+ 'channel_id': 'ch_wrealu24',
+ 'channel_url': 'https://banbye.com/channel/ch_wrealu24',
+ 'timestamp': 1647604800,
+ 'upload_date': '20220318',
+ 'duration': 1931,
+ 'thumbnail': r're:https?://.*\.webp',
+ 'tags': 'count:5',
+ 'like_count': int,
+ 'dislike_count': int,
+ 'view_count': int,
+ 'comment_count': int,
+ },
+ }, {
+ 'url': 'https://banbye.com/watch/v_2JjQtqjKUE_F?playlistId=p_Ld82N6gBw_OJ',
+ 'info_dict': {
+ 'title': 'Krzysztof Karoń',
+ 'id': 'p_Ld82N6gBw_OJ',
+ },
+ 'playlist_count': 9,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ playlist_id = self._extract_playlist_id(url, 'playlistId')
+
+ if self._yes_playlist(playlist_id, video_id):
+ return self._extract_playlist(playlist_id)
+
+ data = self._download_json(f'{self._API_BASE}/videos/{video_id}', video_id)
+ thumbnails = [{
+ 'id': f'{quality}p',
+ 'url': f'{self._CDN_BASE}/video/{video_id}/{quality}.webp',
+ } for quality in [48, 96, 144, 240, 512, 1080]]
+ formats = [{
+ 'format_id': f'http-{quality}p',
+ 'quality': quality,
+ 'url': f'{self._CDN_BASE}/video/{video_id}/{quality}.mp4',
+ } for quality in data['quality']]
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': data.get('title'),
+ 'description': data.get('desc'),
+ 'uploader': traverse_obj(data, ('channel', 'name')),
+ 'channel_id': data.get('channelId'),
+ 'channel_url': format_field(data, 'channelId', 'https://banbye.com/channel/%s'),
+ 'timestamp': unified_timestamp(data.get('publishedAt')),
+ 'duration': data.get('duration'),
+ 'tags': data.get('tags'),
+ 'formats': formats,
+ 'thumbnails': thumbnails,
+ 'like_count': data.get('likes'),
+ 'dislike_count': data.get('dislikes'),
+ 'view_count': data.get('views'),
+ 'comment_count': data.get('commentCount'),
+ }
+
+
+class BanByeChannelIE(BanByeBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?banbye.com/(?:en/)?channel/(?P<id>\w+)'
+ _TESTS = [{
+ 'url': 'https://banbye.com/channel/ch_wrealu24',
+ 'info_dict': {
+ 'title': 'wRealu24',
+ 'id': 'ch_wrealu24',
+ 'description': 'md5:da54e48416b74dfdde20a04867c0c2f6',
+ },
+ 'playlist_mincount': 791,
+ }, {
+ 'url': 'https://banbye.com/channel/ch_wrealu24?playlist=p_Ld82N6gBw_OJ',
+ 'info_dict': {
+ 'title': 'Krzysztof Karoń',
+ 'id': 'p_Ld82N6gBw_OJ',
+ },
+ 'playlist_count': 9,
+ }]
+ _PAGE_SIZE = 100
+
+ def _real_extract(self, url):
+ channel_id = self._match_id(url)
+ playlist_id = self._extract_playlist_id(url)
+
+ if playlist_id:
+ return self._extract_playlist(playlist_id)
+
+ def page_func(page_num):
+ data = self._download_json(f'{self._API_BASE}/videos', channel_id, query={
+ 'channelId': channel_id,
+ 'sort': 'new',
+ 'limit': self._PAGE_SIZE,
+ 'offset': page_num * self._PAGE_SIZE,
+ }, note=f'Downloading page {page_num+1}')
+ return [
+ self.url_result(f"{self._VIDEO_BASE}/{video['_id']}", BanByeIE)
+ for video in data['items']
+ ]
+
+ channel_data = self._download_json(f'{self._API_BASE}/channels/{channel_id}', channel_id)
+ entries = InAdvancePagedList(
+ page_func,
+ math.ceil(channel_data['videoCount'] / self._PAGE_SIZE),
+ self._PAGE_SIZE)
+
+ return self.playlist_result(
+ entries, channel_id, channel_data.get('name'), channel_data.get('description'))
diff --git a/hypervideo_dl/extractor/bandaichannel.py b/hypervideo_dl/extractor/bandaichannel.py
index d672859..f1bcdef 100644
--- a/hypervideo_dl/extractor/bandaichannel.py
+++ b/hypervideo_dl/extractor/bandaichannel.py
@@ -21,7 +21,6 @@ class BandaiChannelIE(BrightcoveNewIE):
'duration': 1387.733,
},
'params': {
- 'format': 'bestvideo',
'skip_download': True,
},
}]
diff --git a/hypervideo_dl/extractor/bandcamp.py b/hypervideo_dl/extractor/bandcamp.py
index b664145..745055e 100644
--- a/hypervideo_dl/extractor/bandcamp.py
+++ b/hypervideo_dl/extractor/bandcamp.py
@@ -183,6 +183,7 @@ class BandcampIE(InfoExtractor):
'format_note': f.get('description'),
'filesize': parse_filesize(f.get('size_mb')),
'vcodec': 'none',
+ 'acodec': format_id.split('-')[0],
})
self._sort_formats(formats)
@@ -212,7 +213,7 @@ class BandcampIE(InfoExtractor):
class BandcampAlbumIE(BandcampIE):
IE_NAME = 'Bandcamp:album'
- _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?!/music)(?:/album/(?P<id>[^/?#&]+))?'
+ _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com/album/(?P<id>[^/?#&]+)'
_TESTS = [{
'url': 'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1',
@@ -258,14 +259,6 @@ class BandcampAlbumIE(BandcampIE):
},
'playlist_mincount': 9,
}, {
- 'url': 'http://dotscale.bandcamp.com',
- 'info_dict': {
- 'title': 'Loom',
- 'id': 'dotscale',
- 'uploader_id': 'dotscale',
- },
- 'playlist_mincount': 7,
- }, {
# with escaped quote in title
'url': 'https://jstrecords.bandcamp.com/album/entropy-ep',
'info_dict': {
@@ -391,41 +384,63 @@ class BandcampWeeklyIE(BandcampIE):
}
-class BandcampMusicIE(InfoExtractor):
- _VALID_URL = r'https?://(?P<id>[^/]+)\.bandcamp\.com/music'
+class BandcampUserIE(InfoExtractor):
+ IE_NAME = 'Bandcamp:user'
+ _VALID_URL = r'https?://(?!www\.)(?P<id>[^.]+)\.bandcamp\.com(?:/music)?/?(?:[#?]|$)'
+
_TESTS = [{
+ # Type 1 Bandcamp user page.
+ 'url': 'https://adrianvonziegler.bandcamp.com',
+ 'info_dict': {
+ 'id': 'adrianvonziegler',
+ 'title': 'Discography of adrianvonziegler',
+ },
+ 'playlist_mincount': 23,
+ }, {
+ # Bandcamp user page with only one album
+ 'url': 'http://dotscale.bandcamp.com',
+ 'info_dict': {
+ 'id': 'dotscale',
+ 'title': 'Discography of dotscale'
+ },
+ 'playlist_count': 1,
+ }, {
+ # Type 2 Bandcamp user page.
+ 'url': 'https://nightcallofficial.bandcamp.com',
+ 'info_dict': {
+ 'id': 'nightcallofficial',
+ 'title': 'Discography of nightcallofficial',
+ },
+ 'playlist_count': 4,
+ }, {
'url': 'https://steviasphere.bandcamp.com/music',
'playlist_mincount': 47,
'info_dict': {
'id': 'steviasphere',
+ 'title': 'Discography of steviasphere',
},
}, {
'url': 'https://coldworldofficial.bandcamp.com/music',
'playlist_mincount': 10,
'info_dict': {
'id': 'coldworldofficial',
+ 'title': 'Discography of coldworldofficial',
},
}, {
'url': 'https://nuclearwarnowproductions.bandcamp.com/music',
'playlist_mincount': 399,
'info_dict': {
'id': 'nuclearwarnowproductions',
+ 'title': 'Discography of nuclearwarnowproductions',
},
- }
- ]
-
- _TYPE_IE_DICT = {
- 'album': BandcampAlbumIE.ie_key(),
- 'track': BandcampIE.ie_key()
- }
+ }]
def _real_extract(self, url):
- id = self._match_id(url)
- webpage = self._download_webpage(url, id)
- items = re.findall(r'href\=\"\/(?P<path>(?P<type>album|track)+/[^\"]+)', webpage)
- entries = [
- self.url_result(
- f'https://{id}.bandcamp.com/{item[0]}',
- ie=self._TYPE_IE_DICT[item[1]])
- for item in items]
- return self.playlist_result(entries, id)
+ uploader = self._match_id(url)
+ webpage = self._download_webpage(url, uploader)
+
+ discography_data = (re.findall(r'<li data-item-id=["\'][^>]+>\s*<a href=["\']([^"\']+)', webpage)
+ or re.findall(r'<div[^>]+trackTitle["\'][^"\']+["\']([^"\']+)', webpage))
+
+ return self.playlist_from_matches(
+ discography_data, uploader, f'Discography of {uploader}', getter=lambda x: urljoin(url, x))
diff --git a/hypervideo_dl/extractor/bbc.py b/hypervideo_dl/extractor/bbc.py
index 4e2dcd7..29ad7de 100644
--- a/hypervideo_dl/extractor/bbc.py
+++ b/hypervideo_dl/extractor/bbc.py
@@ -11,6 +11,7 @@ from ..compat import (
compat_etree_Element,
compat_HTTPError,
compat_str,
+ compat_urllib_error,
compat_urlparse,
)
from ..utils import (
@@ -38,7 +39,7 @@ from ..utils import (
class BBCCoUkIE(InfoExtractor):
IE_NAME = 'bbc.co.uk'
IE_DESC = 'BBC iPlayer'
- _ID_REGEX = r'(?:[pbm][\da-z]{7}|w[\da-z]{7,14})'
+ _ID_REGEX = r'(?:[pbml][\da-z]{7}|w[\da-z]{7,14})'
_VALID_URL = r'''(?x)
https?://
(?:www\.)?bbc\.co\.uk/
@@ -263,11 +264,7 @@ class BBCCoUkIE(InfoExtractor):
'only_matching': True,
}]
- def _login(self):
- username, password = self._get_login_info()
- if username is None:
- return
-
+ def _perform_login(self, username, password):
login_page = self._download_webpage(
self._LOGIN_URL, None, 'Downloading signin page')
@@ -293,9 +290,6 @@ class BBCCoUkIE(InfoExtractor):
'Unable to login: %s' % error, expected=True)
raise ExtractorError('Unable to log in')
- def _real_initialize(self):
- self._login()
-
class MediaSelectionError(Exception):
def __init__(self, id):
self.id = id
@@ -394,9 +388,17 @@ class BBCCoUkIE(InfoExtractor):
formats.extend(self._extract_mpd_formats(
href, programme_id, mpd_id=format_id, fatal=False))
elif transfer_format == 'hls':
- formats.extend(self._extract_m3u8_formats(
- href, programme_id, ext='mp4', entry_protocol='m3u8_native',
- m3u8_id=format_id, fatal=False))
+ # TODO: let expected_status be passed into _extract_xxx_formats() instead
+ try:
+ fmts = self._extract_m3u8_formats(
+ href, programme_id, ext='mp4', entry_protocol='m3u8_native',
+ m3u8_id=format_id, fatal=False)
+ except ExtractorError as e:
+ if not (isinstance(e.exc_info[1], compat_urllib_error.HTTPError)
+ and e.exc_info[1].code in (403, 404)):
+ raise
+ fmts = []
+ formats.extend(fmts)
elif transfer_format == 'hds':
formats.extend(self._extract_f4m_formats(
href, programme_id, f4m_id=format_id, fatal=False))
@@ -451,9 +453,10 @@ class BBCCoUkIE(InfoExtractor):
playlist = self._download_json(
'http://www.bbc.co.uk/programmes/%s/playlist.json' % playlist_id,
playlist_id, 'Downloading playlist JSON')
+ formats = []
+ subtitles = {}
- version = playlist.get('defaultAvailableVersion')
- if version:
+ for version in playlist.get('allAvailableVersions', []):
smp_config = version['smpConfig']
title = smp_config['title']
description = smp_config['summary']
@@ -463,8 +466,17 @@ class BBCCoUkIE(InfoExtractor):
continue
programme_id = item.get('vpid')
duration = int_or_none(item.get('duration'))
- formats, subtitles = self._download_media_selector(programme_id)
- return programme_id, title, description, duration, formats, subtitles
+ version_formats, version_subtitles = self._download_media_selector(programme_id)
+ types = version['types']
+ for f in version_formats:
+ f['format_note'] = ', '.join(types)
+ if any('AudioDescribed' in x for x in types):
+ f['language_preference'] = -10
+ formats += version_formats
+ for tag, subformats in (version_subtitles or {}).items():
+ subtitles.setdefault(tag, []).extend(subformats)
+
+ return programme_id, title, description, duration, formats, subtitles
except ExtractorError as ee:
if not (isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404):
raise
@@ -775,20 +787,32 @@ class BBCIE(BBCCoUkIE):
'upload_date': '20150725',
},
}, {
+ # video with window.__INITIAL_DATA__ and value as JSON string
+ 'url': 'https://www.bbc.com/news/av/world-europe-59468682',
+ 'info_dict': {
+ 'id': 'p0b71qth',
+ 'ext': 'mp4',
+ 'title': 'Why France is making this woman a national hero',
+ 'description': 'md5:7affdfab80e9c3a1f976230a1ff4d5e4',
+ 'thumbnail': r're:https?://.+/.+\.jpg',
+ 'timestamp': 1638230731,
+ 'upload_date': '20211130',
+ },
+ }, {
# single video article embedded with data-media-vpid
'url': 'http://www.bbc.co.uk/sport/rowing/35908187',
'only_matching': True,
}, {
+ # bbcthreeConfig
'url': 'https://www.bbc.co.uk/bbcthree/clip/73d0bbd0-abc3-4cea-b3c0-cdae21905eb1',
'info_dict': {
'id': 'p06556y7',
'ext': 'mp4',
- 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?',
- 'description': 'md5:4b7dfd063d5a789a1512e99662be3ddd',
+ 'title': 'Things Not To Say to people that live on council estates',
+ 'description': "From being labelled a 'chav', to the presumption that they're 'scroungers', people who live on council estates encounter all kinds of prejudices and false assumptions about themselves, their families, and their lifestyles. Here, eight people discuss the common statements, misconceptions, and clichés that they're tired of hearing.",
+ 'duration': 360,
+ 'thumbnail': r're:https?://.+/.+\.jpg',
},
- 'params': {
- 'skip_download': True,
- }
}, {
# window.__PRELOADED_STATE__
'url': 'https://www.bbc.co.uk/radio/play/b0b9z4yl',
@@ -882,9 +906,8 @@ class BBCIE(BBCCoUkIE):
playlist_title = json_ld_info.get('title')
if not playlist_title:
- playlist_title = self._og_search_title(
- webpage, default=None) or self._html_search_regex(
- r'<title>(.+?)</title>', webpage, 'playlist title', default=None)
+ playlist_title = (self._og_search_title(webpage, default=None)
+ or self._html_extract_title(webpage, 'playlist title', default=None))
if playlist_title:
playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip()
@@ -1161,9 +1184,16 @@ class BBCIE(BBCCoUkIE):
return self.playlist_result(
entries, playlist_id, playlist_title, playlist_description)
- initial_data = self._parse_json(self._search_regex(
- r'window\.__INITIAL_DATA__\s*=\s*({.+?});', webpage,
- 'preload state', default='{}'), playlist_id, fatal=False)
+ initial_data = self._search_regex(
+ r'window\.__INITIAL_DATA__\s*=\s*("{.+?}")\s*;', webpage,
+ 'quoted preload state', default=None)
+ if initial_data is None:
+ initial_data = self._search_regex(
+ r'window\.__INITIAL_DATA__\s*=\s*({.+?})\s*;', webpage,
+ 'preload state', default={})
+ else:
+ initial_data = self._parse_json(initial_data or '"{}"', playlist_id, fatal=False)
+ initial_data = self._parse_json(initial_data, playlist_id, fatal=False)
if initial_data:
def parse_media(media):
if not media:
@@ -1204,7 +1234,10 @@ class BBCIE(BBCCoUkIE):
if name == 'media-experience':
parse_media(try_get(resp, lambda x: x['data']['initialItem']['mediaItem'], dict))
elif name == 'article':
- for block in (try_get(resp, lambda x: x['data']['blocks'], list) or []):
+ for block in (try_get(resp,
+ (lambda x: x['data']['blocks'],
+ lambda x: x['data']['content']['model']['blocks'],),
+ list) or []):
if block.get('type') != 'media':
continue
parse_media(block.get('model'))
diff --git a/hypervideo_dl/extractor/beeg.py b/hypervideo_dl/extractor/beeg.py
index 8fbabe7..717fff3 100644
--- a/hypervideo_dl/extractor/beeg.py
+++ b/hypervideo_dl/extractor/beeg.py
@@ -1,32 +1,45 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..compat import (
- compat_str,
-)
+
from ..utils import (
int_or_none,
- parse_qs,
+ traverse_obj,
+ try_get,
unified_timestamp,
)
class BeegIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?beeg\.(?:com|porn(?:/video)?)/(?P<id>\d+)'
+ _VALID_URL = r'https?://(?:www\.)?beeg\.(?:com(?:/video)?)/-?(?P<id>\d+)'
_TESTS = [{
- # api/v6 v1
- 'url': 'http://beeg.com/5416503',
- 'md5': 'a1a1b1a8bc70a89e49ccfd113aed0820',
+ 'url': 'https://beeg.com/-0983946056129650',
+ 'md5': '51d235147c4627cfce884f844293ff88',
'info_dict': {
- 'id': '5416503',
+ 'id': '0983946056129650',
'ext': 'mp4',
- 'title': 'Sultry Striptease',
- 'description': 'md5:d22219c09da287c14bed3d6c37ce4bc2',
- 'timestamp': 1391813355,
- 'upload_date': '20140207',
- 'duration': 383,
+ 'title': 'sucked cock and fucked in a private plane',
+ 'duration': 927,
'tags': list,
'age_limit': 18,
+ 'upload_date': '20220131',
+ 'timestamp': 1643656455,
+ 'display_id': 2540839,
+ }
+ }, {
+ 'url': 'https://beeg.com/-0599050563103750?t=4-861',
+ 'md5': 'bd8b5ea75134f7f07fad63008db2060e',
+ 'info_dict': {
+ 'id': '0599050563103750',
+ 'ext': 'mp4',
+ 'title': 'Bad Relatives',
+ 'duration': 2060,
+ 'tags': list,
+ 'age_limit': 18,
+ 'description': 'md5:b4fc879a58ae6c604f8f259155b7e3b9',
+ 'timestamp': 1643623200,
+ 'display_id': 2569965,
+ 'upload_date': '20220131',
}
}, {
# api/v6 v2
@@ -36,12 +49,6 @@ class BeegIE(InfoExtractor):
# api/v6 v2 w/o t
'url': 'https://beeg.com/1277207756',
'only_matching': True,
- }, {
- 'url': 'https://beeg.porn/video/5416503',
- 'only_matching': True,
- }, {
- 'url': 'https://beeg.porn/5416503',
- 'only_matching': True,
}]
def _real_extract(self, url):
@@ -49,68 +56,38 @@ class BeegIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
- beeg_version = self._search_regex(
- r'beeg_version\s*=\s*([\da-zA-Z_-]+)', webpage, 'beeg version',
- default='1546225636701')
+ video = self._download_json(
+ 'https://store.externulls.com/facts/file/%s' % video_id,
+ video_id, 'Downloading JSON for %s' % video_id)
- if len(video_id) >= 10:
- query = {
- 'v': 2,
- }
- qs = parse_qs(url)
- t = qs.get('t', [''])[0].split('-')
- if len(t) > 1:
- query.update({
- 's': t[0],
- 'e': t[1],
- })
- else:
- query = {'v': 1}
+ fc_facts = video.get('fc_facts')
+ first_fact = {}
+ for fact in fc_facts:
+ if not first_fact or try_get(fact, lambda x: x['id'] < first_fact['id']):
+ first_fact = fact
- for api_path in ('', 'api.'):
- video = self._download_json(
- 'https://%sbeeg.com/api/v6/%s/video/%s'
- % (api_path, beeg_version, video_id), video_id,
- fatal=api_path == 'api.', query=query)
- if video:
- break
+ resources = traverse_obj(video, ('file', 'hls_resources')) or first_fact.get('hls_resources')
formats = []
- for format_id, video_url in video.items():
- if not video_url:
- continue
- height = self._search_regex(
- r'^(\d+)[pP]$', format_id, 'height', default=None)
- if not height:
+ for format_id, video_uri in resources.items():
+ if not video_uri:
continue
- formats.append({
- 'url': self._proto_relative_url(
- video_url.replace('{DATA_MARKERS}', 'data=pc_XX__%s_0' % beeg_version), 'https:'),
- 'format_id': format_id,
- 'height': int(height),
- })
- self._sort_formats(formats)
-
- title = video['title']
- video_id = compat_str(video.get('id') or video_id)
- display_id = video.get('code')
- description = video.get('desc')
- series = video.get('ps_name')
+ height = int_or_none(self._search_regex(r'fl_cdn_(\d+)', format_id, 'height', default=None))
+ current_formats = self._extract_m3u8_formats(f'https://video.beeg.com/{video_uri}', video_id, ext='mp4', m3u8_id=str(height))
+ for f in current_formats:
+ f['height'] = height
+ formats.extend(current_formats)
- timestamp = unified_timestamp(video.get('date'))
- duration = int_or_none(video.get('duration'))
-
- tags = [tag.strip() for tag in video['tags'].split(',')] if video.get('tags') else None
+ self._sort_formats(formats)
return {
'id': video_id,
- 'display_id': display_id,
- 'title': title,
- 'description': description,
- 'series': series,
- 'timestamp': timestamp,
- 'duration': duration,
- 'tags': tags,
+ 'display_id': first_fact.get('id'),
+ 'title': traverse_obj(video, ('file', 'stuff', 'sf_name')),
+ 'description': traverse_obj(video, ('file', 'stuff', 'sf_story')),
+ 'timestamp': unified_timestamp(first_fact.get('fc_created')),
+ 'duration': int_or_none(traverse_obj(video, ('file', 'fl_duration'))),
+ 'tags': traverse_obj(video, ('tags', ..., 'tg_name')),
'formats': formats,
'age_limit': self._rta_search(webpage),
}
diff --git a/hypervideo_dl/extractor/bigo.py b/hypervideo_dl/extractor/bigo.py
new file mode 100644
index 0000000..ddf76ac
--- /dev/null
+++ b/hypervideo_dl/extractor/bigo.py
@@ -0,0 +1,59 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import ExtractorError, urlencode_postdata
+
+
+class BigoIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?bigo\.tv/(?:[a-z]{2,}/)?(?P<id>[^/]+)'
+
+ _TESTS = [{
+ 'url': 'https://www.bigo.tv/ja/221338632',
+ 'info_dict': {
+ 'id': '6576287577575737440',
+ 'title': '土よ〜💁‍♂️ 休憩室/REST room',
+ 'thumbnail': r're:https?://.+',
+ 'uploader': '✨Shin💫',
+ 'uploader_id': '221338632',
+ 'is_live': True,
+ },
+ 'skip': 'livestream',
+ }, {
+ 'url': 'https://www.bigo.tv/th/Tarlerm1304',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://bigo.tv/115976881',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ user_id = self._match_id(url)
+
+ info_raw = self._download_json(
+ 'https://bigo.tv/studio/getInternalStudioInfo',
+ user_id, data=urlencode_postdata({'siteId': user_id}))
+
+ if not isinstance(info_raw, dict):
+ raise ExtractorError('Received invalid JSON data')
+ if info_raw.get('code'):
+ raise ExtractorError(
+ 'Bigo says: %s (code %s)' % (info_raw.get('msg'), info_raw.get('code')), expected=True)
+ info = info_raw.get('data') or {}
+
+ if not info.get('alive'):
+ raise ExtractorError('This user is offline.', expected=True)
+
+ return {
+ 'id': info.get('roomId') or user_id,
+ 'title': info.get('roomTopic') or info.get('nick_name') or user_id,
+ 'formats': [{
+ 'url': info.get('hls_src'),
+ 'ext': 'mp4',
+ 'protocol': 'm3u8',
+ }],
+ 'thumbnail': info.get('snapshot'),
+ 'uploader': info.get('nick_name'),
+ 'uploader_id': user_id,
+ 'is_live': True,
+ }
diff --git a/hypervideo_dl/extractor/bilibili.py b/hypervideo_dl/extractor/bilibili.py
index 8d66b43..909f7f8 100644
--- a/hypervideo_dl/extractor/bilibili.py
+++ b/hypervideo_dl/extractor/bilibili.py
@@ -1,5 +1,6 @@
# coding: utf-8
+import base64
import hashlib
import itertools
import functools
@@ -14,19 +15,21 @@ from ..compat import (
)
from ..utils import (
ExtractorError,
+ filter_dict,
int_or_none,
float_or_none,
+ mimetype2ext,
parse_iso8601,
traverse_obj,
- try_get,
+ parse_count,
smuggle_url,
srt_subtitles_timecode,
str_or_none,
- str_to_int,
strip_jsonp,
unified_timestamp,
unsmuggle_url,
urlencode_postdata,
+ url_or_none,
OnDemandPagedList
)
@@ -50,16 +53,14 @@ class BiliBiliIE(InfoExtractor):
'url': 'http://www.bilibili.com/video/av1074402/',
'md5': '5f7d29e1a2872f3df0cf76b1f87d3788',
'info_dict': {
- 'id': '1074402',
- 'ext': 'flv',
+ 'id': '1074402_part1',
+ 'ext': 'mp4',
'title': '【金坷垃】金泡沫',
+ 'uploader_id': '156160',
+ 'uploader': '菊子桑',
+ 'upload_date': '20140420',
'description': 'md5:ce18c2a2d2193f0df2917d270f2e5923',
- 'duration': 308.067,
'timestamp': 1398012678,
- 'upload_date': '20140420',
- 'thumbnail': r're:^https?://.+\.jpg',
- 'uploader': '菊子桑',
- 'uploader_id': '156160',
},
}, {
# Tested in BiliBiliBangumiIE
@@ -73,49 +74,27 @@ class BiliBiliIE(InfoExtractor):
'url': 'http://bangumi.bilibili.com/anime/5802/play#100643',
'md5': '3f721ad1e75030cc06faf73587cfec57',
'info_dict': {
- 'id': '100643',
+ 'id': '100643_part1',
'ext': 'mp4',
'title': 'CHAOS;CHILD',
'description': '如果你是神明,并且能够让妄想成为现实。那你会进行怎么样的妄想?是淫靡的世界?独裁社会?毁灭性的制裁?还是……2015年,涩谷。从6年前发生的大灾害“涩谷地震”之后复兴了的这个街区里新设立的私立高中...',
},
'skip': 'Geo-restricted to China',
}, {
- # Title with double quotes
'url': 'http://www.bilibili.com/video/av8903802/',
'info_dict': {
- 'id': '8903802',
+ 'id': '8903802_part1',
+ 'ext': 'mp4',
'title': '阿滴英文|英文歌分享#6 "Closer',
+ 'upload_date': '20170301',
'description': '滴妹今天唱Closer給你聽! 有史以来,被推最多次也是最久的歌曲,其实歌词跟我原本想像差蛮多的,不过还是好听! 微博@阿滴英文',
+ 'timestamp': 1488382634,
+ 'uploader_id': '65880958',
+ 'uploader': '阿滴英文',
+ },
+ 'params': {
+ 'skip_download': True,
},
- 'playlist': [{
- 'info_dict': {
- 'id': '8903802_part1',
- 'ext': 'flv',
- 'title': '阿滴英文|英文歌分享#6 "Closer',
- 'description': 'md5:3b1b9e25b78da4ef87e9b548b88ee76a',
- 'uploader': '阿滴英文',
- 'uploader_id': '65880958',
- 'timestamp': 1488382634,
- 'upload_date': '20170301',
- },
- 'params': {
- 'skip_download': True,
- },
- }, {
- 'info_dict': {
- 'id': '8903802_part2',
- 'ext': 'flv',
- 'title': '阿滴英文|英文歌分享#6 "Closer',
- 'description': 'md5:3b1b9e25b78da4ef87e9b548b88ee76a',
- 'uploader': '阿滴英文',
- 'uploader_id': '65880958',
- 'timestamp': 1488382634,
- 'upload_date': '20170301',
- },
- 'params': {
- 'skip_download': True,
- },
- }]
}, {
# new BV video id format
'url': 'https://www.bilibili.com/video/BV1JE411F741',
@@ -150,6 +129,7 @@ class BiliBiliIE(InfoExtractor):
av_id, bv_id = self._get_video_id_set(video_id, mobj.group('id_bv') is not None)
video_id = av_id
+ info = {}
anime_id = mobj.group('anime_id')
page_id = mobj.group('page')
webpage = self._download_webpage(url, video_id)
@@ -201,66 +181,95 @@ class BiliBiliIE(InfoExtractor):
}
headers.update(self.geo_verification_headers())
+ video_info = self._parse_json(
+ self._search_regex(r'window.__playinfo__\s*=\s*({.+?})</script>', webpage, 'video info', default=None) or '{}',
+ video_id, fatal=False)
+ video_info = video_info.get('data') or {}
+
+ durl = traverse_obj(video_info, ('dash', 'video'))
+ audios = traverse_obj(video_info, ('dash', 'audio')) or []
entries = []
RENDITIONS = ('qn=80&quality=80&type=', 'quality=2&type=mp4')
for num, rendition in enumerate(RENDITIONS, start=1):
payload = 'appkey=%s&cid=%s&otype=json&%s' % (self._APP_KEY, cid, rendition)
sign = hashlib.md5((payload + self._BILIBILI_KEY).encode('utf-8')).hexdigest()
-
- video_info = self._download_json(
- 'http://interface.bilibili.com/v2/playurl?%s&sign=%s' % (payload, sign),
- video_id, note='Downloading video info page',
- headers=headers, fatal=num == len(RENDITIONS))
-
if not video_info:
- continue
+ video_info = self._download_json(
+ 'http://interface.bilibili.com/v2/playurl?%s&sign=%s' % (payload, sign),
+ video_id, note='Downloading video info page',
+ headers=headers, fatal=num == len(RENDITIONS))
+ if not video_info:
+ continue
- if 'durl' not in video_info:
+ if not durl and 'durl' not in video_info:
if num < len(RENDITIONS):
continue
self._report_error(video_info)
- for idx, durl in enumerate(video_info['durl']):
- formats = [{
- 'url': durl['url'],
- 'filesize': int_or_none(durl['size']),
- }]
- for backup_url in durl.get('backup_url', []):
+ formats = []
+ for idx, durl in enumerate(durl or video_info['durl']):
+ formats.append({
+ 'url': durl.get('baseUrl') or durl.get('base_url') or durl.get('url'),
+ 'ext': mimetype2ext(durl.get('mimeType') or durl.get('mime_type')),
+ 'fps': int_or_none(durl.get('frameRate') or durl.get('frame_rate')),
+ 'width': int_or_none(durl.get('width')),
+ 'height': int_or_none(durl.get('height')),
+ 'vcodec': durl.get('codecs'),
+ 'acodec': 'none' if audios else None,
+ 'tbr': float_or_none(durl.get('bandwidth'), scale=1000),
+ 'filesize': int_or_none(durl.get('size')),
+ })
+ for backup_url in traverse_obj(durl, 'backup_url', expected_type=list) or []:
formats.append({
'url': backup_url,
- # backup URLs have lower priorities
'quality': -2 if 'hd.mp4' in backup_url else -3,
})
- for a_format in formats:
- a_format.setdefault('http_headers', {}).update({
- 'Referer': url,
+ for audio in audios:
+ formats.append({
+ 'url': audio.get('baseUrl') or audio.get('base_url') or audio.get('url'),
+ 'ext': mimetype2ext(audio.get('mimeType') or audio.get('mime_type')),
+ 'fps': int_or_none(audio.get('frameRate') or audio.get('frame_rate')),
+ 'width': int_or_none(audio.get('width')),
+ 'height': int_or_none(audio.get('height')),
+ 'acodec': audio.get('codecs'),
+ 'vcodec': 'none',
+ 'tbr': float_or_none(audio.get('bandwidth'), scale=1000),
+ 'filesize': int_or_none(audio.get('size'))
+ })
+ for backup_url in traverse_obj(audio, 'backup_url', expected_type=list) or []:
+ formats.append({
+ 'url': backup_url,
+ # backup URLs have lower priorities
+ 'quality': -3,
})
- self._sort_formats(formats)
-
- entries.append({
- 'id': '%s_part%s' % (video_id, idx),
- 'duration': float_or_none(durl.get('length'), 1000),
- 'formats': formats,
- })
+ info.update({
+ 'id': video_id,
+ 'duration': float_or_none(durl.get('length'), 1000),
+ 'formats': formats,
+ 'http_headers': {
+ 'Referer': url,
+ },
+ })
break
- title = self._html_search_regex(
- (r'<h1[^>]+\btitle=(["\'])(?P<title>(?:(?!\1).)+)\1',
- r'(?s)<h1[^>]*>(?P<title>.+?)</h1>'), webpage, 'title',
- group='title')
+ self._sort_formats(formats)
+
+ title = self._html_search_regex((
+ r'<h1[^>]+title=(["\'])(?P<content>[^"\']+)',
+ r'(?s)<h1[^>]*>(?P<content>.+?)</h1>',
+ self._meta_regex('title')
+ ), webpage, 'title', group='content', fatal=False)
# Get part title for anthologies
if page_id is not None:
- # TODO: The json is already downloaded by _extract_anthology_entries. Don't redownload for each video
- part_title = try_get(
- self._download_json(
- f'https://api.bilibili.com/x/player/pagelist?bvid={bv_id}&jsonp=jsonp',
- video_id, note='Extracting videos in anthology'),
- lambda x: x['data'][int(page_id) - 1]['part'])
- title = part_title or title
+ # TODO: The json is already downloaded by _extract_anthology_entries. Don't redownload for each video.
+ part_info = traverse_obj(self._download_json(
+ f'https://api.bilibili.com/x/player/pagelist?bvid={bv_id}&jsonp=jsonp',
+ video_id, note='Extracting videos in anthology'), 'data', expected_type=list)
+ title = title if len(part_info) == 1 else traverse_obj(part_info, (int(page_id) - 1, 'part')) or title
description = self._html_search_meta('description', webpage)
timestamp = unified_timestamp(self._html_search_regex(
@@ -270,15 +279,15 @@ class BiliBiliIE(InfoExtractor):
thumbnail = self._html_search_meta(['og:image', 'thumbnailUrl'], webpage)
# TODO 'view_count' requires deobfuscating Javascript
- info = {
- 'id': str(video_id) if page_id is None else '%s_part%s' % (video_id, page_id),
+ info.update({
+ 'id': f'{video_id}_part{page_id or 1}',
'cid': cid,
'title': title,
'description': description,
'timestamp': timestamp,
'thumbnail': thumbnail,
'duration': float_or_none(video_info.get('timelength'), scale=1000),
- }
+ })
uploader_mobj = re.search(
r'<a[^>]+href="(?:https?:)?//space\.bilibili\.com/(?P<id>\d+)"[^>]*>\s*(?P<name>[^<]+?)\s*<',
@@ -299,7 +308,7 @@ class BiliBiliIE(InfoExtractor):
video_id, fatal=False, note='Downloading tags'), ('data', ..., 'tag_name')),
}
- entries[0]['subtitles'] = {
+ info['subtitles'] = {
'danmaku': [{
'ext': 'xml',
'url': f'https://comment.bilibili.com/{cid}.xml',
@@ -334,19 +343,18 @@ class BiliBiliIE(InfoExtractor):
entry['id'] = '%s_part%d' % (video_id, (idx + 1))
return {
- '_type': 'multi_video',
'id': str(video_id),
'bv_id': bv_id,
'title': title,
'description': description,
- 'entries': entries,
**info, **top_level_info
}
def _extract_anthology_entries(self, bv_id, video_id, webpage):
title = self._html_search_regex(
(r'<h1[^>]+\btitle=(["\'])(?P<title>(?:(?!\1).)+)\1',
- r'(?s)<h1[^>]*>(?P<title>.+?)</h1>'), webpage, 'title',
+ r'(?s)<h1[^>]*>(?P<title>.+?)</h1>',
+ r'<title>(?P<title>.+?)</title>'), webpage, 'title',
group='title')
json_data = self._download_json(
f'https://api.bilibili.com/x/player/pagelist?bvid={bv_id}&jsonp=jsonp',
@@ -376,8 +384,10 @@ class BiliBiliIE(InfoExtractor):
replies = traverse_obj(
self._download_json(
f'https://api.bilibili.com/x/v2/reply?pn={idx}&oid={video_id}&type=1&jsonp=jsonp&sort=2&_=1567227301685',
- video_id, note=f'Extracting comments from page {idx}'),
- ('data', 'replies')) or []
+ video_id, note=f'Extracting comments from page {idx}', fatal=False),
+ ('data', 'replies'))
+ if not replies:
+ return
for children in map(self._get_all_children, replies):
yield from children
@@ -477,9 +487,9 @@ class BilibiliChannelIE(InfoExtractor):
data = self._download_json(
self._API_URL % (list_id, page_num), list_id, note=f'Downloading page {page_num}')['data']
- max_count = max_count or try_get(data, lambda x: x['page']['count'])
+ max_count = max_count or traverse_obj(data, ('page', 'count'))
- entries = try_get(data, lambda x: x['list']['vlist'])
+ entries = traverse_obj(data, ('list', 'vlist'))
if not entries:
return
for entry in entries:
@@ -517,7 +527,7 @@ class BilibiliCategoryIE(InfoExtractor):
api_url, query, query={'Search_key': query, 'pn': page_num},
note='Extracting results from page %s of %s' % (page_num, num_pages))
- video_list = try_get(parsed_json, lambda x: x['data']['archives'], list)
+ video_list = traverse_obj(parsed_json, ('data', 'archives'), expected_type=list)
if not video_list:
raise ExtractorError('Failed to retrieve video list for page %d' % page_num)
@@ -547,7 +557,7 @@ class BilibiliCategoryIE(InfoExtractor):
api_url = 'https://api.bilibili.com/x/web-interface/newlist?rid=%d&type=1&ps=20&jsonp=jsonp' % rid_value
page_json = self._download_json(api_url, query, query={'Search_key': query, 'pn': '1'})
- page_data = try_get(page_json, lambda x: x['data']['page'], dict)
+ page_data = traverse_obj(page_json, ('data', 'page'), expected_type=dict)
count, size = int_or_none(page_data.get('count')), int_or_none(page_data.get('size'))
if count is None or not size:
raise ExtractorError('Failed to calculate either page count or size')
@@ -566,7 +576,7 @@ class BilibiliCategoryIE(InfoExtractor):
class BiliBiliSearchIE(SearchInfoExtractor):
- IE_DESC = 'Bilibili video search, "bilisearch" keyword'
+ IE_DESC = 'Bilibili video search'
_MAX_RESULTS = 100000
_SEARCH_KEY = 'bilisearch'
@@ -719,40 +729,68 @@ class BiliBiliPlayerIE(InfoExtractor):
class BiliIntlBaseIE(InfoExtractor):
- _API_URL = 'https://api.bili{}/intl/gateway{}'
-
- def _call_api(self, type, endpoint, id):
- return self._download_json(self._API_URL.format(type, endpoint), id)['data']
+ _API_URL = 'https://api.bilibili.tv/intl/gateway'
+ _NETRC_MACHINE = 'biliintl'
+
+ def _call_api(self, endpoint, *args, **kwargs):
+ json = self._download_json(self._API_URL + endpoint, *args, **kwargs)
+ if json.get('code'):
+ if json['code'] in (10004004, 10004005, 10023006):
+ self.raise_login_required()
+ elif json['code'] == 10004001:
+ self.raise_geo_restricted()
+ else:
+ if json.get('message') and str(json['code']) != json['message']:
+ errmsg = f'{kwargs.get("errnote", "Unable to download JSON metadata")}: {self.IE_NAME} said: {json["message"]}'
+ else:
+ errmsg = kwargs.get('errnote', 'Unable to download JSON metadata')
+ if kwargs.get('fatal'):
+ raise ExtractorError(errmsg)
+ else:
+ self.report_warning(errmsg)
+ return json.get('data')
def json2srt(self, json):
data = '\n\n'.join(
f'{i + 1}\n{srt_subtitles_timecode(line["from"])} --> {srt_subtitles_timecode(line["to"])}\n{line["content"]}'
- for i, line in enumerate(json['body']))
+ for i, line in enumerate(json['body']) if line.get('content'))
return data
- def _get_subtitles(self, type, ep_id):
- sub_json = self._call_api(type, f'/m/subtitle?ep_id={ep_id}&platform=web', ep_id)
+ def _get_subtitles(self, *, ep_id=None, aid=None):
+ sub_json = self._call_api(
+ '/web/v2/subtitle', ep_id or aid, note='Downloading subtitles list',
+ errnote='Unable to download subtitles list', query=filter_dict({
+ 'platform': 'web',
+ 'episode_id': ep_id,
+ 'aid': aid,
+ }))
subtitles = {}
- for sub in sub_json.get('subtitles', []):
+ for sub in sub_json.get('subtitles') or []:
sub_url = sub.get('url')
if not sub_url:
continue
- sub_data = self._download_json(sub_url, ep_id, fatal=False)
+ sub_data = self._download_json(
+ sub_url, ep_id or aid, errnote='Unable to download subtitles', fatal=False,
+ note='Downloading subtitles%s' % f' for {sub["lang"]}' if sub.get('lang') else '')
if not sub_data:
continue
- subtitles.setdefault(sub.get('key', 'en'), []).append({
+ subtitles.setdefault(sub.get('lang_key', 'en'), []).append({
'ext': 'srt',
'data': self.json2srt(sub_data)
})
return subtitles
- def _get_formats(self, type, ep_id):
- video_json = self._call_api(type, f'/web/playurl?ep_id={ep_id}&platform=web', ep_id)
- if not video_json:
- self.raise_login_required(method='cookies')
+ def _get_formats(self, *, ep_id=None, aid=None):
+ video_json = self._call_api(
+ '/web/playurl', ep_id or aid, note='Downloading video formats',
+ errnote='Unable to download video formats', query=filter_dict({
+ 'platform': 'web',
+ 'ep_id': ep_id,
+ 'aid': aid,
+ }))
video_json = video_json['playurl']
formats = []
- for vid in video_json.get('video', []):
+ for vid in video_json.get('video') or []:
video_res = vid.get('video_resource') or {}
video_info = vid.get('stream_info') or {}
if not video_res.get('url'):
@@ -768,7 +806,7 @@ class BiliIntlBaseIE(InfoExtractor):
'vcodec': video_res.get('codecs'),
'filesize': video_res.get('size'),
})
- for aud in video_json.get('audio_resource', []):
+ for aud in video_json.get('audio_resource') or []:
if not aud.get('url'):
continue
formats.append({
@@ -783,85 +821,148 @@ class BiliIntlBaseIE(InfoExtractor):
self._sort_formats(formats)
return formats
- def _extract_ep_info(self, type, episode_data, ep_id):
+ def _extract_video_info(self, video_data, *, ep_id=None, aid=None):
return {
- 'id': ep_id,
- 'title': episode_data.get('long_title') or episode_data['title'],
- 'thumbnail': episode_data.get('cover'),
- 'episode_number': str_to_int(episode_data.get('title')),
- 'formats': self._get_formats(type, ep_id),
- 'subtitles': self._get_subtitles(type, ep_id),
+ 'id': ep_id or aid,
+ 'title': video_data.get('title_display') or video_data.get('title'),
+ 'thumbnail': video_data.get('cover'),
+ 'episode_number': int_or_none(self._search_regex(
+ r'^E(\d+)(?:$| - )', video_data.get('title_display') or '', 'episode number', default=None)),
+ 'formats': self._get_formats(ep_id=ep_id, aid=aid),
+ 'subtitles': self._get_subtitles(ep_id=ep_id, aid=aid),
'extractor_key': BiliIntlIE.ie_key(),
}
+ def _perform_login(self, username, password):
+ try:
+ from Cryptodome.PublicKey import RSA
+ from Cryptodome.Cipher import PKCS1_v1_5
+ except ImportError:
+ try:
+ from Crypto.PublicKey import RSA
+ from Crypto.Cipher import PKCS1_v1_5
+ except ImportError:
+ raise ExtractorError('pycryptodomex not found. Please install', expected=True)
+
+ key_data = self._download_json(
+ 'https://passport.bilibili.tv/x/intl/passport-login/web/key?lang=en-US', None,
+ note='Downloading login key', errnote='Unable to download login key')['data']
+
+ public_key = RSA.importKey(key_data['key'])
+ password_hash = PKCS1_v1_5.new(public_key).encrypt((key_data['hash'] + password).encode('utf-8'))
+ login_post = self._download_json(
+ 'https://passport.bilibili.tv/x/intl/passport-login/web/login/password?lang=en-US', None, data=urlencode_postdata({
+ 'username': username,
+ 'password': base64.b64encode(password_hash).decode('ascii'),
+ 'keep_me': 'true',
+ 's_locale': 'en_US',
+ 'isTrusted': 'true'
+ }), note='Logging in', errnote='Unable to log in')
+ if login_post.get('code'):
+ if login_post.get('message'):
+ raise ExtractorError(f'Unable to log in: {self.IE_NAME} said: {login_post["message"]}', expected=True)
+ else:
+ raise ExtractorError('Unable to log in')
+
class BiliIntlIE(BiliIntlBaseIE):
- _VALID_URL = r'https?://(?:www\.)?bili(?P<type>bili\.tv|intl.com)/(?:[a-z]{2}/)?play/(?P<season_id>\d+)/(?P<id>\d+)'
+ _VALID_URL = r'https?://(?:www\.)?bili(?:bili\.tv|intl\.com)/(?:[a-z]{2}/)?(play/(?P<season_id>\d+)/(?P<ep_id>\d+)|video/(?P<aid>\d+))'
_TESTS = [{
+ # Bstation page
'url': 'https://www.bilibili.tv/en/play/34613/341736',
'info_dict': {
'id': '341736',
'ext': 'mp4',
- 'title': 'The First Night',
- 'thumbnail': 'https://i0.hdslb.com/bfs/intl/management/91e30e5521235d9b163339a26a0b030ebda54310.png',
+ 'title': 'E2 - The First Night',
+ 'thumbnail': r're:^https://pic\.bstarstatic\.com/ogv/.+\.png$',
'episode_number': 2,
- },
- 'params': {
- 'format': 'bv',
- },
+ }
}, {
- 'url': 'https://www.biliintl.com/en/play/34613/341736',
+ # Non-Bstation page
+ 'url': 'https://www.bilibili.tv/en/play/1033760/11005006',
'info_dict': {
- 'id': '341736',
+ 'id': '11005006',
'ext': 'mp4',
- 'title': 'The First Night',
- 'thumbnail': 'https://i0.hdslb.com/bfs/intl/management/91e30e5521235d9b163339a26a0b030ebda54310.png',
- 'episode_number': 2,
- },
- 'params': {
- 'format': 'bv',
+ 'title': 'E3 - Who?',
+ 'thumbnail': r're:^https://pic\.bstarstatic\.com/ogv/.+\.png$',
+ 'episode_number': 3,
+ }
+ }, {
+ # Subtitle with empty content
+ 'url': 'https://www.bilibili.tv/en/play/1005144/10131790',
+ 'info_dict': {
+ 'id': '10131790',
+ 'ext': 'mp4',
+ 'title': 'E140 - Two Heartbeats: Kabuto\'s Trap',
+ 'thumbnail': r're:^https://pic\.bstarstatic\.com/ogv/.+\.png$',
+ 'episode_number': 140,
},
+ 'skip': 'According to the copyright owner\'s request, you may only watch the video after you log in.'
+ }, {
+ 'url': 'https://www.biliintl.com/en/play/34613/341736',
+ 'only_matching': True,
+ }, {
+ # User-generated content (as opposed to a series licensed from a studio)
+ 'url': 'https://bilibili.tv/en/video/2019955076',
+ 'only_matching': True,
+ }, {
+ # No language in URL
+ 'url': 'https://www.bilibili.tv/video/2019955076',
+ 'only_matching': True,
}]
def _real_extract(self, url):
- type, season_id, id = self._match_valid_url(url).groups()
- data_json = self._call_api(type, f'/web/view/ogv_collection?season_id={season_id}', id)
- episode_data = next(
- episode for episode in data_json.get('episodes', [])
- if str(episode.get('ep_id')) == id)
- return self._extract_ep_info(type, episode_data, id)
+ season_id, ep_id, aid = self._match_valid_url(url).group('season_id', 'ep_id', 'aid')
+ video_id = ep_id or aid
+ webpage = self._download_webpage(url, video_id)
+ # Bstation layout
+ initial_data = self._parse_json(self._search_regex(
+ r'window\.__INITIAL_(?:DATA|STATE)__\s*=\s*({.+?});', webpage,
+ 'preload state', default='{}'), video_id, fatal=False) or {}
+ video_data = (
+ traverse_obj(initial_data, ('OgvVideo', 'epDetail'), expected_type=dict)
+ or traverse_obj(initial_data, ('UgcVideo', 'videoData'), expected_type=dict) or {})
+
+ if season_id and not video_data:
+ # Non-Bstation layout, read through episode list
+ season_json = self._call_api(f'/web/v2/ogv/play/episodes?season_id={season_id}&platform=web', video_id)
+ video_data = traverse_obj(season_json,
+ ('sections', ..., 'episodes', lambda _, v: str(v['episode_id']) == ep_id),
+ expected_type=dict, get_all=False)
+ return self._extract_video_info(video_data, ep_id=ep_id, aid=aid)
class BiliIntlSeriesIE(BiliIntlBaseIE):
- _VALID_URL = r'https?://(?:www\.)?bili(?P<type>bili\.tv|intl.com)/(?:[a-z]{2}/)?play/(?P<id>\d+)$'
+ _VALID_URL = r'https?://(?:www\.)?bili(?:bili\.tv|intl\.com)/(?:[a-z]{2}/)?play/(?P<id>\d+)$'
_TESTS = [{
'url': 'https://www.bilibili.tv/en/play/34613',
'playlist_mincount': 15,
'info_dict': {
'id': '34613',
+ 'title': 'Fly Me to the Moon',
+ 'description': 'md5:a861ee1c4dc0acfad85f557cc42ac627',
+ 'categories': ['Romance', 'Comedy', 'Slice of life'],
+ 'thumbnail': r're:^https://pic\.bstarstatic\.com/ogv/.+\.png$',
+ 'view_count': int,
},
'params': {
'skip_download': True,
- 'format': 'bv',
},
}, {
'url': 'https://www.biliintl.com/en/play/34613',
- 'playlist_mincount': 15,
- 'info_dict': {
- 'id': '34613',
- },
- 'params': {
- 'skip_download': True,
- 'format': 'bv',
- },
+ 'only_matching': True,
}]
- def _entries(self, id, type):
- data_json = self._call_api(type, f'/web/view/ogv_collection?season_id={id}', id)
- for episode in data_json.get('episodes', []):
- episode_id = str(episode.get('ep_id'))
- yield self._extract_ep_info(type, episode, episode_id)
+ def _entries(self, series_id):
+ series_json = self._call_api(f'/web/v2/ogv/play/episodes?season_id={series_id}&platform=web', series_id)
+ for episode in traverse_obj(series_json, ('sections', ..., 'episodes', ...), expected_type=dict, default=[]):
+ episode_id = str(episode.get('episode_id'))
+ yield self._extract_video_info(episode, ep_id=episode_id)
def _real_extract(self, url):
- type, id = self._match_valid_url(url).groups()
- return self.playlist_result(self._entries(id, type), playlist_id=id)
+ series_id = self._match_id(url)
+ series_info = self._call_api(f'/web/v2/ogv/play/season_info?season_id={series_id}&platform=web', series_id).get('season') or {}
+ return self.playlist_result(
+ self._entries(series_id), series_id, series_info.get('title'), series_info.get('description'),
+ categories=traverse_obj(series_info, ('styles', ..., 'title'), expected_type=str_or_none),
+ thumbnail=url_or_none(series_info.get('horizontal_cover')), view_count=parse_count(series_info.get('view')))
diff --git a/hypervideo_dl/extractor/biqle.py b/hypervideo_dl/extractor/biqle.py
index 17ebbb2..2b57bad 100644
--- a/hypervideo_dl/extractor/biqle.py
+++ b/hypervideo_dl/extractor/biqle.py
@@ -3,27 +3,28 @@ from __future__ import unicode_literals
from .common import InfoExtractor
from .vk import VKIE
-from ..compat import (
- compat_b64decode,
- compat_urllib_parse_unquote,
+from ..compat import compat_b64decode
+from ..utils import (
+ int_or_none,
+ js_to_json,
+ traverse_obj,
+ unified_timestamp,
)
-from ..utils import int_or_none
class BIQLEIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?biqle\.(?:com|org|ru)/watch/(?P<id>-?\d+_\d+)'
_TESTS = [{
- # Youtube embed
- 'url': 'https://biqle.ru/watch/-115995369_456239081',
- 'md5': '97af5a06ee4c29bbf9c001bdb1cf5c06',
+ 'url': 'https://biqle.ru/watch/-2000421746_85421746',
+ 'md5': 'ae6ef4f04d19ac84e4658046d02c151c',
'info_dict': {
- 'id': '8v4f-avW-VI',
+ 'id': '-2000421746_85421746',
'ext': 'mp4',
- 'title': "PASSE-PARTOUT - L'ete c'est fait pour jouer",
- 'description': 'Passe-Partout',
- 'uploader_id': 'mrsimpsonstef3',
- 'uploader': 'Phanolito',
- 'upload_date': '20120822',
+ 'title': 'Forsaken By Hope Studio Clip',
+ 'description': 'Forsaken By Hope Studio Clip — Смотреть онлайн',
+ 'upload_date': '19700101',
+ 'thumbnail': r're:https://[^/]+/impf/7vN3ACwSTgChP96OdOfzFjUCzFR6ZglDQgWsIw/KPaACiVJJxM\.jpg\?size=800x450&quality=96&keep_aspect_ratio=1&background=000000&sign=b48ea459c4d33dbcba5e26d63574b1cb&type=video_thumb',
+ 'timestamp': 0,
},
}, {
'url': 'http://biqle.org/watch/-44781847_168547604',
@@ -32,53 +33,62 @@ class BIQLEIE(InfoExtractor):
'id': '-44781847_168547604',
'ext': 'mp4',
'title': 'Ребенок в шоке от автоматической мойки',
+ 'description': 'Ребенок в шоке от автоматической мойки — Смотреть онлайн',
'timestamp': 1396633454,
- 'uploader': 'Dmitry Kotov',
'upload_date': '20140404',
- 'uploader_id': '47850140',
+ 'thumbnail': r're:https://[^/]+/c535507/u190034692/video/l_b84df002\.jpg',
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- embed_url = self._proto_relative_url(self._search_regex(
- r'<iframe.+?src="((?:https?:)?//(?:daxab\.com|dxb\.to|[^/]+/player)/[^"]+)".*?></iframe>',
- webpage, 'embed url'))
+
+ title = self._html_search_meta('name', webpage, 'Title', fatal=False)
+ timestamp = unified_timestamp(self._html_search_meta('uploadDate', webpage, 'Upload Date', default=None))
+ description = self._html_search_meta('description', webpage, 'Description', default=None)
+
+ global_embed_url = self._search_regex(
+ r'<script[^<]+?window.globEmbedUrl\s*=\s*\'((?:https?:)?//(?:daxab\.com|dxb\.to|[^/]+/player)/[^\']+)\'',
+ webpage, 'global Embed url')
+ hash = self._search_regex(
+ r'<script id="data-embed-video[^<]+?hash: "([^"]+)"[^<]*</script>', webpage, 'Hash')
+
+ embed_url = global_embed_url + hash
+
if VKIE.suitable(embed_url):
return self.url_result(embed_url, VKIE.ie_key(), video_id)
embed_page = self._download_webpage(
- embed_url, video_id, headers={'Referer': url})
- video_ext = self._get_cookies(embed_url).get('video_ext')
- if video_ext:
- video_ext = compat_urllib_parse_unquote(video_ext.value)
- if not video_ext:
- video_ext = compat_b64decode(self._search_regex(
- r'video_ext\s*:\s*[\'"]([A-Za-z0-9+/=]+)',
- embed_page, 'video_ext')).decode()
- video_id, sig, _, access_token = video_ext.split(':')
+ embed_url, video_id, 'Downloading embed webpage', headers={'Referer': url})
+
+ glob_params = self._parse_json(self._search_regex(
+ r'<script id="globParams">[^<]*window.globParams = ([^;]+);[^<]+</script>',
+ embed_page, 'Global Parameters'), video_id, transform_source=js_to_json)
+ host_name = compat_b64decode(glob_params['server'][::-1]).decode()
+
item = self._download_json(
- 'https://api.vk.com/method/video.get', video_id,
- headers={'User-Agent': 'okhttp/3.4.1'}, query={
- 'access_token': access_token,
- 'sig': sig,
- 'v': 5.44,
+ f'https://{host_name}/method/video.get/{video_id}', video_id,
+ headers={'Referer': url}, query={
+ 'token': glob_params['video']['access_token'],
'videos': video_id,
+ 'ckey': glob_params['c_key'],
+ 'credentials': glob_params['video']['credentials'],
})['response']['items'][0]
- title = item['title']
formats = []
for f_id, f_url in item.get('files', {}).items():
if f_id == 'external':
return self.url_result(f_url)
ext, height = f_id.split('_')
- formats.append({
- 'format_id': height + 'p',
- 'url': f_url,
- 'height': int_or_none(height),
- 'ext': ext,
- })
+ height_extra_key = traverse_obj(glob_params, ('video', 'partial', 'quality', height))
+ if height_extra_key:
+ formats.append({
+ 'format_id': f'{height}p',
+ 'url': f'https://{host_name}/{f_url[8:]}&videos={video_id}&extra_key={height_extra_key}',
+ 'height': int_or_none(height),
+ 'ext': ext,
+ })
self._sort_formats(formats)
thumbnails = []
@@ -96,10 +106,9 @@ class BIQLEIE(InfoExtractor):
'title': title,
'formats': formats,
'comment_count': int_or_none(item.get('comments')),
- 'description': item.get('description'),
+ 'description': description,
'duration': int_or_none(item.get('duration')),
'thumbnails': thumbnails,
- 'timestamp': int_or_none(item.get('date')),
- 'uploader': item.get('owner_id'),
+ 'timestamp': timestamp,
'view_count': int_or_none(item.get('views')),
}
diff --git a/hypervideo_dl/extractor/bitwave.py b/hypervideo_dl/extractor/bitwave.py
index eb16c46..e6e093f 100644
--- a/hypervideo_dl/extractor/bitwave.py
+++ b/hypervideo_dl/extractor/bitwave.py
@@ -51,7 +51,7 @@ class BitwaveStreamIE(InfoExtractor):
return {
'id': username,
- 'title': self._live_title(channel['data']['title']),
+ 'title': channel['data']['title'],
'uploader': username,
'uploader_id': username,
'formats': formats,
diff --git a/hypervideo_dl/extractor/blogger.py b/hypervideo_dl/extractor/blogger.py
new file mode 100644
index 0000000..dba131c
--- /dev/null
+++ b/hypervideo_dl/extractor/blogger.py
@@ -0,0 +1,54 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from ..utils import (
+ mimetype2ext,
+ parse_duration,
+ parse_qs,
+ str_or_none,
+ traverse_obj,
+)
+from .common import InfoExtractor
+
+
+class BloggerIE(InfoExtractor):
+ IE_NAME = 'blogger.com'
+ _VALID_URL = r'https?://(?:www\.)?blogger\.com/video\.g\?token=(?P<id>.+)'
+ _VALID_EMBED = r'''<iframe[^>]+src=["']((?:https?:)?//(?:www\.)?blogger\.com/video\.g\?token=[^"']+)["']'''
+ _TESTS = [{
+ 'url': 'https://www.blogger.com/video.g?token=AD6v5dzEe9hfcARr5Hlq1WTkYy6t-fXH3BBahVhGvVHe5szdEUBEloSEDSTA8-b111089KbfWuBvTN7fnbxMtymsHhXAXwVvyzHH4Qch2cfLQdGxKQrrEuFpC1amSl_9GuLWODjPgw',
+ 'md5': 'f1bc19b6ea1b0fd1d81e84ca9ec467ac',
+ 'info_dict': {
+ 'id': 'BLOGGER-video-3c740e3a49197e16-796',
+ 'title': 'BLOGGER-video-3c740e3a49197e16-796',
+ 'ext': 'mp4',
+ 'thumbnail': r're:^https?://.*',
+ 'duration': 76.068,
+ }
+ }]
+
+ @staticmethod
+ def _extract_urls(webpage):
+ return re.findall(BloggerIE._VALID_EMBED, webpage)
+
+ def _real_extract(self, url):
+ token_id = self._match_id(url)
+ webpage = self._download_webpage(url, token_id)
+ data_json = self._search_regex(r'var\s+VIDEO_CONFIG\s*=\s*(\{.*)', webpage, 'JSON data')
+ data = self._parse_json(data_json.encode('utf-8').decode('unicode_escape'), token_id)
+ streams = data['streams']
+ formats = [{
+ 'ext': mimetype2ext(traverse_obj(parse_qs(stream['play_url']), ('mime', 0))),
+ 'url': stream['play_url'],
+ 'format_id': str_or_none(stream.get('format_id')),
+ } for stream in streams]
+
+ return {
+ 'id': data.get('iframe_id', token_id),
+ 'title': data.get('iframe_id', token_id),
+ 'formats': formats,
+ 'thumbnail': data.get('thumbnail'),
+ 'duration': parse_duration(traverse_obj(parse_qs(streams[0]['play_url']), ('dur', 0))),
+ }
diff --git a/hypervideo_dl/extractor/bongacams.py b/hypervideo_dl/extractor/bongacams.py
index 9e75511..4e346e7 100644
--- a/hypervideo_dl/extractor/bongacams.py
+++ b/hypervideo_dl/extractor/bongacams.py
@@ -49,7 +49,7 @@ class BongaCamsIE(InfoExtractor):
return {
'id': channel_id,
- 'title': self._live_title(uploader or uploader_id),
+ 'title': uploader or uploader_id,
'uploader': uploader,
'uploader_id': uploader_id,
'like_count': like_count,
diff --git a/hypervideo_dl/extractor/br.py b/hypervideo_dl/extractor/br.py
index 7169ece..0155827 100644
--- a/hypervideo_dl/extractor/br.py
+++ b/hypervideo_dl/extractor/br.py
@@ -175,7 +175,7 @@ class BRIE(InfoExtractor):
class BRMediathekIE(InfoExtractor):
IE_DESC = 'Bayerischer Rundfunk Mediathek'
- _VALID_URL = r'https?://(?:www\.)?br\.de/mediathek/video/[^/?&#]*?-(?P<id>av:[0-9a-f]{24})'
+ _VALID_URL = r'https?://(?:www\.)?br\.de/mediathek//?video/(?:[^/?&#]+?-)?(?P<id>av:[0-9a-f]{24})'
_TESTS = [{
'url': 'https://www.br.de/mediathek/video/gesundheit-die-sendung-vom-28112017-av:5a1e6a6e8fce6d001871cc8e',
@@ -188,6 +188,9 @@ class BRMediathekIE(InfoExtractor):
'timestamp': 1511942766,
'upload_date': '20171129',
}
+ }, {
+ 'url': 'https://www.br.de/mediathek//video/av:61b0db581aed360007558c12',
+ 'only_matching': True,
}]
def _real_extract(self, url):
diff --git a/hypervideo_dl/extractor/breitbart.py b/hypervideo_dl/extractor/breitbart.py
new file mode 100644
index 0000000..e029aa6
--- /dev/null
+++ b/hypervideo_dl/extractor/breitbart.py
@@ -0,0 +1,38 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class BreitBartIE(InfoExtractor):
+ _VALID_URL = r'https?:\/\/(?:www\.)breitbart.com/videos/v/(?P<id>[^/]+)'
+ _TESTS = [{
+ 'url': 'https://www.breitbart.com/videos/v/5cOz1yup/?pl=Ij6NDOji',
+ 'md5': '0aa6d1d6e183ac5ca09207fe49f17ade',
+ 'info_dict': {
+ 'id': '5cOz1yup',
+ 'ext': 'mp4',
+ 'title': 'Watch \u2013 Clyburn: Statues in Congress Have to Go Because they Are Honoring Slavery',
+ 'description': 'md5:bac35eb0256d1cb17f517f54c79404d5',
+ 'thumbnail': 'https://cdn.jwplayer.com/thumbs/5cOz1yup-1920.jpg',
+ 'age_limit': 0,
+ }
+ }, {
+ 'url': 'https://www.breitbart.com/videos/v/eaiZjVOn/',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ formats = self._extract_m3u8_formats(f'https://cdn.jwplayer.com/manifests/{video_id}.m3u8', video_id, ext='mp4')
+ self._sort_formats(formats)
+ return {
+ 'id': video_id,
+ 'title': (self._og_search_title(webpage, default=None)
+ or self._html_extract_title(webpage, 'video title')),
+ 'description': self._og_search_description(webpage),
+ 'thumbnail': self._og_search_thumbnail(webpage),
+ 'age_limit': self._rta_search(webpage),
+ 'formats': formats
+ }
diff --git a/hypervideo_dl/extractor/brightcove.py b/hypervideo_dl/extractor/brightcove.py
index cd1c3f0..dcd332b 100644
--- a/hypervideo_dl/extractor/brightcove.py
+++ b/hypervideo_dl/extractor/brightcove.py
@@ -16,6 +16,7 @@ from ..compat import (
)
from ..utils import (
clean_html,
+ dict_get,
extract_attributes,
ExtractorError,
find_xpath_attr,
@@ -471,32 +472,22 @@ class BrightcoveNewIE(AdobePassIE):
def _parse_brightcove_metadata(self, json_data, video_id, headers={}):
title = json_data['name'].strip()
- num_drm_sources = 0
formats, subtitles = [], {}
sources = json_data.get('sources') or []
for source in sources:
container = source.get('container')
ext = mimetype2ext(source.get('type'))
src = source.get('src')
- skip_unplayable = not self.get_param('allow_unplayable_formats')
- # https://support.brightcove.com/playback-api-video-fields-reference#key_systems_object
- if skip_unplayable and (container == 'WVM' or source.get('key_systems')):
- num_drm_sources += 1
- continue
- elif ext == 'ism' and skip_unplayable:
- continue
- elif ext == 'm3u8' or container == 'M2TS':
+ if ext == 'm3u8' or container == 'M2TS':
if not src:
continue
- f, subs = self._extract_m3u8_formats_and_subtitles(
+ fmts, subs = self._extract_m3u8_formats_and_subtitles(
src, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)
- formats.extend(f)
subtitles = self._merge_subtitles(subtitles, subs)
elif ext == 'mpd':
if not src:
continue
- f, subs = self._extract_mpd_formats_and_subtitles(src, video_id, 'dash', fatal=False)
- formats.extend(f)
+ fmts, subs = self._extract_mpd_formats_and_subtitles(src, video_id, 'dash', fatal=False)
subtitles = self._merge_subtitles(subtitles, subs)
else:
streaming_src = source.get('streaming_src')
@@ -543,7 +534,13 @@ class BrightcoveNewIE(AdobePassIE):
'play_path': stream_name,
'format_id': build_format_id('rtmp'),
})
- formats.append(f)
+ fmts = [f]
+
+ # https://support.brightcove.com/playback-api-video-fields-reference#key_systems_object
+ if container == 'WVM' or source.get('key_systems') or ext == 'ism':
+ for f in fmts:
+ f['has_drm'] = True
+ formats.extend(fmts)
if not formats:
errors = json_data.get('errors')
@@ -551,9 +548,6 @@ class BrightcoveNewIE(AdobePassIE):
error = errors[0]
self.raise_no_formats(
error.get('message') or error.get('error_subcode') or error['error_code'], expected=True)
- elif (not self.get_param('allow_unplayable_formats')
- and sources and num_drm_sources == len(sources)):
- self.report_drm(video_id)
self._sort_formats(formats)
@@ -577,11 +571,19 @@ class BrightcoveNewIE(AdobePassIE):
if duration is not None and duration <= 0:
is_live = True
+ common_res = [(160, 90), (320, 180), (480, 720), (640, 360), (768, 432), (1024, 576), (1280, 720), (1366, 768), (1920, 1080)]
+ thumb_base_url = dict_get(json_data, ('poster', 'thumbnail'))
+ thumbnails = [{
+ 'url': re.sub(r'\d+x\d+', f'{w}x{h}', thumb_base_url),
+ 'width': w,
+ 'height': h,
+ } for w, h in common_res] if thumb_base_url else None
+
return {
'id': video_id,
- 'title': self._live_title(title) if is_live else title,
+ 'title': title,
'description': clean_html(json_data.get('description')),
- 'thumbnail': json_data.get('thumbnail') or json_data.get('poster'),
+ 'thumbnails': thumbnails,
'duration': duration,
'timestamp': parse_iso8601(json_data.get('published_at')),
'uploader_id': json_data.get('account_id'),
diff --git a/hypervideo_dl/extractor/cableav.py b/hypervideo_dl/extractor/cableav.py
new file mode 100644
index 0000000..77efdf4
--- /dev/null
+++ b/hypervideo_dl/extractor/cableav.py
@@ -0,0 +1,34 @@
+# coding: utf-8
+from .common import InfoExtractor
+
+
+class CableAVIE(InfoExtractor):
+ _VALID_URL = r'https://cableav\.tv/(?P<id>[a-zA-Z0-9]+)'
+ _TESTS = [{
+ 'url': 'https://cableav.tv/lS4iR9lWjN8/',
+ 'md5': '7e3fe5e49d61c4233b7f5b0f69b15e18',
+ 'info_dict': {
+ 'id': 'lS4iR9lWjN8',
+ 'ext': 'mp4',
+ 'title': '國產麻豆AV 叮叮映畫 DDF001 情欲小說家 - CableAV',
+ 'description': '國產AV 480p, 720p 国产麻豆AV 叮叮映画 DDF001 情欲小说家',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ }
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ video_url = self._og_search_video_url(webpage, secure=False)
+
+ formats = self._extract_m3u8_formats(video_url, video_id, 'mp4')
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': self._og_search_title(webpage),
+ 'description': self._og_search_description(webpage),
+ 'thumbnail': self._og_search_thumbnail(webpage),
+ 'formats': formats,
+ }
diff --git a/hypervideo_dl/extractor/callin.py b/hypervideo_dl/extractor/callin.py
new file mode 100644
index 0000000..1f3b7cf
--- /dev/null
+++ b/hypervideo_dl/extractor/callin.py
@@ -0,0 +1,114 @@
+# coding: utf-8
+from .common import InfoExtractor
+from ..utils import (
+ traverse_obj,
+ float_or_none,
+ int_or_none
+)
+
+
+class CallinIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?callin\.com/(episode)/(?P<id>[-a-zA-Z]+)'
+ _TESTS = [{
+ 'url': 'https://www.callin.com/episode/the-title-ix-regime-and-the-long-march-through-EBfXYSrsjc',
+ 'info_dict': {
+ 'id': '218b979630a35ead12c6fd096f2996c56c37e4d0dc1f6dc0feada32dcf7b31cd',
+ 'title': 'The Title IX Regime and the Long March Through and Beyond the Institutions',
+ 'ext': 'ts',
+ 'display_id': 'the-title-ix-regime-and-the-long-march-through-EBfXYSrsjc',
+ 'thumbnail': 're:https://.+\\.png',
+ 'description': 'First episode',
+ 'uploader': 'Wesley Yang',
+ 'timestamp': 1639404128.65,
+ 'upload_date': '20211213',
+ 'uploader_id': 'wesyang',
+ 'uploader_url': 'http://wesleyyang.substack.com',
+ 'channel': 'Conversations in Year Zero',
+ 'channel_id': '436d1f82ddeb30cd2306ea9156044d8d2cfdc3f1f1552d245117a42173e78553',
+ 'channel_url': 'https://callin.com/show/conversations-in-year-zero-oJNllRFSfx',
+ 'duration': 9951.936,
+ 'view_count': int,
+ 'categories': ['News & Politics', 'History', 'Technology'],
+ 'cast': ['Wesley Yang', 'KC Johnson', 'Gabi Abramovich'],
+ 'series': 'Conversations in Year Zero',
+ 'series_id': '436d1f82ddeb30cd2306ea9156044d8d2cfdc3f1f1552d245117a42173e78553',
+ 'episode': 'The Title IX Regime and the Long March Through and Beyond the Institutions',
+ 'episode_number': 1,
+ 'episode_id': '218b979630a35ead12c6fd096f2996c56c37e4d0dc1f6dc0feada32dcf7b31cd'
+ }
+ }]
+
+ def try_get_user_name(self, d):
+ names = [d.get(n) for n in ('first', 'last')]
+ if None in names:
+ return next((n for n in names if n), default=None)
+ return ' '.join(names)
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
+
+ next_data = self._search_nextjs_data(webpage, display_id)
+ episode = next_data['props']['pageProps']['episode']
+
+ id = episode['id']
+ title = (episode.get('title')
+ or self._og_search_title(webpage, fatal=False)
+ or self._html_extract_title(webpage))
+ url = episode['m3u8']
+ formats = self._extract_m3u8_formats(url, display_id, ext='ts')
+ self._sort_formats(formats)
+
+ show = traverse_obj(episode, ('show', 'title'))
+ show_id = traverse_obj(episode, ('show', 'id'))
+
+ show_json = None
+ app_slug = (self._html_search_regex(
+ '<script\\s+src=["\']/_next/static/([-_a-zA-Z0-9]+)/_',
+ webpage, 'app slug', fatal=False) or next_data.get('buildId'))
+ show_slug = traverse_obj(episode, ('show', 'linkObj', 'resourceUrl'))
+ if app_slug and show_slug and '/' in show_slug:
+ show_slug = show_slug.rsplit('/', 1)[1]
+ show_json_url = f'https://www.callin.com/_next/data/{app_slug}/show/{show_slug}.json'
+ show_json = self._download_json(show_json_url, display_id, fatal=False)
+
+ host = (traverse_obj(show_json, ('pageProps', 'show', 'hosts', 0))
+ or traverse_obj(episode, ('speakers', 0)))
+
+ host_nick = traverse_obj(host, ('linkObj', 'resourceUrl'))
+ host_nick = host_nick.rsplit('/', 1)[1] if (host_nick and '/' in host_nick) else None
+
+ cast = list(filter(None, [
+ self.try_get_user_name(u) for u in
+ traverse_obj(episode, (('speakers', 'callerTags'), ...)) or []
+ ]))
+
+ episode_list = traverse_obj(show_json, ('pageProps', 'show', 'episodes')) or []
+ episode_number = next(
+ (len(episode_list) - i for (i, e) in enumerate(episode_list) if e.get('id') == id),
+ None)
+
+ return {
+ 'id': id,
+ 'display_id': display_id,
+ 'title': title,
+ 'formats': formats,
+ 'thumbnail': traverse_obj(episode, ('show', 'photo')),
+ 'description': episode.get('description'),
+ 'uploader': self.try_get_user_name(host) if host else None,
+ 'timestamp': episode.get('publishedAt'),
+ 'uploader_id': host_nick,
+ 'uploader_url': traverse_obj(show_json, ('pageProps', 'show', 'url')),
+ 'channel': show,
+ 'channel_id': show_id,
+ 'channel_url': traverse_obj(episode, ('show', 'linkObj', 'resourceUrl')),
+ 'duration': float_or_none(episode.get('runtime')),
+ 'view_count': int_or_none(episode.get('plays')),
+ 'categories': traverse_obj(episode, ('show', 'categorizations', ..., 'name')),
+ 'cast': cast if cast else None,
+ 'series': show,
+ 'series_id': show_id,
+ 'episode': title,
+ 'episode_number': episode_number,
+ 'episode_id': id
+ }
diff --git a/hypervideo_dl/extractor/caltrans.py b/hypervideo_dl/extractor/caltrans.py
new file mode 100644
index 0000000..9ac740f
--- /dev/null
+++ b/hypervideo_dl/extractor/caltrans.py
@@ -0,0 +1,41 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class CaltransIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:[^/]+\.)?ca\.gov/vm/loc/[^/]+/(?P<id>[a-z0-9_]+)\.htm'
+ _TEST = {
+ 'url': 'https://cwwp2.dot.ca.gov/vm/loc/d3/hwy50at24th.htm',
+ 'info_dict': {
+ 'id': 'hwy50at24th',
+ 'ext': 'ts',
+ 'title': 'US-50 : Sacramento : Hwy 50 at 24th',
+ 'live_status': 'is_live',
+ 'thumbnail': 'https://cwwp2.dot.ca.gov/data/d3/cctv/image/hwy50at24th/hwy50at24th.jpg',
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ global_vars = self._search_regex(
+ r'<script[^<]+?([^<]+\.m3u8[^<]+)</script>',
+ webpage, 'Global Vars')
+ route_place = self._search_regex(r'routePlace\s*=\s*"([^"]+)"', global_vars, 'Route Place', fatal=False)
+ location_name = self._search_regex(r'locationName\s*=\s*"([^"]+)"', global_vars, 'Location Name', fatal=False)
+ poster_url = self._search_regex(r'posterURL\s*=\s*"([^"]+)"', global_vars, 'Poster Url', fatal=False)
+ video_stream = self._search_regex(r'videoStreamURL\s*=\s*"([^"]+)"', global_vars, 'Video Stream URL', fatal=False)
+
+ formats = self._extract_m3u8_formats(video_stream, video_id, 'ts', live=True)
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': f'{route_place} : {location_name}',
+ 'is_live': True,
+ 'formats': formats,
+ 'thumbnail': poster_url,
+ }
diff --git a/hypervideo_dl/extractor/cam4.py b/hypervideo_dl/extractor/cam4.py
index 30daf2b..2a3931f 100644
--- a/hypervideo_dl/extractor/cam4.py
+++ b/hypervideo_dl/extractor/cam4.py
@@ -13,6 +13,8 @@ class CAM4IE(InfoExtractor):
'ext': 'mp4',
'title': 're:^foxynesss [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
'age_limit': 18,
+ 'live_status': 'is_live',
+ 'thumbnail': 'https://snapshots.xcdnpro.com/thumbnails/foxynesss',
}
}
@@ -25,8 +27,9 @@ class CAM4IE(InfoExtractor):
return {
'id': channel_id,
- 'title': self._live_title(channel_id),
+ 'title': channel_id,
'is_live': True,
'age_limit': 18,
'formats': formats,
+ 'thumbnail': f'https://snapshots.xcdnpro.com/thumbnails/{channel_id}',
}
diff --git a/hypervideo_dl/extractor/cammodels.py b/hypervideo_dl/extractor/cammodels.py
index eb2a8b4..3dc1937 100644
--- a/hypervideo_dl/extractor/cammodels.py
+++ b/hypervideo_dl/extractor/cammodels.py
@@ -91,7 +91,7 @@ class CamModelsIE(InfoExtractor):
return {
'id': user_id,
- 'title': self._live_title(user_id),
+ 'title': user_id,
'is_live': True,
'formats': formats,
'age_limit': 18
diff --git a/hypervideo_dl/extractor/canalalpha.py b/hypervideo_dl/extractor/canalalpha.py
new file mode 100644
index 0000000..0365cb2
--- /dev/null
+++ b/hypervideo_dl/extractor/canalalpha.py
@@ -0,0 +1,98 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ clean_html,
+ dict_get,
+ try_get,
+ unified_strdate,
+)
+
+
+class CanalAlphaIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?canalalpha\.ch/play/[^/]+/[^/]+/(?P<id>\d+)/?.*'
+
+ _TESTS = [{
+ 'url': 'https://www.canalalpha.ch/play/le-journal/episode/24520/jeudi-28-octobre-2021',
+ 'info_dict': {
+ 'id': '24520',
+ 'ext': 'mp4',
+ 'title': 'Jeudi 28 octobre 2021',
+ 'description': 'md5:d30c6c3e53f8ad40d405379601973b30',
+ 'thumbnail': 'https://static.canalalpha.ch/poster/journal/journal_20211028.jpg',
+ 'upload_date': '20211028',
+ 'duration': 1125,
+ },
+ 'params': {'skip_download': True}
+ }, {
+ 'url': 'https://www.canalalpha.ch/play/le-journal/topic/24512/la-poste-fait-de-neuchatel-un-pole-cryptographique',
+ 'info_dict': {
+ 'id': '24512',
+ 'ext': 'mp4',
+ 'title': 'La Poste fait de Neuchâtel un pôle cryptographique',
+ 'description': 'md5:4ba63ae78a0974d1a53d6703b6e1dedf',
+ 'thumbnail': 'https://static.canalalpha.ch/poster/news/news_39712.jpg',
+ 'upload_date': '20211028',
+ 'duration': 138,
+ },
+ 'params': {'skip_download': True}
+ }, {
+ 'url': 'https://www.canalalpha.ch/play/eureka/episode/24484/ces-innovations-qui-veulent-rendre-lagriculture-plus-durable',
+ 'info_dict': {
+ 'id': '24484',
+ 'ext': 'mp4',
+ 'title': 'Ces innovations qui veulent rendre l’agriculture plus durable',
+ 'description': 'md5:3de3f151180684621e85be7c10e4e613',
+ 'thumbnail': 'https://static.canalalpha.ch/poster/magazine/magazine_10236.jpg',
+ 'upload_date': '20211026',
+ 'duration': 360,
+ },
+ 'params': {'skip_download': True}
+ }, {
+ 'url': 'https://www.canalalpha.ch/play/avec-le-temps/episode/23516/redonner-de-leclat-grace-au-polissage',
+ 'info_dict': {
+ 'id': '23516',
+ 'ext': 'mp4',
+ 'title': 'Redonner de l\'éclat grâce au polissage',
+ 'description': 'md5:0d8fbcda1a5a4d6f6daa3165402177e1',
+ 'thumbnail': 'https://static.canalalpha.ch/poster/magazine/magazine_9990.png',
+ 'upload_date': '20210726',
+ 'duration': 360,
+ },
+ 'params': {'skip_download': True}
+ }]
+
+ def _real_extract(self, url):
+ id = self._match_id(url)
+ webpage = self._download_webpage(url, id)
+ data_json = self._parse_json(self._search_regex(
+ r'window\.__SERVER_STATE__\s?=\s?({(?:(?!};)[^"]|"([^"]|\\")*")+})\s?;',
+ webpage, 'data_json'), id)['1']['data']['data']
+ manifests = try_get(data_json, lambda x: x['video']['manifests'], expected_type=dict) or {}
+ subtitles = {}
+ formats = [{
+ 'url': video['$url'],
+ 'ext': 'mp4',
+ 'width': try_get(video, lambda x: x['res']['width'], expected_type=int),
+ 'height': try_get(video, lambda x: x['res']['height'], expected_type=int),
+ } for video in try_get(data_json, lambda x: x['video']['mp4'], expected_type=list) or [] if video.get('$url')]
+ if manifests.get('hls'):
+ m3u8_frmts, m3u8_subs = self._parse_m3u8_formats_and_subtitles(manifests['hls'], video_id=id)
+ formats.extend(m3u8_frmts)
+ subtitles = self._merge_subtitles(subtitles, m3u8_subs)
+ if manifests.get('dash'):
+ dash_frmts, dash_subs = self._parse_mpd_formats_and_subtitles(manifests['dash'])
+ formats.extend(dash_frmts)
+ subtitles = self._merge_subtitles(subtitles, dash_subs)
+ self._sort_formats(formats)
+ return {
+ 'id': id,
+ 'title': data_json.get('title').strip(),
+ 'description': clean_html(dict_get(data_json, ('longDesc', 'shortDesc'))),
+ 'thumbnail': data_json.get('poster'),
+ 'upload_date': unified_strdate(dict_get(data_json, ('webPublishAt', 'featuredAt', 'diffusionDate'))),
+ 'duration': try_get(data_json, lambda x: x['video']['duration'], expected_type=int),
+ 'formats': formats,
+ 'subtitles': subtitles,
+ }
diff --git a/hypervideo_dl/extractor/canvas.py b/hypervideo_dl/extractor/canvas.py
index 49e7e4e..8b99037 100644
--- a/hypervideo_dl/extractor/canvas.py
+++ b/hypervideo_dl/extractor/canvas.py
@@ -1,4 +1,5 @@
from __future__ import unicode_literals
+import json
from .common import InfoExtractor
@@ -41,9 +42,9 @@ class CanvasIE(InfoExtractor):
_GEO_BYPASS = False
_HLS_ENTRY_PROTOCOLS_MAP = {
'HLS': 'm3u8_native',
- 'HLS_AES': 'm3u8',
+ 'HLS_AES': 'm3u8_native',
}
- _REST_API_BASE = 'https://media-services-public.vrt.be/vualto-video-aggregator-web/rest/external/v1'
+ _REST_API_BASE = 'https://media-services-public.vrt.be/vualto-video-aggregator-web/rest/external/v2'
def _real_extract(self, url):
mobj = self._match_valid_url(url)
@@ -59,18 +60,23 @@ class CanvasIE(InfoExtractor):
# New API endpoint
if not data:
+ vrtnutoken = self._download_json('https://token.vrt.be/refreshtoken',
+ video_id, note='refreshtoken: Retrieve vrtnutoken',
+ errnote='refreshtoken failed')['vrtnutoken']
headers = self.geo_verification_headers()
- headers.update({'Content-Type': 'application/json'})
- token = self._download_json(
+ headers.update({'Content-Type': 'application/json; charset=utf-8'})
+ vrtPlayerToken = self._download_json(
'%s/tokens' % self._REST_API_BASE, video_id,
- 'Downloading token', data=b'', headers=headers)['vrtPlayerToken']
+ 'Downloading token', headers=headers, data=json.dumps({
+ 'identityToken': vrtnutoken
+ }).encode('utf-8'))['vrtPlayerToken']
data = self._download_json(
'%s/videos/%s' % (self._REST_API_BASE, video_id),
video_id, 'Downloading video JSON', query={
- 'vrtPlayerToken': token,
- 'client': '%s@PROD' % site_id,
+ 'vrtPlayerToken': vrtPlayerToken,
+ 'client': 'null',
}, expected_status=400)
- if not data.get('title'):
+ if 'title' not in data:
code = data.get('code')
if code == 'AUTHENTICATION_REQUIRED':
self.raise_login_required()
@@ -78,7 +84,8 @@ class CanvasIE(InfoExtractor):
self.raise_geo_restricted(countries=['BE'])
raise ExtractorError(data.get('message') or code, expected=True)
- title = data['title']
+ # Note: The title may be an empty string
+ title = data['title'] or f'{site_id} {video_id}'
description = data.get('description')
formats = []
@@ -238,10 +245,6 @@ class VrtNUIE(GigyaBaseIE):
'upload_date': '20200727',
},
'skip': 'This video is only available for registered users',
- 'params': {
- 'username': '<snip>',
- 'password': '<snip>',
- },
'expected_warnings': ['is not a supported codec'],
}, {
# Only available via new API endpoint
@@ -257,34 +260,20 @@ class VrtNUIE(GigyaBaseIE):
'episode_number': 5,
},
'skip': 'This video is only available for registered users',
- 'params': {
- 'username': '<snip>',
- 'password': '<snip>',
- },
'expected_warnings': ['Unable to download asset JSON', 'is not a supported codec', 'Unknown MIME type'],
}]
_NETRC_MACHINE = 'vrtnu'
- _APIKEY = '3_qhEcPa5JGFROVwu5SWKqJ4mVOIkwlFNMSKwzPDAh8QZOtHqu6L4nD5Q7lk0eXOOG'
+ _APIKEY = '3_0Z2HujMtiWq_pkAjgnS2Md2E11a1AwZjYiBETtwNE-EoEHDINgtnvcAOpNgmrVGy'
_CONTEXT_ID = 'R3595707040'
- def _real_initialize(self):
- self._login()
-
- def _login(self):
- username, password = self._get_login_info()
- if username is None:
- return
-
- auth_info = self._download_json(
- 'https://accounts.vrt.be/accounts.login', None,
- note='Login data', errnote='Could not get Login data',
- headers={}, data=urlencode_postdata({
- 'loginID': username,
- 'password': password,
- 'sessionExpiration': '-2',
- 'APIKey': self._APIKEY,
- 'targetEnv': 'jssdk',
- }))
+ def _perform_login(self, username, password):
+ auth_info = self._gigya_login({
+ 'APIKey': self._APIKEY,
+ 'targetEnv': 'jssdk',
+ 'loginID': username,
+ 'password': password,
+ 'authMode': 'cookie',
+ })
if auth_info.get('errorDetails'):
raise ExtractorError('Unable to login: VrtNU said: ' + auth_info.get('errorDetails'), expected=True)
@@ -301,14 +290,15 @@ class VrtNUIE(GigyaBaseIE):
'UID': auth_info['UID'],
'UIDSignature': auth_info['UIDSignature'],
'signatureTimestamp': auth_info['signatureTimestamp'],
- 'client_id': 'vrtnu-site',
'_csrf': self._get_cookies('https://login.vrt.be').get('OIDCXSRF').value,
}
self._request_webpage(
'https://login.vrt.be/perform_login',
- None, note='Requesting a token', errnote='Could not get a token',
- headers={}, data=urlencode_postdata(post_data))
+ None, note='Performing login', errnote='perform login failed',
+ headers={}, query={
+ 'client_id': 'vrtnu-site'
+ }, data=urlencode_postdata(post_data))
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
diff --git a/hypervideo_dl/extractor/carambatv.py b/hypervideo_dl/extractor/carambatv.py
index b57b86a..7e5cc90 100644
--- a/hypervideo_dl/extractor/carambatv.py
+++ b/hypervideo_dl/extractor/carambatv.py
@@ -4,6 +4,7 @@ from __future__ import unicode_literals
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
+ format_field,
float_or_none,
int_or_none,
try_get,
@@ -43,7 +44,7 @@ class CarambaTVIE(InfoExtractor):
formats = [{
'url': base_url + f['fn'],
'height': int_or_none(f.get('height')),
- 'format_id': '%sp' % f['height'] if f.get('height') else None,
+ 'format_id': format_field(f, 'height', '%sp'),
} for f in video['qualities'] if f.get('fn')]
self._sort_formats(formats)
diff --git a/hypervideo_dl/extractor/cbc.py b/hypervideo_dl/extractor/cbc.py
index 2429521..4892419 100644
--- a/hypervideo_dl/extractor/cbc.py
+++ b/hypervideo_dl/extractor/cbc.py
@@ -2,17 +2,22 @@
from __future__ import unicode_literals
import re
+import json
+import base64
+import time
from .common import InfoExtractor
from ..compat import (
compat_str,
)
from ..utils import (
+ int_or_none,
+ join_nonempty,
js_to_json,
- smuggle_url,
- try_get,
orderedSet,
+ smuggle_url,
strip_or_none,
+ try_get,
ExtractorError,
)
@@ -122,9 +127,9 @@ class CBCIE(InfoExtractor):
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
- title = self._og_search_title(webpage, default=None) or self._html_search_meta(
- 'twitter:title', webpage, 'title', default=None) or self._html_search_regex(
- r'<title>([^<]+)</title>', webpage, 'title', fatal=False)
+ title = (self._og_search_title(webpage, default=None)
+ or self._html_search_meta('twitter:title', webpage, 'title', default=None)
+ or self._html_extract_title(webpage))
entries = [
self._extract_player_init(player_init, display_id)
for player_init in re.findall(r'CBC\.APP\.Caffeine\.initInstance\(({.+?})\);', webpage)]
@@ -244,37 +249,129 @@ class CBCGemIE(InfoExtractor):
'params': {'format': 'bv'},
'skip': 'Geo-restricted to Canada',
}]
- _API_BASE = 'https://services.radio-canada.ca/ott/cbc-api/v2/assets/'
+
+ _GEO_COUNTRIES = ['CA']
+ _TOKEN_API_KEY = '3f4beddd-2061-49b0-ae80-6f1f2ed65b37'
+ _NETRC_MACHINE = 'cbcgem'
+ _claims_token = None
+
+ def _new_claims_token(self, email, password):
+ data = json.dumps({
+ 'email': email,
+ 'password': password,
+ }).encode()
+ headers = {'content-type': 'application/json'}
+ query = {'apikey': self._TOKEN_API_KEY}
+ resp = self._download_json('https://api.loginradius.com/identity/v2/auth/login',
+ None, data=data, headers=headers, query=query)
+ access_token = resp['access_token']
+
+ query = {
+ 'access_token': access_token,
+ 'apikey': self._TOKEN_API_KEY,
+ 'jwtapp': 'jwt',
+ }
+ resp = self._download_json('https://cloud-api.loginradius.com/sso/jwt/api/token',
+ None, headers=headers, query=query)
+ sig = resp['signature']
+
+ data = json.dumps({'jwt': sig}).encode()
+ headers = {'content-type': 'application/json', 'ott-device-type': 'web'}
+ resp = self._download_json('https://services.radio-canada.ca/ott/cbc-api/v2/token',
+ None, data=data, headers=headers)
+ cbc_access_token = resp['accessToken']
+
+ headers = {'content-type': 'application/json', 'ott-device-type': 'web', 'ott-access-token': cbc_access_token}
+ resp = self._download_json('https://services.radio-canada.ca/ott/cbc-api/v2/profile',
+ None, headers=headers)
+ return resp['claimsToken']
+
+ def _get_claims_token_expiry(self):
+ # Token is a JWT
+ # JWT is decoded here and 'exp' field is extracted
+ # It is a Unix timestamp for when the token expires
+ b64_data = self._claims_token.split('.')[1]
+ data = base64.urlsafe_b64decode(b64_data + "==")
+ return json.loads(data)['exp']
+
+ def claims_token_expired(self):
+ exp = self._get_claims_token_expiry()
+ if exp - time.time() < 10:
+ # It will expire in less than 10 seconds, or has already expired
+ return True
+ return False
+
+ def claims_token_valid(self):
+ return self._claims_token is not None and not self.claims_token_expired()
+
+ def _get_claims_token(self, email, password):
+ if not self.claims_token_valid():
+ self._claims_token = self._new_claims_token(email, password)
+ self._downloader.cache.store(self._NETRC_MACHINE, 'claims_token', self._claims_token)
+ return self._claims_token
+
+ def _real_initialize(self):
+ if self.claims_token_valid():
+ return
+ self._claims_token = self._downloader.cache.load(self._NETRC_MACHINE, 'claims_token')
+
+ def _find_secret_formats(self, formats, video_id):
+ """ Find a valid video url and convert it to the secret variant """
+ base_format = next((f for f in formats if f.get('vcodec') != 'none'), None)
+ if not base_format:
+ return
+
+ base_url = re.sub(r'(Manifest\(.*?),filter=[\w-]+(.*?\))', r'\1\2', base_format['url'])
+ url = re.sub(r'(Manifest\(.*?),format=[\w-]+(.*?\))', r'\1\2', base_url)
+
+ secret_xml = self._download_xml(url, video_id, note='Downloading secret XML', fatal=False)
+ if not secret_xml:
+ return
+
+ for child in secret_xml:
+ if child.attrib.get('Type') != 'video':
+ continue
+ for video_quality in child:
+ bitrate = int_or_none(video_quality.attrib.get('Bitrate'))
+ if not bitrate or 'Index' not in video_quality.attrib:
+ continue
+ height = int_or_none(video_quality.attrib.get('MaxHeight'))
+
+ yield {
+ **base_format,
+ 'format_id': join_nonempty('sec', height),
+ # Note: \g<1> is necessary instead of \1 since bitrate is a number
+ 'url': re.sub(r'(QualityLevels\()\d+(\))', fr'\g<1>{bitrate}\2', base_url),
+ 'width': int_or_none(video_quality.attrib.get('MaxWidth')),
+ 'tbr': bitrate / 1000.0,
+ 'height': height,
+ }
def _real_extract(self, url):
video_id = self._match_id(url)
- video_info = self._download_json(self._API_BASE + video_id, video_id)
-
- last_error = None
- attempt = -1
- retries = self.get_param('extractor_retries', 15)
- while attempt < retries:
- attempt += 1
- if last_error:
- self.report_warning('%s. Retrying ...' % last_error)
- m3u8_info = self._download_json(
- video_info['playSession']['url'], video_id,
- note='Downloading JSON metadata%s' % f' (attempt {attempt})')
- m3u8_url = m3u8_info.get('url')
- if m3u8_url:
- break
- elif m3u8_info.get('errorCode') == 1:
- self.raise_geo_restricted(countries=['CA'])
- else:
- last_error = f'{self.IE_NAME} said: {m3u8_info.get("errorCode")} - {m3u8_info.get("message")}'
- # 35 means media unavailable, but retries work
- if m3u8_info.get('errorCode') != 35 or attempt >= retries:
- raise ExtractorError(last_error)
+ video_info = self._download_json('https://services.radio-canada.ca/ott/cbc-api/v2/assets/' + video_id, video_id)
+
+ email, password = self._get_login_info()
+ if email and password:
+ claims_token = self._get_claims_token(email, password)
+ headers = {'x-claims-token': claims_token}
+ else:
+ headers = {}
+ m3u8_info = self._download_json(video_info['playSession']['url'], video_id, headers=headers)
+ m3u8_url = m3u8_info.get('url')
+
+ if m3u8_info.get('errorCode') == 1:
+ self.raise_geo_restricted(countries=['CA'])
+ elif m3u8_info.get('errorCode') == 35:
+ self.raise_login_required(method='password')
+ elif m3u8_info.get('errorCode') != 0:
+ raise ExtractorError(f'{self.IE_NAME} said: {m3u8_info.get("errorCode")} - {m3u8_info.get("message")}')
formats = self._extract_m3u8_formats(m3u8_url, video_id, m3u8_id='hls')
self._remove_duplicate_formats(formats)
+ formats.extend(self._find_secret_formats(formats, video_id))
- for i, format in enumerate(formats):
+ for format in formats:
if format.get('vcodec') == 'none':
if format.get('ext') is None:
format['ext'] = 'm4a'
@@ -328,7 +425,8 @@ class CBCGemPlaylistIE(InfoExtractor):
show = match.group('show')
show_info = self._download_json(self._API_BASE + show, season_id)
season = int(match.group('season'))
- season_info = try_get(show_info, lambda x: x['seasons'][season - 1])
+
+ season_info = next((s for s in show_info['seasons'] if s.get('season') == season), None)
if season_info is None:
raise ExtractorError(f'Couldn\'t find season {season} of {show}')
@@ -377,7 +475,7 @@ class CBCGemPlaylistIE(InfoExtractor):
class CBCGemLiveIE(InfoExtractor):
IE_NAME = 'gem.cbc.ca:live'
- _VALID_URL = r'https?://gem\.cbc\.ca/live/(?P<id>[0-9]{12})'
+ _VALID_URL = r'https?://gem\.cbc\.ca/live/(?P<id>\d+)'
_TEST = {
'url': 'https://gem.cbc.ca/live/920604739687',
'info_dict': {
@@ -396,21 +494,21 @@ class CBCGemLiveIE(InfoExtractor):
# It's unclear where the chars at the end come from, but they appear to be
# constant. Might need updating in the future.
- _API = 'https://tpfeed.cbc.ca/f/ExhSPC/t_t3UKJR6MAT'
+ # There are two URLs, some livestreams are in one, and some
+ # in the other. The JSON schema is the same for both.
+ _API_URLS = ['https://tpfeed.cbc.ca/f/ExhSPC/t_t3UKJR6MAT', 'https://tpfeed.cbc.ca/f/ExhSPC/FNiv9xQx_BnT']
def _real_extract(self, url):
video_id = self._match_id(url)
- live_info = self._download_json(self._API, video_id)['entries']
- video_info = None
- for stream in live_info:
- if stream.get('guid') == video_id:
- video_info = stream
-
- if video_info is None:
- raise ExtractorError(
- 'Couldn\'t find video metadata, maybe this livestream is now offline',
- expected=True)
+ for api_url in self._API_URLS:
+ video_info = next((
+ stream for stream in self._download_json(api_url, video_id)['entries']
+ if stream.get('guid') == video_id), None)
+ if video_info:
+ break
+ else:
+ raise ExtractorError('Couldn\'t find video metadata, maybe this livestream is now offline', expected=True)
return {
'_type': 'url_transparent',
diff --git a/hypervideo_dl/extractor/cbs.py b/hypervideo_dl/extractor/cbs.py
index ae9ce58..2af36ea 100644
--- a/hypervideo_dl/extractor/cbs.py
+++ b/hypervideo_dl/extractor/cbs.py
@@ -77,21 +77,21 @@ class CBSIE(CBSBaseIE):
(?:
cbs:|
https?://(?:www\.)?(?:
- cbs\.com/(?:shows/[^/]+/video|movies/[^/]+)/|
+ cbs\.com/(?:shows|movies)/(?:video|[^/]+/video|[^/]+)/|
colbertlateshow\.com/(?:video|podcasts)/)
)(?P<id>[\w-]+)'''
# All tests are blocked outside US
_TESTS = [{
- 'url': 'https://www.cbs.com/shows/garth-brooks/video/_u7W953k6la293J7EPTd9oHkSPs6Xn6_/connect-chat-feat-garth-brooks/',
+ 'url': 'https://www.cbs.com/shows/video/xrUyNLtl9wd8D_RWWAg9NU2F_V6QpB3R/',
'info_dict': {
- 'id': '_u7W953k6la293J7EPTd9oHkSPs6Xn6_',
+ 'id': 'xrUyNLtl9wd8D_RWWAg9NU2F_V6QpB3R',
'ext': 'mp4',
- 'title': 'Connect Chat feat. Garth Brooks',
- 'description': 'Connect with country music singer Garth Brooks, as he chats with fans on Wednesday November 27, 2013. Be sure to tune in to Garth Brooks: Live from Las Vegas, Friday November 29, at 9/8c on CBS!',
- 'duration': 1495,
- 'timestamp': 1385585425,
- 'upload_date': '20131127',
+ 'title': 'Tough As Nails - Dreams Never Die',
+ 'description': 'md5:a3535a62531cdd52b0364248a2c1ae33',
+ 'duration': 2588,
+ 'timestamp': 1639015200,
+ 'upload_date': '20211209',
'uploader': 'CBSI-NEW',
},
'params': {
@@ -99,14 +99,14 @@ class CBSIE(CBSBaseIE):
'skip_download': True,
},
}, {
- 'url': 'https://www.cbs.com/shows/the-late-show-with-stephen-colbert/video/60icOhMb9NcjbcWnF_gub9XXHdeBcNk2/the-late-show-6-23-21-christine-baranski-joy-oladokun-',
+ 'url': 'https://www.cbs.com/shows/video/sZH1MGgomIosZgxGJ1l263MFq16oMtW1/',
'info_dict': {
- 'id': '60icOhMb9NcjbcWnF_gub9XXHdeBcNk2',
- 'title': 'The Late Show - 6/23/21 (Christine Baranski, Joy Oladokun)',
- 'timestamp': 1624507140,
- 'description': 'md5:e01af24e95c74d55e8775aef86117b95',
+ 'id': 'sZH1MGgomIosZgxGJ1l263MFq16oMtW1',
+ 'title': 'The Late Show - 3/16/22 (Michael Buble, Rose Matafeo)',
+ 'timestamp': 1647488100,
+ 'description': 'md5:d0e6ec23c544b7fa8e39a8e6844d2439',
'uploader': 'CBSI-NEW',
- 'upload_date': '20210624',
+ 'upload_date': '20220317',
},
'params': {
'ignore_no_formats_error': True,
diff --git a/hypervideo_dl/extractor/ccma.py b/hypervideo_dl/extractor/ccma.py
index ea98f86..9dbaabf 100644
--- a/hypervideo_dl/extractor/ccma.py
+++ b/hypervideo_dl/extractor/ccma.py
@@ -1,17 +1,14 @@
# coding: utf-8
from __future__ import unicode_literals
-import calendar
-import datetime
-
from .common import InfoExtractor
from ..utils import (
clean_html,
- extract_timezone,
int_or_none,
parse_duration,
parse_resolution,
try_get,
+ unified_timestamp,
url_or_none,
)
@@ -95,14 +92,8 @@ class CCMAIE(InfoExtractor):
duration = int_or_none(durada.get('milisegons'), 1000) or parse_duration(durada.get('text'))
tematica = try_get(informacio, lambda x: x['tematica']['text'])
- timestamp = None
data_utc = try_get(informacio, lambda x: x['data_emissio']['utc'])
- try:
- timezone, data_utc = extract_timezone(data_utc)
- timestamp = calendar.timegm((datetime.datetime.strptime(
- data_utc, '%Y-%d-%mT%H:%M:%S') - timezone).timetuple())
- except TypeError:
- pass
+ timestamp = unified_timestamp(data_utc)
subtitles = {}
subtitols = media.get('subtitols') or []
diff --git a/hypervideo_dl/extractor/cctv.py b/hypervideo_dl/extractor/cctv.py
index 9b86121..0ed5f32 100644
--- a/hypervideo_dl/extractor/cctv.py
+++ b/hypervideo_dl/extractor/cctv.py
@@ -162,7 +162,8 @@ class CCTVIE(InfoExtractor):
'url': video_url,
'format_id': 'http',
'quality': quality,
- 'source_preference': -10
+ # Sample clip
+ 'preference': -10
})
hls_url = try_get(data, lambda x: x['hls_url'], compat_str)
diff --git a/hypervideo_dl/extractor/ceskatelevize.py b/hypervideo_dl/extractor/ceskatelevize.py
index 5e04d38..ddf66b2 100644
--- a/hypervideo_dl/extractor/ceskatelevize.py
+++ b/hypervideo_dl/extractor/ceskatelevize.py
@@ -12,30 +12,15 @@ from ..utils import (
ExtractorError,
float_or_none,
sanitized_Request,
- unescapeHTML,
- update_url_query,
+ traverse_obj,
urlencode_postdata,
USER_AGENTS,
)
class CeskaTelevizeIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/ivysilani/(?:[^/?#&]+/)*(?P<id>[^/#?]+)'
+ _VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/(?:ivysilani|porady)/(?:[^/?#&]+/)*(?P<id>[^/#?]+)'
_TESTS = [{
- 'url': 'http://www.ceskatelevize.cz/ivysilani/ivysilani/10441294653-hyde-park-civilizace/214411058091220',
- 'info_dict': {
- 'id': '61924494877246241',
- 'ext': 'mp4',
- 'title': 'Hyde Park Civilizace: Život v Grónsku',
- 'description': 'md5:3fec8f6bb497be5cdb0c9e8781076626',
- 'thumbnail': r're:^https?://.*\.jpg',
- 'duration': 3350,
- },
- 'params': {
- # m3u8 download
- 'skip_download': True,
- },
- }, {
'url': 'http://www.ceskatelevize.cz/ivysilani/10441294653-hyde-park-civilizace/215411058090502/bonus/20641-bonus-01-en',
'info_dict': {
'id': '61924494877028507',
@@ -66,12 +51,60 @@ class CeskaTelevizeIE(InfoExtractor):
}, {
'url': 'http://www.ceskatelevize.cz/ivysilani/embed/iFramePlayer.php?hash=d6a3e1370d2e4fa76296b90bad4dfc19673b641e&IDEC=217 562 22150/0004&channelID=1&width=100%25',
'only_matching': True,
+ }, {
+ # video with 18+ caution trailer
+ 'url': 'http://www.ceskatelevize.cz/porady/10520528904-queer/215562210900007-bogotart/',
+ 'info_dict': {
+ 'id': '215562210900007-bogotart',
+ 'title': 'Queer: Bogotart',
+ 'description': 'Hlavní město Kolumbie v doprovodu queer umělců. Vroucí svět plný vášně, sebevědomí, ale i násilí a bolesti. Připravil Peter Serge Butko',
+ },
+ 'playlist': [{
+ 'info_dict': {
+ 'id': '61924494877311053',
+ 'ext': 'mp4',
+ 'title': 'Queer: Bogotart (Varování 18+)',
+ 'duration': 11.9,
+ },
+ }, {
+ 'info_dict': {
+ 'id': '61924494877068022',
+ 'ext': 'mp4',
+ 'title': 'Queer: Bogotart (Queer)',
+ 'thumbnail': r're:^https?://.*\.jpg',
+ 'duration': 1558.3,
+ },
+ }],
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }, {
+ # iframe embed
+ 'url': 'http://www.ceskatelevize.cz/porady/10614999031-neviditelni/21251212048/',
+ 'only_matching': True,
}]
def _real_extract(self, url):
playlist_id = self._match_id(url)
-
+ parsed_url = compat_urllib_parse_urlparse(url)
webpage = self._download_webpage(url, playlist_id)
+ site_name = self._og_search_property('site_name', webpage, fatal=False, default=None)
+ playlist_title = self._og_search_title(webpage, default=None)
+ if site_name and playlist_title:
+ playlist_title = playlist_title.replace(f' — {site_name}', '', 1)
+ playlist_description = self._og_search_description(webpage, default=None)
+ if playlist_description:
+ playlist_description = playlist_description.replace('\xa0', ' ')
+
+ if parsed_url.path.startswith('/porady/'):
+ next_data = self._search_nextjs_data(webpage, playlist_id)
+ idec = traverse_obj(next_data, ('props', 'pageProps', 'data', ('show', 'mediaMeta'), 'idec'), get_all=False)
+ if not idec:
+ raise ExtractorError('Failed to find IDEC id')
+ iframe_hash = self._download_webpage('https://www.ceskatelevize.cz/v-api/iframe-hash/', playlist_id)
+ webpage = self._download_webpage('https://www.ceskatelevize.cz/ivysilani/embed/iFramePlayer.php', playlist_id,
+ query={'hash': iframe_hash, 'origin': 'iVysilani', 'autoStart': 'true', 'IDEC': idec})
NOT_AVAILABLE_STRING = 'This content is not available at your territory due to limited copyright.'
if '%s</p>' % NOT_AVAILABLE_STRING in webpage:
@@ -100,7 +133,7 @@ class CeskaTelevizeIE(InfoExtractor):
data = {
'playlist[0][type]': type_,
'playlist[0][id]': episode_id,
- 'requestUrl': compat_urllib_parse_urlparse(url).path,
+ 'requestUrl': parsed_url.path,
'requestSource': 'iVysilani',
}
@@ -108,7 +141,7 @@ class CeskaTelevizeIE(InfoExtractor):
for user_agent in (None, USER_AGENTS['Safari']):
req = sanitized_Request(
- 'https://www.ceskatelevize.cz/ivysilani/ajax/get-client-playlist',
+ 'https://www.ceskatelevize.cz/ivysilani/ajax/get-client-playlist/',
data=urlencode_postdata(data))
req.add_header('Content-type', 'application/x-www-form-urlencoded')
@@ -130,9 +163,6 @@ class CeskaTelevizeIE(InfoExtractor):
req = sanitized_Request(compat_urllib_parse_unquote(playlist_url))
req.add_header('Referer', url)
- playlist_title = self._og_search_title(webpage, default=None)
- playlist_description = self._og_search_description(webpage, default=None)
-
playlist = self._download_json(req, playlist_id, fatal=False)
if not playlist:
continue
@@ -147,6 +177,7 @@ class CeskaTelevizeIE(InfoExtractor):
is_live = item.get('type') == 'LIVE'
formats = []
for format_id, stream_url in item.get('streamUrls', {}).items():
+ stream_url = stream_url.replace('https://', 'http://')
if 'playerType=flash' in stream_url:
stream_formats = self._extract_m3u8_formats(
stream_url, playlist_id, 'mp4', 'm3u8_native',
@@ -182,8 +213,6 @@ class CeskaTelevizeIE(InfoExtractor):
if playlist_len == 1:
final_title = playlist_title or title
- if is_live:
- final_title = self._live_title(final_title)
else:
final_title = '%s (%s)' % (playlist_title, title)
@@ -237,54 +266,3 @@ class CeskaTelevizeIE(InfoExtractor):
yield line
return '\r\n'.join(_fix_subtitle(subtitles))
-
-
-class CeskaTelevizePoradyIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/porady/(?:[^/?#&]+/)*(?P<id>[^/#?]+)'
- _TESTS = [{
- # video with 18+ caution trailer
- 'url': 'http://www.ceskatelevize.cz/porady/10520528904-queer/215562210900007-bogotart/',
- 'info_dict': {
- 'id': '215562210900007-bogotart',
- 'title': 'Queer: Bogotart',
- 'description': 'Alternativní průvodce současným queer světem',
- },
- 'playlist': [{
- 'info_dict': {
- 'id': '61924494876844842',
- 'ext': 'mp4',
- 'title': 'Queer: Bogotart (Varování 18+)',
- 'duration': 10.2,
- },
- }, {
- 'info_dict': {
- 'id': '61924494877068022',
- 'ext': 'mp4',
- 'title': 'Queer: Bogotart (Queer)',
- 'thumbnail': r're:^https?://.*\.jpg',
- 'duration': 1558.3,
- },
- }],
- 'params': {
- # m3u8 download
- 'skip_download': True,
- },
- }, {
- # iframe embed
- 'url': 'http://www.ceskatelevize.cz/porady/10614999031-neviditelni/21251212048/',
- 'only_matching': True,
- }]
-
- def _real_extract(self, url):
- video_id = self._match_id(url)
-
- webpage = self._download_webpage(url, video_id)
-
- data_url = update_url_query(unescapeHTML(self._search_regex(
- (r'<span[^>]*\bdata-url=(["\'])(?P<url>(?:(?!\1).)+)\1',
- r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:www\.)?ceskatelevize\.cz/ivysilani/embed/iFramePlayer\.php.*?)\1'),
- webpage, 'iframe player url', group='url')), query={
- 'autoStart': 'true',
- })
-
- return self.url_result(data_url, ie=CeskaTelevizeIE.ie_key())
diff --git a/hypervideo_dl/extractor/chaturbate.py b/hypervideo_dl/extractor/chaturbate.py
index a459dcb..8da51f9 100644
--- a/hypervideo_dl/extractor/chaturbate.py
+++ b/hypervideo_dl/extractor/chaturbate.py
@@ -101,7 +101,7 @@ class ChaturbateIE(InfoExtractor):
return {
'id': video_id,
- 'title': self._live_title(video_id),
+ 'title': video_id,
'thumbnail': 'https://roomimg.stream.highwebmedia.com/ri/%s.jpg' % video_id,
'age_limit': self._rta_search(webpage),
'is_live': True,
diff --git a/hypervideo_dl/extractor/chingari.py b/hypervideo_dl/extractor/chingari.py
index 6bdc4f6..e6841fb 100644
--- a/hypervideo_dl/extractor/chingari.py
+++ b/hypervideo_dl/extractor/chingari.py
@@ -67,7 +67,7 @@ class ChingariBaseIE(InfoExtractor):
class ChingariIE(ChingariBaseIE):
- _VALID_URL = r'(?:https?://)(?:www\.)?chingari\.io/share/post\?id=(?P<id>[^&/#?]+)'
+ _VALID_URL = r'https?://(?:www\.)?chingari\.io/share/post\?id=(?P<id>[^&/#?]+)'
_TESTS = [{
'url': 'https://chingari.io/share/post?id=612f8f4ce1dc57090e8a7beb',
'info_dict': {
@@ -102,7 +102,7 @@ class ChingariIE(ChingariBaseIE):
class ChingariUserIE(ChingariBaseIE):
- _VALID_URL = r'(?:https?://)(?:www\.)?chingari\.io/(?!share/post)(?P<id>[^/?]+)'
+ _VALID_URL = r'https?://(?:www\.)?chingari\.io/(?!share/post)(?P<id>[^/?]+)'
_TESTS = [{
'url': 'https://chingari.io/dada1023',
'playlist_mincount': 3,
diff --git a/hypervideo_dl/extractor/closertotruth.py b/hypervideo_dl/extractor/closertotruth.py
index 26243d5..517e121 100644
--- a/hypervideo_dl/extractor/closertotruth.py
+++ b/hypervideo_dl/extractor/closertotruth.py
@@ -54,8 +54,7 @@ class CloserToTruthIE(InfoExtractor):
r'<script[^>]+src=["\'].*?\b(?:partner_id|p)/(\d+)',
webpage, 'kaltura partner_id')
- title = self._search_regex(
- r'<title>(.+?)\s*\|\s*.+?</title>', webpage, 'video title')
+ title = self._html_extract_title(webpage, 'video title')
select = self._search_regex(
r'(?s)<select[^>]+id="select-version"[^>]*>(.+?)</select>',
diff --git a/hypervideo_dl/extractor/common.py b/hypervideo_dl/extractor/common.py
index df74c75..0035191 100644
--- a/hypervideo_dl/extractor/common.py
+++ b/hypervideo_dl/extractor/common.py
@@ -2,7 +2,7 @@
from __future__ import unicode_literals
import base64
-import datetime
+import collections
import hashlib
import itertools
import json
@@ -45,15 +45,18 @@ from ..utils import (
determine_ext,
determine_protocol,
dict_get,
+ encode_data_uri,
error_to_compat_str,
extract_attributes,
ExtractorError,
+ filter_dict,
fix_xml_ampersands,
float_or_none,
format_field,
GeoRestrictedError,
GeoUtils,
int_or_none,
+ join_nonempty,
js_to_json,
JSON_LD_RE,
mimetype2ext,
@@ -73,7 +76,9 @@ from ..utils import (
str_to_int,
strip_or_none,
traverse_obj,
+ try_get,
unescapeHTML,
+ UnsupportedError,
unified_strdate,
unified_timestamp,
update_Request,
@@ -134,6 +139,8 @@ class InfoExtractor(object):
for HDS - URL of the F4M manifest,
for DASH - URL of the MPD manifest,
for MSS - URL of the ISM manifest.
+ * manifest_stream_number (For internal use only)
+ The index of the stream in the manifest file
* ext Will be calculated from URL if missing
* format A human-readable description of the format
("mp4 container with h264/opus").
@@ -161,9 +168,8 @@ class InfoExtractor(object):
* filesize_approx An estimate for the number of bytes
* player_url SWF Player URL (used for rtmpdump).
* protocol The protocol that will be used for the actual
- download, lower-case.
- "http", "https", "rtsp", "rtmp", "rtmp_ffmpeg", "rtmpe",
- "m3u8", "m3u8_native" or "http_dash_segments".
+ download, lower-case. One of "http", "https" or
+ one of the protocols defined in downloader.PROTOCOL_MAP
* fragment_base_url
Base URL for fragments. Each fragment's path
value (if present) will be relative to
@@ -179,6 +185,8 @@ class InfoExtractor(object):
fragment_base_url
* "duration" (optional, int or float)
* "filesize" (optional, int)
+ * is_from_start Is a live format that can be downloaded
+ from the start. Boolean
* preference Order number of this format. If this field is
present and not None, the formats get sorted
by this field, regardless of all other values.
@@ -209,7 +217,7 @@ class InfoExtractor(object):
(HTTP or RTMP) download. Boolean.
* has_drm The format has DRM and cannot be downloaded. Boolean
* downloader_options A dictionary of downloader options as
- described in FileDownloader
+ described in FileDownloader (For internal use only)
RTMP formats can also have the additional fields: page_url,
app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
rtmp_protocol, rtmp_real_time
@@ -221,6 +229,7 @@ class InfoExtractor(object):
The following fields are optional:
+ direct: True if a direct video file was given (must only be set by GenericIE)
alt_title: A secondary title of the video.
display_id An alternative identifier for the video, not necessarily
unique, but available before title. Typically, id is
@@ -235,16 +244,22 @@ class InfoExtractor(object):
* "resolution" (optional, string "{width}x{height}",
deprecated)
* "filesize" (optional, int)
+ * "http_headers" (dict) - HTTP headers for the request
thumbnail: Full URL to a video thumbnail image.
description: Full video description.
uploader: Full name of the video uploader.
license: License name the video is licensed under.
creator: The creator of the video.
- release_timestamp: UNIX timestamp of the moment the video was released.
- release_date: The date (YYYYMMDD) when the video was released.
timestamp: UNIX timestamp of the moment the video was uploaded
- upload_date: Video upload date (YYYYMMDD).
- If not explicitly set, calculated from timestamp.
+ upload_date: Video upload date in UTC (YYYYMMDD).
+ If not explicitly set, calculated from timestamp
+ release_timestamp: UNIX timestamp of the moment the video was released.
+ If it is not clear whether to use timestamp or this, use the former
+ release_date: The date (YYYYMMDD) when the video was released in UTC.
+ If not explicitly set, calculated from release_timestamp
+ modified_timestamp: UNIX timestamp of the moment the video was last modified.
+ modified_date: The date (YYYYMMDD) when the video was last modified in UTC.
+ If not explicitly set, calculated from modified_timestamp
uploader_id: Nickname or id of the video uploader.
uploader_url: Full URL to a personal webpage of the video uploader.
channel: Full name of the channel the video is uploaded on.
@@ -252,6 +267,7 @@ class InfoExtractor(object):
fields. This depends on a particular extractor.
channel_id: Id of the channel.
channel_url: Full URL to a channel webpage.
+ channel_follower_count: Number of followers of the channel.
location: Physical location where the video was filmed.
subtitles: The available subtitles as a dictionary in the format
{tag: subformats}. "tag" is usually a language code, and
@@ -262,6 +278,8 @@ class InfoExtractor(object):
* "url": A URL pointing to the subtitles file
It can optionally also have:
* "name": Name or description of the subtitles
+ * "http_headers": A dictionary of additional HTTP headers
+ to add to the request.
"ext" will be calculated from URL if missing
automatic_captions: Like 'subtitles'; contains automatically generated
captions instead of normal subtitles
@@ -340,6 +358,7 @@ class InfoExtractor(object):
series, programme or podcast:
series: Title of the series or programme the video episode belongs to.
+ series_id: Id of the series or programme the video episode belongs to, as a unicode string.
season: Title of the season the video episode belongs to.
season_number: Number of the season the video episode belongs to, as an integer.
season_id: Id of the season the video episode belongs to, as a unicode string.
@@ -366,6 +385,7 @@ class InfoExtractor(object):
disc_number: Number of the disc or other physical medium the track belongs to,
as an integer.
release_year: Year (YYYY) when the album was released.
+ composer: Composer of the piece
Unless mentioned otherwise, the fields should be Unicode strings.
@@ -379,6 +399,11 @@ class InfoExtractor(object):
Additionally, playlists can have "id", "title", and any other relevent
attributes with the same semantics as videos (see above).
+ It can also have the following optional fields:
+
+ playlist_count: The total number of videos in a playlist. If not given,
+ YoutubeDL tries to calculate it from "entries"
+
_type "multi_video" indicates that there are multiple videos that
form a single show, for examples multiple acts of an opera or TV episode.
@@ -404,13 +429,21 @@ class InfoExtractor(object):
title, description etc.
- Subclasses of this one should re-define the _real_initialize() and
- _real_extract() methods and define a _VALID_URL regexp.
+ Subclasses of this should define a _VALID_URL regexp and, re-define the
+ _real_extract() and (optionally) _real_initialize() methods.
Probably, they should also be added to the list of extractors.
Subclasses may also override suitable() if necessary, but ensure the function
signature is preserved and that this function imports everything it needs
- (except other extractors), so that lazy_extractors works correctly
+ (except other extractors), so that lazy_extractors works correctly.
+
+ To support username + password (or netrc) login, the extractor must define a
+ _NETRC_MACHINE and re-define _perform_login(username, password) and
+ (optionally) _initialize_pre_login() methods. The _perform_login method will
+ be called between _initialize_pre_login and _real_initialize if credentials
+ are passed by the user. In cases where it is necessary to have the login
+ process as part of the extraction rather than initialization, _perform_login
+ can be left undefined.
_GEO_BYPASS attribute may be set to False in order to disable
geo restriction bypass mechanisms for a particular extractor.
@@ -438,17 +471,21 @@ class InfoExtractor(object):
_GEO_COUNTRIES = None
_GEO_IP_BLOCKS = None
_WORKING = True
+ _NETRC_MACHINE = None
+ IE_DESC = None
_LOGIN_HINTS = {
- 'any': 'Use --cookies, --username and --password or --netrc to provide account credentials',
+ 'any': 'Use --cookies, --cookies-from-browser, --username and --password, or --netrc to provide account credentials',
'cookies': (
'Use --cookies-from-browser or --cookies for the authentication. '
'See https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl for how to manually pass cookies'),
- 'password': 'Use --username and --password or --netrc to provide account credentials',
+ 'password': 'Use --username and --password, or --netrc to provide account credentials',
}
def __init__(self, downloader=None):
- """Constructor. Receives an optional downloader."""
+ """Constructor. Receives an optional downloader (a YoutubeDL instance).
+ If a downloader is not passed during initialization,
+ it must be set using "set_downloader()" before "extract()" is called"""
self._ready = False
self._x_forwarded_for_ip = None
self._printed_messages = set()
@@ -460,6 +497,8 @@ class InfoExtractor(object):
# we have cached the regexp for *this* class, whereas getattr would also
# match the superclass
if '_VALID_URL_RE' not in cls.__dict__:
+ if '_VALID_URL' not in cls.__dict__:
+ cls._VALID_URL = cls._make_valid_url()
cls._VALID_URL_RE = re.compile(cls._VALID_URL)
return cls._VALID_URL_RE.match(url)
@@ -486,6 +525,10 @@ class InfoExtractor(object):
"""Getter method for _WORKING."""
return cls._WORKING
+ @classmethod
+ def supports_login(cls):
+ return bool(cls._NETRC_MACHINE)
+
def initialize(self):
"""Initializes an instance (authentication, etc)."""
self._printed_messages = set()
@@ -494,6 +537,13 @@ class InfoExtractor(object):
'ip_blocks': self._GEO_IP_BLOCKS,
})
if not self._ready:
+ self._initialize_pre_login()
+ if self.supports_login():
+ username, password = self._get_login_info()
+ if username:
+ self._perform_login(username, password)
+ elif self.get_param('username') and False not in (self.IE_DESC, self._NETRC_MACHINE):
+ self.report_warning(f'Login with password is not supported for this website. {self._LOGIN_HINTS["cookies"]}')
self._real_initialize()
self._ready = True
@@ -602,10 +652,19 @@ class InfoExtractor(object):
if self.__maybe_fake_ip_and_retry(e.countries):
continue
raise
+ except UnsupportedError:
+ raise
except ExtractorError as e:
- video_id = e.video_id or self.get_temp_id(url)
- raise ExtractorError(
- e.msg, video_id=video_id, ie=self.IE_NAME, tb=e.traceback, expected=e.expected, cause=e.cause)
+ kwargs = {
+ 'video_id': e.video_id or self.get_temp_id(url),
+ 'ie': self.IE_NAME,
+ 'tb': e.traceback or sys.exc_info()[2],
+ 'expected': e.expected,
+ 'cause': e.cause
+ }
+ if hasattr(e, 'countries'):
+ kwargs['countries'] = e.countries
+ raise type(e)(e.orig_msg, **kwargs)
except compat_http_client.IncompleteRead as e:
raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
except (KeyError, StopIteration) as e:
@@ -627,16 +686,24 @@ class InfoExtractor(object):
return False
def set_downloader(self, downloader):
- """Sets the downloader for this IE."""
+ """Sets a YoutubeDL instance as the downloader for this IE."""
self._downloader = downloader
+ def _initialize_pre_login(self):
+ """ Intialization before login. Redefine in subclasses."""
+ pass
+
+ def _perform_login(self, username, password):
+ """ Login with username and password. Redefine in subclasses."""
+ pass
+
def _real_initialize(self):
"""Real initialization process. Redefine in subclasses."""
pass
def _real_extract(self, url):
"""Real extraction process. Redefine in subclasses."""
- pass
+ raise NotImplementedError('This method must be implemented by subclasses')
@classmethod
def ie_key(cls):
@@ -664,7 +731,7 @@ class InfoExtractor(object):
See _download_webpage docstring for arguments specification.
"""
if not self._downloader._first_webpage_request:
- sleep_interval = float_or_none(self.get_param('sleep_interval_requests')) or 0
+ sleep_interval = self.get_param('sleep_interval_requests') or 0
if sleep_interval > 0:
self.to_screen('Sleeping %s seconds ...' % sleep_interval)
time.sleep(sleep_interval)
@@ -715,7 +782,7 @@ class InfoExtractor(object):
errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
if fatal:
- raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
+ raise ExtractorError(errmsg, cause=err)
else:
self.report_warning(errmsg)
return False
@@ -970,7 +1037,7 @@ class InfoExtractor(object):
if transform_source:
json_string = transform_source(json_string)
try:
- return json.loads(json_string)
+ return json.loads(json_string, strict=False)
except ValueError as ve:
errmsg = '%s: Failed to parse JSON ' % video_id
if fatal:
@@ -1063,23 +1130,30 @@ class InfoExtractor(object):
def raise_login_required(
self, msg='This video is only available for registered users',
- metadata_available=False, method='any'):
- if metadata_available and self.get_param('ignore_no_formats_error'):
+ metadata_available=False, method=NO_DEFAULT):
+ if metadata_available and (
+ self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
self.report_warning(msg)
+ return
+ if method is NO_DEFAULT:
+ method = 'any' if self.supports_login() else 'cookies'
if method is not None:
+ assert method in self._LOGIN_HINTS, 'Invalid login method'
msg = '%s. %s' % (msg, self._LOGIN_HINTS[method])
raise ExtractorError(msg, expected=True)
def raise_geo_restricted(
self, msg='This video is not available from your location due to geo restriction',
countries=None, metadata_available=False):
- if metadata_available and self.get_param('ignore_no_formats_error'):
+ if metadata_available and (
+ self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
self.report_warning(msg)
else:
raise GeoRestrictedError(msg, countries=countries)
def raise_no_formats(self, msg, expected=False, video_id=None):
- if expected and self.get_param('ignore_no_formats_error'):
+ if expected and (
+ self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
self.report_warning(msg, video_id)
elif isinstance(msg, ExtractorError):
raise msg
@@ -1088,39 +1162,39 @@ class InfoExtractor(object):
# Methods for following #608
@staticmethod
- def url_result(url, ie=None, video_id=None, video_title=None, **kwargs):
+ def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent=False, **kwargs):
"""Returns a URL that points to a page that should be processed"""
- # TODO: ie should be the class used for getting the info
- video_info = {'_type': 'url',
- 'url': url,
- 'ie_key': ie}
- video_info.update(kwargs)
+ if ie is not None:
+ kwargs['ie_key'] = ie if isinstance(ie, str) else ie.ie_key()
if video_id is not None:
- video_info['id'] = video_id
+ kwargs['id'] = video_id
if video_title is not None:
- video_info['title'] = video_title
- return video_info
+ kwargs['title'] = video_title
+ return {
+ **kwargs,
+ '_type': 'url_transparent' if url_transparent else 'url',
+ 'url': url,
+ }
- def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
- urls = orderedSet(
- self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
- for m in matches)
- return self.playlist_result(
- urls, playlist_id=playlist_id, playlist_title=playlist_title)
+ def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None, video_kwargs=None, **kwargs):
+ urls = (self.url_result(self._proto_relative_url(m), ie, **(video_kwargs or {}))
+ for m in orderedSet(map(getter, matches) if getter else matches))
+ return self.playlist_result(urls, playlist_id, playlist_title, **kwargs)
@staticmethod
- def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, **kwargs):
+ def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):
"""Returns a playlist"""
- video_info = {'_type': 'playlist',
- 'entries': entries}
- video_info.update(kwargs)
if playlist_id:
- video_info['id'] = playlist_id
+ kwargs['id'] = playlist_id
if playlist_title:
- video_info['title'] = playlist_title
+ kwargs['title'] = playlist_title
if playlist_description is not None:
- video_info['description'] = playlist_description
- return video_info
+ kwargs['description'] = playlist_description
+ return {
+ **kwargs,
+ '_type': 'multi_video' if multi_video else 'playlist',
+ 'entries': entries,
+ }
def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
"""
@@ -1137,7 +1211,7 @@ class InfoExtractor(object):
if mobj:
break
- _name = self._downloader._color_text(name, 'blue')
+ _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
if mobj:
if group is None:
@@ -1225,8 +1299,8 @@ class InfoExtractor(object):
@staticmethod
def _og_regexes(prop):
content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
- property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)'
- % {'prop': re.escape(prop)})
+ property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)'
+ % {'prop': re.escape(prop), 'sep': '(?:&#x3A;|[:-])'})
template = r'<meta[^>]+?%s[^>]+?%s'
return [
template % (property_re, content_re),
@@ -1257,8 +1331,8 @@ class InfoExtractor(object):
def _og_search_description(self, html, **kargs):
return self._og_search_property('description', html, fatal=False, **kargs)
- def _og_search_title(self, html, **kargs):
- return self._og_search_property('title', html, **kargs)
+ def _og_search_title(self, html, *, fatal=False, **kargs):
+ return self._og_search_property('title', html, fatal=fatal, **kargs)
def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
regexes = self._og_regexes('video') + self._og_regexes('video:url')
@@ -1269,6 +1343,9 @@ class InfoExtractor(object):
def _og_search_url(self, html, **kargs):
return self._og_search_property('url', html, **kargs)
+ def _html_extract_title(self, html, name='title', *, fatal=False, **kwargs):
+ return self._html_search_regex(r'(?s)<title>([^<]+)</title>', html, name, fatal=fatal, **kwargs)
+
def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
name = variadic(name)
if display_name is None:
@@ -1409,6 +1486,23 @@ class InfoExtractor(object):
continue
info[count_key] = interaction_count
+ def extract_chapter_information(e):
+ chapters = [{
+ 'title': part.get('name'),
+ 'start_time': part.get('startOffset'),
+ 'end_time': part.get('endOffset'),
+ } for part in variadic(e.get('hasPart') or []) if part.get('@type') == 'Clip']
+ for idx, (last_c, current_c, next_c) in enumerate(zip(
+ [{'end_time': 0}] + chapters, chapters, chapters[1:])):
+ current_c['end_time'] = current_c['end_time'] or next_c['start_time']
+ current_c['start_time'] = current_c['start_time'] or last_c['end_time']
+ if None in current_c.values():
+ self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters')
+ return
+ if chapters:
+ chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration']
+ info['chapters'] = chapters
+
def extract_video_object(e):
assert e['@type'] == 'VideoObject'
author = e.get('author')
@@ -1416,7 +1510,8 @@ class InfoExtractor(object):
'url': url_or_none(e.get('contentUrl')),
'title': unescapeHTML(e.get('name')),
'description': unescapeHTML(e.get('description')),
- 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')),
+ 'thumbnails': [{'url': url_or_none(url)}
+ for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))],
'duration': parse_duration(e.get('duration')),
'timestamp': unified_timestamp(e.get('uploadDate')),
# author can be an instance of 'Organization' or 'Person' types.
@@ -1431,12 +1526,21 @@ class InfoExtractor(object):
'view_count': int_or_none(e.get('interactionCount')),
})
extract_interaction_statistic(e)
+ extract_chapter_information(e)
- for e in json_ld:
- if '@context' in e:
+ def traverse_json_ld(json_ld, at_top_level=True):
+ for e in json_ld:
+ if at_top_level and '@context' not in e:
+ continue
+ if at_top_level and set(e.keys()) == {'@context', '@graph'}:
+ traverse_json_ld(variadic(e['@graph'], allowed_types=(dict,)), at_top_level=False)
+ break
item_type = e.get('@type')
if expected_type is not None and expected_type != item_type:
continue
+ rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
+ if rating is not None:
+ info['average_rating'] = rating
if item_type in ('TVEpisode', 'Episode'):
episode_name = unescapeHTML(e.get('name'))
info.update({
@@ -1466,8 +1570,10 @@ class InfoExtractor(object):
info.update({
'timestamp': parse_iso8601(e.get('datePublished')),
'title': unescapeHTML(e.get('headline')),
- 'description': unescapeHTML(e.get('articleBody')),
+ 'description': unescapeHTML(e.get('articleBody') or e.get('description')),
})
+ if traverse_obj(e, ('video', 0, '@type')) == 'VideoObject':
+ extract_video_object(e['video'][0])
elif item_type == 'VideoObject':
extract_video_object(e)
if expected_type is None:
@@ -1481,7 +1587,34 @@ class InfoExtractor(object):
continue
else:
break
- return dict((k, v) for k, v in info.items() if v is not None)
+ traverse_json_ld(json_ld)
+
+ return filter_dict(info)
+
+ def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw):
+ return self._parse_json(
+ self._search_regex(
+ r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
+ webpage, 'next.js data', fatal=fatal, **kw),
+ video_id, transform_source=transform_source, fatal=fatal)
+
+ def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__'):
+ ''' Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function. '''
+ # not all website do this, but it can be changed
+ # https://stackoverflow.com/questions/67463109/how-to-change-or-hide-nuxt-and-nuxt-keyword-in-page-source
+ rectx = re.escape(context_name)
+ js, arg_keys, arg_vals = self._search_regex(
+ (r'<script>window\.%s=\(function\((?P<arg_keys>.*?)\)\{return\s(?P<js>\{.*?\})\}\((?P<arg_vals>.+?)\)\);?</script>' % rectx,
+ r'%s\(.*?\(function\((?P<arg_keys>.*?)\)\{return\s(?P<js>\{.*?\})\}\((?P<arg_vals>.*?)\)' % rectx),
+ webpage, context_name, group=['js', 'arg_keys', 'arg_vals'])
+
+ args = dict(zip(arg_keys.split(','), arg_vals.split(',')))
+
+ for key, val in args.items():
+ if val in ('undefined', 'void 0'):
+ args[key] = 'null'
+
+ return self._parse_json(js_to_json(js, args), video_id)['data'][0]
@staticmethod
def _hidden_inputs(html):
@@ -1510,20 +1643,20 @@ class InfoExtractor(object):
default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
'res', 'fps', 'hdr:12', 'codec:vp9.2', 'size', 'br', 'asr',
- 'proto', 'ext', 'hasaud', 'source', 'format_id') # These must not be aliases
+ 'proto', 'ext', 'hasaud', 'source', 'id') # These must not be aliases
ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
'height', 'width', 'proto', 'vext', 'abr', 'aext',
- 'fps', 'fs_approx', 'source', 'format_id')
+ 'fps', 'fs_approx', 'source', 'id')
settings = {
'vcodec': {'type': 'ordered', 'regex': True,
'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
'acodec': {'type': 'ordered', 'regex': True,
- 'order': ['opus', 'vorbis', 'aac', 'mp?4a?', 'mp3', 'e?a?c-?3', 'dts', '', None, 'none']},
+ 'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
- 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.+', '.*dash', 'ws|websocket', '', 'mms|rtsp', 'none', 'f4']},
+ 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
'vext': {'type': 'ordered', 'field': 'video_ext',
'order': ('mp4', 'webm', 'flv', '', 'none'),
'order_free': ('webm', 'mp4', 'flv', '', 'none')},
@@ -1537,8 +1670,8 @@ class InfoExtractor(object):
'ie_pref': {'priority': True, 'type': 'extractor'},
'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
- 'lang': {'convert': 'ignore', 'field': 'language_preference'},
- 'quality': {'convert': 'float_none', 'default': -1},
+ 'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
+ 'quality': {'convert': 'float', 'default': -1},
'filesize': {'convert': 'bytes'},
'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
'id': {'convert': 'string', 'field': 'format_id'},
@@ -1549,7 +1682,7 @@ class InfoExtractor(object):
'vbr': {'convert': 'float_none'},
'abr': {'convert': 'float_none'},
'asr': {'convert': 'float_none'},
- 'source': {'convert': 'ignore', 'field': 'source_preference'},
+ 'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
@@ -1558,39 +1691,51 @@ class InfoExtractor(object):
'res': {'type': 'multiple', 'field': ('height', 'width'),
'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
- # Most of these exist only for compatibility reasons
- 'dimension': {'type': 'alias', 'field': 'res'},
- 'resolution': {'type': 'alias', 'field': 'res'},
- 'extension': {'type': 'alias', 'field': 'ext'},
- 'bitrate': {'type': 'alias', 'field': 'br'},
- 'total_bitrate': {'type': 'alias', 'field': 'tbr'},
- 'video_bitrate': {'type': 'alias', 'field': 'vbr'},
- 'audio_bitrate': {'type': 'alias', 'field': 'abr'},
- 'framerate': {'type': 'alias', 'field': 'fps'},
- 'language_preference': {'type': 'alias', 'field': 'lang'}, # not named as 'language' because such a field exists
- 'protocol': {'type': 'alias', 'field': 'proto'},
+ # For compatibility with youtube-dl
+ 'format_id': {'type': 'alias', 'field': 'id'},
+ 'preference': {'type': 'alias', 'field': 'ie_pref'},
+ 'language_preference': {'type': 'alias', 'field': 'lang'},
'source_preference': {'type': 'alias', 'field': 'source'},
+ 'protocol': {'type': 'alias', 'field': 'proto'},
'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
- 'filesize_estimate': {'type': 'alias', 'field': 'size'},
- 'samplerate': {'type': 'alias', 'field': 'asr'},
- 'video_ext': {'type': 'alias', 'field': 'vext'},
- 'audio_ext': {'type': 'alias', 'field': 'aext'},
- 'video_codec': {'type': 'alias', 'field': 'vcodec'},
- 'audio_codec': {'type': 'alias', 'field': 'acodec'},
- 'video': {'type': 'alias', 'field': 'hasvid'},
- 'has_video': {'type': 'alias', 'field': 'hasvid'},
- 'audio': {'type': 'alias', 'field': 'hasaud'},
- 'has_audio': {'type': 'alias', 'field': 'hasaud'},
- 'extractor': {'type': 'alias', 'field': 'ie_pref'},
- 'preference': {'type': 'alias', 'field': 'ie_pref'},
- 'extractor_preference': {'type': 'alias', 'field': 'ie_pref'},
- 'format_id': {'type': 'alias', 'field': 'id'},
+
+ # Deprecated
+ 'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
+ 'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
+ 'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
+ 'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
+ 'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
+ 'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
+ 'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
+ 'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
+ 'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
+ 'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
+ 'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
+ 'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
+ 'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
+ 'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
+ 'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
+ 'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
+ 'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
+ 'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
+ 'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
+ 'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
}
- _order = []
+ def __init__(self, ie, field_preference):
+ self._order = []
+ self.ydl = ie._downloader
+ self.evaluate_params(self.ydl.params, field_preference)
+ if ie.get_param('verbose'):
+ self.print_verbose_info(self.ydl.write_debug)
def _get_field_setting(self, field, key):
if field not in self.settings:
+ if key in ('forced', 'priority'):
+ return False
+ self.ydl.deprecation_warning(
+ f'Using arbitrary fields ({field}) for format sorting is deprecated '
+ 'and may be removed in a future version')
self.settings[field] = {}
propObj = self.settings[field]
if key not in propObj:
@@ -1673,7 +1818,11 @@ class InfoExtractor(object):
if field is None:
continue
if self._get_field_setting(field, 'type') == 'alias':
- field = self._get_field_setting(field, 'field')
+ alias, field = field, self._get_field_setting(field, 'field')
+ if self._get_field_setting(alias, 'deprecated'):
+ self.ydl.deprecation_warning(
+ f'Format sorting alias {alias} is deprecated '
+ f'and may be removed in a future version. Please use {field} instead')
reverse = match.group('reverse') is not None
closest = match.group('separator') == '~'
limit_text = match.group('limit')
@@ -1777,10 +1926,7 @@ class InfoExtractor(object):
def _sort_formats(self, formats, field_preference=[]):
if not formats:
return
- format_sort = self.FormatSort() # params and to_screen are taken from the downloader
- format_sort.evaluate_params(self._downloader.params, field_preference)
- if self.get_param('verbose', False):
- format_sort.print_verbose_info(self._downloader.write_debug)
+ format_sort = self.FormatSort(self, field_preference)
formats.sort(key=lambda f: format_sort.calculate_preference(f))
def _check_formats(self, formats, video_id):
@@ -1899,7 +2045,7 @@ class InfoExtractor(object):
tbr = int_or_none(media_el.attrib.get('bitrate'))
width = int_or_none(media_el.attrib.get('width'))
height = int_or_none(media_el.attrib.get('height'))
- format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
+ format_id = join_nonempty(f4m_id, tbr or i)
# If <bootstrapInfo> is present, the specified f4m is a
# stream-level manifest, and only set-level manifests may refer to
# external resources. See section 11.4 and section 4 of F4M spec
@@ -1961,7 +2107,7 @@ class InfoExtractor(object):
def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
return {
- 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
+ 'format_id': join_nonempty(m3u8_id, 'meta'),
'url': m3u8_url,
'ext': ext,
'protocol': 'm3u8',
@@ -2008,16 +2154,16 @@ class InfoExtractor(object):
headers=headers, query=query, video_id=video_id)
def _parse_m3u8_formats_and_subtitles(
- self, m3u8_doc, m3u8_url, ext=None, entry_protocol='m3u8_native',
+ self, m3u8_doc, m3u8_url=None, ext=None, entry_protocol='m3u8_native',
preference=None, quality=None, m3u8_id=None, live=False, note=None,
errnote=None, fatal=True, data=None, headers={}, query={},
video_id=None):
formats, subtitles = [], {}
- if '#EXT-X-FAXS-CM:' in m3u8_doc: # Adobe Flash Access
- return formats, subtitles
-
- has_drm = re.search(r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://', m3u8_doc)
+ has_drm = re.search('|'.join([
+ r'#EXT-X-FAXS-CM:', # Adobe Flash Access
+ r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://', # Apple FairPlay
+ ]), m3u8_doc)
def format_url(url):
return url if re.match(r'^https?://', url) else compat_urlparse.urljoin(m3u8_url, url)
@@ -2056,9 +2202,9 @@ class InfoExtractor(object):
if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
formats = [{
- 'format_id': '-'.join(map(str, filter(None, [m3u8_id, idx]))),
+ 'format_id': join_nonempty(m3u8_id, idx),
'format_index': idx,
- 'url': m3u8_url,
+ 'url': m3u8_url or encode_data_uri(m3u8_doc.encode('utf-8'), 'application/x-mpegurl'),
'ext': ext,
'protocol': entry_protocol,
'preference': preference,
@@ -2105,7 +2251,7 @@ class InfoExtractor(object):
if media_url:
manifest_url = format_url(media_url)
formats.extend({
- 'format_id': '-'.join(map(str, filter(None, (m3u8_id, group_id, name, idx)))),
+ 'format_id': join_nonempty(m3u8_id, group_id, name, idx),
'format_note': name,
'format_index': idx,
'url': manifest_url,
@@ -2162,9 +2308,9 @@ class InfoExtractor(object):
# format_id intact.
if not live:
stream_name = build_stream_name()
- format_id[1] = stream_name if stream_name else '%d' % (tbr if tbr else len(formats))
+ format_id[1] = stream_name or '%d' % (tbr or len(formats))
f = {
- 'format_id': '-'.join(map(str, filter(None, format_id))),
+ 'format_id': join_nonempty(*format_id),
'format_index': idx,
'url': manifest_url,
'manifest_url': m3u8_url,
@@ -2264,7 +2410,7 @@ class InfoExtractor(object):
if smil is False:
assert not fatal
- return []
+ return [], {}
namespace = self._parse_smil_namespace(smil)
@@ -2628,7 +2774,7 @@ class InfoExtractor(object):
mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
formats, subtitles = [], {}
- stream_numbers = {'audio': 0, 'video': 0}
+ stream_numbers = collections.defaultdict(int)
for period in mpd_doc.findall(_add_ns('Period')):
period_duration = parse_duration(period.get('duration')) or mpd_duration
period_ms_info = extract_multisegment_info(period, {
@@ -2644,11 +2790,15 @@ class InfoExtractor(object):
mime_type = representation_attrib['mimeType']
content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
- codecs = representation_attrib.get('codecs', '')
+ codecs = parse_codecs(representation_attrib.get('codecs', ''))
if content_type not in ('video', 'audio', 'text'):
if mime_type == 'image/jpeg':
content_type = mime_type
- elif codecs.split('.')[0] == 'stpp':
+ elif codecs['vcodec'] != 'none':
+ content_type = 'video'
+ elif codecs['acodec'] != 'none':
+ content_type = 'audio'
+ elif codecs.get('tcodec', 'none') != 'none':
content_type = 'text'
elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
content_type = 'text'
@@ -2694,10 +2844,8 @@ class InfoExtractor(object):
'format_note': 'DASH %s' % content_type,
'filesize': filesize,
'container': mimetype2ext(mime_type) + '_dash',
- 'manifest_stream_number': stream_numbers[content_type]
+ **codecs
}
- f.update(parse_codecs(codecs))
- stream_numbers[content_type] += 1
elif content_type == 'text':
f = {
'ext': mimetype2ext(mime_type),
@@ -2770,7 +2918,8 @@ class InfoExtractor(object):
segment_duration = None
if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
- representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
+ representation_ms_info['total_number'] = int(math.ceil(
+ float_or_none(period_duration, segment_duration, default=0)))
representation_ms_info['fragments'] = [{
media_location_key: media_template % {
'Number': segment_number,
@@ -2861,10 +3010,16 @@ class InfoExtractor(object):
f['url'] = initialization_url
f['fragments'].append({location_key(initialization_url): initialization_url})
f['fragments'].extend(representation_ms_info['fragments'])
+ if not period_duration:
+ period_duration = try_get(
+ representation_ms_info,
+ lambda r: sum(frag['duration'] for frag in r['fragments']), float)
else:
# Assuming direct URL to unfragmented media.
f['url'] = base_url
- if content_type in ('video', 'audio') or mime_type == 'image/jpeg':
+ if content_type in ('video', 'audio', 'image/jpeg'):
+ f['manifest_stream_number'] = stream_numbers[f['url']]
+ stream_numbers[f['url']] += 1
formats.append(f)
elif content_type == 'text':
subtitles.setdefault(lang or 'und', []).append(f)
@@ -2953,13 +3108,6 @@ class InfoExtractor(object):
})
fragment_ctx['time'] += fragment_ctx['duration']
- format_id = []
- if ism_id:
- format_id.append(ism_id)
- if stream_name:
- format_id.append(stream_name)
- format_id.append(compat_str(tbr))
-
if stream_type == 'text':
subtitles.setdefault(stream_language, []).append({
'ext': 'ismt',
@@ -2978,7 +3126,7 @@ class InfoExtractor(object):
})
elif stream_type in ('video', 'audio'):
formats.append({
- 'format_id': '-'.join(format_id),
+ 'format_id': join_nonempty(ism_id, stream_name, tbr),
'url': ism_url,
'manifest_url': ism_url,
'ext': 'ismv' if stream_type == 'video' else 'isma',
@@ -3008,7 +3156,7 @@ class InfoExtractor(object):
})
return formats, subtitles
- def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None, quality=None):
+ def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8_native', mpd_id=None, preference=None, quality=None):
def absolute_url(item_url):
return urljoin(base_url, item_url)
@@ -3402,15 +3550,11 @@ class InfoExtractor(object):
return formats
def _live_title(self, name):
- """ Generate the title for a live video """
- now = datetime.datetime.now()
- now_str = now.strftime('%Y-%m-%d %H:%M')
- return name + ' ' + now_str
+ self._downloader.deprecation_warning('hypervideo_dl.InfoExtractor._live_title is deprecated and does not work as expected')
+ return name
def _int(self, v, name, fatal=False, **kwargs):
res = int_or_none(v, **kwargs)
- if 'get_attr' in kwargs:
- print(getattr(v, kwargs['get_attr']))
if res is None:
msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
if fatal:
@@ -3515,14 +3659,18 @@ class InfoExtractor(object):
def extractor():
comments = []
+ interrupted = True
try:
while True:
comments.append(next(generator))
- except KeyboardInterrupt:
- interrupted = True
- self.to_screen('Interrupted by user')
except StopIteration:
interrupted = False
+ except KeyboardInterrupt:
+ self.to_screen('Interrupted by user')
+ except Exception as e:
+ if self.get_param('ignoreerrors') is not True:
+ raise
+ self._downloader.report_error(e)
comment_count = len(comments)
self.to_screen(f'Extracted {comment_count} comments')
return {
@@ -3536,11 +3684,11 @@ class InfoExtractor(object):
@staticmethod
def _merge_subtitle_items(subtitle_list1, subtitle_list2):
- """ Merge subtitle items for one language. Items with duplicated URLs
+ """ Merge subtitle items for one language. Items with duplicated URLs/data
will be dropped. """
- list1_urls = set([item['url'] for item in subtitle_list1])
+ list1_data = set((item.get('url'), item.get('data')) for item in subtitle_list1)
ret = list(subtitle_list1)
- ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
+ ret.extend(item for item in subtitle_list2 if (item.get('url'), item.get('data')) not in list1_data)
return ret
@classmethod
@@ -3565,9 +3713,8 @@ class InfoExtractor(object):
def mark_watched(self, *args, **kwargs):
if not self.get_param('mark_watched', False):
return
- if (self._get_login_info()[0] is not None
- or self.get_param('cookiefile')
- or self.get_param('cookiesfrombrowser')):
+ if (self.supports_login() and self._get_login_info()[0] is not None
+ or self.get_param('cookiefile') or self.get_param('cookiesfrombrowser')):
self._mark_watched(*args, **kwargs)
def _mark_watched(self, *args, **kwargs):
@@ -3600,7 +3747,7 @@ class InfoExtractor(object):
else 'public' if all_known
else None)
- def _configuration_arg(self, key, default=NO_DEFAULT, casesense=False):
+ def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False):
'''
@returns A list of values for the extractor argument given by "key"
or "default" if no such key is present
@@ -3608,34 +3755,43 @@ class InfoExtractor(object):
@param casesense When false, the values are converted to lower case
'''
val = traverse_obj(
- self._downloader.params, ('extractor_args', self.ie_key().lower(), key))
+ self._downloader.params, ('extractor_args', (ie_key or self.ie_key()).lower(), key))
if val is None:
return [] if default is NO_DEFAULT else default
return list(val) if casesense else [x.lower() for x in val]
+ def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_label='playlist', video_label='video'):
+ if not playlist_id or not video_id:
+ return not video_id
+
+ no_playlist = (smuggled_data or {}).get('force_noplaylist')
+ if no_playlist is not None:
+ return not no_playlist
+
+ video_id = '' if video_id is True else f' {video_id}'
+ playlist_id = '' if playlist_id is True else f' {playlist_id}'
+ if self.get_param('noplaylist'):
+ self.to_screen(f'Downloading just the {video_label}{video_id} because of --no-playlist')
+ return False
+ self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')
+ return True
+
class SearchInfoExtractor(InfoExtractor):
"""
Base class for paged search queries extractors.
They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
- Instances should define _SEARCH_KEY and _MAX_RESULTS.
+ Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
"""
+ _MAX_RESULTS = float('inf')
+
@classmethod
def _make_valid_url(cls):
return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
- @classmethod
- def suitable(cls, url):
- return re.match(cls._make_valid_url(), url) is not None
-
def _real_extract(self, query):
- mobj = re.match(self._make_valid_url(), query)
- if mobj is None:
- raise ExtractorError('Invalid search query "%s"' % query)
-
- prefix = mobj.group('prefix')
- query = mobj.group('query')
+ prefix, query = self._match_valid_url(query).group('prefix', 'query')
if prefix == '':
return self._get_n_results(query, 1)
elif prefix == 'all':
diff --git a/hypervideo_dl/extractor/corus.py b/hypervideo_dl/extractor/corus.py
index 352951e..1194613 100644
--- a/hypervideo_dl/extractor/corus.py
+++ b/hypervideo_dl/extractor/corus.py
@@ -55,7 +55,6 @@ class CorusIE(ThePlatformFeedIE):
'timestamp': 1486392197,
},
'params': {
- 'format': 'bestvideo',
'skip_download': True,
},
'expected_warnings': ['Failed to parse JSON'],
diff --git a/hypervideo_dl/extractor/coub.py b/hypervideo_dl/extractor/coub.py
index eba6b73..e90aa19 100644
--- a/hypervideo_dl/extractor/coub.py
+++ b/hypervideo_dl/extractor/coub.py
@@ -57,7 +57,7 @@ class CoubIE(InfoExtractor):
file_versions = coub['file_versions']
- QUALITIES = ('low', 'med', 'high')
+ QUALITIES = ('low', 'med', 'high', 'higher')
MOBILE = 'mobile'
IPHONE = 'iphone'
@@ -86,6 +86,7 @@ class CoubIE(InfoExtractor):
'format_id': '%s-%s-%s' % (HTML5, kind, quality),
'filesize': int_or_none(item.get('size')),
'vcodec': 'none' if kind == 'audio' else None,
+ 'acodec': 'none' if kind == 'video' else None,
'quality': quality_key(quality),
'source_preference': preference_key(HTML5),
})
diff --git a/hypervideo_dl/extractor/cozytv.py b/hypervideo_dl/extractor/cozytv.py
new file mode 100644
index 0000000..d49f1ca
--- /dev/null
+++ b/hypervideo_dl/extractor/cozytv.py
@@ -0,0 +1,40 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import unified_strdate
+
+
+class CozyTVIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?cozy\.tv/(?P<uploader>[^/]+)/replays/(?P<id>[^/$#&?]+)'
+
+ _TESTS = [{
+ 'url': 'https://cozy.tv/beardson/replays/2021-11-19_1',
+ 'info_dict': {
+ 'id': 'beardson-2021-11-19_1',
+ 'ext': 'mp4',
+ 'title': 'pokemon pt2',
+ 'uploader': 'beardson',
+ 'upload_date': '20211119',
+ 'was_live': True,
+ 'duration': 7981,
+ },
+ 'params': {'skip_download': True}
+ }]
+
+ def _real_extract(self, url):
+ uploader, date = self._match_valid_url(url).groups()
+ id = f'{uploader}-{date}'
+ data_json = self._download_json(f'https://api.cozy.tv/cache/{uploader}/replay/{date}', id)
+ formats, subtitles = self._extract_m3u8_formats_and_subtitles(
+ f'https://cozycdn.foxtrotstream.xyz/replays/{uploader}/{date}/index.m3u8', id, ext='mp4')
+ return {
+ 'id': id,
+ 'title': data_json.get('title'),
+ 'uploader': data_json.get('user') or uploader,
+ 'upload_date': unified_strdate(data_json.get('date')),
+ 'was_live': True,
+ 'duration': data_json.get('duration'),
+ 'formats': formats,
+ 'subtitles': subtitles,
+ }
diff --git a/hypervideo_dl/extractor/cpac.py b/hypervideo_dl/extractor/cpac.py
new file mode 100644
index 0000000..2274115
--- /dev/null
+++ b/hypervideo_dl/extractor/cpac.py
@@ -0,0 +1,148 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+ int_or_none,
+ str_or_none,
+ try_get,
+ unified_timestamp,
+ update_url_query,
+ urljoin,
+)
+
+# compat_range
+try:
+ if callable(xrange):
+ range = xrange
+except (NameError, TypeError):
+ pass
+
+
+class CPACIE(InfoExtractor):
+ IE_NAME = 'cpac'
+ _VALID_URL = r'https?://(?:www\.)?cpac\.ca/(?P<fr>l-)?episode\?id=(?P<id>[\da-f]{8}(?:-[\da-f]{4}){3}-[\da-f]{12})'
+ _TEST = {
+ # 'url': 'http://www.cpac.ca/en/programs/primetime-politics/episodes/65490909',
+ 'url': 'https://www.cpac.ca/episode?id=fc7edcae-4660-47e1-ba61-5b7f29a9db0f',
+ 'md5': 'e46ad699caafd7aa6024279f2614e8fa',
+ 'info_dict': {
+ 'id': 'fc7edcae-4660-47e1-ba61-5b7f29a9db0f',
+ 'ext': 'mp4',
+ 'upload_date': '20220215',
+ 'title': 'News Conference to Celebrate National Kindness Week – February 15, 2022',
+ 'description': 'md5:466a206abd21f3a6f776cdef290c23fb',
+ 'timestamp': 1644901200,
+ },
+ 'params': {
+ 'format': 'bestvideo',
+ 'hls_prefer_native': True,
+ },
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ url_lang = 'fr' if '/l-episode?' in url else 'en'
+
+ content = self._download_json(
+ 'https://www.cpac.ca/api/1/services/contentModel.json?url=/site/website/episode/index.xml&crafterSite=cpacca&id=' + video_id,
+ video_id)
+ video_url = try_get(content, lambda x: x['page']['details']['videoUrl'], compat_str)
+ formats = []
+ if video_url:
+ content = content['page']
+ title = str_or_none(content['details']['title_%s_t' % (url_lang, )])
+ formats = self._extract_m3u8_formats(video_url, video_id, m3u8_id='hls', ext='mp4')
+ for fmt in formats:
+ # prefer language to match URL
+ fmt_lang = fmt.get('language')
+ if fmt_lang == url_lang:
+ fmt['language_preference'] = 10
+ elif not fmt_lang:
+ fmt['language_preference'] = -1
+ else:
+ fmt['language_preference'] = -10
+
+ self._sort_formats(formats)
+
+ category = str_or_none(content['details']['category_%s_t' % (url_lang, )])
+
+ def is_live(v_type):
+ return (v_type == 'live') if v_type is not None else None
+
+ return {
+ 'id': video_id,
+ 'formats': formats,
+ 'title': title,
+ 'description': str_or_none(content['details'].get('description_%s_t' % (url_lang, ))),
+ 'timestamp': unified_timestamp(content['details'].get('liveDateTime')),
+ 'category': [category] if category else None,
+ 'thumbnail': urljoin(url, str_or_none(content['details'].get('image_%s_s' % (url_lang, )))),
+ 'is_live': is_live(content['details'].get('type')),
+ }
+
+
+class CPACPlaylistIE(InfoExtractor):
+ IE_NAME = 'cpac:playlist'
+ _VALID_URL = r'(?i)https?://(?:www\.)?cpac\.ca/(?:program|search|(?P<fr>emission|rechercher))\?(?:[^&]+&)*?(?P<id>(?:id=\d+|programId=\d+|key=[^&]+))'
+
+ _TESTS = [{
+ 'url': 'https://www.cpac.ca/program?id=6',
+ 'info_dict': {
+ 'id': 'id=6',
+ 'title': 'Headline Politics',
+ 'description': 'Watch CPAC’s signature long-form coverage of the day’s pressing political events as they unfold.',
+ },
+ 'playlist_count': 10,
+ }, {
+ 'url': 'https://www.cpac.ca/search?key=hudson&type=all&order=desc',
+ 'info_dict': {
+ 'id': 'key=hudson',
+ 'title': 'hudson',
+ },
+ 'playlist_count': 22,
+ }, {
+ 'url': 'https://www.cpac.ca/search?programId=50',
+ 'info_dict': {
+ 'id': 'programId=50',
+ 'title': '50',
+ },
+ 'playlist_count': 9,
+ }, {
+ 'url': 'https://www.cpac.ca/emission?id=6',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.cpac.ca/rechercher?key=hudson&type=all&order=desc',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ url_lang = 'fr' if any(x in url for x in ('/emission?', '/rechercher?')) else 'en'
+ pl_type, list_type = ('program', 'itemList') if any(x in url for x in ('/program?', '/emission?')) else ('search', 'searchResult')
+ api_url = (
+ 'https://www.cpac.ca/api/1/services/contentModel.json?url=/site/website/%s/index.xml&crafterSite=cpacca&%s'
+ % (pl_type, video_id, ))
+ content = self._download_json(api_url, video_id)
+ entries = []
+ total_pages = int_or_none(try_get(content, lambda x: x['page'][list_type]['totalPages']), default=1)
+ for page in range(1, total_pages + 1):
+ if page > 1:
+ api_url = update_url_query(api_url, {'page': '%d' % (page, ), })
+ content = self._download_json(
+ api_url, video_id,
+ note='Downloading continuation - %d' % (page, ),
+ fatal=False)
+
+ for item in try_get(content, lambda x: x['page'][list_type]['item'], list) or []:
+ episode_url = urljoin(url, try_get(item, lambda x: x['url_%s_s' % (url_lang, )]))
+ if episode_url:
+ entries.append(episode_url)
+
+ return self.playlist_result(
+ (self.url_result(entry) for entry in entries),
+ playlist_id=video_id,
+ playlist_title=try_get(content, lambda x: x['page']['program']['title_%s_t' % (url_lang, )]) or video_id.split('=')[-1],
+ playlist_description=try_get(content, lambda x: x['page']['program']['description_%s_t' % (url_lang, )]),
+ )
diff --git a/hypervideo_dl/extractor/crackle.py b/hypervideo_dl/extractor/crackle.py
index 2c9d28d..db4962c 100644
--- a/hypervideo_dl/extractor/crackle.py
+++ b/hypervideo_dl/extractor/crackle.py
@@ -23,32 +23,35 @@ from ..utils import (
class CrackleIE(InfoExtractor):
_VALID_URL = r'(?:crackle:|https?://(?:(?:www|m)\.)?(?:sony)?crackle\.com/(?:playlist/\d+/|(?:[^/]+/)+))(?P<id>\d+)'
_TESTS = [{
- # geo restricted to CA
- 'url': 'https://www.crackle.com/andromeda/2502343',
+ # Crackle is available in the United States and territories
+ 'url': 'https://www.crackle.com/thanksgiving/2510064',
'info_dict': {
- 'id': '2502343',
+ 'id': '2510064',
'ext': 'mp4',
- 'title': 'Under The Night',
- 'description': 'md5:d2b8ca816579ae8a7bf28bfff8cefc8a',
- 'duration': 2583,
+ 'title': 'Touch Football',
+ 'description': 'md5:cfbb513cf5de41e8b56d7ab756cff4df',
+ 'duration': 1398,
'view_count': int,
'average_rating': 0,
- 'age_limit': 14,
- 'genre': 'Action, Sci-Fi',
- 'creator': 'Allan Kroeker',
- 'artist': 'Keith Hamilton Cobb, Kevin Sorbo, Lisa Ryder, Lexa Doig, Robert Hewitt Wolfe',
- 'release_year': 2000,
- 'series': 'Andromeda',
- 'episode': 'Under The Night',
+ 'age_limit': 17,
+ 'genre': 'Comedy',
+ 'creator': 'Daniel Powell',
+ 'artist': 'Chris Elliott, Amy Sedaris',
+ 'release_year': 2016,
+ 'series': 'Thanksgiving',
+ 'episode': 'Touch Football',
'season_number': 1,
'episode_number': 1,
},
'params': {
# m3u8 download
'skip_download': True,
- }
+ },
+ 'expected_warnings': [
+ 'Trying with a list of known countries'
+ ],
}, {
- 'url': 'https://www.sonycrackle.com/andromeda/2502343',
+ 'url': 'https://www.sonycrackle.com/thanksgiving/2510064',
'only_matching': True,
}]
@@ -129,7 +132,6 @@ class CrackleIE(InfoExtractor):
break
ignore_no_formats = self.get_param('ignore_no_formats_error')
- allow_unplayable_formats = self.get_param('allow_unplayable_formats')
if not media or (not media.get('MediaURLs') and not ignore_no_formats):
raise ExtractorError(
@@ -143,9 +145,9 @@ class CrackleIE(InfoExtractor):
for e in media.get('MediaURLs') or []:
if e.get('UseDRM'):
has_drm = True
- if not allow_unplayable_formats:
- continue
- format_url = url_or_none(e.get('Path'))
+ format_url = url_or_none(e.get('DRMPath'))
+ else:
+ format_url = url_or_none(e.get('Path'))
if not format_url:
continue
ext = determine_ext(format_url)
diff --git a/hypervideo_dl/extractor/craftsy.py b/hypervideo_dl/extractor/craftsy.py
new file mode 100644
index 0000000..ed2f442
--- /dev/null
+++ b/hypervideo_dl/extractor/craftsy.py
@@ -0,0 +1,71 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .brightcove import BrightcoveNewIE
+from .common import InfoExtractor
+
+from ..utils import (
+ dict_get,
+ get_element_by_id,
+ js_to_json,
+ traverse_obj,
+)
+
+
+class CraftsyIE(InfoExtractor):
+ _VALID_URL = r'https?://www.craftsy.com/class/(?P<id>[a-z0-9_-]+)/'
+ _TESTS = [{
+ 'url': 'https://www.craftsy.com/class/the-midnight-quilt-show-season-5/',
+ 'info_dict': {
+ 'id': 'the-midnight-quilt-show-season-5',
+ 'title': 'The Midnight Quilt Show Season 5',
+ 'description': 'md5:113eda818e985d1a566625fb2f833b7a',
+ },
+ 'playlist_count': 10,
+ }, {
+ 'url': 'https://www.craftsy.com/class/sew-your-own-designer-handbag/',
+ 'info_dict': {
+ 'id': 'sew-your-own-designer-handbag',
+ 'title': 'Sew Your Own Designer Handbag',
+ 'description': 'md5:8270d0ef5427d3c895a27351aeaac276',
+ },
+ 'playlist_mincount': 1,
+ }, {
+ 'url': 'https://www.craftsy.com/class/all-access-estes-park-wool-market/',
+ 'info_dict': {
+ 'id': 'all-access-estes-park-wool-market',
+ 'title': 'All Access: Estes Park Wool Market',
+ 'description': 'md5:aded1bd8d38ae2fae4dae936c0ae01e7',
+ },
+ 'playlist_count': 6,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ video_data = self._parse_json(self._search_regex(
+ r'class_video_player_vars\s*=\s*({.*})\s*;',
+ get_element_by_id('vidstore-classes_class-video-player-js-extra', webpage),
+ 'video data'), video_id, transform_source=js_to_json)
+
+ account_id = traverse_obj(video_data, ('video_player', 'bc_account_id'))
+
+ entries = []
+ class_preview = traverse_obj(video_data, ('video_player', 'class_preview'))
+ if class_preview:
+ v_id = class_preview.get('video_id')
+ entries.append(self.url_result(
+ f'http://players.brightcove.net/{account_id}/default_default/index.html?videoId={v_id}',
+ BrightcoveNewIE, v_id, class_preview.get('title')))
+
+ if dict_get(video_data, ('is_free', 'user_has_access')):
+ entries += [
+ self.url_result(
+ f'http://players.brightcove.net/{account_id}/default_default/index.html?videoId={lesson["video_id"]}',
+ BrightcoveNewIE, lesson['video_id'], lesson.get('title'))
+ for lesson in video_data['lessons']]
+
+ return self.playlist_result(
+ entries, video_id, video_data.get('class_title'),
+ self._html_search_meta(('og:description', 'description'), webpage, default=None))
diff --git a/hypervideo_dl/extractor/crowdbunker.py b/hypervideo_dl/extractor/crowdbunker.py
new file mode 100644
index 0000000..72906af
--- /dev/null
+++ b/hypervideo_dl/extractor/crowdbunker.py
@@ -0,0 +1,113 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import itertools
+
+from .common import InfoExtractor
+from ..utils import (
+ int_or_none,
+ try_get,
+ unified_strdate,
+)
+
+
+class CrowdBunkerIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?crowdbunker\.com/v/(?P<id>[^/?#$&]+)'
+
+ _TESTS = [{
+ 'url': 'https://crowdbunker.com/v/0z4Kms8pi8I',
+ 'info_dict': {
+ 'id': '0z4Kms8pi8I',
+ 'ext': 'mp4',
+ 'title': '117) Pass vax et solutions',
+ 'description': 'md5:86bcb422c29475dbd2b5dcfa6ec3749c',
+ 'view_count': int,
+ 'duration': 5386,
+ 'uploader': 'Jérémie Mercier',
+ 'uploader_id': 'UCeN_qQV829NYf0pvPJhW5dQ',
+ 'like_count': int,
+ 'upload_date': '20211218',
+ 'thumbnail': 'https://scw.divulg.org/cb-medias4/images/0z4Kms8pi8I/maxres.jpg'
+ },
+ 'params': {'skip_download': True}
+ }]
+
+ def _real_extract(self, url):
+ id = self._match_id(url)
+ data_json = self._download_json(f'https://api.divulg.org/post/{id}/details',
+ id, headers={'accept': 'application/json, text/plain, */*'})
+ video_json = data_json['video']
+ formats, subtitles = [], {}
+ for sub in video_json.get('captions') or []:
+ sub_url = try_get(sub, lambda x: x['file']['url'])
+ if not sub_url:
+ continue
+ subtitles.setdefault(sub.get('languageCode', 'fr'), []).append({
+ 'url': sub_url,
+ })
+
+ mpd_url = try_get(video_json, lambda x: x['dashManifest']['url'])
+ if mpd_url:
+ fmts, subs = self._extract_mpd_formats_and_subtitles(mpd_url, id)
+ formats.extend(fmts)
+ subtitles = self._merge_subtitles(subtitles, subs)
+ m3u8_url = try_get(video_json, lambda x: x['hlsManifest']['url'])
+ if m3u8_url:
+ fmts, subs = self._extract_m3u8_formats_and_subtitles(mpd_url, id)
+ formats.extend(fmts)
+ subtitles = self._merge_subtitles(subtitles, subs)
+
+ thumbnails = [{
+ 'url': image['url'],
+ 'height': int_or_none(image.get('height')),
+ 'width': int_or_none(image.get('width')),
+ } for image in video_json.get('thumbnails') or [] if image.get('url')]
+
+ self._sort_formats(formats)
+ return {
+ 'id': id,
+ 'title': video_json.get('title'),
+ 'description': video_json.get('description'),
+ 'view_count': video_json.get('viewCount'),
+ 'duration': video_json.get('duration'),
+ 'uploader': try_get(data_json, lambda x: x['channel']['name']),
+ 'uploader_id': try_get(data_json, lambda x: x['channel']['id']),
+ 'like_count': data_json.get('likesCount'),
+ 'upload_date': unified_strdate(video_json.get('publishedAt') or video_json.get('createdAt')),
+ 'thumbnails': thumbnails,
+ 'formats': formats,
+ 'subtitles': subtitles,
+ }
+
+
+class CrowdBunkerChannelIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?crowdbunker\.com/@(?P<id>[^/?#$&]+)'
+
+ _TESTS = [{
+ 'url': 'https://crowdbunker.com/@Milan_UHRIN',
+ 'playlist_mincount': 14,
+ 'info_dict': {
+ 'id': 'Milan_UHRIN',
+ },
+ }]
+
+ def _entries(self, id):
+ last = None
+
+ for page in itertools.count():
+ channel_json = self._download_json(
+ f'https://api.divulg.org/organization/{id}/posts', id, headers={'accept': 'application/json, text/plain, */*'},
+ query={'after': last} if last else {}, note=f'Downloading Page {page}')
+ for item in channel_json.get('items') or []:
+ v_id = item.get('uid')
+ if not v_id:
+ continue
+ yield self.url_result(
+ 'https://crowdbunker.com/v/%s' % v_id, ie=CrowdBunkerIE.ie_key(), video_id=v_id)
+ last = channel_json.get('last')
+ if not last:
+ break
+
+ def _real_extract(self, url):
+ id = self._match_id(url)
+ return self.playlist_result(self._entries(id), playlist_id=id)
diff --git a/hypervideo_dl/extractor/crunchyroll.py b/hypervideo_dl/extractor/crunchyroll.py
index 511ac1b..7edb645 100644
--- a/hypervideo_dl/extractor/crunchyroll.py
+++ b/hypervideo_dl/extractor/crunchyroll.py
@@ -1,6 +1,7 @@
# coding: utf-8
from __future__ import unicode_literals
+import base64
import re
import json
import zlib
@@ -8,7 +9,7 @@ import zlib
from hashlib import sha1
from math import pow, sqrt, floor
from .common import InfoExtractor
-from .vrv import VRVIE
+from .vrv import VRVBaseIE
from ..compat import (
compat_b64decode,
compat_etree_Element,
@@ -23,14 +24,17 @@ from ..utils import (
bytes_to_intlist,
extract_attributes,
float_or_none,
+ format_field,
intlist_to_bytes,
int_or_none,
+ join_nonempty,
lowercase_escape,
merge_dicts,
+ qualities,
remove_end,
sanitized_Request,
+ traverse_obj,
try_get,
- urlencode_postdata,
xpath_text,
)
from ..aes import (
@@ -39,8 +43,8 @@ from ..aes import (
class CrunchyrollBaseIE(InfoExtractor):
- _LOGIN_URL = 'https://www.crunchyroll.com/login'
- _LOGIN_FORM = 'login_form'
+ _LOGIN_URL = 'https://www.crunchyroll.com/welcome/login'
+ _API_BASE = 'https://api.crunchyroll.com'
_NETRC_MACHINE = 'crunchyroll'
def _call_rpc_api(self, method, video_id, note=None, data=None):
@@ -53,57 +57,50 @@ class CrunchyrollBaseIE(InfoExtractor):
'Content-Type': 'application/x-www-form-urlencoded',
})
- def _login(self):
- username, password = self._get_login_info()
- if username is None:
+ def _perform_login(self, username, password):
+ if self._get_cookies(self._LOGIN_URL).get('etp_rt'):
return
- login_page = self._download_webpage(
- self._LOGIN_URL, None, 'Downloading login page')
-
- def is_logged(webpage):
- return 'href="/logout"' in webpage
-
- # Already logged in
- if is_logged(login_page):
- return
-
- login_form_str = self._search_regex(
- r'(?P<form><form[^>]+?id=(["\'])%s\2[^>]*>)' % self._LOGIN_FORM,
- login_page, 'login form', group='form')
-
- post_url = extract_attributes(login_form_str).get('action')
- if not post_url:
- post_url = self._LOGIN_URL
- elif not post_url.startswith('http'):
- post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url)
-
- login_form = self._form_hidden_inputs(self._LOGIN_FORM, login_page)
-
- login_form.update({
- 'login_form[name]': username,
- 'login_form[password]': password,
- })
-
- response = self._download_webpage(
- post_url, None, 'Logging in', 'Wrong login info',
- data=urlencode_postdata(login_form),
- headers={'Content-Type': 'application/x-www-form-urlencoded'})
-
- # Successful login
- if is_logged(response):
- return
-
- error = self._html_search_regex(
- '(?s)<ul[^>]+class=["\']messages["\'][^>]*>(.+?)</ul>',
- response, 'error message', default=None)
- if error:
- raise ExtractorError('Unable to login: %s' % error, expected=True)
-
- raise ExtractorError('Unable to log in')
-
- def _real_initialize(self):
- self._login()
+ upsell_response = self._download_json(
+ f'{self._API_BASE}/get_upsell_data.0.json', None, 'Getting session id',
+ query={
+ 'sess_id': 1,
+ 'device_id': 'whatvalueshouldbeforweb',
+ 'device_type': 'com.crunchyroll.static',
+ 'access_token': 'giKq5eY27ny3cqz',
+ 'referer': self._LOGIN_URL
+ })
+ if upsell_response['code'] != 'ok':
+ raise ExtractorError('Could not get session id')
+ session_id = upsell_response['data']['session_id']
+
+ login_response = self._download_json(
+ f'{self._API_BASE}/login.1.json', None, 'Logging in',
+ data=compat_urllib_parse_urlencode({
+ 'account': username,
+ 'password': password,
+ 'session_id': session_id
+ }).encode('ascii'))
+ if login_response['code'] != 'ok':
+ raise ExtractorError('Login failed. Server message: %s' % login_response['message'], expected=True)
+ if not self._get_cookies(self._LOGIN_URL).get('etp_rt'):
+ raise ExtractorError('Login succeeded but did not set etp_rt cookie')
+
+ # Beta-specific, but needed for redirects
+ def _get_beta_embedded_json(self, webpage, display_id):
+ initial_state = self._parse_json(self._search_regex(
+ r'__INITIAL_STATE__\s*=\s*({.+?})\s*;', webpage, 'initial state'), display_id)
+ app_config = self._parse_json(self._search_regex(
+ r'__APP_CONFIG__\s*=\s*({.+?})\s*;', webpage, 'app config'), display_id)
+ return initial_state, app_config
+
+ def _redirect_to_beta(self, webpage, iekey, video_id):
+ if not self._get_cookies(self._LOGIN_URL).get('etp_rt'):
+ raise ExtractorError('Received a beta page from non-beta url when not logged in.')
+ initial_state, app_config = self._get_beta_embedded_json(webpage, video_id)
+ url = app_config['baseSiteUrl'] + initial_state['router']['locations']['current']['pathname']
+ self.to_screen(f'{video_id}: Redirected to beta site - {url}')
+ return self.url_result(f'{url}', iekey, video_id)
@staticmethod
def _add_skip_wall(url):
@@ -119,7 +116,7 @@ class CrunchyrollBaseIE(InfoExtractor):
parsed_url._replace(query=compat_urllib_parse_urlencode(qs, True)))
-class CrunchyrollIE(CrunchyrollBaseIE, VRVIE):
+class CrunchyrollIE(CrunchyrollBaseIE, VRVBaseIE):
IE_NAME = 'crunchyroll'
_VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.(?:com|fr)/(?:media(?:-|/\?id=)|(?:[^/]*/){1,2}[^/?&]*?)(?P<id>[0-9]+))(?:[/?&]|$)'
_TESTS = [{
@@ -425,6 +422,8 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
webpage = self._download_webpage(
self._add_skip_wall(webpage_url), video_id,
headers=self.geo_verification_headers())
+ if re.search(r'<div id="preload-data">', webpage):
+ return self._redirect_to_beta(webpage, CrunchyrollBetaIE.ie_key(), video_id)
note_m = self._html_search_regex(
r'<div class="showmedia-trailer-notice">(.+?)</div>',
webpage, 'trailer-notice', default='')
@@ -478,19 +477,24 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
[r'<a[^>]+href="/publisher/[^"]+"[^>]*>([^<]+)</a>', r'<div>\s*Publisher:\s*<span>\s*(.+?)\s*</span>\s*</div>'],
webpage, 'video_uploader', default=False)
+ requested_languages = self._configuration_arg('language')
+ requested_hardsubs = [('' if val == 'none' else val) for val in self._configuration_arg('hardsub')]
+ language_preference = qualities((requested_languages or [language or ''])[::-1])
+ hardsub_preference = qualities((requested_hardsubs or ['', language or ''])[::-1])
+
formats = []
for stream in media.get('streams', []):
- audio_lang = stream.get('audio_lang')
- hardsub_lang = stream.get('hardsub_lang')
+ audio_lang = stream.get('audio_lang') or ''
+ hardsub_lang = stream.get('hardsub_lang') or ''
+ if (requested_languages and audio_lang.lower() not in requested_languages
+ or requested_hardsubs and hardsub_lang.lower() not in requested_hardsubs):
+ continue
vrv_formats = self._extract_vrv_formats(
stream.get('url'), video_id, stream.get('format'),
audio_lang, hardsub_lang)
for f in vrv_formats:
- f['language_preference'] = 1 if audio_lang == language else 0
- f['quality'] = (
- 1 if not hardsub_lang
- else 0 if hardsub_lang == language
- else -1)
+ f['language_preference'] = language_preference(audio_lang)
+ f['quality'] = hardsub_preference(hardsub_lang)
formats.extend(vrv_formats)
if not formats:
available_fmts = []
@@ -684,6 +688,8 @@ class CrunchyrollShowPlaylistIE(CrunchyrollBaseIE):
# https:// gives a 403, but http:// does not
self._add_skip_wall(url).replace('https://', 'http://'), show_id,
headers=self.geo_verification_headers())
+ if re.search(r'<div id="preload-data">', webpage):
+ return self._redirect_to_beta(webpage, CrunchyrollBetaShowIE.ie_key(), show_id)
title = self._html_search_meta('name', webpage, default=None)
episode_re = r'<li id="showview_videos_media_(\d+)"[^>]+>.*?<a href="([^"]+)"'
@@ -706,9 +712,56 @@ class CrunchyrollShowPlaylistIE(CrunchyrollBaseIE):
}
-class CrunchyrollBetaIE(CrunchyrollBaseIE):
+class CrunchyrollBetaBaseIE(CrunchyrollBaseIE):
+ params = None
+
+ def _get_params(self, lang):
+ if not CrunchyrollBetaBaseIE.params:
+ initial_state, app_config = self._get_beta_embedded_json(self._download_webpage(
+ f'https://beta.crunchyroll.com/{lang}', None, note='Retrieving main page'), None)
+ api_domain = app_config['cxApiParams']['apiDomain']
+ basic_token = str(base64.b64encode(('%s:' % app_config['cxApiParams']['accountAuthClientId']).encode('ascii')), 'ascii')
+ auth_response = self._download_json(
+ f'{api_domain}/auth/v1/token', None, note='Authenticating with cookie',
+ headers={
+ 'Authorization': 'Basic ' + basic_token
+ }, data='grant_type=etp_rt_cookie'.encode('ascii'))
+ policy_response = self._download_json(
+ f'{api_domain}/index/v2', None, note='Retrieving signed policy',
+ headers={
+ 'Authorization': auth_response['token_type'] + ' ' + auth_response['access_token']
+ })
+ bucket = policy_response['cms']['bucket']
+ params = {
+ 'Policy': policy_response['cms']['policy'],
+ 'Signature': policy_response['cms']['signature'],
+ 'Key-Pair-Id': policy_response['cms']['key_pair_id']
+ }
+ locale = traverse_obj(initial_state, ('localization', 'locale'))
+ if locale:
+ params['locale'] = locale
+ CrunchyrollBetaBaseIE.params = (api_domain, bucket, params)
+ return CrunchyrollBetaBaseIE.params
+
+ def _redirect_from_beta(self, url, lang, internal_id, display_id, is_episode, iekey):
+ initial_state, app_config = self._get_beta_embedded_json(self._download_webpage(url, display_id), display_id)
+ content_data = initial_state['content']['byId'][internal_id]
+ if is_episode:
+ video_id = content_data['external_id'].split('.')[1]
+ series_id = content_data['episode_metadata']['series_slug_title']
+ else:
+ series_id = content_data['slug_title']
+ series_id = re.sub(r'-{2,}', '-', series_id)
+ url = f'https://www.crunchyroll.com/{lang}{series_id}'
+ if is_episode:
+ url = url + f'/{display_id}-{video_id}'
+ self.to_screen(f'{display_id}: Not logged in. Redirecting to non-beta site - {url}')
+ return self.url_result(url, iekey, display_id)
+
+
+class CrunchyrollBetaIE(CrunchyrollBetaBaseIE):
IE_NAME = 'crunchyroll:beta'
- _VALID_URL = r'https?://beta\.crunchyroll\.com/(?P<lang>(?:\w{1,2}/)?)watch/(?P<internal_id>\w+)/(?P<id>[\w\-]+)/?(?:\?|$)'
+ _VALID_URL = r'https?://beta\.crunchyroll\.com/(?P<lang>(?:\w{1,2}/)?)watch/(?P<id>\w+)/(?P<display_id>[\w\-]*)/?(?:\?|$)'
_TESTS = [{
'url': 'https://beta.crunchyroll.com/watch/GY2P1Q98Y/to-the-future',
'info_dict': {
@@ -719,26 +772,129 @@ class CrunchyrollBetaIE(CrunchyrollBaseIE):
'uploader': 'Toei Animation',
'title': 'World Trigger Episode 73 – To the Future',
'upload_date': '20160402',
+ 'episode_number': 73,
+ 'series': 'World Trigger',
+ 'average_rating': 4.9,
+ 'episode': 'To the Future',
+ 'season': 'World Trigger',
+ 'thumbnail': 'https://img1.ak.crunchyroll.com/i/spire3-tmb/c870dedca1a83137c2d3d144984155ed1459527119_main.jpg',
+ 'season_number': 1,
+ },
+ 'params': {'skip_download': 'm3u8'},
+ 'expected_warnings': ['Unable to download XML']
+ }, {
+ 'url': 'https://beta.crunchyroll.com/watch/GYK53DMPR/wicked-lord-shingan-reborn',
+ 'info_dict': {
+ 'id': '648781',
+ 'ext': 'mp4',
+ 'episode_number': 1,
+ 'timestamp': 1389173400,
+ 'series': 'Love, Chunibyo & Other Delusions - Heart Throb -',
+ 'description': 'md5:5579d1a0355cc618558ba23d27067a62',
+ 'uploader': 'TBS',
+ 'episode': 'Wicked Lord Shingan... Reborn',
+ 'average_rating': 4.9,
+ 'season': 'Love, Chunibyo & Other Delusions - Heart Throb -',
+ 'thumbnail': 'https://img1.ak.crunchyroll.com/i/spire3-tmb/2ba0384e225a5370d5f0ee9496d91ea51389046521_main.jpg',
+ 'title': 'Love, Chunibyo & Other Delusions - Heart Throb - Episode 1 – Wicked Lord Shingan... Reborn',
+ 'season_number': 2,
+ 'upload_date': '20140108',
},
'params': {'skip_download': 'm3u8'},
'expected_warnings': ['Unable to download XML']
+ }, {
+ 'url': 'https://beta.crunchyroll.com/watch/GY2P1Q98Y/',
+ 'only_matching': True,
}]
def _real_extract(self, url):
- lang, internal_id, display_id = self._match_valid_url(url).group('lang', 'internal_id', 'id')
- webpage = self._download_webpage(url, display_id)
- episode_data = self._parse_json(
- self._search_regex(r'__INITIAL_STATE__\s*=\s*({.+?})\s*;', webpage, 'episode data'),
- display_id)['content']['byId'][internal_id]
- video_id = episode_data['external_id'].split('.')[1]
- series_id = episode_data['episode_metadata']['series_slug_title']
- return self.url_result(f'https://www.crunchyroll.com/{lang}{series_id}/{display_id}-{video_id}',
- CrunchyrollIE.ie_key(), video_id)
-
-
-class CrunchyrollBetaShowIE(CrunchyrollBaseIE):
+ lang, internal_id, display_id = self._match_valid_url(url).group('lang', 'id', 'display_id')
+
+ if not self._get_cookies(url).get('etp_rt'):
+ return self._redirect_from_beta(url, lang, internal_id, display_id, True, CrunchyrollIE.ie_key())
+
+ api_domain, bucket, params = self._get_params(lang)
+
+ episode_response = self._download_json(
+ f'{api_domain}/cms/v2{bucket}/episodes/{internal_id}', display_id,
+ note='Retrieving episode metadata',
+ query=params)
+ if episode_response.get('is_premium_only') and not episode_response.get('playback'):
+ raise ExtractorError('This video is for premium members only.', expected=True)
+ stream_response = self._download_json(
+ episode_response['playback'], display_id,
+ note='Retrieving stream info')
+
+ thumbnails = []
+ for thumbnails_data in traverse_obj(episode_response, ('images', 'thumbnail')):
+ for thumbnail_data in thumbnails_data:
+ thumbnails.append({
+ 'url': thumbnail_data.get('source'),
+ 'width': thumbnail_data.get('width'),
+ 'height': thumbnail_data.get('height'),
+ })
+ subtitles = {}
+ for lang, subtitle_data in stream_response.get('subtitles').items():
+ subtitles[lang] = [{
+ 'url': subtitle_data.get('url'),
+ 'ext': subtitle_data.get('format')
+ }]
+
+ requested_hardsubs = [('' if val == 'none' else val) for val in (self._configuration_arg('hardsub') or ['none'])]
+ hardsub_preference = qualities(requested_hardsubs[::-1])
+ requested_formats = self._configuration_arg('format') or ['adaptive_hls']
+
+ formats = []
+ for stream_type, streams in stream_response.get('streams', {}).items():
+ if stream_type not in requested_formats:
+ continue
+ for stream in streams.values():
+ hardsub_lang = stream.get('hardsub_locale') or ''
+ if hardsub_lang.lower() not in requested_hardsubs:
+ continue
+ format_id = join_nonempty(
+ stream_type,
+ format_field(stream, 'hardsub_locale', 'hardsub-%s'))
+ if not stream.get('url'):
+ continue
+ if stream_type.split('_')[-1] == 'hls':
+ adaptive_formats = self._extract_m3u8_formats(
+ stream['url'], display_id, 'mp4', m3u8_id=format_id,
+ note='Downloading %s information' % format_id,
+ fatal=False)
+ elif stream_type.split('_')[-1] == 'dash':
+ adaptive_formats = self._extract_mpd_formats(
+ stream['url'], display_id, mpd_id=format_id,
+ note='Downloading %s information' % format_id,
+ fatal=False)
+ for f in adaptive_formats:
+ if f.get('acodec') != 'none':
+ f['language'] = stream_response.get('audio_locale')
+ f['quality'] = hardsub_preference(hardsub_lang.lower())
+ formats.extend(adaptive_formats)
+ self._sort_formats(formats)
+
+ return {
+ 'id': internal_id,
+ 'title': '%s Episode %s – %s' % (episode_response.get('season_title'), episode_response.get('episode'), episode_response.get('title')),
+ 'description': episode_response.get('description').replace(r'\r\n', '\n'),
+ 'duration': float_or_none(episode_response.get('duration_ms'), 1000),
+ 'thumbnails': thumbnails,
+ 'series': episode_response.get('series_title'),
+ 'series_id': episode_response.get('series_id'),
+ 'season': episode_response.get('season_title'),
+ 'season_id': episode_response.get('season_id'),
+ 'season_number': episode_response.get('season_number'),
+ 'episode': episode_response.get('title'),
+ 'episode_number': episode_response.get('sequence_number'),
+ 'subtitles': subtitles,
+ 'formats': formats
+ }
+
+
+class CrunchyrollBetaShowIE(CrunchyrollBetaBaseIE):
IE_NAME = 'crunchyroll:playlist:beta'
- _VALID_URL = r'https?://beta\.crunchyroll\.com/(?P<lang>(?:\w{1,2}/)?)series/\w+/(?P<id>[\w\-]+)/?(?:\?|$)'
+ _VALID_URL = r'https?://beta\.crunchyroll\.com/(?P<lang>(?:\w{1,2}/)?)series/(?P<id>\w+)/(?P<display_id>[\w\-]*)/?(?:\?|$)'
_TESTS = [{
'url': 'https://beta.crunchyroll.com/series/GY19NQ2QR/Girl-Friend-BETA',
'info_dict': {
@@ -747,11 +903,56 @@ class CrunchyrollBetaShowIE(CrunchyrollBaseIE):
},
'playlist_mincount': 10,
}, {
+ 'url': 'https://beta.crunchyroll.com/series/GYJQV73V6/love-chunibyo--other-delusions---heart-throb--',
+ 'info_dict': {
+ 'id': 'love-chunibyo-other-delusions-heart-throb-',
+ 'title': 'Love, Chunibyo & Other Delusions - Heart Throb -',
+ },
+ 'playlist_mincount': 10,
+ }, {
'url': 'https://beta.crunchyroll.com/it/series/GY19NQ2QR/Girl-Friend-BETA',
'only_matching': True,
}]
def _real_extract(self, url):
- lang, series_id = self._match_valid_url(url).group('lang', 'id')
- return self.url_result(f'https://www.crunchyroll.com/{lang}{series_id.lower()}',
- CrunchyrollShowPlaylistIE.ie_key(), series_id)
+ lang, internal_id, display_id = self._match_valid_url(url).group('lang', 'id', 'display_id')
+
+ if not self._get_cookies(url).get('etp_rt'):
+ return self._redirect_from_beta(url, lang, internal_id, display_id, False, CrunchyrollShowPlaylistIE.ie_key())
+
+ api_domain, bucket, params = self._get_params(lang)
+
+ series_response = self._download_json(
+ f'{api_domain}/cms/v2{bucket}/series/{internal_id}', display_id,
+ note='Retrieving series metadata', query=params)
+
+ seasons_response = self._download_json(
+ f'{api_domain}/cms/v2{bucket}/seasons?series_id={internal_id}', display_id,
+ note='Retrieving season list', query=params)
+
+ def entries():
+ for season in seasons_response['items']:
+ episodes_response = self._download_json(
+ f'{api_domain}/cms/v2{bucket}/episodes?season_id={season["id"]}', display_id,
+ note=f'Retrieving episode list for {season.get("slug_title")}', query=params)
+ for episode in episodes_response['items']:
+ episode_id = episode['id']
+ episode_display_id = episode['slug_title']
+ yield {
+ '_type': 'url',
+ 'url': f'https://beta.crunchyroll.com/{lang}watch/{episode_id}/{episode_display_id}',
+ 'ie_key': CrunchyrollBetaIE.ie_key(),
+ 'id': episode_id,
+ 'title': '%s Episode %s – %s' % (episode.get('season_title'), episode.get('episode'), episode.get('title')),
+ 'description': try_get(episode, lambda x: x['description'].replace(r'\r\n', '\n')),
+ 'duration': float_or_none(episode.get('duration_ms'), 1000),
+ 'series': episode.get('series_title'),
+ 'series_id': episode.get('series_id'),
+ 'season': episode.get('season_title'),
+ 'season_id': episode.get('season_id'),
+ 'season_number': episode.get('season_number'),
+ 'episode': episode.get('title'),
+ 'episode_number': episode.get('sequence_number')
+ }
+
+ return self.playlist_result(entries(), internal_id, series_response.get('title'))
diff --git a/hypervideo_dl/extractor/cspan.py b/hypervideo_dl/extractor/cspan.py
index 2e01aff..f51159b 100644
--- a/hypervideo_dl/extractor/cspan.py
+++ b/hypervideo_dl/extractor/cspan.py
@@ -3,6 +3,7 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..compat import compat_HTMLParseError
from ..utils import (
determine_ext,
ExtractorError,
@@ -11,14 +12,16 @@ from ..utils import (
get_element_by_attribute,
get_element_by_class,
int_or_none,
+ join_nonempty,
js_to_json,
merge_dicts,
parse_iso8601,
+ parse_qs,
smuggle_url,
str_to_int,
unescapeHTML,
)
-from .senateisvp import SenateISVPIE
+from .senategov import SenateISVPIE
from .ustream import UstreamIE
@@ -126,8 +129,12 @@ class CSpanIE(InfoExtractor):
ext = 'vtt'
subtitle['ext'] = ext
ld_info = self._search_json_ld(webpage, video_id, default={})
- title = get_element_by_class('video-page-title', webpage) or \
- self._og_search_title(webpage)
+ try:
+ title = get_element_by_class('video-page-title', webpage)
+ except compat_HTMLParseError:
+ title = None
+ if title is None:
+ title = self._og_search_title(webpage)
description = get_element_by_attribute('itemprop', 'description', webpage) or \
self._html_search_meta(['og:description', 'description'], webpage)
return merge_dicts(info, ld_info, {
@@ -242,3 +249,42 @@ class CSpanIE(InfoExtractor):
'title': title,
'id': 'c' + video_id if video_type == 'clip' else video_id,
}
+
+
+class CSpanCongressIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?c-span\.org/congress/'
+ _TESTS = [{
+ 'url': 'https://www.c-span.org/congress/?chamber=house&date=2017-12-13&t=1513208380',
+ 'info_dict': {
+ 'id': 'house_2017-12-13',
+ 'title': 'Congressional Chronicle - Members of Congress, Hearings and More',
+ 'description': 'md5:54c264b7a8f219937987610243305a84',
+ 'thumbnail': r're:https://ximage.c-spanvideo.org/.+',
+ 'ext': 'mp4'
+ }
+ }]
+
+ def _real_extract(self, url):
+ query = parse_qs(url)
+ video_date = query.get('date', [None])[0]
+ video_id = join_nonempty(query.get('chamber', ['senate'])[0], video_date, delim='_')
+ webpage = self._download_webpage(url, video_id)
+ if not video_date:
+ jwp_date = re.search(r'jwsetup.clipprogdate = \'(?P<date>\d{4}-\d{2}-\d{2})\';', webpage)
+ if jwp_date:
+ video_id = f'{video_id}_{jwp_date.group("date")}'
+ jwplayer_data = self._parse_json(
+ self._search_regex(r'jwsetup\s*=\s*({(?:.|\n)[^;]+});', webpage, 'player config'),
+ video_id, transform_source=js_to_json)
+
+ title = (self._og_search_title(webpage, default=None)
+ or self._html_extract_title(webpage, 'video title'))
+ description = (self._og_search_description(webpage, default=None)
+ or self._html_search_meta('description', webpage, 'description', default=None))
+
+ return {
+ **self._parse_jwplayer_data(jwplayer_data, video_id, False),
+ 'title': re.sub(r'\s+', ' ', title.split('|')[0]).strip(),
+ 'description': description,
+ 'http_headers': {'Referer': 'https://www.c-span.org/'},
+ }
diff --git a/hypervideo_dl/extractor/ctvnews.py b/hypervideo_dl/extractor/ctvnews.py
index 03f8cef..952f4c7 100644
--- a/hypervideo_dl/extractor/ctvnews.py
+++ b/hypervideo_dl/extractor/ctvnews.py
@@ -65,4 +65,9 @@ class CTVNewsIE(InfoExtractor):
})
entries = [ninecninemedia_url_result(clip_id) for clip_id in orderedSet(
re.findall(r'clip\.id\s*=\s*(\d+);', webpage))]
+ if not entries:
+ webpage = self._download_webpage(url, page_id)
+ if 'getAuthStates("' in webpage:
+ entries = [ninecninemedia_url_result(clip_id) for clip_id in
+ self._search_regex(r'getAuthStates\("([\d+,]+)"', webpage, 'clip ids').split(',')]
return self.playlist_result(entries, page_id)
diff --git a/hypervideo_dl/extractor/curiositystream.py b/hypervideo_dl/extractor/curiositystream.py
index 034a5c9..b8abcf7 100644
--- a/hypervideo_dl/extractor/curiositystream.py
+++ b/hypervideo_dl/extractor/curiositystream.py
@@ -15,7 +15,6 @@ from ..utils import (
class CuriosityStreamBaseIE(InfoExtractor):
_NETRC_MACHINE = 'curiositystream'
_auth_token = None
- _API_BASE_URL = 'https://api.curiositystream.com/v1/'
def _handle_errors(self, result):
error = result.get('error', {}).get('message')
@@ -34,43 +33,46 @@ class CuriosityStreamBaseIE(InfoExtractor):
self._handle_errors(result)
return result['data']
- def _real_initialize(self):
- email, password = self._get_login_info()
- if email is None:
- return
+ def _perform_login(self, username, password):
result = self._download_json(
- self._API_BASE_URL + 'login', None, data=urlencode_postdata({
- 'email': email,
+ 'https://api.curiositystream.com/v1/login', None,
+ note='Logging in', data=urlencode_postdata({
+ 'email': username,
'password': password,
}))
self._handle_errors(result)
- self._auth_token = result['message']['auth_token']
+ CuriosityStreamBaseIE._auth_token = result['message']['auth_token']
class CuriosityStreamIE(CuriosityStreamBaseIE):
IE_NAME = 'curiositystream'
_VALID_URL = r'https?://(?:app\.)?curiositystream\.com/video/(?P<id>\d+)'
- _TEST = {
+ _TESTS = [{
'url': 'https://app.curiositystream.com/video/2',
'info_dict': {
'id': '2',
'ext': 'mp4',
'title': 'How Did You Develop The Internet?',
'description': 'Vint Cerf, Google\'s Chief Internet Evangelist, describes how he and Bob Kahn created the internet.',
+ 'channel': 'Curiosity Stream',
+ 'categories': ['Technology', 'Interview'],
+ 'average_rating': 96.79,
+ 'series_id': '2',
},
'params': {
- 'format': 'bestvideo',
# m3u8 download
'skip_download': True,
},
- }
+ }]
+
+ _API_BASE_URL = 'https://api.curiositystream.com/v1/media/'
def _real_extract(self, url):
video_id = self._match_id(url)
formats = []
for encoding_format in ('m3u8', 'mpd'):
- media = self._call_api('media/' + video_id, video_id, query={
+ media = self._call_api(video_id, video_id, query={
'encodingsNew': 'true',
'encodingsFormat': encoding_format,
})
@@ -140,12 +142,33 @@ class CuriosityStreamIE(CuriosityStreamBaseIE):
'duration': int_or_none(media.get('duration')),
'tags': media.get('tags'),
'subtitles': subtitles,
+ 'channel': media.get('producer'),
+ 'categories': [media.get('primary_category'), media.get('type')],
+ 'average_rating': media.get('rating_percentage'),
+ 'series_id': str(media.get('collection_id') or '') or None,
}
-class CuriosityStreamCollectionIE(CuriosityStreamBaseIE):
- IE_NAME = 'curiositystream:collection'
- _VALID_URL = r'https?://(?:app\.)?curiositystream\.com/(?:collections?|series)/(?P<id>\d+)'
+class CuriosityStreamCollectionBaseIE(CuriosityStreamBaseIE):
+
+ def _real_extract(self, url):
+ collection_id = self._match_id(url)
+ collection = self._call_api(collection_id, collection_id)
+ entries = []
+ for media in collection.get('media', []):
+ media_id = compat_str(media.get('id'))
+ media_type, ie = ('series', CuriosityStreamSeriesIE) if media.get('is_collection') else ('video', CuriosityStreamIE)
+ entries.append(self.url_result(
+ 'https://curiositystream.com/%s/%s' % (media_type, media_id),
+ ie=ie.ie_key(), video_id=media_id))
+ return self.playlist_result(
+ entries, collection_id,
+ collection.get('title'), collection.get('description'))
+
+
+class CuriosityStreamCollectionsIE(CuriosityStreamCollectionBaseIE):
+ IE_NAME = 'curiositystream:collections'
+ _VALID_URL = r'https?://(?:app\.)?curiositystream\.com/collections/(?P<id>\d+)'
_API_BASE_URL = 'https://api.curiositystream.com/v2/collections/'
_TESTS = [{
'url': 'https://curiositystream.com/collections/86',
@@ -156,7 +179,17 @@ class CuriosityStreamCollectionIE(CuriosityStreamBaseIE):
},
'playlist_mincount': 7,
}, {
- 'url': 'https://app.curiositystream.com/collection/2',
+ 'url': 'https://curiositystream.com/collections/36',
+ 'only_matching': True,
+ }]
+
+
+class CuriosityStreamSeriesIE(CuriosityStreamCollectionBaseIE):
+ IE_NAME = 'curiositystream:series'
+ _VALID_URL = r'https?://(?:app\.)?curiositystream\.com/(?:series|collection)/(?P<id>\d+)'
+ _API_BASE_URL = 'https://api.curiositystream.com/v2/series/'
+ _TESTS = [{
+ 'url': 'https://curiositystream.com/series/2',
'info_dict': {
'id': '2',
'title': 'Curious Minds: The Internet',
@@ -164,23 +197,6 @@ class CuriosityStreamCollectionIE(CuriosityStreamBaseIE):
},
'playlist_mincount': 16,
}, {
- 'url': 'https://curiositystream.com/series/2',
- 'only_matching': True,
- }, {
- 'url': 'https://curiositystream.com/collections/36',
+ 'url': 'https://curiositystream.com/collection/2',
'only_matching': True,
}]
-
- def _real_extract(self, url):
- collection_id = self._match_id(url)
- collection = self._call_api(collection_id, collection_id)
- entries = []
- for media in collection.get('media', []):
- media_id = compat_str(media.get('id'))
- media_type, ie = ('series', CuriosityStreamCollectionIE) if media.get('is_collection') else ('video', CuriosityStreamIE)
- entries.append(self.url_result(
- 'https://curiositystream.com/%s/%s' % (media_type, media_id),
- ie=ie.ie_key(), video_id=media_id))
- return self.playlist_result(
- entries, collection_id,
- collection.get('title'), collection.get('description'))
diff --git a/hypervideo_dl/extractor/cybrary.py b/hypervideo_dl/extractor/cybrary.py
new file mode 100644
index 0000000..c278f0f
--- /dev/null
+++ b/hypervideo_dl/extractor/cybrary.py
@@ -0,0 +1,146 @@
+# coding: utf-8
+from .common import InfoExtractor
+
+from ..utils import (
+ ExtractorError,
+ smuggle_url,
+ str_or_none,
+ traverse_obj,
+ urlencode_postdata
+)
+
+
+class CybraryBaseIE(InfoExtractor):
+ _API_KEY = 'AIzaSyCX9ru6j70PX2My1Eq6Q1zoMAhuTdXlzSw'
+ _ENDPOINTS = {
+ 'course': 'https://app.cybrary.it/courses/api/catalog/browse/course/{}',
+ 'course_enrollment': 'https://app.cybrary.it/courses/api/catalog/{}/enrollment',
+ 'enrollment': 'https://app.cybrary.it/courses/api/enrollment/{}',
+ 'launch': 'https://app.cybrary.it/courses/api/catalog/{}/launch',
+ 'vimeo_oembed': 'https://vimeo.com/api/oembed.json?url=https://vimeo.com/{}',
+ }
+ _NETRC_MACHINE = 'cybrary'
+ _TOKEN = None
+
+ def _perform_login(self, username, password):
+ CybraryBaseIE._TOKEN = self._download_json(
+ f'https://identitytoolkit.googleapis.com/v1/accounts:signInWithPassword?key={self._API_KEY}',
+ None, data=urlencode_postdata({'email': username, 'password': password, 'returnSecureToken': True}),
+ note='Logging in')['idToken']
+
+ def _real_initialize(self):
+ if not self._TOKEN:
+ self.raise_login_required(method='password')
+
+ def _call_api(self, endpoint, item_id):
+ return self._download_json(
+ self._ENDPOINTS[endpoint].format(item_id), item_id,
+ note=f'Downloading {endpoint} JSON metadata',
+ headers={'Authorization': f'Bearer {self._TOKEN}'})
+
+ def _get_vimeo_id(self, activity_id):
+ launch_api = self._call_api('launch', activity_id)
+
+ if launch_api.get('url'):
+ return self._search_regex(r'https?://player\.vimeo\.com/video/(?P<vimeo_id>[0-9]+)', launch_api['url'], 'vimeo_id')
+ return traverse_obj(launch_api, ('vendor_data', 'content', ..., 'videoId'), get_all=False)
+
+
+class CybraryIE(CybraryBaseIE):
+ _VALID_URL = r'https?://app.cybrary.it/immersive/(?P<enrollment>[0-9]+)/activity/(?P<id>[0-9]+)'
+ _TESTS = [{
+ 'url': 'https://app.cybrary.it/immersive/12487950/activity/63102',
+ 'md5': '9ae12d37e555cb2ed554223a71a701d0',
+ 'info_dict': {
+ 'id': '646609770',
+ 'ext': 'mp4',
+ 'title': 'Getting Started',
+ 'thumbnail': 'https://i.vimeocdn.com/video/1301817996-76a268f0c56cff18a5cecbbdc44131eb9dda0c80eb0b3a036_1280',
+ 'series_id': '63111',
+ 'uploader_url': 'https://vimeo.com/user30867300',
+ 'duration': 88,
+ 'uploader_id': 'user30867300',
+ 'series': 'Cybrary Orientation',
+ 'uploader': 'Cybrary',
+ 'chapter': 'Cybrary Orientation Series',
+ 'chapter_id': '63110'
+ },
+ 'expected_warnings': ['No authenticators for vimeo']
+ }, {
+ 'url': 'https://app.cybrary.it/immersive/12747143/activity/52686',
+ 'md5': '62f26547dccc59c44363e2a13d4ad08d',
+ 'info_dict': {
+ 'id': '445638073',
+ 'ext': 'mp4',
+ 'title': 'Azure Virtual Network IP Addressing',
+ 'thumbnail': 'https://i.vimeocdn.com/video/936667051-1647ace66c627d4a2382185e0dae8deb830309bfddd53f8b2367b2f91e92ed0e-d_1280',
+ 'series_id': '52733',
+ 'uploader_url': 'https://vimeo.com/user30867300',
+ 'duration': 426,
+ 'uploader_id': 'user30867300',
+ 'series': 'AZ-500: Microsoft Azure Security Technologies',
+ 'uploader': 'Cybrary',
+ 'chapter': 'Implement Network Security',
+ 'chapter_id': '52693'
+ },
+ 'expected_warnings': ['No authenticators for vimeo']
+ }]
+
+ def _real_extract(self, url):
+ activity_id, enrollment_id = self._match_valid_url(url).group('id', 'enrollment')
+ course = self._call_api('enrollment', enrollment_id)['content']
+ activity = traverse_obj(course, ('learning_modules', ..., 'activities', lambda _, v: int(activity_id) == v['id']), get_all=False)
+
+ if activity.get('type') not in ['Video Activity', 'Lesson Activity']:
+ raise ExtractorError('The activity is not a video', expected=True)
+
+ module = next((m for m in course.get('learning_modules') or []
+ if int(activity_id) in traverse_obj(m, ('activities', ..., 'id') or [])), None)
+
+ vimeo_id = self._get_vimeo_id(activity_id)
+
+ return {
+ '_type': 'url_transparent',
+ 'series': traverse_obj(course, ('content_description', 'title')),
+ 'series_id': str_or_none(traverse_obj(course, ('content_description', 'id'))),
+ 'id': vimeo_id,
+ 'chapter': module.get('title'),
+ 'chapter_id': str_or_none(module.get('id')),
+ 'title': activity.get('title'),
+ 'url': smuggle_url(f'https://player.vimeo.com/video/{vimeo_id}', {'http_headers': {'Referer': 'https://api.cybrary.it'}})
+ }
+
+
+class CybraryCourseIE(CybraryBaseIE):
+ _VALID_URL = r'https://app.cybrary.it/browse/course/(?P<id>[\w-]+)/?(?:$|[#?])'
+ _TESTS = [{
+ 'url': 'https://app.cybrary.it/browse/course/az-500-microsoft-azure-security-technologies',
+ 'info_dict': {
+ 'id': 898,
+ 'title': 'AZ-500: Microsoft Azure Security Technologies',
+ 'description': 'md5:69549d379c0fc1dec92926d4e8b6fbd4'
+ },
+ 'playlist_count': 59
+ }, {
+ 'url': 'https://app.cybrary.it/browse/course/cybrary-orientation',
+ 'info_dict': {
+ 'id': 1245,
+ 'title': 'Cybrary Orientation',
+ 'description': 'md5:9e69ff66b32fe78744e0ad4babe2e88e'
+ },
+ 'playlist_count': 4
+ }]
+
+ def _real_extract(self, url):
+ course_id = self._match_id(url)
+ course = self._call_api('course', course_id)
+ enrollment_info = self._call_api('course_enrollment', course['id'])
+
+ entries = [self.url_result(
+ f'https://app.cybrary.it/immersive/{enrollment_info["id"]}/activity/{activity["id"]}')
+ for activity in traverse_obj(course, ('content_item', 'learning_modules', ..., 'activities', ...))]
+
+ return self.playlist_result(
+ entries,
+ traverse_obj(course, ('content_item', 'id'), expected_type=str_or_none),
+ course.get('title'), course.get('short_description'))
diff --git a/hypervideo_dl/extractor/daftsex.py b/hypervideo_dl/extractor/daftsex.py
new file mode 100644
index 0000000..6037fd9
--- /dev/null
+++ b/hypervideo_dl/extractor/daftsex.py
@@ -0,0 +1,146 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_b64decode
+from ..utils import (
+ int_or_none,
+ js_to_json,
+ parse_count,
+ parse_duration,
+ traverse_obj,
+ try_get,
+ unified_timestamp,
+)
+
+
+class DaftsexIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?daftsex\.com/watch/(?P<id>-?\d+_\d+)'
+ _TESTS = [{
+ 'url': 'https://daftsex.com/watch/-35370899_456246186',
+ 'md5': 'd95135e6cea2d905bea20dbe82cda64a',
+ 'info_dict': {
+ 'id': '-35370899_456246186',
+ 'ext': 'mp4',
+ 'title': 'just relaxing',
+ 'description': 'just relaxing - Watch video Watch video in high quality',
+ 'upload_date': '20201113',
+ 'timestamp': 1605261911,
+ 'thumbnail': r're:https://[^/]+/impf/-43BuMDIawmBGr3GLcZ93CYwWf2PBv_tVWoS1A/dnu41DnARU4\.jpg\?size=800x450&quality=96&keep_aspect_ratio=1&background=000000&sign=6af2c26ff4a45e55334189301c867384&type=video_thumb',
+ },
+ }, {
+ 'url': 'https://daftsex.com/watch/-156601359_456242791',
+ 'info_dict': {
+ 'id': '-156601359_456242791',
+ 'ext': 'mp4',
+ 'title': 'Skye Blue - Dinner And A Show',
+ 'description': 'Skye Blue - Dinner And A Show - Watch video Watch video in high quality',
+ 'upload_date': '20200916',
+ 'timestamp': 1600250735,
+ 'thumbnail': 'https://psv153-1.crazycloud.ru/videos/-156601359/456242791/thumb.jpg?extra=i3D32KaBbBFf9TqDRMAVmQ',
+ },
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+ title = self._html_search_meta('name', webpage, 'title')
+ timestamp = unified_timestamp(self._html_search_meta('uploadDate', webpage, 'Upload Date', default=None))
+ description = self._html_search_meta('description', webpage, 'Description', default=None)
+
+ duration = parse_duration(self._search_regex(
+ r'Duration: ((?:[0-9]{2}:){0,2}[0-9]{2})',
+ webpage, 'duration', fatal=False))
+ views = parse_count(self._search_regex(
+ r'Views: ([0-9 ]+)',
+ webpage, 'views', fatal=False))
+
+ player_hash = self._search_regex(
+ r'DaxabPlayer\.Init\({[\s\S]*hash:\s*"([0-9a-zA-Z_\-]+)"[\s\S]*}',
+ webpage, 'player hash')
+ player_color = self._search_regex(
+ r'DaxabPlayer\.Init\({[\s\S]*color:\s*"([0-9a-z]+)"[\s\S]*}',
+ webpage, 'player color', fatal=False) or ''
+
+ embed_page = self._download_webpage(
+ 'https://daxab.com/player/%s?color=%s' % (player_hash, player_color),
+ video_id, headers={'Referer': url})
+ video_params = self._parse_json(
+ self._search_regex(
+ r'window\.globParams\s*=\s*({[\S\s]+})\s*;\s*<\/script>',
+ embed_page, 'video parameters'),
+ video_id, transform_source=js_to_json)
+
+ server_domain = 'https://%s' % compat_b64decode(video_params['server'][::-1]).decode('utf-8')
+
+ cdn_files = traverse_obj(video_params, ('video', 'cdn_files')) or {}
+ if cdn_files:
+ formats = []
+ for format_id, format_data in cdn_files.items():
+ ext, height = format_id.split('_')
+ formats.append({
+ 'format_id': format_id,
+ 'url': f'{server_domain}/videos/{video_id.replace("_", "/")}/{height}.mp4?extra={format_data.split(".")[-1]}',
+ 'height': int_or_none(height),
+ 'ext': ext,
+ })
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'formats': formats,
+ 'description': description,
+ 'duration': duration,
+ 'thumbnail': try_get(video_params, lambda vi: 'https:' + compat_b64decode(vi['video']['thumb']).decode('utf-8')),
+ 'timestamp': timestamp,
+ 'view_count': views,
+ 'age_limit': 18,
+ }
+
+ item = self._download_json(
+ f'{server_domain}/method/video.get/{video_id}', video_id,
+ headers={'Referer': url}, query={
+ 'token': video_params['video']['access_token'],
+ 'videos': video_id,
+ 'ckey': video_params['c_key'],
+ 'credentials': video_params['video']['credentials'],
+ })['response']['items'][0]
+
+ formats = []
+ for f_id, f_url in item.get('files', {}).items():
+ if f_id == 'external':
+ return self.url_result(f_url)
+ ext, height = f_id.split('_')
+ height_extra_key = traverse_obj(video_params, ('video', 'partial', 'quality', height))
+ if height_extra_key:
+ formats.append({
+ 'format_id': f'{height}p',
+ 'url': f'{server_domain}/{f_url[8:]}&videos={video_id}&extra_key={height_extra_key}',
+ 'height': int_or_none(height),
+ 'ext': ext,
+ })
+ self._sort_formats(formats)
+
+ thumbnails = []
+ for k, v in item.items():
+ if k.startswith('photo_') and v:
+ width = k.replace('photo_', '')
+ thumbnails.append({
+ 'id': width,
+ 'url': v,
+ 'width': int_or_none(width),
+ })
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'formats': formats,
+ 'comment_count': int_or_none(item.get('comments')),
+ 'description': description,
+ 'duration': duration,
+ 'thumbnails': thumbnails,
+ 'timestamp': timestamp,
+ 'view_count': views,
+ 'age_limit': 18,
+ }
diff --git a/hypervideo_dl/extractor/dailymotion.py b/hypervideo_dl/extractor/dailymotion.py
index e04e10b..9cb5618 100644
--- a/hypervideo_dl/extractor/dailymotion.py
+++ b/hypervideo_dl/extractor/dailymotion.py
@@ -94,10 +94,10 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
_VALID_URL = r'''(?ix)
https?://
(?:
- (?:(?:www|touch)\.)?dailymotion\.[a-z]{2,3}/(?:(?:(?:embed|swf|\#)/)?video|swf)|
+ (?:(?:www|touch|geo)\.)?dailymotion\.[a-z]{2,3}/(?:(?:(?:(?:embed|swf|\#)/)|player\.html\?)?video|swf)|
(?:www\.)?lequipe\.fr/video
)
- /(?P<id>[^/?_]+)(?:.+?\bplaylist=(?P<playlist_id>x[0-9a-z]+))?
+ [/=](?P<id>[^/?_&]+)(?:.+?\bplaylist=(?P<playlist_id>x[0-9a-z]+))?
'''
IE_NAME = 'dailymotion'
_TESTS = [{
@@ -116,6 +116,25 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
'age_limit': 0,
},
}, {
+ 'url': 'https://geo.dailymotion.com/player.html?video=x89eyek&mute=true',
+ 'md5': 'e2f9717c6604773f963f069ca53a07f8',
+ 'info_dict': {
+ 'id': 'x89eyek',
+ 'ext': 'mp4',
+ 'title': "En quête d'esprit du 27/03/2022",
+ 'description': 'md5:66542b9f4df2eb23f314fc097488e553',
+ 'duration': 2756,
+ 'timestamp': 1648383669,
+ 'upload_date': '20220327',
+ 'uploader': 'CNEWS',
+ 'uploader_id': 'x24vth',
+ 'age_limit': 0,
+ 'view_count': int,
+ 'like_count': int,
+ 'tags': ['en_quete_d_esprit'],
+ 'thumbnail': 'https://s2.dmcdn.net/v/Tncwi1YGKdvFbDuDY/x1080',
+ }
+ }, {
'url': 'https://www.dailymotion.com/video/x2iuewm_steam-machine-models-pricing-listed-on-steam-store-ign-news_videogames',
'md5': '2137c41a8e78554bb09225b8eb322406',
'info_dict': {
@@ -207,12 +226,10 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
video_id, playlist_id = self._match_valid_url(url).groups()
if playlist_id:
- if not self.get_param('noplaylist'):
- self.to_screen('Downloading playlist %s - add --no-playlist to just download video' % playlist_id)
+ if self._yes_playlist(playlist_id, video_id):
return self.url_result(
'http://www.dailymotion.com/playlist/' + playlist_id,
'DailymotionPlaylist', playlist_id)
- self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
password = self.get_param('videopassword')
media = self._call_api(
@@ -261,9 +278,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
continue
if media_type == 'application/x-mpegURL':
formats.extend(self._extract_m3u8_formats(
- media_url, video_id, 'mp4',
- 'm3u8' if is_live else 'm3u8_native',
- m3u8_id='hls', fatal=False))
+ media_url, video_id, 'mp4', live=is_live, m3u8_id='hls', fatal=False))
else:
f = {
'url': media_url,
@@ -305,7 +320,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
return {
'id': video_id,
- 'title': self._live_title(title) if is_live else title,
+ 'title': title,
'description': clean_html(media.get('description')),
'thumbnails': thumbnails,
'duration': int_or_none(metadata.get('duration')) or None,
diff --git a/hypervideo_dl/extractor/daum.py b/hypervideo_dl/extractor/daum.py
index 8aa2af9..4362e92 100644
--- a/hypervideo_dl/extractor/daum.py
+++ b/hypervideo_dl/extractor/daum.py
@@ -157,11 +157,8 @@ class DaumListIE(InfoExtractor):
query_dict = parse_qs(url)
if 'clipid' in query_dict:
clip_id = query_dict['clipid'][0]
- if self.get_param('noplaylist'):
- self.to_screen('Downloading just video %s because of --no-playlist' % clip_id)
+ if not self._yes_playlist(list_id, clip_id):
return self.url_result(DaumClipIE._URL_TEMPLATE % clip_id, 'DaumClip')
- else:
- self.to_screen('Downloading playlist %s - add --no-playlist to just download video' % list_id)
class DaumPlaylistIE(DaumListIE):
diff --git a/hypervideo_dl/extractor/daystar.py b/hypervideo_dl/extractor/daystar.py
new file mode 100644
index 0000000..4f59d90
--- /dev/null
+++ b/hypervideo_dl/extractor/daystar.py
@@ -0,0 +1,48 @@
+from .common import InfoExtractor
+from ..utils import js_to_json, urljoin
+
+
+class DaystarClipIE(InfoExtractor):
+ IE_NAME = 'daystar:clip'
+ _VALID_URL = r'https?://player\.daystar\.tv/(?P<id>\w+)'
+ _TESTS = [{
+ 'url': 'https://player.daystar.tv/0MTO2ITM',
+ 'info_dict': {
+ 'id': '0MTO2ITM',
+ 'ext': 'mp4',
+ 'title': 'The Dark World of COVID Pt. 1 | Aaron Siri',
+ 'description': 'a420d320dda734e5f29458df3606c5f4',
+ 'thumbnail': r're:^https?://.+\.jpg',
+ },
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ src_iframe = self._search_regex(r'\<iframe[^>]+src="([^"]+)"', webpage, 'src iframe')
+ webpage_iframe = self._download_webpage(
+ src_iframe.replace('player.php', 'config2.php'), video_id, headers={'Referer': src_iframe})
+
+ sources = self._parse_json(self._search_regex(
+ r'sources\:\s*(\[.*?\])', webpage_iframe, 'm3u8 source'), video_id, transform_source=js_to_json)
+
+ formats, subtitles = [], {}
+ for source in sources:
+ file = source.get('file')
+ if file and source.get('type') == 'm3u8':
+ fmts, subs = self._extract_m3u8_formats_and_subtitles(
+ urljoin('https://www.lightcast.com/embed/', file),
+ video_id, 'mp4', fatal=False, headers={'Referer': src_iframe})
+ formats.extend(fmts)
+ subtitles = self._merge_subtitles(subtitles, subs)
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': self._html_search_meta(['og:title', 'twitter:title'], webpage),
+ 'description': self._html_search_meta(['og:description', 'twitter:description'], webpage),
+ 'thumbnail': self._search_regex(r'image:\s*"([^"]+)', webpage_iframe, 'thumbnail'),
+ 'formats': formats,
+ 'subtitles': subtitles,
+ }
diff --git a/hypervideo_dl/extractor/digitalconcerthall.py b/hypervideo_dl/extractor/digitalconcerthall.py
new file mode 100644
index 0000000..8398ae3
--- /dev/null
+++ b/hypervideo_dl/extractor/digitalconcerthall.py
@@ -0,0 +1,141 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+from ..utils import (
+ ExtractorError,
+ parse_resolution,
+ traverse_obj,
+ try_get,
+ urlencode_postdata,
+)
+
+
+class DigitalConcertHallIE(InfoExtractor):
+ IE_DESC = 'DigitalConcertHall extractor'
+ _VALID_URL = r'https?://(?:www\.)?digitalconcerthall\.com/(?P<language>[a-z]+)/concert/(?P<id>[0-9]+)'
+ _OAUTH_URL = 'https://api.digitalconcerthall.com/v2/oauth2/token'
+ _ACCESS_TOKEN = None
+ _NETRC_MACHINE = 'digitalconcerthall'
+ _TESTS = [{
+ 'note': 'Playlist with only one video',
+ 'url': 'https://www.digitalconcerthall.com/en/concert/53201',
+ 'info_dict': {
+ 'id': '53201-1',
+ 'ext': 'mp4',
+ 'composer': 'Kurt Weill',
+ 'title': '[Magic Night]',
+ 'thumbnail': r're:^https?://images.digitalconcerthall.com/cms/thumbnails.*\.jpg$',
+ 'upload_date': '20210624',
+ 'timestamp': 1624548600,
+ 'duration': 2798,
+ 'album_artist': 'Members of the Berliner Philharmoniker / Simon Rössler',
+ },
+ 'params': {'skip_download': 'm3u8'},
+ }, {
+ 'note': 'Concert with several works and an interview',
+ 'url': 'https://www.digitalconcerthall.com/en/concert/53785',
+ 'info_dict': {
+ 'id': '53785',
+ 'album_artist': 'Berliner Philharmoniker / Kirill Petrenko',
+ 'title': 'Kirill Petrenko conducts Mendelssohn and Shostakovich',
+ },
+ 'params': {'skip_download': 'm3u8'},
+ 'playlist_count': 3,
+ }]
+
+ def _perform_login(self, username, password):
+ token_response = self._download_json(
+ self._OAUTH_URL,
+ None, 'Obtaining token', errnote='Unable to obtain token', data=urlencode_postdata({
+ 'affiliate': 'none',
+ 'grant_type': 'device',
+ 'device_vendor': 'unknown',
+ 'app_id': 'dch.webapp',
+ 'app_version': '1.0.0',
+ 'client_secret': '2ySLN+2Fwb',
+ }), headers={
+ 'Content-Type': 'application/x-www-form-urlencoded',
+ })
+ self._ACCESS_TOKEN = token_response['access_token']
+ try:
+ self._download_json(
+ self._OAUTH_URL,
+ None, note='Logging in', errnote='Unable to login', data=urlencode_postdata({
+ 'grant_type': 'password',
+ 'username': username,
+ 'password': password,
+ }), headers={
+ 'Content-Type': 'application/x-www-form-urlencoded',
+ 'Referer': 'https://www.digitalconcerthall.com',
+ 'Authorization': f'Bearer {self._ACCESS_TOKEN}'
+ })
+ except ExtractorError:
+ self.raise_login_required(msg='Login info incorrect')
+
+ def _real_initialize(self):
+ if not self._ACCESS_TOKEN:
+ self.raise_login_required(method='password')
+
+ def _entries(self, items, language, **kwargs):
+ for item in items:
+ video_id = item['id']
+ stream_info = self._download_json(
+ self._proto_relative_url(item['_links']['streams']['href']), video_id, headers={
+ 'Accept': 'application/json',
+ 'Authorization': f'Bearer {self._ACCESS_TOKEN}',
+ 'Accept-Language': language
+ })
+
+ m3u8_url = traverse_obj(
+ stream_info, ('channel', lambda x: x.startswith('vod_mixed'), 'stream', 0, 'url'), get_all=False)
+ formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', 'm3u8_native', fatal=False)
+ self._sort_formats(formats)
+
+ yield {
+ 'id': video_id,
+ 'title': item.get('title'),
+ 'composer': item.get('name_composer'),
+ 'url': m3u8_url,
+ 'formats': formats,
+ 'duration': item.get('duration_total'),
+ 'timestamp': traverse_obj(item, ('date', 'published')),
+ 'description': item.get('short_description') or stream_info.get('short_description'),
+ **kwargs,
+ 'chapters': [{
+ 'start_time': chapter.get('time'),
+ 'end_time': try_get(chapter, lambda x: x['time'] + x['duration']),
+ 'title': chapter.get('text'),
+ } for chapter in item['cuepoints']] if item.get('cuepoints') else None,
+ }
+
+ def _real_extract(self, url):
+ language, video_id = self._match_valid_url(url).group('language', 'id')
+ if not language:
+ language = 'en'
+
+ thumbnail_url = self._html_search_regex(
+ r'(https?://images\.digitalconcerthall\.com/cms/thumbnails/.*\.jpg)',
+ self._download_webpage(url, video_id), 'thumbnail')
+ thumbnails = [{
+ 'url': thumbnail_url,
+ **parse_resolution(thumbnail_url)
+ }]
+
+ vid_info = self._download_json(
+ f'https://api.digitalconcerthall.com/v2/concert/{video_id}', video_id, headers={
+ 'Accept': 'application/json',
+ 'Accept-Language': language
+ })
+ album_artist = ' / '.join(traverse_obj(vid_info, ('_links', 'artist', ..., 'name')) or '')
+
+ return {
+ '_type': 'playlist',
+ 'id': video_id,
+ 'title': vid_info.get('title'),
+ 'entries': self._entries(traverse_obj(vid_info, ('_embedded', ..., ...)), language,
+ thumbnails=thumbnails, album_artist=album_artist),
+ 'thumbnails': thumbnails,
+ 'album_artist': album_artist,
+ }
diff --git a/hypervideo_dl/extractor/disney.py b/hypervideo_dl/extractor/disney.py
index f018cbe..0ad7b1f 100644
--- a/hypervideo_dl/extractor/disney.py
+++ b/hypervideo_dl/extractor/disney.py
@@ -7,8 +7,8 @@ from .common import InfoExtractor
from ..utils import (
int_or_none,
unified_strdate,
- compat_str,
determine_ext,
+ join_nonempty,
update_url_query,
)
@@ -119,18 +119,13 @@ class DisneyIE(InfoExtractor):
continue
formats.append(f)
continue
- format_id = []
- if flavor_format:
- format_id.append(flavor_format)
- if tbr:
- format_id.append(compat_str(tbr))
ext = determine_ext(flavor_url)
if flavor_format == 'applehttp' or ext == 'm3u8':
ext = 'mp4'
width = int_or_none(flavor.get('width'))
height = int_or_none(flavor.get('height'))
formats.append({
- 'format_id': '-'.join(format_id),
+ 'format_id': join_nonempty(flavor_format, tbr),
'url': flavor_url,
'width': width,
'height': height,
diff --git a/hypervideo_dl/extractor/dispeak.py b/hypervideo_dl/extractor/dispeak.py
index be7ad12..3d651f3 100644
--- a/hypervideo_dl/extractor/dispeak.py
+++ b/hypervideo_dl/extractor/dispeak.py
@@ -74,13 +74,11 @@ class DigitallySpeakingIE(InfoExtractor):
tbr = int_or_none(bitrate)
vbr = int_or_none(self._search_regex(
r'-(\d+)\.mp4', video_path, 'vbr', default=None))
- abr = tbr - vbr if tbr and vbr else None
video_formats.append({
'format_id': bitrate,
'url': url,
'tbr': tbr,
'vbr': vbr,
- 'abr': abr,
})
return video_formats
@@ -121,6 +119,7 @@ class DigitallySpeakingIE(InfoExtractor):
video_formats = self._parse_mp4(metadata)
if video_formats is None:
video_formats = self._parse_flv(metadata)
+ self._sort_formats(video_formats)
return {
'id': video_id,
diff --git a/hypervideo_dl/extractor/dlive.py b/hypervideo_dl/extractor/dlive.py
index 90462c0..7410eb6 100644
--- a/hypervideo_dl/extractor/dlive.py
+++ b/hypervideo_dl/extractor/dlive.py
@@ -84,7 +84,7 @@ class DLiveStreamIE(InfoExtractor):
self._sort_formats(formats)
return {
'id': display_name,
- 'title': self._live_title(title),
+ 'title': title,
'uploader': display_name,
'uploader_id': username,
'formats': formats,
diff --git a/hypervideo_dl/extractor/doodstream.py b/hypervideo_dl/extractor/doodstream.py
index 2c9ea68..f692127 100644
--- a/hypervideo_dl/extractor/doodstream.py
+++ b/hypervideo_dl/extractor/doodstream.py
@@ -21,6 +21,16 @@ class DoodStreamIE(InfoExtractor):
'thumbnail': 'https://img.doodcdn.com/snaps/flyus84qgl2fsk4g.jpg',
}
}, {
+ 'url': 'http://dood.watch/d/5s1wmbdacezb',
+ 'md5': '4568b83b31e13242b3f1ff96c55f0595',
+ 'info_dict': {
+ 'id': '5s1wmbdacezb',
+ 'ext': 'mp4',
+ 'title': 'Kat Wonders - Monthly May 2020',
+ 'description': 'Kat Wonders - Monthly May 2020 | DoodStream.com',
+ 'thumbnail': 'https://img.doodcdn.com/snaps/flyus84qgl2fsk4g.jpg',
+ }
+ }, {
'url': 'https://dood.to/d/jzrxn12t2s7n',
'md5': '3207e199426eca7c2aa23c2872e6728a',
'info_dict': {
@@ -34,31 +44,26 @@ class DoodStreamIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
+ url = f'https://dood.to/e/{video_id}'
webpage = self._download_webpage(url, video_id)
- if '/d/' in url:
- url = "https://dood.to" + self._html_search_regex(
- r'<iframe src="(/e/[a-z0-9]+)"', webpage, 'embed')
- video_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
-
- title = self._html_search_meta(['og:title', 'twitter:title'],
- webpage, default=None)
- thumb = self._html_search_meta(['og:image', 'twitter:image'],
- webpage, default=None)
+ title = self._html_search_meta(['og:title', 'twitter:title'], webpage, default=None)
+ thumb = self._html_search_meta(['og:image', 'twitter:image'], webpage, default=None)
token = self._html_search_regex(r'[?&]token=([a-z0-9]+)[&\']', webpage, 'token')
description = self._html_search_meta(
- ['og:description', 'description', 'twitter:description'],
- webpage, default=None)
- auth_url = 'https://dood.to' + self._html_search_regex(
- r'(/pass_md5.*?)\'', webpage, 'pass_md5')
+ ['og:description', 'description', 'twitter:description'], webpage, default=None)
+
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:53.0) Gecko/20100101 Firefox/66.0',
'referer': url
}
- webpage = self._download_webpage(auth_url, video_id, headers=headers)
- final_url = webpage + ''.join([random.choice(string.ascii_letters + string.digits) for _ in range(10)]) + "?token=" + token + "&expiry=" + str(int(time.time() * 1000))
+ pass_md5 = self._html_search_regex(r'(/pass_md5.*?)\'', webpage, 'pass_md5')
+ final_url = ''.join((
+ self._download_webpage(f'https://dood.to{pass_md5}', video_id, headers=headers),
+ *(random.choice(string.ascii_letters + string.digits) for _ in range(10)),
+ f'?token={token}&expiry={int(time.time() * 1000)}',
+ ))
return {
'id': video_id,
diff --git a/hypervideo_dl/extractor/douyutv.py b/hypervideo_dl/extractor/douyutv.py
index 9757f44..26a8d64 100644
--- a/hypervideo_dl/extractor/douyutv.py
+++ b/hypervideo_dl/extractor/douyutv.py
@@ -105,7 +105,7 @@ class DouyuTVIE(InfoExtractor):
'aid': 'pcclient'
})['data']['live_url']
- title = self._live_title(unescapeHTML(room['room_name']))
+ title = unescapeHTML(room['room_name'])
description = room.get('show_details')
thumbnail = room.get('room_src')
uploader = room.get('nickname')
diff --git a/hypervideo_dl/extractor/dplay.py b/hypervideo_dl/extractor/dplay.py
index e0e446b..a25f27c 100644
--- a/hypervideo_dl/extractor/dplay.py
+++ b/hypervideo_dl/extractor/dplay.py
@@ -2,6 +2,7 @@
from __future__ import unicode_literals
import json
+import uuid
from .common import InfoExtractor
from ..compat import compat_HTTPError
@@ -11,12 +12,172 @@ from ..utils import (
float_or_none,
int_or_none,
strip_or_none,
+ try_get,
unified_timestamp,
)
-class DPlayIE(InfoExtractor):
+class DPlayBaseIE(InfoExtractor):
_PATH_REGEX = r'/(?P<id>[^/]+/[^/?#]+)'
+ _auth_token_cache = {}
+
+ def _get_auth(self, disco_base, display_id, realm, needs_device_id=True):
+ key = (disco_base, realm)
+ st = self._get_cookies(disco_base).get('st')
+ token = (st and st.value) or self._auth_token_cache.get(key)
+
+ if not token:
+ query = {'realm': realm}
+ if needs_device_id:
+ query['deviceId'] = uuid.uuid4().hex
+ token = self._download_json(
+ disco_base + 'token', display_id, 'Downloading token',
+ query=query)['data']['attributes']['token']
+
+ # Save cache only if cookies are not being set
+ if not self._get_cookies(disco_base).get('st'):
+ self._auth_token_cache[key] = token
+
+ return f'Bearer {token}'
+
+ def _process_errors(self, e, geo_countries):
+ info = self._parse_json(e.cause.read().decode('utf-8'), None)
+ error = info['errors'][0]
+ error_code = error.get('code')
+ if error_code == 'access.denied.geoblocked':
+ self.raise_geo_restricted(countries=geo_countries)
+ elif error_code in ('access.denied.missingpackage', 'invalid.token'):
+ raise ExtractorError(
+ 'This video is only available for registered users. You may want to use --cookies.', expected=True)
+ raise ExtractorError(info['errors'][0]['detail'], expected=True)
+
+ def _update_disco_api_headers(self, headers, disco_base, display_id, realm):
+ headers['Authorization'] = self._get_auth(disco_base, display_id, realm, False)
+
+ def _download_video_playback_info(self, disco_base, video_id, headers):
+ streaming = self._download_json(
+ disco_base + 'playback/videoPlaybackInfo/' + video_id,
+ video_id, headers=headers)['data']['attributes']['streaming']
+ streaming_list = []
+ for format_id, format_dict in streaming.items():
+ streaming_list.append({
+ 'type': format_id,
+ 'url': format_dict.get('url'),
+ })
+ return streaming_list
+
+ def _get_disco_api_info(self, url, display_id, disco_host, realm, country, domain=''):
+ geo_countries = [country.upper()]
+ self._initialize_geo_bypass({
+ 'countries': geo_countries,
+ })
+ disco_base = 'https://%s/' % disco_host
+ headers = {
+ 'Referer': url,
+ }
+ self._update_disco_api_headers(headers, disco_base, display_id, realm)
+ try:
+ video = self._download_json(
+ disco_base + 'content/videos/' + display_id, display_id,
+ headers=headers, query={
+ 'fields[channel]': 'name',
+ 'fields[image]': 'height,src,width',
+ 'fields[show]': 'name',
+ 'fields[tag]': 'name',
+ 'fields[video]': 'description,episodeNumber,name,publishStart,seasonNumber,videoDuration',
+ 'include': 'images,primaryChannel,show,tags'
+ })
+ except ExtractorError as e:
+ if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400:
+ self._process_errors(e, geo_countries)
+ raise
+ video_id = video['data']['id']
+ info = video['data']['attributes']
+ title = info['name'].strip()
+ formats = []
+ subtitles = {}
+ try:
+ streaming = self._download_video_playback_info(
+ disco_base, video_id, headers)
+ except ExtractorError as e:
+ if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
+ self._process_errors(e, geo_countries)
+ raise
+ for format_dict in streaming:
+ if not isinstance(format_dict, dict):
+ continue
+ format_url = format_dict.get('url')
+ if not format_url:
+ continue
+ format_id = format_dict.get('type')
+ ext = determine_ext(format_url)
+ if format_id == 'dash' or ext == 'mpd':
+ dash_fmts, dash_subs = self._extract_mpd_formats_and_subtitles(
+ format_url, display_id, mpd_id='dash', fatal=False)
+ formats.extend(dash_fmts)
+ subtitles = self._merge_subtitles(subtitles, dash_subs)
+ elif format_id == 'hls' or ext == 'm3u8':
+ m3u8_fmts, m3u8_subs = self._extract_m3u8_formats_and_subtitles(
+ format_url, display_id, 'mp4',
+ entry_protocol='m3u8_native', m3u8_id='hls',
+ fatal=False)
+ formats.extend(m3u8_fmts)
+ subtitles = self._merge_subtitles(subtitles, m3u8_subs)
+ else:
+ formats.append({
+ 'url': format_url,
+ 'format_id': format_id,
+ })
+ self._sort_formats(formats)
+
+ creator = series = None
+ tags = []
+ thumbnails = []
+ included = video.get('included') or []
+ if isinstance(included, list):
+ for e in included:
+ attributes = e.get('attributes')
+ if not attributes:
+ continue
+ e_type = e.get('type')
+ if e_type == 'channel':
+ creator = attributes.get('name')
+ elif e_type == 'image':
+ src = attributes.get('src')
+ if src:
+ thumbnails.append({
+ 'url': src,
+ 'width': int_or_none(attributes.get('width')),
+ 'height': int_or_none(attributes.get('height')),
+ })
+ if e_type == 'show':
+ series = attributes.get('name')
+ elif e_type == 'tag':
+ name = attributes.get('name')
+ if name:
+ tags.append(name)
+ return {
+ 'id': video_id,
+ 'display_id': display_id,
+ 'title': title,
+ 'description': strip_or_none(info.get('description')),
+ 'duration': float_or_none(info.get('videoDuration'), 1000),
+ 'timestamp': unified_timestamp(info.get('publishStart')),
+ 'series': series,
+ 'season_number': int_or_none(info.get('seasonNumber')),
+ 'episode_number': int_or_none(info.get('episodeNumber')),
+ 'creator': creator,
+ 'tags': tags,
+ 'thumbnails': thumbnails,
+ 'formats': formats,
+ 'subtitles': subtitles,
+ 'http_headers': {
+ 'referer': domain,
+ },
+ }
+
+
+class DPlayIE(DPlayBaseIE):
_VALID_URL = r'''(?x)https?://
(?P<domain>
(?:www\.)?(?P<host>d
@@ -26,7 +187,7 @@ class DPlayIE(InfoExtractor):
)
)|
(?P<subdomain_country>es|it)\.dplay\.com
- )/[^/]+''' + _PATH_REGEX
+ )/[^/]+''' + DPlayBaseIE._PATH_REGEX
_TESTS = [{
# non geo restricted, via secure api, unsigned download hls URL
@@ -46,7 +207,6 @@ class DPlayIE(InfoExtractor):
'episode_number': 1,
},
'params': {
- 'format': 'bestvideo',
'skip_download': True,
},
}, {
@@ -67,7 +227,6 @@ class DPlayIE(InfoExtractor):
'episode_number': 1,
},
'params': {
- 'format': 'bestvideo',
'skip_download': True,
},
}, {
@@ -87,7 +246,6 @@ class DPlayIE(InfoExtractor):
'episode_number': 7,
},
'params': {
- 'format': 'bestvideo',
'skip_download': True,
},
'skip': 'Available for Premium users',
@@ -153,138 +311,6 @@ class DPlayIE(InfoExtractor):
'only_matching': True,
}]
- def _process_errors(self, e, geo_countries):
- info = self._parse_json(e.cause.read().decode('utf-8'), None)
- error = info['errors'][0]
- error_code = error.get('code')
- if error_code == 'access.denied.geoblocked':
- self.raise_geo_restricted(countries=geo_countries)
- elif error_code in ('access.denied.missingpackage', 'invalid.token'):
- raise ExtractorError(
- 'This video is only available for registered users. You may want to use --cookies.', expected=True)
- raise ExtractorError(info['errors'][0]['detail'], expected=True)
-
- def _update_disco_api_headers(self, headers, disco_base, display_id, realm):
- headers['Authorization'] = 'Bearer ' + self._download_json(
- disco_base + 'token', display_id, 'Downloading token',
- query={
- 'realm': realm,
- })['data']['attributes']['token']
-
- def _download_video_playback_info(self, disco_base, video_id, headers):
- streaming = self._download_json(
- disco_base + 'playback/videoPlaybackInfo/' + video_id,
- video_id, headers=headers)['data']['attributes']['streaming']
- streaming_list = []
- for format_id, format_dict in streaming.items():
- streaming_list.append({
- 'type': format_id,
- 'url': format_dict.get('url'),
- })
- return streaming_list
-
- def _get_disco_api_info(self, url, display_id, disco_host, realm, country):
- geo_countries = [country.upper()]
- self._initialize_geo_bypass({
- 'countries': geo_countries,
- })
- disco_base = 'https://%s/' % disco_host
- headers = {
- 'Referer': url,
- }
- self._update_disco_api_headers(headers, disco_base, display_id, realm)
- try:
- video = self._download_json(
- disco_base + 'content/videos/' + display_id, display_id,
- headers=headers, query={
- 'fields[channel]': 'name',
- 'fields[image]': 'height,src,width',
- 'fields[show]': 'name',
- 'fields[tag]': 'name',
- 'fields[video]': 'description,episodeNumber,name,publishStart,seasonNumber,videoDuration',
- 'include': 'images,primaryChannel,show,tags'
- })
- except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400:
- self._process_errors(e, geo_countries)
- raise
- video_id = video['data']['id']
- info = video['data']['attributes']
- title = info['name'].strip()
- formats = []
- try:
- streaming = self._download_video_playback_info(
- disco_base, video_id, headers)
- except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
- self._process_errors(e, geo_countries)
- raise
- for format_dict in streaming:
- if not isinstance(format_dict, dict):
- continue
- format_url = format_dict.get('url')
- if not format_url:
- continue
- format_id = format_dict.get('type')
- ext = determine_ext(format_url)
- if format_id == 'dash' or ext == 'mpd':
- formats.extend(self._extract_mpd_formats(
- format_url, display_id, mpd_id='dash', fatal=False))
- elif format_id == 'hls' or ext == 'm3u8':
- formats.extend(self._extract_m3u8_formats(
- format_url, display_id, 'mp4',
- entry_protocol='m3u8_native', m3u8_id='hls',
- fatal=False))
- else:
- formats.append({
- 'url': format_url,
- 'format_id': format_id,
- })
- self._sort_formats(formats)
-
- creator = series = None
- tags = []
- thumbnails = []
- included = video.get('included') or []
- if isinstance(included, list):
- for e in included:
- attributes = e.get('attributes')
- if not attributes:
- continue
- e_type = e.get('type')
- if e_type == 'channel':
- creator = attributes.get('name')
- elif e_type == 'image':
- src = attributes.get('src')
- if src:
- thumbnails.append({
- 'url': src,
- 'width': int_or_none(attributes.get('width')),
- 'height': int_or_none(attributes.get('height')),
- })
- if e_type == 'show':
- series = attributes.get('name')
- elif e_type == 'tag':
- name = attributes.get('name')
- if name:
- tags.append(name)
-
- return {
- 'id': video_id,
- 'display_id': display_id,
- 'title': title,
- 'description': strip_or_none(info.get('description')),
- 'duration': float_or_none(info.get('videoDuration'), 1000),
- 'timestamp': unified_timestamp(info.get('publishStart')),
- 'series': series,
- 'season_number': int_or_none(info.get('seasonNumber')),
- 'episode_number': int_or_none(info.get('episodeNumber')),
- 'creator': creator,
- 'tags': tags,
- 'thumbnails': thumbnails,
- 'formats': formats,
- }
-
def _real_extract(self, url):
mobj = self._match_valid_url(url)
display_id = mobj.group('id')
@@ -292,11 +318,11 @@ class DPlayIE(InfoExtractor):
country = mobj.group('country') or mobj.group('subdomain_country') or mobj.group('plus_country')
host = 'disco-api.' + domain if domain[0] == 'd' else 'eu2-prod.disco-api.com'
return self._get_disco_api_info(
- url, display_id, host, 'dplay' + country, country)
+ url, display_id, host, 'dplay' + country, country, domain)
-class HGTVDeIE(DPlayIE):
- _VALID_URL = r'https?://de\.hgtv\.com/sendungen' + DPlayIE._PATH_REGEX
+class HGTVDeIE(DPlayBaseIE):
+ _VALID_URL = r'https?://de\.hgtv\.com/sendungen' + DPlayBaseIE._PATH_REGEX
_TESTS = [{
'url': 'https://de.hgtv.com/sendungen/tiny-house-klein-aber-oho/wer-braucht-schon-eine-toilette/',
'info_dict': {
@@ -313,9 +339,6 @@ class HGTVDeIE(DPlayIE):
'season_number': 3,
'episode_number': 3,
},
- 'params': {
- 'format': 'bestvideo',
- },
}]
def _real_extract(self, url):
@@ -324,30 +347,7 @@ class HGTVDeIE(DPlayIE):
url, display_id, 'eu1-prod.disco-api.com', 'hgtv', 'de')
-class DiscoveryPlusIE(DPlayIE):
- _VALID_URL = r'https?://(?:www\.)?discoveryplus\.com/video' + DPlayIE._PATH_REGEX
- _TESTS = [{
- 'url': 'https://www.discoveryplus.com/video/property-brothers-forever-home/food-and-family',
- 'info_dict': {
- 'id': '1140794',
- 'display_id': 'property-brothers-forever-home/food-and-family',
- 'ext': 'mp4',
- 'title': 'Food and Family',
- 'description': 'The brothers help a Richmond family expand their single-level home.',
- 'duration': 2583.113,
- 'timestamp': 1609304400,
- 'upload_date': '20201230',
- 'creator': 'HGTV',
- 'series': 'Property Brothers: Forever Home',
- 'season_number': 1,
- 'episode_number': 1,
- },
- 'skip': 'Available for Premium users',
- }]
-
- _PRODUCT = 'dplus_us'
- _API_URL = 'us1-prod-direct.discoveryplus.com'
-
+class DiscoveryPlusBaseIE(DPlayBaseIE):
def _update_disco_api_headers(self, headers, disco_base, display_id, realm):
headers['x-disco-client'] = f'WEB:UNKNOWN:{self._PRODUCT}:25.2.6'
@@ -366,13 +366,227 @@ class DiscoveryPlusIE(DPlayIE):
}).encode('utf-8'))['data']['attributes']['streaming']
def _real_extract(self, url):
- display_id = self._match_id(url)
- return self._get_disco_api_info(
- url, display_id, self._API_URL, 'go', 'us')
+ return self._get_disco_api_info(url, self._match_id(url), **self._DISCO_API_PARAMS)
+
+
+class GoDiscoveryIE(DiscoveryPlusBaseIE):
+ _VALID_URL = r'https?://(?:go\.)?discovery\.com/video' + DPlayBaseIE._PATH_REGEX
+ _TESTS = [{
+ 'url': 'https://go.discovery.com/video/dirty-jobs-discovery-atve-us/rodbuster-galvanizer',
+ 'info_dict': {
+ 'id': '4164906',
+ 'display_id': 'dirty-jobs-discovery-atve-us/rodbuster-galvanizer',
+ 'ext': 'mp4',
+ 'title': 'Rodbuster / Galvanizer',
+ 'description': 'Mike installs rebar with a team of rodbusters, then he galvanizes steel.',
+ 'season_number': 9,
+ 'episode_number': 1,
+ },
+ 'skip': 'Available for Premium users',
+ }, {
+ 'url': 'https://discovery.com/video/dirty-jobs-discovery-atve-us/rodbuster-galvanizer',
+ 'only_matching': True,
+ }]
+
+ _PRODUCT = 'dsc'
+ _DISCO_API_PARAMS = {
+ 'disco_host': 'us1-prod-direct.go.discovery.com',
+ 'realm': 'go',
+ 'country': 'us',
+ }
+
+
+class TravelChannelIE(DiscoveryPlusBaseIE):
+ _VALID_URL = r'https?://(?:watch\.)?travelchannel\.com/video' + DPlayBaseIE._PATH_REGEX
+ _TESTS = [{
+ 'url': 'https://watch.travelchannel.com/video/ghost-adventures-travel-channel/ghost-train-of-ely',
+ 'info_dict': {
+ 'id': '2220256',
+ 'display_id': 'ghost-adventures-travel-channel/ghost-train-of-ely',
+ 'ext': 'mp4',
+ 'title': 'Ghost Train of Ely',
+ 'description': 'The crew investigates the dark history of the Nevada Northern Railway.',
+ 'season_number': 24,
+ 'episode_number': 1,
+ },
+ 'skip': 'Available for Premium users',
+ }, {
+ 'url': 'https://watch.travelchannel.com/video/ghost-adventures-travel-channel/ghost-train-of-ely',
+ 'only_matching': True,
+ }]
+
+ _PRODUCT = 'trav'
+ _DISCO_API_PARAMS = {
+ 'disco_host': 'us1-prod-direct.watch.travelchannel.com',
+ 'realm': 'go',
+ 'country': 'us',
+ }
+
+
+class CookingChannelIE(DiscoveryPlusBaseIE):
+ _VALID_URL = r'https?://(?:watch\.)?cookingchanneltv\.com/video' + DPlayBaseIE._PATH_REGEX
+ _TESTS = [{
+ 'url': 'https://watch.cookingchanneltv.com/video/carnival-eats-cooking-channel/the-postman-always-brings-rice-2348634',
+ 'info_dict': {
+ 'id': '2348634',
+ 'display_id': 'carnival-eats-cooking-channel/the-postman-always-brings-rice-2348634',
+ 'ext': 'mp4',
+ 'title': 'The Postman Always Brings Rice',
+ 'description': 'Noah visits the Maui Fair and the Aurora Winter Festival in Vancouver.',
+ 'season_number': 9,
+ 'episode_number': 1,
+ },
+ 'skip': 'Available for Premium users',
+ }, {
+ 'url': 'https://watch.cookingchanneltv.com/video/carnival-eats-cooking-channel/the-postman-always-brings-rice-2348634',
+ 'only_matching': True,
+ }]
+
+ _PRODUCT = 'cook'
+ _DISCO_API_PARAMS = {
+ 'disco_host': 'us1-prod-direct.watch.cookingchanneltv.com',
+ 'realm': 'go',
+ 'country': 'us',
+ }
+
+
+class HGTVUsaIE(DiscoveryPlusBaseIE):
+ _VALID_URL = r'https?://(?:watch\.)?hgtv\.com/video' + DPlayBaseIE._PATH_REGEX
+ _TESTS = [{
+ 'url': 'https://watch.hgtv.com/video/home-inspector-joe-hgtv-atve-us/this-mold-house',
+ 'info_dict': {
+ 'id': '4289736',
+ 'display_id': 'home-inspector-joe-hgtv-atve-us/this-mold-house',
+ 'ext': 'mp4',
+ 'title': 'This Mold House',
+ 'description': 'Joe and Noel help take a familys dream home from hazardous to fabulous.',
+ 'season_number': 1,
+ 'episode_number': 1,
+ },
+ 'skip': 'Available for Premium users',
+ }, {
+ 'url': 'https://watch.hgtv.com/video/home-inspector-joe-hgtv-atve-us/this-mold-house',
+ 'only_matching': True,
+ }]
+
+ _PRODUCT = 'hgtv'
+ _DISCO_API_PARAMS = {
+ 'disco_host': 'us1-prod-direct.watch.hgtv.com',
+ 'realm': 'go',
+ 'country': 'us',
+ }
+
+
+class FoodNetworkIE(DiscoveryPlusBaseIE):
+ _VALID_URL = r'https?://(?:watch\.)?foodnetwork\.com/video' + DPlayBaseIE._PATH_REGEX
+ _TESTS = [{
+ 'url': 'https://watch.foodnetwork.com/video/kids-baking-championship-food-network/float-like-a-butterfly',
+ 'info_dict': {
+ 'id': '4116449',
+ 'display_id': 'kids-baking-championship-food-network/float-like-a-butterfly',
+ 'ext': 'mp4',
+ 'title': 'Float Like a Butterfly',
+ 'description': 'The 12 kid bakers create colorful carved butterfly cakes.',
+ 'season_number': 10,
+ 'episode_number': 1,
+ },
+ 'skip': 'Available for Premium users',
+ }, {
+ 'url': 'https://watch.foodnetwork.com/video/kids-baking-championship-food-network/float-like-a-butterfly',
+ 'only_matching': True,
+ }]
+
+ _PRODUCT = 'food'
+ _DISCO_API_PARAMS = {
+ 'disco_host': 'us1-prod-direct.watch.foodnetwork.com',
+ 'realm': 'go',
+ 'country': 'us',
+ }
-class ScienceChannelIE(DiscoveryPlusIE):
- _VALID_URL = r'https?://(?:www\.)?sciencechannel\.com/video' + DPlayIE._PATH_REGEX
+class DestinationAmericaIE(DiscoveryPlusBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?destinationamerica\.com/video' + DPlayBaseIE._PATH_REGEX
+ _TESTS = [{
+ 'url': 'https://www.destinationamerica.com/video/alaska-monsters-destination-america-atve-us/central-alaskas-bigfoot',
+ 'info_dict': {
+ 'id': '4210904',
+ 'display_id': 'alaska-monsters-destination-america-atve-us/central-alaskas-bigfoot',
+ 'ext': 'mp4',
+ 'title': 'Central Alaskas Bigfoot',
+ 'description': 'A team heads to central Alaska to investigate an aggressive Bigfoot.',
+ 'season_number': 1,
+ 'episode_number': 1,
+ },
+ 'skip': 'Available for Premium users',
+ }, {
+ 'url': 'https://www.destinationamerica.com/video/alaska-monsters-destination-america-atve-us/central-alaskas-bigfoot',
+ 'only_matching': True,
+ }]
+
+ _PRODUCT = 'dam'
+ _DISCO_API_PARAMS = {
+ 'disco_host': 'us1-prod-direct.destinationamerica.com',
+ 'realm': 'go',
+ 'country': 'us',
+ }
+
+
+class InvestigationDiscoveryIE(DiscoveryPlusBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?investigationdiscovery\.com/video' + DPlayBaseIE._PATH_REGEX
+ _TESTS = [{
+ 'url': 'https://www.investigationdiscovery.com/video/unmasked-investigation-discovery/the-killer-clown',
+ 'info_dict': {
+ 'id': '2139409',
+ 'display_id': 'unmasked-investigation-discovery/the-killer-clown',
+ 'ext': 'mp4',
+ 'title': 'The Killer Clown',
+ 'description': 'A wealthy Florida woman is fatally shot in the face by a clown at her door.',
+ 'season_number': 1,
+ 'episode_number': 1,
+ },
+ 'skip': 'Available for Premium users',
+ }, {
+ 'url': 'https://www.investigationdiscovery.com/video/unmasked-investigation-discovery/the-killer-clown',
+ 'only_matching': True,
+ }]
+
+ _PRODUCT = 'ids'
+ _DISCO_API_PARAMS = {
+ 'disco_host': 'us1-prod-direct.investigationdiscovery.com',
+ 'realm': 'go',
+ 'country': 'us',
+ }
+
+
+class AmHistoryChannelIE(DiscoveryPlusBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?ahctv\.com/video' + DPlayBaseIE._PATH_REGEX
+ _TESTS = [{
+ 'url': 'https://www.ahctv.com/video/modern-sniper-ahc/army',
+ 'info_dict': {
+ 'id': '2309730',
+ 'display_id': 'modern-sniper-ahc/army',
+ 'ext': 'mp4',
+ 'title': 'Army',
+ 'description': 'Snipers today face challenges their predecessors couldve only dreamed of.',
+ 'season_number': 1,
+ 'episode_number': 1,
+ },
+ 'skip': 'Available for Premium users',
+ }, {
+ 'url': 'https://www.ahctv.com/video/modern-sniper-ahc/army',
+ 'only_matching': True,
+ }]
+
+ _PRODUCT = 'ahc'
+ _DISCO_API_PARAMS = {
+ 'disco_host': 'us1-prod-direct.ahctv.com',
+ 'realm': 'go',
+ 'country': 'us',
+ }
+
+
+class ScienceChannelIE(DiscoveryPlusBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?sciencechannel\.com/video' + DPlayBaseIE._PATH_REGEX
_TESTS = [{
'url': 'https://www.sciencechannel.com/video/strangest-things-science-atve-us/nazi-mystery-machine',
'info_dict': {
@@ -385,14 +599,21 @@ class ScienceChannelIE(DiscoveryPlusIE):
'episode_number': 1,
},
'skip': 'Available for Premium users',
+ }, {
+ 'url': 'https://www.sciencechannel.com/video/strangest-things-science-atve-us/nazi-mystery-machine',
+ 'only_matching': True,
}]
_PRODUCT = 'sci'
- _API_URL = 'us1-prod-direct.sciencechannel.com'
+ _DISCO_API_PARAMS = {
+ 'disco_host': 'us1-prod-direct.sciencechannel.com',
+ 'realm': 'go',
+ 'country': 'us',
+ }
-class DIYNetworkIE(DiscoveryPlusIE):
- _VALID_URL = r'https?://(?:watch\.)?diynetwork\.com/video' + DPlayIE._PATH_REGEX
+class DIYNetworkIE(DiscoveryPlusBaseIE):
+ _VALID_URL = r'https?://(?:watch\.)?diynetwork\.com/video' + DPlayBaseIE._PATH_REGEX
_TESTS = [{
'url': 'https://watch.diynetwork.com/video/pool-kings-diy-network/bringing-beach-life-to-texas',
'info_dict': {
@@ -405,14 +626,48 @@ class DIYNetworkIE(DiscoveryPlusIE):
'episode_number': 2,
},
'skip': 'Available for Premium users',
+ }, {
+ 'url': 'https://watch.diynetwork.com/video/pool-kings-diy-network/bringing-beach-life-to-texas',
+ 'only_matching': True,
}]
_PRODUCT = 'diy'
- _API_URL = 'us1-prod-direct.watch.diynetwork.com'
+ _DISCO_API_PARAMS = {
+ 'disco_host': 'us1-prod-direct.watch.diynetwork.com',
+ 'realm': 'go',
+ 'country': 'us',
+ }
+
+
+class DiscoveryLifeIE(DiscoveryPlusBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?discoverylife\.com/video' + DPlayBaseIE._PATH_REGEX
+ _TESTS = [{
+ 'url': 'https://www.discoverylife.com/video/surviving-death-discovery-life-atve-us/bodily-trauma',
+ 'info_dict': {
+ 'id': '2218238',
+ 'display_id': 'surviving-death-discovery-life-atve-us/bodily-trauma',
+ 'ext': 'mp4',
+ 'title': 'Bodily Trauma',
+ 'description': 'Meet three people who tested the limits of the human body.',
+ 'season_number': 1,
+ 'episode_number': 2,
+ },
+ 'skip': 'Available for Premium users',
+ }, {
+ 'url': 'https://www.discoverylife.com/video/surviving-death-discovery-life-atve-us/bodily-trauma',
+ 'only_matching': True,
+ }]
+
+ _PRODUCT = 'dlf'
+ _DISCO_API_PARAMS = {
+ 'disco_host': 'us1-prod-direct.discoverylife.com',
+ 'realm': 'go',
+ 'country': 'us',
+ }
-class AnimalPlanetIE(DiscoveryPlusIE):
- _VALID_URL = r'https?://(?:www\.)?animalplanet\.com/video' + DPlayIE._PATH_REGEX
+class AnimalPlanetIE(DiscoveryPlusBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?animalplanet\.com/video' + DPlayBaseIE._PATH_REGEX
_TESTS = [{
'url': 'https://www.animalplanet.com/video/north-woods-law-animal-planet/squirrel-showdown',
'info_dict': {
@@ -425,7 +680,251 @@ class AnimalPlanetIE(DiscoveryPlusIE):
'episode_number': 11,
},
'skip': 'Available for Premium users',
+ }, {
+ 'url': 'https://www.animalplanet.com/video/north-woods-law-animal-planet/squirrel-showdown',
+ 'only_matching': True,
}]
_PRODUCT = 'apl'
- _API_URL = 'us1-prod-direct.animalplanet.com'
+ _DISCO_API_PARAMS = {
+ 'disco_host': 'us1-prod-direct.animalplanet.com',
+ 'realm': 'go',
+ 'country': 'us',
+ }
+
+
+class TLCIE(DiscoveryPlusBaseIE):
+ _VALID_URL = r'https?://(?:go\.)?tlc\.com/video' + DPlayBaseIE._PATH_REGEX
+ _TESTS = [{
+ 'url': 'https://go.tlc.com/video/my-600-lb-life-tlc/melissas-story-part-1',
+ 'info_dict': {
+ 'id': '2206540',
+ 'display_id': 'my-600-lb-life-tlc/melissas-story-part-1',
+ 'ext': 'mp4',
+ 'title': 'Melissas Story (Part 1)',
+ 'description': 'At 650 lbs, Melissa is ready to begin her seven-year weight loss journey.',
+ 'season_number': 1,
+ 'episode_number': 1,
+ },
+ 'skip': 'Available for Premium users',
+ }, {
+ 'url': 'https://go.tlc.com/video/my-600-lb-life-tlc/melissas-story-part-1',
+ 'only_matching': True,
+ }]
+
+ _PRODUCT = 'tlc'
+ _DISCO_API_PARAMS = {
+ 'disco_host': 'us1-prod-direct.tlc.com',
+ 'realm': 'go',
+ 'country': 'us',
+ }
+
+
+class DiscoveryPlusIE(DiscoveryPlusBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?discoveryplus\.com/(?!it/)(?:\w{2}/)?video' + DPlayBaseIE._PATH_REGEX
+ _TESTS = [{
+ 'url': 'https://www.discoveryplus.com/video/property-brothers-forever-home/food-and-family',
+ 'info_dict': {
+ 'id': '1140794',
+ 'display_id': 'property-brothers-forever-home/food-and-family',
+ 'ext': 'mp4',
+ 'title': 'Food and Family',
+ 'description': 'The brothers help a Richmond family expand their single-level home.',
+ 'duration': 2583.113,
+ 'timestamp': 1609304400,
+ 'upload_date': '20201230',
+ 'creator': 'HGTV',
+ 'series': 'Property Brothers: Forever Home',
+ 'season_number': 1,
+ 'episode_number': 1,
+ },
+ 'skip': 'Available for Premium users',
+ }, {
+ 'url': 'https://discoveryplus.com/ca/video/bering-sea-gold-discovery-ca/goldslingers',
+ 'only_matching': True,
+ }]
+
+ _PRODUCT = 'dplus_us'
+ _DISCO_API_PARAMS = {
+ 'disco_host': 'us1-prod-direct.discoveryplus.com',
+ 'realm': 'go',
+ 'country': 'us',
+ }
+
+
+class DiscoveryPlusIndiaIE(DiscoveryPlusBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?discoveryplus\.in/videos?' + DPlayBaseIE._PATH_REGEX
+ _TESTS = [{
+ 'url': 'https://www.discoveryplus.in/videos/how-do-they-do-it/fugu-and-more?seasonId=8&type=EPISODE',
+ 'info_dict': {
+ 'id': '27104',
+ 'ext': 'mp4',
+ 'display_id': 'how-do-they-do-it/fugu-and-more',
+ 'title': 'Fugu and More',
+ 'description': 'The Japanese catch, prepare and eat the deadliest fish on the planet.',
+ 'duration': 1319.32,
+ 'timestamp': 1582309800,
+ 'upload_date': '20200221',
+ 'series': 'How Do They Do It?',
+ 'season_number': 8,
+ 'episode_number': 2,
+ 'creator': 'Discovery Channel',
+ 'thumbnail': r're:https://.+\.jpeg',
+ 'episode': 'Episode 2',
+ 'season': 'Season 8',
+ 'tags': [],
+ },
+ 'params': {
+ 'skip_download': True,
+ }
+ }]
+
+ _PRODUCT = 'dplus-india'
+ _DISCO_API_PARAMS = {
+ 'disco_host': 'ap2-prod-direct.discoveryplus.in',
+ 'realm': 'dplusindia',
+ 'country': 'in',
+ 'domain': 'https://www.discoveryplus.in/',
+ }
+
+ def _update_disco_api_headers(self, headers, disco_base, display_id, realm):
+ headers.update({
+ 'x-disco-params': 'realm=%s' % realm,
+ 'x-disco-client': f'WEB:UNKNOWN:{self._PRODUCT}:17.0.0',
+ 'Authorization': self._get_auth(disco_base, display_id, realm),
+ })
+
+
+class DiscoveryNetworksDeIE(DPlayBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?(?P<domain>(?:tlc|dmax)\.de|dplay\.co\.uk)/(?:programme|show|sendungen)/(?P<programme>[^/]+)/(?:video/)?(?P<alternate_id>[^/]+)'
+
+ _TESTS = [{
+ 'url': 'https://www.tlc.de/programme/breaking-amish/video/die-welt-da-drauen/DCB331270001100',
+ 'info_dict': {
+ 'id': '78867',
+ 'ext': 'mp4',
+ 'title': 'Die Welt da draußen',
+ 'description': 'md5:61033c12b73286e409d99a41742ef608',
+ 'timestamp': 1554069600,
+ 'upload_date': '20190331',
+ 'creator': 'TLC',
+ 'season': 'Season 1',
+ 'series': 'Breaking Amish',
+ 'episode_number': 1,
+ 'tags': ['new york', 'großstadt', 'amische', 'landleben', 'modern', 'infos', 'tradition', 'herausforderung'],
+ 'display_id': 'breaking-amish/die-welt-da-drauen',
+ 'episode': 'Episode 1',
+ 'duration': 2625.024,
+ 'season_number': 1,
+ 'thumbnail': r're:https://.+\.jpg',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'https://www.dmax.de/programme/dmax-highlights/video/tuning-star-sidney-hoffmann-exklusiv-bei-dmax/191023082312316',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.dplay.co.uk/show/ghost-adventures/video/hotel-leger-103620/EHD_280313B',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://tlc.de/sendungen/breaking-amish/die-welt-da-drauen/',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ domain, programme, alternate_id = self._match_valid_url(url).groups()
+ country = 'GB' if domain == 'dplay.co.uk' else 'DE'
+ realm = 'questuk' if country == 'GB' else domain.replace('.', '')
+ return self._get_disco_api_info(
+ url, '%s/%s' % (programme, alternate_id),
+ 'sonic-eu1-prod.disco-api.com', realm, country)
+
+
+class DiscoveryPlusShowBaseIE(DPlayBaseIE):
+
+ def _entries(self, show_name):
+ headers = {
+ 'x-disco-client': self._X_CLIENT,
+ 'x-disco-params': f'realm={self._REALM}',
+ 'referer': self._DOMAIN,
+ 'Authentication': self._get_auth(self._BASE_API, None, self._REALM),
+ }
+ show_json = self._download_json(
+ f'{self._BASE_API}cms/routes/{self._SHOW_STR}/{show_name}?include=default',
+ video_id=show_name, headers=headers)['included'][self._INDEX]['attributes']['component']
+ show_id = show_json['mandatoryParams'].split('=')[-1]
+ season_url = self._BASE_API + 'content/videos?sort=episodeNumber&filter[seasonNumber]={}&filter[show.id]={}&page[size]=100&page[number]={}'
+ for season in show_json['filters'][0]['options']:
+ season_id = season['id']
+ total_pages, page_num = 1, 0
+ while page_num < total_pages:
+ season_json = self._download_json(
+ season_url.format(season_id, show_id, str(page_num + 1)), show_name, headers=headers,
+ note='Downloading season %s JSON metadata%s' % (season_id, ' page %d' % page_num if page_num else ''))
+ if page_num == 0:
+ total_pages = try_get(season_json, lambda x: x['meta']['totalPages'], int) or 1
+ episodes_json = season_json['data']
+ for episode in episodes_json:
+ video_path = episode['attributes']['path']
+ yield self.url_result(
+ '%svideos/%s' % (self._DOMAIN, video_path),
+ ie=self._VIDEO_IE.ie_key(), video_id=episode.get('id') or video_path)
+ page_num += 1
+
+ def _real_extract(self, url):
+ show_name = self._match_valid_url(url).group('show_name')
+ return self.playlist_result(self._entries(show_name), playlist_id=show_name)
+
+
+class DiscoveryPlusItalyIE(DiscoveryPlusBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?discoveryplus\.com/it/video' + DPlayBaseIE._PATH_REGEX
+ _TESTS = [{
+ 'url': 'https://www.discoveryplus.com/it/video/i-signori-della-neve/stagione-2-episodio-1-i-preparativi',
+ 'only_matching': True,
+ }]
+
+ _PRODUCT = 'dplus_us'
+ _DISCO_API_PARAMS = {
+ 'disco_host': 'eu1-prod-direct.discoveryplus.com',
+ 'realm': 'dplay',
+ 'country': 'it',
+ }
+
+
+class DiscoveryPlusItalyShowIE(DiscoveryPlusShowBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?discoveryplus\.it/programmi/(?P<show_name>[^/]+)/?(?:[?#]|$)'
+ _TESTS = [{
+ 'url': 'https://www.discoveryplus.it/programmi/deal-with-it-stai-al-gioco',
+ 'playlist_mincount': 168,
+ 'info_dict': {
+ 'id': 'deal-with-it-stai-al-gioco',
+ },
+ }]
+
+ _BASE_API = 'https://disco-api.discoveryplus.it/'
+ _DOMAIN = 'https://www.discoveryplus.it/'
+ _X_CLIENT = 'WEB:UNKNOWN:dplay-client:2.6.0'
+ _REALM = 'dplayit'
+ _SHOW_STR = 'programmi'
+ _INDEX = 1
+ _VIDEO_IE = DPlayIE
+
+
+class DiscoveryPlusIndiaShowIE(DiscoveryPlusShowBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?discoveryplus\.in/show/(?P<show_name>[^/]+)/?(?:[?#]|$)'
+ _TESTS = [{
+ 'url': 'https://www.discoveryplus.in/show/how-do-they-do-it',
+ 'playlist_mincount': 140,
+ 'info_dict': {
+ 'id': 'how-do-they-do-it',
+ },
+ }]
+
+ _BASE_API = 'https://ap2-prod-direct.discoveryplus.in/'
+ _DOMAIN = 'https://www.discoveryplus.in/'
+ _X_CLIENT = 'WEB:UNKNOWN:dplus-india:prod'
+ _REALM = 'dplusindia'
+ _SHOW_STR = 'show'
+ _INDEX = 4
+ _VIDEO_IE = DiscoveryPlusIndiaIE
diff --git a/hypervideo_dl/extractor/drooble.py b/hypervideo_dl/extractor/drooble.py
new file mode 100644
index 0000000..0584250
--- /dev/null
+++ b/hypervideo_dl/extractor/drooble.py
@@ -0,0 +1,116 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import json
+
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ int_or_none,
+ try_get,
+)
+
+
+class DroobleIE(InfoExtractor):
+ _VALID_URL = r'''(?x)https?://drooble\.com/(?:
+ (?:(?P<user>[^/]+)/)?(?P<kind>song|videos|music/albums)/(?P<id>\d+)|
+ (?P<user_2>[^/]+)/(?P<kind_2>videos|music))
+ '''
+ _TESTS = [{
+ 'url': 'https://drooble.com/song/2858030',
+ 'md5': '5ffda90f61c7c318dc0c3df4179eb064',
+ 'info_dict': {
+ 'id': '2858030',
+ 'ext': 'mp3',
+ 'title': 'Skankocillin',
+ 'upload_date': '20200801',
+ 'timestamp': 1596241390,
+ 'uploader_id': '95894',
+ 'uploader': 'Bluebeat Shelter',
+ }
+ }, {
+ 'url': 'https://drooble.com/karl340758/videos/2859183',
+ 'info_dict': {
+ 'id': 'J6QCQY_I5Tk',
+ 'ext': 'mp4',
+ 'title': 'Skankocillin',
+ 'uploader_id': 'UCrSRoI5vVyeYihtWEYua7rg',
+ 'description': 'md5:ffc0bd8ba383db5341a86a6cd7d9bcca',
+ 'upload_date': '20200731',
+ 'uploader': 'Bluebeat Shelter',
+ }
+ }, {
+ 'url': 'https://drooble.com/karl340758/music/albums/2858031',
+ 'info_dict': {
+ 'id': '2858031',
+ },
+ 'playlist_mincount': 8,
+ }, {
+ 'url': 'https://drooble.com/karl340758/music',
+ 'info_dict': {
+ 'id': 'karl340758',
+ },
+ 'playlist_mincount': 8,
+ }, {
+ 'url': 'https://drooble.com/karl340758/videos',
+ 'info_dict': {
+ 'id': 'karl340758',
+ },
+ 'playlist_mincount': 8,
+ }]
+
+ def _call_api(self, method, video_id, data=None):
+ response = self._download_json(
+ f'https://drooble.com/api/dt/{method}', video_id, data=json.dumps(data).encode())
+ if not response[0]:
+ raise ExtractorError('Unable to download JSON metadata')
+ return response[1]
+
+ def _real_extract(self, url):
+ mobj = self._match_valid_url(url)
+ user = mobj.group('user') or mobj.group('user_2')
+ kind = mobj.group('kind') or mobj.group('kind_2')
+ display_id = mobj.group('id') or user
+
+ if mobj.group('kind_2') == 'videos':
+ data = {'from_user': display_id, 'album': -1, 'limit': 18, 'offset': 0, 'order': 'new2old', 'type': 'video'}
+ elif kind in ('music/albums', 'music'):
+ data = {'user': user, 'public_only': True, 'individual_limit': {'singles': 1, 'albums': 1, 'playlists': 1}}
+ else:
+ data = {'url_slug': display_id, 'children': 10, 'order': 'old2new'}
+
+ method = 'getMusicOverview' if kind in ('music/albums', 'music') else 'getElements'
+ json_data = self._call_api(method, display_id, data=data)
+ if kind in ('music/albums', 'music'):
+ json_data = json_data['singles']['list']
+
+ entites = []
+ for media in json_data:
+ url = media.get('external_media_url') or media.get('link')
+ if url.startswith('https://www.youtube.com'):
+ entites.append({
+ '_type': 'url',
+ 'url': url,
+ 'ie_key': 'Youtube'
+ })
+ continue
+ is_audio = (media.get('type') or '').lower() == 'audio'
+ entites.append({
+ 'url': url,
+ 'id': media['id'],
+ 'title': media['title'],
+ 'duration': int_or_none(media.get('duration')),
+ 'timestamp': int_or_none(media.get('timestamp')),
+ 'album': try_get(media, lambda x: x['album']['title']),
+ 'uploader': try_get(media, lambda x: x['creator']['display_name']),
+ 'uploader_id': try_get(media, lambda x: x['creator']['id']),
+ 'thumbnail': media.get('image_comment'),
+ 'like_count': int_or_none(media.get('likes')),
+ 'vcodec': 'none' if is_audio else None,
+ 'ext': 'mp3' if is_audio else None,
+ })
+
+ if len(entites) > 1:
+ return self.playlist_result(entites, display_id)
+
+ return entites[0]
diff --git a/hypervideo_dl/extractor/dropbox.py b/hypervideo_dl/extractor/dropbox.py
index 6a7d050..2559657 100644
--- a/hypervideo_dl/extractor/dropbox.py
+++ b/hypervideo_dl/extractor/dropbox.py
@@ -6,7 +6,12 @@ import re
from .common import InfoExtractor
from ..compat import compat_urllib_parse_unquote
-from ..utils import url_basename
+from ..utils import (
+ ExtractorError,
+ traverse_obj,
+ try_get,
+ url_basename,
+)
class DropboxIE(InfoExtractor):
@@ -28,13 +33,44 @@ class DropboxIE(InfoExtractor):
def _real_extract(self, url):
mobj = self._match_valid_url(url)
video_id = mobj.group('id')
+ webpage = self._download_webpage(url, video_id)
fn = compat_urllib_parse_unquote(url_basename(url))
title = os.path.splitext(fn)[0]
- video_url = re.sub(r'[?&]dl=0', '', url)
- video_url += ('?' if '?' not in video_url else '&') + 'dl=1'
+
+ password = self.get_param('videopassword')
+ if (self._og_search_title(webpage) == 'Dropbox - Password Required'
+ or 'Enter the password for this link' in webpage):
+
+ if password:
+ content_id = self._search_regex(r'content_id=(.*?)["\']', webpage, 'content_id')
+ payload = f'is_xhr=true&t={self._get_cookies("https://www.dropbox.com").get("t").value}&content_id={content_id}&password={password}&url={url}'
+ response = self._download_json(
+ 'https://www.dropbox.com/sm/auth', video_id, 'POSTing video password', data=payload.encode('UTF-8'),
+ headers={'content-type': 'application/x-www-form-urlencoded; charset=UTF-8'})
+
+ if response.get('status') != 'authed':
+ raise ExtractorError('Authentication failed!', expected=True)
+ webpage = self._download_webpage(url, video_id)
+ elif self._get_cookies('https://dropbox.com').get('sm_auth'):
+ webpage = self._download_webpage(url, video_id)
+ else:
+ raise ExtractorError('Password protected video, use --video-password <password>', expected=True)
+
+ json_string = self._html_search_regex(r'InitReact\.mountComponent\(.*?,\s*(\{.+\})\s*?\)', webpage, 'Info JSON')
+ info_json = self._parse_json(json_string, video_id).get('props')
+ transcode_url = traverse_obj(info_json, ((None, 'preview'), 'file', 'preview', 'content', 'transcode_url'), get_all=False)
+ formats, subtitles = self._extract_m3u8_formats_and_subtitles(transcode_url, video_id)
+
+ # downloads enabled we can get the original file
+ if 'anonymous' in (try_get(info_json, lambda x: x['sharePermission']['canDownloadRoles']) or []):
+ video_url = re.sub(r'[?&]dl=0', '', url)
+ video_url += ('?' if '?' not in video_url else '&') + 'dl=1'
+ formats.append({'url': video_url, 'format_id': 'original', 'format_note': 'Original', 'quality': 1})
+ self._sort_formats(formats)
return {
'id': video_id,
'title': title,
- 'url': video_url,
+ 'formats': formats,
+ 'subtitles': subtitles
}
diff --git a/hypervideo_dl/extractor/dropout.py b/hypervideo_dl/extractor/dropout.py
new file mode 100644
index 0000000..2fa6195
--- /dev/null
+++ b/hypervideo_dl/extractor/dropout.py
@@ -0,0 +1,212 @@
+# coding: utf-8
+from .common import InfoExtractor
+from .vimeo import VHXEmbedIE
+from ..utils import (
+ clean_html,
+ ExtractorError,
+ get_element_by_class,
+ get_element_by_id,
+ get_elements_by_class,
+ int_or_none,
+ join_nonempty,
+ unified_strdate,
+ urlencode_postdata,
+)
+
+
+class DropoutIE(InfoExtractor):
+ _LOGIN_URL = 'https://www.dropout.tv/login'
+ _NETRC_MACHINE = 'dropout'
+
+ _VALID_URL = r'https?://(?:www\.)?dropout\.tv/(?:[^/]+/)*videos/(?P<id>[^/]+)/?$'
+ _TESTS = [
+ {
+ 'url': 'https://www.dropout.tv/game-changer/season:2/videos/yes-or-no',
+ 'note': 'Episode in a series',
+ 'md5': '5e000fdfd8d8fa46ff40456f1c2af04a',
+ 'info_dict': {
+ 'id': '738153',
+ 'display_id': 'yes-or-no',
+ 'ext': 'mp4',
+ 'title': 'Yes or No',
+ 'description': 'Ally, Brennan, and Zac are asked a simple question, but is there a correct answer?',
+ 'release_date': '20200508',
+ 'thumbnail': 'https://vhx.imgix.net/chuncensoredstaging/assets/351e3f24-c4a3-459a-8b79-dc80f1e5b7fd.jpg',
+ 'series': 'Game Changer',
+ 'season_number': 2,
+ 'season': 'Season 2',
+ 'episode_number': 6,
+ 'episode': 'Yes or No',
+ 'duration': 1180,
+ 'uploader_id': 'user80538407',
+ 'uploader_url': 'https://vimeo.com/user80538407',
+ 'uploader': 'OTT Videos'
+ },
+ 'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest']
+ },
+ {
+ 'url': 'https://www.dropout.tv/dimension-20-fantasy-high/season:1/videos/episode-1',
+ 'note': 'Episode in a series (missing release_date)',
+ 'md5': '712caf7c191f1c47c8f1879520c2fa5c',
+ 'info_dict': {
+ 'id': '320562',
+ 'display_id': 'episode-1',
+ 'ext': 'mp4',
+ 'title': 'The Beginning Begins',
+ 'description': 'The cast introduces their PCs, including a neurotic elf, a goblin PI, and a corn-worshipping cleric.',
+ 'thumbnail': 'https://vhx.imgix.net/chuncensoredstaging/assets/4421ed0d-f630-4c88-9004-5251b2b8adfa.jpg',
+ 'series': 'Dimension 20: Fantasy High',
+ 'season_number': 1,
+ 'season': 'Season 1',
+ 'episode_number': 1,
+ 'episode': 'The Beginning Begins',
+ 'duration': 6838,
+ 'uploader_id': 'user80538407',
+ 'uploader_url': 'https://vimeo.com/user80538407',
+ 'uploader': 'OTT Videos'
+ },
+ 'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest']
+ },
+ {
+ 'url': 'https://www.dropout.tv/videos/misfits-magic-holiday-special',
+ 'note': 'Episode not in a series',
+ 'md5': 'c30fa18999c5880d156339f13c953a26',
+ 'info_dict': {
+ 'id': '1915774',
+ 'display_id': 'misfits-magic-holiday-special',
+ 'ext': 'mp4',
+ 'title': 'Misfits & Magic Holiday Special',
+ 'description': 'The magical misfits spend Christmas break at Gowpenny, with an unwelcome visitor.',
+ 'release_date': '20211215',
+ 'thumbnail': 'https://vhx.imgix.net/chuncensoredstaging/assets/d91ea8a6-b250-42ed-907e-b30fb1c65176-8e24b8e5.jpg',
+ 'duration': 11698,
+ 'uploader_id': 'user80538407',
+ 'uploader_url': 'https://vimeo.com/user80538407',
+ 'uploader': 'OTT Videos'
+ },
+ 'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest']
+ }
+ ]
+
+ def _get_authenticity_token(self, display_id):
+ signin_page = self._download_webpage(
+ self._LOGIN_URL, display_id, note='Getting authenticity token')
+ return self._html_search_regex(
+ r'name=["\']authenticity_token["\'] value=["\'](.+?)["\']',
+ signin_page, 'authenticity_token')
+
+ def _login(self, display_id):
+ username, password = self._get_login_info()
+ if not (username and password):
+ self.raise_login_required(method='password')
+
+ response = self._download_webpage(
+ self._LOGIN_URL, display_id, note='Logging in', data=urlencode_postdata({
+ 'email': username,
+ 'password': password,
+ 'authenticity_token': self._get_authenticity_token(display_id),
+ 'utf8': True
+ }))
+
+ user_has_subscription = self._search_regex(
+ r'user_has_subscription:\s*["\'](.+?)["\']', response, 'subscription status', default='none')
+ if user_has_subscription.lower() == 'true':
+ return response
+ elif user_has_subscription.lower() == 'false':
+ raise ExtractorError('Account is not subscribed')
+ else:
+ raise ExtractorError('Incorrect username/password')
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ try:
+ self._login(display_id)
+ webpage = self._download_webpage(url, display_id, note='Downloading video webpage')
+ finally:
+ self._download_webpage('https://www.dropout.tv/logout', display_id, note='Logging out', fatal=False)
+
+ embed_url = self._search_regex(r'embed_url:\s*["\'](.+?)["\']', webpage, 'embed url')
+ thumbnail = self._og_search_thumbnail(webpage)
+ watch_info = get_element_by_id('watch-info', webpage) or ''
+
+ title = clean_html(get_element_by_class('video-title', watch_info))
+ season_episode = get_element_by_class(
+ 'site-font-secondary-color', get_element_by_class('text', watch_info))
+ episode_number = int_or_none(self._search_regex(
+ r'Episode (\d+)', season_episode or '', 'episode', default=None))
+
+ return {
+ '_type': 'url_transparent',
+ 'ie_key': VHXEmbedIE.ie_key(),
+ 'url': embed_url,
+ 'id': self._search_regex(r'embed\.vhx\.tv/videos/(.+?)\?', embed_url, 'id'),
+ 'display_id': display_id,
+ 'title': title,
+ 'description': self._html_search_meta('description', webpage, fatal=False),
+ 'thumbnail': thumbnail.split('?')[0] if thumbnail else None, # Ignore crop/downscale
+ 'series': clean_html(get_element_by_class('series-title', watch_info)),
+ 'episode_number': episode_number,
+ 'episode': title if episode_number else None,
+ 'season_number': int_or_none(self._search_regex(
+ r'Season (\d+),', season_episode or '', 'season', default=None)),
+ 'release_date': unified_strdate(self._search_regex(
+ r'data-meta-field-name=["\']release_dates["\'] data-meta-field-value=["\'](.+?)["\']',
+ watch_info, 'release date', default=None)),
+ }
+
+
+class DropoutSeasonIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?dropout\.tv/(?P<id>[^\/$&?#]+)(?:/?$|/season:[0-9]+/?$)'
+ _TESTS = [
+ {
+ 'url': 'https://www.dropout.tv/dimension-20-fantasy-high/season:1',
+ 'note': 'Multi-season series with the season in the url',
+ 'playlist_count': 17,
+ 'info_dict': {
+ 'id': 'dimension-20-fantasy-high-season-1',
+ 'title': 'Dimension 20 Fantasy High - Season 1'
+ }
+ },
+ {
+ 'url': 'https://www.dropout.tv/dimension-20-fantasy-high',
+ 'note': 'Multi-season series with the season not in the url',
+ 'playlist_count': 17,
+ 'info_dict': {
+ 'id': 'dimension-20-fantasy-high-season-1',
+ 'title': 'Dimension 20 Fantasy High - Season 1'
+ }
+ },
+ {
+ 'url': 'https://www.dropout.tv/dimension-20-shriek-week',
+ 'note': 'Single-season series',
+ 'playlist_count': 4,
+ 'info_dict': {
+ 'id': 'dimension-20-shriek-week-season-1',
+ 'title': 'Dimension 20 Shriek Week - Season 1'
+ }
+ }
+ ]
+
+ def _real_extract(self, url):
+ season_id = self._match_id(url)
+ season_title = season_id.replace('-', ' ').title()
+ webpage = self._download_webpage(url, season_id)
+
+ entries = [
+ self.url_result(
+ url=self._search_regex(r'<a href=["\'](.+?)["\'] class=["\']browse-item-link["\']',
+ item, 'item_url'),
+ ie=DropoutIE.ie_key()
+ ) for item in get_elements_by_class('js-collection-item', webpage)
+ ]
+
+ seasons = (get_element_by_class('select-dropdown-wrapper', webpage) or '').strip().replace('\n', '')
+ current_season = self._search_regex(r'<option[^>]+selected>([^<]+)</option>',
+ seasons, 'current_season', default='').strip()
+
+ return {
+ '_type': 'playlist',
+ 'id': join_nonempty(season_id, current_season.lower().replace(' ', '-')),
+ 'title': join_nonempty(season_title, current_season, delim=' - '),
+ 'entries': entries
+ }
diff --git a/hypervideo_dl/extractor/drtv.py b/hypervideo_dl/extractor/drtv.py
index 7bb15f8..37e4d5b 100644
--- a/hypervideo_dl/extractor/drtv.py
+++ b/hypervideo_dl/extractor/drtv.py
@@ -7,13 +7,11 @@ import re
from .common import InfoExtractor
-from ..aes import aes_cbc_decrypt
+from ..aes import aes_cbc_decrypt_bytes, unpad_pkcs7
from ..compat import compat_urllib_parse_unquote
from ..utils import (
- bytes_to_intlist,
ExtractorError,
int_or_none,
- intlist_to_bytes,
float_or_none,
mimetype2ext,
str_or_none,
@@ -191,13 +189,11 @@ class DRTVIE(InfoExtractor):
def decrypt_uri(e):
n = int(e[2:10], 16)
a = e[10 + n:]
- data = bytes_to_intlist(hex_to_bytes(e[10:10 + n]))
- key = bytes_to_intlist(hashlib.sha256(
- ('%s:sRBzYNXBzkKgnjj8pGtkACch' % a).encode('utf-8')).digest())
- iv = bytes_to_intlist(hex_to_bytes(a))
- decrypted = aes_cbc_decrypt(data, key, iv)
- return intlist_to_bytes(
- decrypted[:-decrypted[-1]]).decode('utf-8').split('?')[0]
+ data = hex_to_bytes(e[10:10 + n])
+ key = hashlib.sha256(('%s:sRBzYNXBzkKgnjj8pGtkACch' % a).encode('utf-8')).digest()
+ iv = hex_to_bytes(a)
+ decrypted = unpad_pkcs7(aes_cbc_decrypt_bytes(data, key, iv))
+ return decrypted.decode('utf-8').split('?')[0]
for asset in assets:
kind = asset.get('Kind')
@@ -321,7 +317,7 @@ class DRTVLiveIE(InfoExtractor):
channel_data = self._download_json(
'https://www.dr.dk/mu-online/api/1.0/channel/' + channel_id,
channel_id)
- title = self._live_title(channel_data['Title'])
+ title = channel_data['Title']
formats = []
for streaming_server in channel_data.get('StreamingServers', []):
diff --git a/hypervideo_dl/extractor/dvtv.py b/hypervideo_dl/extractor/dvtv.py
index de7f6d6..08663cf 100644
--- a/hypervideo_dl/extractor/dvtv.py
+++ b/hypervideo_dl/extractor/dvtv.py
@@ -8,6 +8,7 @@ from ..utils import (
determine_ext,
ExtractorError,
int_or_none,
+ join_nonempty,
js_to_json,
mimetype2ext,
try_get,
@@ -139,13 +140,9 @@ class DVTVIE(InfoExtractor):
label = video.get('label')
height = self._search_regex(
r'^(\d+)[pP]', label or '', 'height', default=None)
- format_id = ['http']
- for f in (ext, label):
- if f:
- format_id.append(f)
formats.append({
'url': video_url,
- 'format_id': '-'.join(format_id),
+ 'format_id': join_nonempty('http', ext, label),
'height': int_or_none(height),
})
self._sort_formats(formats)
diff --git a/hypervideo_dl/extractor/egghead.py b/hypervideo_dl/extractor/egghead.py
index f6b50e7..b6b8676 100644
--- a/hypervideo_dl/extractor/egghead.py
+++ b/hypervideo_dl/extractor/egghead.py
@@ -86,7 +86,6 @@ class EggheadLessonIE(EggheadBaseIE):
},
'params': {
'skip_download': True,
- 'format': 'bestvideo',
},
}, {
'url': 'https://egghead.io/api/v1/lessons/react-add-redux-to-a-react-application',
diff --git a/hypervideo_dl/extractor/ellentube.py b/hypervideo_dl/extractor/ellentube.py
index 5444732..d451bc0 100644
--- a/hypervideo_dl/extractor/ellentube.py
+++ b/hypervideo_dl/extractor/ellentube.py
@@ -26,7 +26,7 @@ class EllenTubeBaseIE(InfoExtractor):
duration = None
for entry in data.get('media'):
if entry.get('id') == 'm3u8':
- formats = self._extract_m3u8_formats(
+ formats, subtitles = self._extract_m3u8_formats_and_subtitles(
entry['url'], video_id, 'mp4',
entry_protocol='m3u8_native', m3u8_id='hls')
duration = int_or_none(entry.get('duration'))
@@ -48,6 +48,7 @@ class EllenTubeBaseIE(InfoExtractor):
'view_count': get_insight('view'),
'like_count': get_insight('like'),
'formats': formats,
+ 'subtitles': subtitles,
}
diff --git a/hypervideo_dl/extractor/elonet.py b/hypervideo_dl/extractor/elonet.py
index eefba4e..9c6aea2 100644
--- a/hypervideo_dl/extractor/elonet.py
+++ b/hypervideo_dl/extractor/elonet.py
@@ -1,30 +1,22 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
-from ..utils import (
- base_url,
- ExtractorError,
- try_get,
-)
-from ..compat import compat_str
+from ..utils import determine_ext
class ElonetIE(InfoExtractor):
_VALID_URL = r'https?://elonet\.finna\.fi/Record/kavi\.elonet_elokuva_(?P<id>[0-9]+)'
_TESTS = [{
- # m3u8 with subtitles
'url': 'https://elonet.finna.fi/Record/kavi.elonet_elokuva_107867',
- 'md5': '8efc954b96c543711707f87de757caea',
'info_dict': {
'id': '107867',
'ext': 'mp4',
'title': 'Valkoinen peura',
- 'description': 'Valkoinen peura (1952) on Erik Blombergin ohjaama ja yhdessä Mirjami Kuosmasen kanssa käsikirjoittama tarunomainen kertomus valkoisen peuran hahmossa lii...',
- 'thumbnail': 'https://elonet.finna.fi/Cover/Show?id=kavi.elonet_elokuva_107867&index=0&size=large',
+ 'thumbnail': r're:^https?://elonet\.finna\.fi/Cover/Show\?id=kavi\.elonet_elokuva_107867.+',
+ 'description': 'md5:bded4201c9677fab10854884fe8f7312',
},
+ 'params': {'skip_download': 'dash'},
}, {
# DASH with subtitles
'url': 'https://elonet.finna.fi/Record/kavi.elonet_elokuva_116539',
@@ -32,58 +24,45 @@ class ElonetIE(InfoExtractor):
'id': '116539',
'ext': 'mp4',
'title': 'Minulla on tiikeri',
- 'description': 'Pienellä pojalla, joka asuu kerrostalossa, on kotieläimenä tiikeri. Se on kuitenkin salaisuus. Kerrostalon räpätäti on Kotilaisen täti, joka on aina vali...',
- 'thumbnail': 'https://elonet.finna.fi/Cover/Show?id=kavi.elonet_elokuva_116539&index=0&size=large&source=Solr',
- }
+ 'thumbnail': r're:^https?://elonet\.finna\.fi/Cover/Show\?id=kavi\.elonet_elokuva_116539.+',
+ 'description': 'md5:5ab72b3fe76d3414e46cc8f277104419',
+ },
+ 'params': {'skip_download': 'dash'},
+ }, {
+ # Page with multiple videos, download the main one
+ 'url': 'https://elonet.finna.fi/Record/kavi.elonet_elokuva_117396',
+ 'info_dict': {
+ 'id': '117396',
+ 'ext': 'mp4',
+ 'title': 'Sampo',
+ 'thumbnail': r're:^https?://elonet\.finna\.fi/Cover/Show\?id=kavi\.elonet_elokuva_117396.+',
+ 'description': 'md5:ec69572a5b054d0ecafe8086b1fa96f7',
+ },
+ 'params': {'skip_download': 'dash'},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- title = self._html_search_regex(
- r'<meta .*property="og&#x3A;title" .*content="(.+?)"', webpage, 'title')
- description = self._html_search_regex(
- r'<meta .*property="og&#x3A;description" .*content="(.+?)"', webpage, 'description')
- thumbnail = self._html_search_regex(
- r'<meta .*property="og&#x3A;image" .*content="(.+?)"', webpage, 'thumbnail')
+ src = self._parse_json(self._html_search_regex(
+ r'id=\'video-data\'[^>]+data-video-sources="([^"]+)"', webpage, 'json'), video_id)[0]['src']
+ ext = determine_ext(src)
- json_s = self._html_search_regex(
- r'data-video-sources="(.+?)"', webpage, 'json')
- src = try_get(
- self._parse_json(json_s, video_id),
- lambda x: x[0]["src"], compat_str)
- formats = []
- subtitles = {}
- if re.search(r'\.m3u8\??', src):
- res = self._download_webpage_handle(
- # elonet servers have certificate problems
- src.replace('https:', 'http:'), video_id,
- note='Downloading m3u8 information',
- errnote='Failed to download m3u8 information')
- if res:
- doc, urlh = res
- url = urlh.geturl()
- formats, subtitles = self._parse_m3u8_formats_and_subtitles(doc, url)
- for f in formats:
- f['ext'] = 'mp4'
- elif re.search(r'\.mpd\??', src):
- res = self._download_xml_handle(
- src, video_id,
- note='Downloading MPD manifest',
- errnote='Failed to download MPD manifest')
- if res:
- doc, urlh = res
- url = base_url(urlh.geturl())
- formats, subtitles = self._parse_mpd_formats_and_subtitles(doc, mpd_base_url=url)
+ if ext == 'm3u8':
+ formats, subtitles = self._extract_m3u8_formats_and_subtitles(src, video_id, fatal=False)
+ elif ext == 'mpd':
+ formats, subtitles = self._extract_mpd_formats_and_subtitles(src, video_id, fatal=False)
else:
- raise ExtractorError("Unknown streaming format")
+ formats, subtitles = [], {}
+ self.raise_no_formats(f'Unknown streaming format {ext}')
+ self._sort_formats(formats)
return {
'id': video_id,
- 'title': title,
- 'description': description,
- 'thumbnail': thumbnail,
+ 'title': self._og_search_title(webpage),
+ 'description': self._og_search_description(webpage),
+ 'thumbnail': self._og_search_thumbnail(webpage),
'formats': formats,
'subtitles': subtitles,
}
diff --git a/hypervideo_dl/extractor/engadget.py b/hypervideo_dl/extractor/engadget.py
index 65635c1..733bf32 100644
--- a/hypervideo_dl/extractor/engadget.py
+++ b/hypervideo_dl/extractor/engadget.py
@@ -7,16 +7,6 @@ class EngadgetIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?engadget\.com/video/(?P<id>[^/?#]+)'
_TESTS = [{
- # video with 5min ID
- 'url': 'http://www.engadget.com/video/518153925/',
- 'md5': 'c6820d4828a5064447a4d9fc73f312c9',
- 'info_dict': {
- 'id': '518153925',
- 'ext': 'mp4',
- 'title': 'Samsung Galaxy Tab Pro 8.4 Review',
- },
- 'add_ie': ['FiveMin'],
- }, {
# video with vidible ID
'url': 'https://www.engadget.com/video/57a28462134aa15a39f0421a/',
'only_matching': True,
diff --git a/hypervideo_dl/extractor/epicon.py b/hypervideo_dl/extractor/epicon.py
index b4e544d..cd19325 100644
--- a/hypervideo_dl/extractor/epicon.py
+++ b/hypervideo_dl/extractor/epicon.py
@@ -8,7 +8,7 @@ from ..utils import ExtractorError
class EpiconIE(InfoExtractor):
- _VALID_URL = r'(?:https?://)(?:www\.)?epicon\.in/(?:documentaries|movies|tv-shows/[^/?#]+/[^/?#]+)/(?P<id>[^/?#]+)'
+ _VALID_URL = r'https?://(?:www\.)?epicon\.in/(?:documentaries|movies|tv-shows/[^/?#]+/[^/?#]+)/(?P<id>[^/?#]+)'
_TESTS = [{
'url': 'https://www.epicon.in/documentaries/air-battle-of-srinagar',
'info_dict': {
@@ -84,7 +84,7 @@ class EpiconIE(InfoExtractor):
class EpiconSeriesIE(InfoExtractor):
- _VALID_URL = r'(?!.*season)(?:https?://)(?:www\.)?epicon\.in/tv-shows/(?P<id>[^/?#]+)'
+ _VALID_URL = r'(?!.*season)https?://(?:www\.)?epicon\.in/tv-shows/(?P<id>[^/?#]+)'
_TESTS = [{
'url': 'https://www.epicon.in/tv-shows/1-of-something',
'playlist_mincount': 5,
diff --git a/hypervideo_dl/extractor/eroprofile.py b/hypervideo_dl/extractor/eroprofile.py
index a8396f1..5d5e7f2 100644
--- a/hypervideo_dl/extractor/eroprofile.py
+++ b/hypervideo_dl/extractor/eroprofile.py
@@ -39,11 +39,7 @@ class EroProfileIE(InfoExtractor):
'skip': 'Requires login',
}]
- def _login(self):
- (username, password) = self._get_login_info()
- if username is None:
- return
-
+ def _perform_login(self, username, password):
query = compat_urllib_parse_urlencode({
'username': username,
'password': password,
@@ -62,9 +58,6 @@ class EroProfileIE(InfoExtractor):
r'<script[^>]+?src="([^"]+)"', login_page, 'login redirect url')
self._download_webpage(redirect_url, None, False)
- def _real_initialize(self):
- self._login()
-
def _real_extract(self, url):
display_id = self._match_id(url)
diff --git a/hypervideo_dl/extractor/ertgr.py b/hypervideo_dl/extractor/ertgr.py
new file mode 100644
index 0000000..19ce23f
--- /dev/null
+++ b/hypervideo_dl/extractor/ertgr.py
@@ -0,0 +1,316 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import json
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+ clean_html,
+ determine_ext,
+ ExtractorError,
+ dict_get,
+ int_or_none,
+ merge_dicts,
+ parse_qs,
+ parse_age_limit,
+ parse_iso8601,
+ str_or_none,
+ try_get,
+ unescapeHTML,
+ url_or_none,
+ variadic,
+)
+
+
+class ERTFlixBaseIE(InfoExtractor):
+ def _call_api(
+ self, video_id, method='Player/AcquireContent', api_version=1,
+ param_headers=None, data=None, headers=None, **params):
+ platform_codename = {'platformCodename': 'www'}
+ headers_as_param = {'X-Api-Date-Format': 'iso', 'X-Api-Camel-Case': False}
+ headers_as_param.update(param_headers or {})
+ headers = headers or {}
+ if data:
+ headers['Content-Type'] = headers_as_param['Content-Type'] = 'application/json;charset=utf-8'
+ data = json.dumps(merge_dicts(platform_codename, data)).encode('utf-8')
+ query = merge_dicts(
+ {} if data else platform_codename,
+ {'$headers': json.dumps(headers_as_param)},
+ params)
+ response = self._download_json(
+ 'https://api.app.ertflix.gr/v%s/%s' % (str(api_version), method),
+ video_id, fatal=False, query=query, data=data, headers=headers)
+ if try_get(response, lambda x: x['Result']['Success']) is True:
+ return response
+
+ def _call_api_get_tiles(self, video_id, *tile_ids):
+ requested_tile_ids = [video_id] + list(tile_ids)
+ requested_tiles = [{'Id': tile_id} for tile_id in requested_tile_ids]
+ tiles_response = self._call_api(
+ video_id, method='Tile/GetTiles', api_version=2,
+ data={'RequestedTiles': requested_tiles})
+ tiles = try_get(tiles_response, lambda x: x['Tiles'], list) or []
+ if tile_ids:
+ if sorted([tile['Id'] for tile in tiles]) != sorted(requested_tile_ids):
+ raise ExtractorError('Requested tiles not found', video_id=video_id)
+ return tiles
+ try:
+ return next(tile for tile in tiles if tile['Id'] == video_id)
+ except StopIteration:
+ raise ExtractorError('No matching tile found', video_id=video_id)
+
+
+class ERTFlixCodenameIE(ERTFlixBaseIE):
+ IE_NAME = 'ertflix:codename'
+ IE_DESC = 'ERTFLIX videos by codename'
+ _VALID_URL = r'ertflix:(?P<id>[\w-]+)'
+ _TESTS = [{
+ 'url': 'ertflix:monogramma-praxitelis-tzanoylinos',
+ 'md5': '5b9c2cd171f09126167e4082fc1dd0ef',
+ 'info_dict': {
+ 'id': 'monogramma-praxitelis-tzanoylinos',
+ 'ext': 'mp4',
+ 'title': 'md5:ef0b439902963d56c43ac83c3f41dd0e',
+ },
+ },
+ ]
+
+ def _extract_formats_and_subs(self, video_id, allow_none=True):
+ media_info = self._call_api(video_id, codename=video_id)
+ formats, subs = [], {}
+ for media_file in try_get(media_info, lambda x: x['MediaFiles'], list) or []:
+ for media in try_get(media_file, lambda x: x['Formats'], list) or []:
+ fmt_url = url_or_none(try_get(media, lambda x: x['Url']))
+ if not fmt_url:
+ continue
+ ext = determine_ext(fmt_url)
+ if ext == 'm3u8':
+ formats_, subs_ = self._extract_m3u8_formats_and_subtitles(
+ fmt_url, video_id, m3u8_id='hls', ext='mp4', fatal=False)
+ elif ext == 'mpd':
+ formats_, subs_ = self._extract_mpd_formats_and_subtitles(
+ fmt_url, video_id, mpd_id='dash', fatal=False)
+ else:
+ formats.append({
+ 'url': fmt_url,
+ 'format_id': str_or_none(media.get('Id')),
+ })
+ continue
+ formats.extend(formats_)
+ self._merge_subtitles(subs_, target=subs)
+
+ if formats or not allow_none:
+ self._sort_formats(formats)
+ return formats, subs
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ formats, subs = self._extract_formats_and_subs(video_id)
+
+ if formats:
+ return {
+ 'id': video_id,
+ 'formats': formats,
+ 'subtitles': subs,
+ 'title': self._generic_title(url),
+ }
+
+
+class ERTFlixIE(ERTFlixBaseIE):
+ IE_NAME = 'ertflix'
+ IE_DESC = 'ERTFLIX videos'
+ _VALID_URL = r'https?://www\.ertflix\.gr/(?:series|vod)/(?P<id>[a-z]{3}\.\d+)'
+ _TESTS = [{
+ 'url': 'https://www.ertflix.gr/vod/vod.173258-aoratoi-ergates',
+ 'md5': '6479d5e60fd7e520b07ba5411dcdd6e7',
+ 'info_dict': {
+ 'id': 'aoratoi-ergates',
+ 'ext': 'mp4',
+ 'title': 'md5:c1433d598fbba0211b0069021517f8b4',
+ 'description': 'md5:01a64d113c31957eb7eb07719ab18ff4',
+ 'thumbnail': r're:https?://.+\.jpg',
+ 'episode_id': 'vod.173258',
+ 'timestamp': 1639648800,
+ 'upload_date': '20211216',
+ 'duration': 3166,
+ 'age_limit': 8,
+ },
+ }, {
+ 'url': 'https://www.ertflix.gr/series/ser.3448-monogramma',
+ 'info_dict': {
+ 'id': 'ser.3448',
+ 'age_limit': 8,
+ 'description': 'Η εκπομπή σαράντα ετών που σημάδεψε τον πολιτισμό μας.',
+ 'title': 'Μονόγραμμα',
+ },
+ 'playlist_mincount': 64,
+ }, {
+ 'url': 'https://www.ertflix.gr/series/ser.3448-monogramma?season=1',
+ 'info_dict': {
+ 'id': 'ser.3448',
+ 'age_limit': 8,
+ 'description': 'Η εκπομπή σαράντα ετών που σημάδεψε τον πολιτισμό μας.',
+ 'title': 'Μονόγραμμα',
+ },
+ 'playlist_count': 22,
+ }, {
+ 'url': 'https://www.ertflix.gr/series/ser.3448-monogramma?season=1&season=2021%20-%202022',
+ 'info_dict': {
+ 'id': 'ser.3448',
+ 'age_limit': 8,
+ 'description': 'Η εκπομπή σαράντα ετών που σημάδεψε τον πολιτισμό μας.',
+ 'title': 'Μονόγραμμα',
+ },
+ 'playlist_mincount': 36,
+ }, {
+ 'url': 'https://www.ertflix.gr/series/ser.164991-to-diktuo-1?season=1-9',
+ 'info_dict': {
+ 'id': 'ser.164991',
+ 'age_limit': 8,
+ 'description': 'Η πρώτη ελληνική εκπομπή με θεματολογία αποκλειστικά γύρω από το ίντερνετ.',
+ 'title': 'Το δίκτυο',
+ },
+ 'playlist_mincount': 9,
+ }]
+
+ def _extract_episode(self, episode):
+ codename = try_get(episode, lambda x: x['Codename'], compat_str)
+ title = episode.get('Title')
+ description = clean_html(dict_get(episode, ('ShortDescription', 'TinyDescription', )))
+ if not codename or not title or not episode.get('HasPlayableStream', True):
+ return
+ thumbnail = next((
+ url_or_none(thumb.get('Url'))
+ for thumb in variadic(dict_get(episode, ('Images', 'Image')) or {})
+ if thumb.get('IsMain')),
+ None)
+ return {
+ '_type': 'url_transparent',
+ 'thumbnail': thumbnail,
+ 'id': codename,
+ 'episode_id': episode.get('Id'),
+ 'title': title,
+ 'alt_title': episode.get('Subtitle'),
+ 'description': description,
+ 'timestamp': parse_iso8601(episode.get('PublishDate')),
+ 'duration': episode.get('DurationSeconds'),
+ 'age_limit': self._parse_age_rating(episode),
+ 'url': 'ertflix:%s' % (codename, ),
+ }
+
+ @staticmethod
+ def _parse_age_rating(info_dict):
+ return parse_age_limit(
+ info_dict.get('AgeRating')
+ or (info_dict.get('IsAdultContent') and 18)
+ or (info_dict.get('IsKidsContent') and 0))
+
+ def _extract_series(self, video_id, season_titles=None, season_numbers=None):
+ media_info = self._call_api(video_id, method='Tile/GetSeriesDetails', id=video_id)
+
+ series = try_get(media_info, lambda x: x['Series'], dict) or {}
+ series_info = {
+ 'age_limit': self._parse_age_rating(series),
+ 'title': series.get('Title'),
+ 'description': dict_get(series, ('ShortDescription', 'TinyDescription', )),
+ }
+ if season_numbers:
+ season_titles = season_titles or []
+ for season in try_get(series, lambda x: x['Seasons'], list) or []:
+ if season.get('SeasonNumber') in season_numbers and season.get('Title'):
+ season_titles.append(season['Title'])
+
+ def gen_episode(m_info, season_titles):
+ for episode_group in try_get(m_info, lambda x: x['EpisodeGroups'], list) or []:
+ if season_titles and episode_group.get('Title') not in season_titles:
+ continue
+ episodes = try_get(episode_group, lambda x: x['Episodes'], list)
+ if not episodes:
+ continue
+ season_info = {
+ 'season': episode_group.get('Title'),
+ 'season_number': int_or_none(episode_group.get('SeasonNumber')),
+ }
+ try:
+ episodes = [(int(ep['EpisodeNumber']), ep) for ep in episodes]
+ episodes.sort()
+ except (KeyError, ValueError):
+ episodes = enumerate(episodes, 1)
+ for n, episode in episodes:
+ info = self._extract_episode(episode)
+ if info is None:
+ continue
+ info['episode_number'] = n
+ info.update(season_info)
+ yield info
+
+ return self.playlist_result(
+ gen_episode(media_info, season_titles), playlist_id=video_id, **series_info)
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ if video_id.startswith('ser.'):
+ param_season = parse_qs(url).get('season', [None])
+ param_season = [
+ (have_number, int_or_none(v) if have_number else str_or_none(v))
+ for have_number, v in
+ [(int_or_none(ps) is not None, ps) for ps in param_season]
+ if v is not None
+ ]
+ season_kwargs = {
+ k: [v for is_num, v in param_season if is_num is c] or None
+ for k, c in
+ [('season_titles', False), ('season_numbers', True)]
+ }
+ return self._extract_series(video_id, **season_kwargs)
+
+ return self._extract_episode(self._call_api_get_tiles(video_id))
+
+
+class ERTWebtvEmbedIE(InfoExtractor):
+ IE_NAME = 'ertwebtv:embed'
+ IE_DESC = 'ert.gr webtv embedded videos'
+ _BASE_PLAYER_URL_RE = re.escape('//www.ert.gr/webtv/live-uni/vod/dt-uni-vod.php')
+ _VALID_URL = rf'https?:{_BASE_PLAYER_URL_RE}\?([^#]+&)?f=(?P<id>[^#&]+)'
+
+ _TESTS = [{
+ 'url': 'https://www.ert.gr/webtv/live-uni/vod/dt-uni-vod.php?f=trailers/E2251_TO_DIKTYO_E09_16-01_1900.mp4&bgimg=/photos/2022/1/to_diktio_ep09_i_istoria_tou_diadiktiou_stin_Ellada_1021x576.jpg',
+ 'md5': 'f9e9900c25c26f4ecfbddbb4b6305854',
+ 'info_dict': {
+ 'id': 'trailers/E2251_TO_DIKTYO_E09_16-01_1900.mp4',
+ 'title': 'md5:914f06a73cd8b62fbcd6fb90c636e497',
+ 'ext': 'mp4',
+ 'thumbnail': 'https://program.ert.gr/photos/2022/1/to_diktio_ep09_i_istoria_tou_diadiktiou_stin_Ellada_1021x576.jpg'
+ },
+ }]
+
+ @classmethod
+ def _extract_urls(cls, webpage):
+ EMBED_URL_RE = rf'(?:https?:)?{cls._BASE_PLAYER_URL_RE}\?(?:(?!(?P=_q1)).)+'
+ EMBED_RE = rf'<iframe[^>]+?src=(?P<_q1>["\'])(?P<url>{EMBED_URL_RE})(?P=_q1)'
+
+ for mobj in re.finditer(EMBED_RE, webpage):
+ url = unescapeHTML(mobj.group('url'))
+ if not cls.suitable(url):
+ continue
+ yield url
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ formats, subs = self._extract_m3u8_formats_and_subtitles(
+ f'https://mediastream.ert.gr/vodedge/_definst_/mp4:dvrorigin/{video_id}/playlist.m3u8',
+ video_id, 'mp4')
+ self._sort_formats(formats)
+ thumbnail_id = parse_qs(url).get('bgimg', [None])[0]
+ if thumbnail_id and not thumbnail_id.startswith('http'):
+ thumbnail_id = f'https://program.ert.gr{thumbnail_id}'
+ return {
+ 'id': video_id,
+ 'title': f'VOD - {video_id}',
+ 'thumbnail': thumbnail_id,
+ 'formats': formats,
+ 'subtitles': subs,
+ }
diff --git a/hypervideo_dl/extractor/espn.py b/hypervideo_dl/extractor/espn.py
index d4a66c2..dc50f3b 100644
--- a/hypervideo_dl/extractor/espn.py
+++ b/hypervideo_dl/extractor/espn.py
@@ -7,7 +7,9 @@ from .once import OnceIE
from ..compat import compat_str
from ..utils import (
determine_ext,
+ dict_get,
int_or_none,
+ unified_strdate,
unified_timestamp,
)
@@ -236,3 +238,44 @@ class FiveThirtyEightIE(InfoExtractor):
webpage, 'embed url')
return self.url_result(embed_url, 'AbcNewsVideo')
+
+
+class ESPNCricInfoIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?espncricinfo\.com/video/[^#$&?/]+-(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'https://www.espncricinfo.com/video/finch-chasing-comes-with-risks-despite-world-cup-trend-1289135',
+ 'info_dict': {
+ 'id': '1289135',
+ 'ext': 'mp4',
+ 'title': 'Finch: Chasing comes with \'risks\' despite World Cup trend',
+ 'description': 'md5:ea32373303e25efbb146efdfc8a37829',
+ 'upload_date': '20211113',
+ 'duration': 96,
+ },
+ 'params': {'skip_download': True}
+ }]
+
+ def _real_extract(self, url):
+ id = self._match_id(url)
+ data_json = self._download_json(f'https://hs-consumer-api.espncricinfo.com/v1/pages/video/video-details?videoId={id}', id)['video']
+ formats, subtitles = [], {}
+ for item in data_json.get('playbacks') or []:
+ if item.get('type') == 'HLS' and item.get('url'):
+ m3u8_frmts, m3u8_subs = self._extract_m3u8_formats_and_subtitles(item['url'], id)
+ formats.extend(m3u8_frmts)
+ subtitles = self._merge_subtitles(subtitles, m3u8_subs)
+ elif item.get('type') == 'AUDIO' and item.get('url'):
+ formats.append({
+ 'url': item['url'],
+ 'vcodec': 'none',
+ })
+ self._sort_formats(formats)
+ return {
+ 'id': id,
+ 'title': data_json.get('title'),
+ 'description': data_json.get('summary'),
+ 'upload_date': unified_strdate(dict_get(data_json, ('publishedAt', 'recordedAt'))),
+ 'duration': data_json.get('duration'),
+ 'formats': formats,
+ 'subtitles': subtitles,
+ }
diff --git a/hypervideo_dl/extractor/europeantour.py b/hypervideo_dl/extractor/europeantour.py
new file mode 100644
index 0000000..e28f067
--- /dev/null
+++ b/hypervideo_dl/extractor/europeantour.py
@@ -0,0 +1,37 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class EuropeanTourIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?europeantour\.com/dpworld-tour/news/video/(?P<id>[^/&?#$]+)'
+
+ _TESTS = [{
+ 'url': 'https://www.europeantour.com/dpworld-tour/news/video/the-best-shots-of-the-2021-seasons/',
+ 'info_dict': {
+ 'id': '6287788195001',
+ 'ext': 'mp4',
+ 'title': 'The best shots of the 2021 seasons',
+ 'duration': 2416.512,
+ 'timestamp': 1640010141,
+ 'uploader_id': '5136026580001',
+ 'tags': ['prod-imported'],
+ 'thumbnail': 'md5:fdac52bc826548860edf8145ee74e71a',
+ 'upload_date': '20211220'
+ },
+ 'params': {'skip_download': True}
+ }]
+
+ BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s'
+
+ def _real_extract(self, url):
+ id = self._match_id(url)
+ webpage = self._download_webpage(url, id)
+ vid, aid = re.search(r'(?s)brightcove-player\s?video-id="([^"]+)".*"ACCOUNT_ID":"([^"]+)"', webpage).groups()
+ if not aid:
+ aid = '5136026580001'
+ return self.url_result(
+ self.BRIGHTCOVE_URL_TEMPLATE % (aid, vid), 'BrightcoveNew')
diff --git a/hypervideo_dl/extractor/euscreen.py b/hypervideo_dl/extractor/euscreen.py
index 3980c23..2759e74 100644
--- a/hypervideo_dl/extractor/euscreen.py
+++ b/hypervideo_dl/extractor/euscreen.py
@@ -10,7 +10,7 @@ from ..utils import (
class EUScreenIE(InfoExtractor):
- _VALID_URL = r'(?:https?://)(?:www\.)?euscreen\.eu/item.html\?id=(?P<id>[^&?$/]+)'
+ _VALID_URL = r'https?://(?:www\.)?euscreen\.eu/item.html\?id=(?P<id>[^&?$/]+)'
_TESTS = [{
'url': 'https://euscreen.eu/item.html?id=EUS_0EBCBF356BFC4E12A014023BA41BD98C',
diff --git a/hypervideo_dl/extractor/extractors.py b/hypervideo_dl/extractor/extractors.py
index f4f817f..457f4c2 100644
--- a/hypervideo_dl/extractor/extractors.py
+++ b/hypervideo_dl/extractor/extractors.py
@@ -4,6 +4,7 @@ from __future__ import unicode_literals
from .abc import (
ABCIE,
ABCIViewIE,
+ ABCIViewShowSeriesIE,
)
from .abcnews import (
AbcNewsIE,
@@ -13,6 +14,10 @@ from .abcotvs import (
ABCOTVSIE,
ABCOTVSClipsIE,
)
+from .abematv import (
+ AbemaTVIE,
+ AbemaTVTitleIE,
+)
from .academicearth import AcademicEarthCourseIE
from .acast import (
ACastIE,
@@ -36,7 +41,10 @@ from .aenetworks import (
HistoryPlayerIE,
BiographyIE,
)
-from .afreecatv import AfreecaTVIE
+from .afreecatv import (
+ AfreecaTVIE,
+ AfreecaTVLiveIE,
+)
from .airmozilla import AirMozillaIE
from .aljazeera import AlJazeeraIE
from .alphaporno import AlphaPornoIE
@@ -50,6 +58,7 @@ from .animelab import (
AnimeLabIE,
AnimeLabShowsIE,
)
+from .amazon import AmazonStoreIE
from .americastestkitchen import (
AmericasTestKitchenIE,
AmericasTestKitchenSeasonIE,
@@ -59,6 +68,10 @@ from .anvato import AnvatoIE
from .aol import AolIE
from .allocine import AllocineIE
from .aliexpress import AliExpressLiveIE
+from .alsace20tv import (
+ Alsace20TVIE,
+ Alsace20TVEmbedIE,
+)
from .apa import APAIE
from .aparat import AparatIE
from .appleconnect import AppleConnectIE
@@ -82,6 +95,7 @@ from .arte import (
ArteTVIE,
ArteTVEmbedIE,
ArteTVPlaylistIE,
+ ArteTVCategoryIE,
)
from .arnes import ArnesIE
from .asiancrush import (
@@ -108,12 +122,16 @@ from .awaan import (
)
from .azmedien import AZMedienIE
from .baidu import BaiduVideoIE
+from .banbye import (
+ BanByeIE,
+ BanByeChannelIE,
+)
from .bandaichannel import BandaiChannelIE
from .bandcamp import (
BandcampIE,
BandcampAlbumIE,
BandcampWeeklyIE,
- BandcampMusicIE,
+ BandcampUserIE,
)
from .bannedvideo import BannedVideoIE
from .bbc import (
@@ -137,6 +155,7 @@ from .bfmtv import (
)
from .bibeltv import BibelTVIE
from .bigflix import BigflixIE
+from .bigo import BigoIE
from .bild import BildIE
from .bilibili import (
BiliBiliIE,
@@ -165,6 +184,7 @@ from .bleacherreport import (
BleacherReportIE,
BleacherReportCMSIE,
)
+from .blogger import BloggerIE
from .bloomberg import BloombergIE
from .bokecc import BokeCCIE
from .bongacams import BongaCamsIE
@@ -177,6 +197,7 @@ from .br import (
)
from .bravotv import BravoTVIE
from .breakcom import BreakIE
+from .breitbart import BreitBartIE
from .brightcove import (
BrightcoveLegacyIE,
BrightcoveNewIE,
@@ -185,6 +206,9 @@ from .businessinsider import BusinessInsiderIE
from .buzzfeed import BuzzFeedIE
from .byutv import BYUtvIE
from .c56 import C56IE
+from .cableav import CableAVIE
+from .callin import CallinIE
+from .caltrans import CaltransIE
from .cam4 import CAM4IE
from .camdemy import (
CamdemyIE,
@@ -192,6 +216,7 @@ from .camdemy import (
)
from .cammodels import CamModelsIE
from .camwithher import CamWithHerIE
+from .canalalpha import CanalAlphaIE
from .canalplus import CanalplusIE
from .canalc2 import Canalc2IE
from .canvas import (
@@ -235,10 +260,7 @@ from .ccc import (
from .ccma import CCMAIE
from .cctv import CCTVIE
from .cda import CDAIE
-from .ceskatelevize import (
- CeskaTelevizeIE,
- CeskaTelevizePoradyIE,
-)
+from .ceskatelevize import CeskaTelevizeIE
from .cgtn import CGTNIE
from .channel9 import Channel9IE
from .charlierose import CharlieRoseIE
@@ -293,25 +315,41 @@ from .commonprotocols import (
from .condenast import CondeNastIE
from .contv import CONtvIE
from .corus import CorusIE
+from .cpac import (
+ CPACIE,
+ CPACPlaylistIE,
+)
+from .cozytv import CozyTVIE
from .cracked import CrackedIE
from .crackle import CrackleIE
+from .craftsy import CraftsyIE
from .crooksandliars import CrooksAndLiarsIE
+from .crowdbunker import (
+ CrowdBunkerIE,
+ CrowdBunkerChannelIE,
+)
from .crunchyroll import (
CrunchyrollIE,
CrunchyrollShowPlaylistIE,
CrunchyrollBetaIE,
CrunchyrollBetaShowIE,
)
-from .cspan import CSpanIE
+from .cspan import CSpanIE, CSpanCongressIE
from .ctsnews import CtsNewsIE
from .ctv import CTVIE
from .ctvnews import CTVNewsIE
from .cultureunplugged import CultureUnpluggedIE
from .curiositystream import (
CuriosityStreamIE,
- CuriosityStreamCollectionIE,
+ CuriosityStreamCollectionsIE,
+ CuriosityStreamSeriesIE,
)
from .cwtv import CWTVIE
+from .cybrary import (
+ CybraryIE,
+ CybraryCourseIE
+)
+from .daftsex import DaftsexIE
from .dailymail import DailyMailIE
from .dailymotion import (
DailymotionIE,
@@ -328,6 +366,7 @@ from .daum import (
DaumPlaylistIE,
DaumUserIE,
)
+from .daystar import DaystarClipIE
from .dbtv import DBTVIE
from .dctp import DctpTvIE
from .deezer import (
@@ -338,10 +377,6 @@ from .democracynow import DemocracynowIE
from .dfb import DFBIE
from .dhm import DHMIE
from .digg import DiggIE
-from .discoveryplusindia import (
- DiscoveryPlusIndiaIE,
- DiscoveryPlusIndiaShowIE,
-)
from .dotsub import DotsubIE
from .douyutv import (
DouyuShowIE,
@@ -351,9 +386,24 @@ from .dplay import (
DPlayIE,
DiscoveryPlusIE,
HGTVDeIE,
+ GoDiscoveryIE,
+ TravelChannelIE,
+ CookingChannelIE,
+ HGTVUsaIE,
+ FoodNetworkIE,
+ InvestigationDiscoveryIE,
+ DestinationAmericaIE,
+ AmHistoryChannelIE,
ScienceChannelIE,
DIYNetworkIE,
- AnimalPlanetIE
+ DiscoveryLifeIE,
+ AnimalPlanetIE,
+ TLCIE,
+ DiscoveryPlusIndiaIE,
+ DiscoveryNetworksDeIE,
+ DiscoveryPlusItalyIE,
+ DiscoveryPlusItalyShowIE,
+ DiscoveryPlusIndiaShowIE,
)
from .dreisat import DreiSatIE
from .drbonanza import DRBonanzaIE
@@ -370,17 +420,16 @@ from .duboku import (
)
from .dumpert import DumpertIE
from .defense import DefenseGouvFrIE
+from .digitalconcerthall import DigitalConcertHallIE
from .discovery import DiscoveryIE
-from .discoverygo import (
- DiscoveryGoIE,
- DiscoveryGoPlaylistIE,
-)
-from .discoverynetworks import DiscoveryNetworksDeIE
-from .discoveryvr import DiscoveryVRIE
from .disney import DisneyIE
from .dispeak import DigitallySpeakingIE
from .doodstream import DoodStreamIE
from .dropbox import DropboxIE
+from .dropout import (
+ DropoutSeasonIE,
+ DropoutIE
+)
from .dw import (
DWIE,
DWArticleIE,
@@ -414,14 +463,21 @@ from .eroprofile import (
EroProfileIE,
EroProfileAlbumIE,
)
+from .ertgr import (
+ ERTFlixCodenameIE,
+ ERTFlixIE,
+ ERTWebtvEmbedIE,
+)
from .escapist import EscapistIE
from .espn import (
ESPNIE,
ESPNArticleIE,
FiveThirtyEightIE,
+ ESPNCricInfoIE,
)
from .esri import EsriVideoIE
from .europa import EuropaIE
+from .europeantour import EuropeanTourIE
from .euscreen import EUScreenIE
from .expotv import ExpoTVIE
from .expressen import ExpressenIE
@@ -430,6 +486,7 @@ from .eyedotv import EyedoTVIE
from .facebook import (
FacebookIE,
FacebookPluginsVideoIE,
+ FacebookRedirectURLIE,
)
from .fancode import (
FancodeVodIE,
@@ -440,6 +497,7 @@ from .faz import FazIE
from .fc2 import (
FC2IE,
FC2EmbedIE,
+ FC2LiveIE,
)
from .fczenit import FczenitIE
from .filmmodu import FilmmoduIE
@@ -449,7 +507,6 @@ from .filmon import (
)
from .filmweb import FilmwebIE
from .firsttv import FirstTVIE
-from .fivemin import FiveMinIE
from .fivetv import FiveTVIE
from .flickr import FlickrIE
from .folketinget import FolketingetIE
@@ -472,6 +529,7 @@ from .foxnews import (
FoxNewsArticleIE,
)
from .foxsports import FoxSportsIE
+from .fptplay import FptplayIE
from .franceculture import FranceCultureIE
from .franceinter import FranceInterIE
from .francetv import (
@@ -481,7 +539,6 @@ from .francetv import (
)
from .freesound import FreesoundIE
from .freespeech import FreespeechIE
-from .freshlive import FreshLiveIE
from .frontendmasters import (
FrontendMastersIE,
FrontendMastersLessonIE,
@@ -495,9 +552,20 @@ from .funimation import (
)
from .funk import FunkIE
from .fusion import FusionIE
-from .gab import GabTVIE
+from .gab import (
+ GabTVIE,
+ GabIE,
+)
from .gaia import GaiaIE
from .gameinformer import GameInformerIE
+from .gamejolt import (
+ GameJoltIE,
+ GameJoltUserIE,
+ GameJoltGameIE,
+ GameJoltGameSoundtrackIE,
+ GameJoltCommunityIE,
+ GameJoltSearchIE,
+)
from .gamespot import GameSpotIE
from .gamestar import GameStarIE
from .gaskrank import GaskrankIE
@@ -505,7 +573,10 @@ from .gazeta import GazetaIE
from .gdcvault import GDCVaultIE
from .gedidigital import GediDigitalIE
from .generic import GenericIE
-from .gettr import GettrIE
+from .gettr import (
+ GettrIE,
+ GettrStreamingIE,
+)
from .gfycat import GfycatIE
from .giantbomb import GiantBombIE
from .giga import GigaIE
@@ -516,6 +587,7 @@ from .globo import (
)
from .go import GoIE
from .godtube import GodTubeIE
+from .gofile import GofileIE
from .golem import GolemIE
from .googledrive import GoogleDriveIE
from .googlepodcasts import (
@@ -541,7 +613,6 @@ from .hidive import HiDiveIE
from .historicfilms import HistoricFilmsIE
from .hitbox import HitboxIE, HitboxLiveIE
from .hitrecord import HitRecordIE
-from .hornbunny import HornBunnyIE
from .hotnewhiphop import HotNewHipHopIE
from .hotstar import (
HotStarIE,
@@ -555,7 +626,12 @@ from .hrti import (
HRTiIE,
HRTiPlaylistIE,
)
+from .hse import (
+ HSEShowIE,
+ HSEProductIE,
+)
from .huajiao import HuajiaoIE
+from .huya import HuyaLiveIE
from .huffpost import HuffPostIE
from .hungama import (
HungamaIE,
@@ -591,14 +667,28 @@ from .indavideo import IndavideoEmbedIE
from .infoq import InfoQIE
from .instagram import (
InstagramIE,
+ InstagramIOSIE,
InstagramUserIE,
InstagramTagIE,
+ InstagramStoryIE,
)
from .internazionale import InternazionaleIE
from .internetvideoarchive import InternetVideoArchiveIE
-from .iprima import IPrimaIE
-from .iqiyi import IqiyiIE
-from .ir90tv import Ir90TvIE
+from .iprima import (
+ IPrimaIE,
+ IPrimaCNNIE
+)
+from .iqiyi import (
+ IqiyiIE,
+ IqIE,
+ IqAlbumIE
+)
+
+from .itprotv import (
+ ITProTVIE,
+ ITProTVCourseIE
+)
+
from .itv import (
ITVIE,
ITVBTCCIE,
@@ -620,10 +710,10 @@ from .joj import JojIE
from .jwplatform import JWPlatformIE
from .kakao import KakaoIE
from .kaltura import KalturaIE
-from .kankan import KankanIE
from .karaoketv import KaraoketvIE
from .karrierevideos import KarriereVideosIE
from .keezmovies import KeezMoviesIE
+from .kelbyone import KelbyOneIE
from .ketnet import KetnetIE
from .khanacademy import (
KhanAcademyIE,
@@ -656,6 +746,11 @@ from .laola1tv import (
EHFTVIE,
ITTFIE,
)
+from .lastfm import (
+ LastFMIE,
+ LastFMPlaylistIE,
+ LastFMUserIE,
+)
from .lbry import (
LBRYIE,
LBRYChannelIE,
@@ -691,11 +786,11 @@ from .limelight import (
LimelightChannelListIE,
)
from .line import (
- LineTVIE,
LineLiveIE,
LineLiveChannelIE,
)
from .linkedin import (
+ LinkedInIE,
LinkedInLearningIE,
LinkedInLearningCourseIE,
)
@@ -707,7 +802,10 @@ from .livestream import (
LivestreamOriginalIE,
LivestreamShortenerIE,
)
-from .lnkgo import LnkGoIE
+from .lnkgo import (
+ LnkGoIE,
+ LnkIE,
+)
from .localnews8 import LocalNews8IE
from .lovehomeporn import LoveHomePornIE
from .lrt import LRTIE
@@ -722,6 +820,7 @@ from .mailru import (
MailRuMusicIE,
MailRuMusicSearchIE,
)
+from .mainstreaming import MainStreamingIE
from .malltv import MallTVIE
from .mangomolo import (
MangomoloVideoIE,
@@ -744,7 +843,10 @@ from .mdr import MDRIE
from .medaltv import MedalTVIE
from .mediaite import MediaiteIE
from .mediaklikk import MediaKlikkIE
-from .mediaset import MediasetIE
+from .mediaset import (
+ MediasetIE,
+ MediasetShowIE,
+)
from .mediasite import (
MediasiteIE,
MediasiteCatalogIE,
@@ -760,6 +862,7 @@ from .metacritic import MetacriticIE
from .mgoon import MgoonIE
from .mgtv import MGTVIE
from .miaopai import MiaoPaiIE
+from .microsoftstream import MicrosoftStreamIE
from .microsoftvirtualacademy import (
MicrosoftVirtualAcademyIE,
MicrosoftVirtualAcademyCourseIE,
@@ -767,6 +870,7 @@ from .microsoftvirtualacademy import (
from .mildom import (
MildomIE,
MildomVodIE,
+ MildomClipIE,
MildomUserVodIE,
)
from .minds import (
@@ -783,6 +887,10 @@ from .mirrativ import (
)
from .mit import TechTVMITIE, OCWMITIE
from .mitele import MiTeleIE
+from .mixch import (
+ MixchIE,
+ MixchArchiveIE,
+)
from .mixcloud import (
MixcloudIE,
MixcloudUserIE,
@@ -792,6 +900,7 @@ from .mlb import (
MLBIE,
MLBVideoIE,
)
+from .mlssoccer import MLSSoccerIE
from .mnet import MnetIE
from .moevideo import MoeVideoIE
from .mofosex import (
@@ -819,7 +928,14 @@ from .mtv import (
MTVItaliaProgrammaIE,
)
from .muenchentv import MuenchenTVIE
+from .murrtube import MurrtubeIE, MurrtubeUserIE
from .musescore import MuseScoreIE
+from .musicdex import (
+ MusicdexSongIE,
+ MusicdexAlbumIE,
+ MusicdexArtistIE,
+ MusicdexPlaylistIE,
+)
from .mwave import MwaveIE, MwaveMeetGreetIE
from .mxplayer import (
MxplayerIE,
@@ -834,7 +950,14 @@ from .myvi import (
)
from .myvideoge import MyVideoGeIE
from .myvidster import MyVidsterIE
-from .n1 import N1InfoIIE, N1InfoAssetIE
+from .n1 import (
+ N1InfoAssetIE,
+ N1InfoIIE,
+)
+from .nate import (
+ NateIE,
+ NateProgramIE,
+)
from .nationalgeographic import (
NationalGeographicVideoIE,
NationalGeographicTVIE,
@@ -868,7 +991,10 @@ from .ndr import (
NJoyEmbedIE,
)
from .ndtv import NDTVIE
-from .nebula import NebulaIE
+from .nebula import (
+ NebulaIE,
+ NebulaCollectionIE,
+)
from .nerdcubed import NerdCubedFeedIE
from .netzkino import NetzkinoIE
from .neteasemusic import (
@@ -886,6 +1012,7 @@ from .newgrounds import (
NewgroundsUserIE,
)
from .newstube import NewstubeIE
+from .newsy import NewsyIE
from .nextmedia import (
NextMediaIE,
NextMediaActionNewsIE,
@@ -896,6 +1023,7 @@ from .nexx import (
NexxIE,
NexxEmbedIE,
)
+from .nfb import NFBIE
from .nfhsnetwork import NFHSNetworkIE
from .nfl import (
NFLIE,
@@ -904,6 +1032,9 @@ from .nfl import (
from .nhk import (
NhkVodIE,
NhkVodProgramIE,
+ NhkForSchoolBangumiIE,
+ NhkForSchoolSubjectIE,
+ NhkForSchoolProgramListIE,
)
from .nhl import NHLIE
from .nick import (
@@ -913,16 +1044,21 @@ from .nick import (
NickNightIE,
NickRuIE,
)
-
from .niconico import (
NiconicoIE,
NiconicoPlaylistIE,
NiconicoUserIE,
+ NiconicoSeriesIE,
+ NiconicoHistoryIE,
NicovideoSearchDateIE,
NicovideoSearchIE,
NicovideoSearchURLIE,
+ NicovideoTagURLIE,
+)
+from .ninecninemedia import (
+ NineCNineMediaIE,
+ CPTwentyFourIE,
)
-from .ninecninemedia import NineCNineMediaIE
from .ninegag import NineGagIE
from .ninenow import NineNowIE
from .nintendo import NintendoIE
@@ -930,6 +1066,7 @@ from .nitter import NitterIE
from .njpwworld import NJPWWorldIE
from .nobelprize import NobelPrizeIE
from .nonktube import NonkTubeIE
+from .noodlemagazine import NoodleMagazineIE
from .noovo import NoovoIE
from .normalboots import NormalbootsIE
from .nosvideo import NosVideoIE
@@ -986,6 +1123,7 @@ from .oktoberfesttv import OktoberfestTVIE
from .olympics import OlympicsReplayIE
from .on24 import On24IE
from .ondemandkorea import OnDemandKoreaIE
+from .onefootball import OneFootballIE
from .onet import (
OnetIE,
OnetChannelIE,
@@ -997,9 +1135,14 @@ from .ooyala import (
OoyalaIE,
OoyalaExternalIE,
)
+from .opencast import (
+ OpencastIE,
+ OpencastPlaylistIE,
+)
from .openrec import (
OpenRecIE,
OpenRecCaptureIE,
+ OpenRecMovieIE,
)
from .ora import OraTVIE
from .orf import (
@@ -1030,6 +1173,11 @@ from .palcomp3 import (
PalcoMP3VideoIE,
)
from .pandoratv import PandoraTVIE
+from .panopto import (
+ PanoptoIE,
+ PanoptoListIE,
+ PanoptoPlaylistIE
+)
from .paramountplus import (
ParamountPlusIE,
ParamountPlusSeriesIE,
@@ -1042,10 +1190,12 @@ from .patreon import (
)
from .pbs import PBSIE
from .pearvideo import PearVideoIE
+from .peekvids import PeekVidsIE, PlayVidsIE
from .peertube import (
PeerTubeIE,
PeerTubePlaylistIE,
)
+from .peertv import PeerTVIE
from .peloton import (
PelotonIE,
PelotonLiveIE
@@ -1059,6 +1209,7 @@ from .periscope import (
from .philharmoniedeparis import PhilharmonieDeParisIE
from .phoenix import PhoenixIE
from .photobucket import PhotobucketIE
+from .piapro import PiaproIE
from .picarto import (
PicartoIE,
PicartoVodIE,
@@ -1069,7 +1220,12 @@ from .pinterest import (
PinterestIE,
PinterestCollectionIE,
)
+from .pixivsketch import (
+ PixivSketchIE,
+ PixivSketchUserIE,
+)
from .pladform import PladformIE
+from .planetmarathi import PlanetMarathiIE
from .platzi import (
PlatziIE,
PlatziCourseIE,
@@ -1090,10 +1246,20 @@ from .podomatic import PodomaticIE
from .pokemon import (
PokemonIE,
PokemonWatchIE,
+ PokemonSoundLibraryIE,
)
+from .pokergo import (
+ PokerGoIE,
+ PokerGoCollectionIE,
+)
+from .polsatgo import PolsatGoIE
from .polskieradio import (
PolskieRadioIE,
PolskieRadioCategoryIE,
+ PolskieRadioPlayerIE,
+ PolskieRadioPodcastIE,
+ PolskieRadioPodcastListIE,
+ PolskieRadioRadioKierowcowIE,
)
from .popcorntimes import PopcorntimesIE
from .popcorntv import PopcornTVIE
@@ -1111,6 +1277,7 @@ from .pornhub import (
from .pornotube import PornotubeIE
from .pornovoisines import PornoVoisinesIE
from .pornoxo import PornoXOIE
+from .pornez import PornezIE
from .puhutv import (
PuhuTVIE,
PuhuTVSerieIE,
@@ -1118,6 +1285,13 @@ from .puhutv import (
from .presstv import PressTVIE
from .projectveritas import ProjectVeritasIE
from .prosiebensat1 import ProSiebenSat1IE
+from .prx import (
+ PRXStoryIE,
+ PRXSeriesIE,
+ PRXAccountIE,
+ PRXStoriesSearchIE,
+ PRXSeriesSearchIE
+)
from .puls4 import Puls4IE
from .pyvideo import PyvideoIE
from .qqmusic import (
@@ -1140,6 +1314,11 @@ from .radiode import RadioDeIE
from .radiojavan import RadioJavanIE
from .radiobremen import RadioBremenIE
from .radiofrance import RadioFranceIE
+from .radiozet import RadioZetPodcastIE
+from .radiokapital import (
+ RadioKapitalIE,
+ RadioKapitalShowIE,
+)
from .radlive import (
RadLiveIE,
RadLiveChannelIE,
@@ -1149,6 +1328,9 @@ from .rai import (
RaiPlayIE,
RaiPlayLiveIE,
RaiPlayPlaylistIE,
+ RaiPlaySoundIE,
+ RaiPlaySoundLiveIE,
+ RaiPlaySoundPlaylistIE,
RaiIE,
)
from .raywenderlich import (
@@ -1173,9 +1355,11 @@ from .redbulltv import (
RedBullTVRrnContentIE,
RedBullIE,
)
-from .reddit import (
- RedditIE,
- RedditRIE,
+from .reddit import RedditIE
+from .redgifs import (
+ RedGifsIE,
+ RedGifsSearchIE,
+ RedGifsUserIE,
)
from .redtube import RedTubeIE
from .regiotv import RegioTVIE
@@ -1188,11 +1372,14 @@ from .reuters import ReutersIE
from .reverbnation import ReverbNationIE
from .rice import RICEIE
from .rmcdecouverte import RMCDecouverteIE
-from .ro220 import Ro220IE
from .rockstargames import RockstarGamesIE
-from .roosterteeth import RoosterTeethIE
+from .rokfin import (
+ RokfinIE,
+ RokfinStackIE,
+ RokfinChannelIE,
+)
+from .roosterteeth import RoosterTeethIE, RoosterTeethSeriesIE
from .rottentomatoes import RottenTomatoesIE
-from .roxwel import RoxwelIE
from .rozhlas import RozhlasIE
from .rtbf import RTBFIE
from .rte import RteIE, RteRadioIE
@@ -1202,12 +1389,26 @@ from .rtl2 import (
RTL2YouIE,
RTL2YouSeriesIE,
)
+from .rtnews import (
+ RTNewsIE,
+ RTDocumentryIE,
+ RTDocumentryPlaylistIE,
+ RuptlyIE,
+)
from .rtp import RTPIE
+from .rtrfm import RTRFMIE
from .rts import RTSIE
-from .rtve import RTVEALaCartaIE, RTVELiveIE, RTVEInfantilIE, RTVELiveIE, RTVETelevisionIE
+from .rtve import (
+ RTVEALaCartaIE,
+ RTVEAudioIE,
+ RTVELiveIE,
+ RTVEInfantilIE,
+ RTVETelevisionIE,
+)
from .rtvnh import RTVNHIE
from .rtvs import RTVSIE
from .ruhd import RUHDIE
+from .rule34video import Rule34VideoIE
from .rumble import (
RumbleEmbedIE,
RumbleChannelIE,
@@ -1219,10 +1420,27 @@ from .rutube import (
RutubeMovieIE,
RutubePersonIE,
RutubePlaylistIE,
+ RutubeTagsIE,
+)
+from .glomex import (
+ GlomexIE,
+ GlomexEmbedIE,
+)
+from .megatvcom import (
+ MegaTVComIE,
+ MegaTVComEmbedIE,
+)
+from .ant1newsgr import (
+ Ant1NewsGrWatchIE,
+ Ant1NewsGrArticleIE,
+ Ant1NewsGrEmbedIE,
)
from .rutv import RUTVIE
from .ruutu import RuutuIE
-from .ruv import RuvIE
+from .ruv import (
+ RuvIE,
+ RuvSpilaIE
+)
from .safari import (
SafariIE,
SafariApiIE,
@@ -1244,7 +1462,7 @@ from .scte import (
SCTECourseIE,
)
from .seeker import SeekerIE
-from .senateisvp import SenateISVPIE
+from .senategov import SenateISVPIE, SenateGovIE
from .sendtonews import SendtoNewsIE
from .servus import ServusIE
from .sevenplus import SevenPlusIE
@@ -1270,6 +1488,7 @@ from .simplecast import (
)
from .sina import SinaIE
from .sixplay import SixPlayIE
+from .skeb import SkebIE
from .skyit import (
SkyItPlayerIE,
SkyItVideoIE,
@@ -1288,6 +1507,7 @@ from .skynewsarabia import (
from .skynewsau import SkyNewsAUIE
from .sky import (
SkyNewsIE,
+ SkyNewsStoryIE,
SkySportsIE,
SkySportsNewsIE,
)
@@ -1304,6 +1524,7 @@ from .soundcloud import (
SoundcloudEmbedIE,
SoundcloudIE,
SoundcloudSetIE,
+ SoundcloudRelatedIE,
SoundcloudUserIE,
SoundcloudTrackStationIE,
SoundcloudPlaylistIE,
@@ -1370,8 +1591,10 @@ from .streamable import StreamableIE
from .streamanity import StreamanityIE
from .streamcloud import StreamcloudIE
from .streamcz import StreamCZIE
+from .streamff import StreamFFIE
from .streetvoice import StreetVoiceIE
from .stretchinternet import StretchInternetIE
+from .stripchat import StripchatIE
from .stv import STVPlayerIE
from .sunporno import SunPornoIE
from .sverigesradio import (
@@ -1387,10 +1610,7 @@ from .svt import (
from .swrmediathek import SWRMediathekIE
from .syfy import SyfyIE
from .sztvhu import SztvHuIE
-from .tagesschau import (
- TagesschauPlayerIE,
- TagesschauIE,
-)
+from .tagesschau import TagesschauIE
from .tass import TassIE
from .tbs import TBSIE
from .tdslifeway import TDSLifewayIE
@@ -1406,12 +1626,18 @@ from .teachingchannel import TeachingChannelIE
from .teamcoco import TeamcocoIE
from .teamtreehouse import TeamTreeHouseIE
from .techtalks import TechTalksIE
-from .ted import TEDIE
+from .ted import (
+ TedEmbedIE,
+ TedPlaylistIE,
+ TedSeriesIE,
+ TedTalkIE,
+)
from .tele5 import Tele5IE
from .tele13 import Tele13IE
from .telebruxelles import TeleBruxellesIE
from .telecinco import TelecincoIE
from .telegraaf import TelegraafIE
+from .telegram import TelegramEmbedIE
from .telemb import TeleMBIE
from .telemundo import TelemundoIE
from .telequebec import (
@@ -1433,7 +1659,6 @@ from .theplatform import (
ThePlatformIE,
ThePlatformFeedIE,
)
-from .thescene import TheSceneIE
from .thestar import TheStarIE
from .thesun import TheSunIE
from .theta import (
@@ -1444,10 +1669,18 @@ from .theweatherchannel import TheWeatherChannelIE
from .thisamericanlife import ThisAmericanLifeIE
from .thisav import ThisAVIE
from .thisoldhouse import ThisOldHouseIE
+from .threespeak import (
+ ThreeSpeakIE,
+ ThreeSpeakUserIE,
+)
from .threeqsdn import ThreeQSDNIE
from .tiktok import (
TikTokIE,
TikTokUserIE,
+ TikTokSoundIE,
+ TikTokEffectIE,
+ TikTokTagIE,
+ TikTokVMIE,
DouyinIE,
)
from .tinypic import TinyPicIE
@@ -1462,6 +1695,9 @@ from .toggle import (
ToggleIE,
MeWatchIE,
)
+from .toggo import (
+ ToggoIE,
+)
from .tokentube import (
TokentubeIE,
TokentubeChannelIE
@@ -1478,6 +1714,7 @@ from .trovo import (
TrovoChannelVodIE,
TrovoChannelClipIE,
)
+from .trueid import TrueIDIE
from .trunews import TruNewsIE
from .trutv import TruTVIE
from .tube8 import Tube8IE
@@ -1541,9 +1778,14 @@ from .tvnow import (
TVNowAnnualIE,
TVNowShowIE,
)
+from .tvopengr import (
+ TVOpenGrWatchIE,
+ TVOpenGrEmbedIE,
+)
from .tvp import (
TVPEmbedIE,
TVPIE,
+ TVPStreamIE,
TVPWebsiteIE,
)
from .tvplay import (
@@ -1593,6 +1835,7 @@ from .dlive import (
DLiveVODIE,
DLiveStreamIE,
)
+from .drooble import DroobleIE
from .umg import UMGDeIE
from .unistra import UnistraIE
from .unity import UnityIE
@@ -1635,6 +1878,10 @@ from .vice import (
from .vidbit import VidbitIE
from .viddler import ViddlerIE
from .videa import VideaIE
+from .videocampus_sachsen import (
+ VideocampusSachsenIE,
+ VideocampusSachsenEmbedIE,
+)
from .videodetective import VideoDetectiveIE
from .videofyme import VideofyMeIE
from .videomore import (
@@ -1667,6 +1914,10 @@ from .vimeo import (
VimeoWatchLaterIE,
VHXEmbedIE,
)
+from .vimm import (
+ VimmIE,
+ VimmRecordingIE,
+)
from .vimple import VimpleIE
from .vine import (
VineIE,
@@ -1717,7 +1968,6 @@ from .vrv import (
from .vshare import VShareIE
from .vtm import VTMIE
from .medialaan import MedialaanIE
-from .vube import VubeIE
from .vuclip import VuClipIE
from .vupload import VuploadIE
from .vvvvid import (
@@ -1732,6 +1982,11 @@ from .washingtonpost import (
WashingtonPostIE,
WashingtonPostArticleIE,
)
+from .wasdtv import (
+ WASDTVStreamIE,
+ WASDTVRecordIE,
+ WASDTVClipIE,
+)
from .wat import WatIE
from .watchbox import WatchBoxIE
from .watchindianporn import WatchIndianPornIE
@@ -1754,6 +2009,7 @@ from .weibo import (
WeiboMobileIE
)
from .weiqitv import WeiqiTVIE
+from .willow import WillowIE
from .wimtv import WimTVIE
from .whowatch import WhoWatchIE
from .wistia import (
@@ -1761,6 +2017,10 @@ from .wistia import (
WistiaPlaylistIE,
)
from .worldstarhiphop import WorldStarHipHopIE
+from .wppilot import (
+ WPPilotIE,
+ WPPilotChannelsIE,
+)
from .wsj import (
WSJIE,
WSJArticleIE,
@@ -1784,6 +2044,7 @@ from .ximalaya import (
XimalayaIE,
XimalayaAlbumIE
)
+from .xinpianchang import XinpianchangIE
from .xminus import XMinusIE
from .xnxx import XNXXIE
from .xstream import XstreamIE
@@ -1808,6 +2069,7 @@ from .yandexmusic import (
)
from .yandexvideo import (
YandexVideoIE,
+ YandexVideoPreviewIE,
ZenYandexIE,
ZenYandexChannelIE,
)
@@ -1834,11 +2096,13 @@ from .youtube import (
YoutubeFavouritesIE,
YoutubeHistoryIE,
YoutubeTabIE,
+ YoutubeLivestreamEmbedIE,
YoutubePlaylistIE,
YoutubeRecommendedIE,
YoutubeSearchDateIE,
YoutubeSearchIE,
YoutubeSearchURLIE,
+ YoutubeMusicSearchURLIE,
YoutubeSubscriptionsIE,
YoutubeTruncatedIDIE,
YoutubeTruncatedURLIE,
diff --git a/hypervideo_dl/extractor/facebook.py b/hypervideo_dl/extractor/facebook.py
index f32700f..022ea85 100644
--- a/hypervideo_dl/extractor/facebook.py
+++ b/hypervideo_dl/extractor/facebook.py
@@ -13,21 +13,26 @@ from ..compat import (
)
from ..utils import (
clean_html,
+ determine_ext,
error_to_compat_str,
ExtractorError,
float_or_none,
get_element_by_id,
+ get_first,
int_or_none,
js_to_json,
- limit_length,
merge_dicts,
network_exceptions,
parse_count,
+ parse_qs,
qualities,
sanitized_Request,
+ traverse_obj,
try_get,
+ url_or_none,
urlencode_postdata,
urljoin,
+ variadic,
)
@@ -161,7 +166,7 @@ class FacebookIE(InfoExtractor):
'info_dict': {
'id': '1417995061575415',
'ext': 'mp4',
- 'title': 'Yaroslav Korpan - Довгоочікуване відео',
+ 'title': 'Ukrainian Scientists Worldwide | Довгоочікуване відео',
'description': 'Довгоочікуване відео',
'timestamp': 1486648771,
'upload_date': '20170209',
@@ -192,8 +197,8 @@ class FacebookIE(InfoExtractor):
'info_dict': {
'id': '202882990186699',
'ext': 'mp4',
- 'title': 'Elisabeth Ahtn - Hello? Yes your uber ride is here\n* Jukin...',
- 'description': 'Hello? Yes your uber ride is here\n* Jukin Media Verified *\nFind this video and others like it by visiting...',
+ 'title': 'birb (O v O") | Hello? Yes your uber ride is here',
+ 'description': 'Hello? Yes your uber ride is here * Jukin Media Verified * Find this video and others like it by visiting...',
'timestamp': 1486035513,
'upload_date': '20170202',
'uploader': 'Elisabeth Ahtn',
@@ -324,11 +329,7 @@ class FacebookIE(InfoExtractor):
urls.append(mobj.group('url'))
return urls
- def _login(self):
- useremail, password = self._get_login_info()
- if useremail is None:
- return
-
+ def _perform_login(self, username, password):
login_page_req = sanitized_Request(self._LOGIN_URL)
self._set_cookie('facebook.com', 'locale', 'en_US')
login_page = self._download_webpage(login_page_req, None,
@@ -340,7 +341,7 @@ class FacebookIE(InfoExtractor):
lgnrnd = self._search_regex(r'name="lgnrnd" value="([^"]*?)"', login_page, 'lgnrnd')
login_form = {
- 'email': useremail,
+ 'email': username,
'pass': password,
'lsd': lsd,
'lgnrnd': lgnrnd,
@@ -387,36 +388,36 @@ class FacebookIE(InfoExtractor):
self.report_warning('unable to log in: %s' % error_to_compat_str(err))
return
- def _real_initialize(self):
- self._login()
-
def _extract_from_url(self, url, video_id):
webpage = self._download_webpage(
url.replace('://m.facebook.com/', '://www.facebook.com/'), video_id)
def extract_metadata(webpage):
- video_title = self._html_search_regex(
- r'<h2\s+[^>]*class="uiHeaderTitle"[^>]*>([^<]*)</h2>', webpage,
- 'title', default=None)
- if not video_title:
- video_title = self._html_search_regex(
- r'(?s)<span class="fbPhotosPhotoCaption".*?id="fbPhotoPageCaption"><span class="hasCaption">(.*?)</span>',
- webpage, 'alternative title', default=None)
- if not video_title:
- video_title = self._html_search_meta(
- ['og:title', 'twitter:title', 'description'],
- webpage, 'title', default=None)
- if video_title:
- video_title = limit_length(video_title, 80)
- else:
- video_title = 'Facebook video #%s' % video_id
- description = self._html_search_meta(
+ post_data = [self._parse_json(j, video_id, fatal=False) for j in re.findall(
+ r'handleWithCustomApplyEach\(\s*ScheduledApplyEach\s*,\s*(\{.+?\})\s*\);', webpage)]
+ post = traverse_obj(post_data, (
+ ..., 'require', ..., ..., ..., '__bbox', 'result', 'data'), expected_type=dict) or []
+ media = traverse_obj(
+ post,
+ (..., 'attachments', ..., 'media', lambda _, m: str(m['id']) == video_id and m['__typename'] == 'Video'),
+ expected_type=dict)
+ title = get_first(media, ('title', 'text'))
+ description = get_first(media, ('creation_story', 'comet_sections', 'message', 'story', 'message', 'text'))
+ uploader_data = get_first(media, 'owner') or get_first(post, ('node', 'actors', ...)) or {}
+
+ page_title = title or self._html_search_regex((
+ r'<h2\s+[^>]*class="uiHeaderTitle"[^>]*>(?P<content>[^<]*)</h2>',
+ r'(?s)<span class="fbPhotosPhotoCaption".*?id="fbPhotoPageCaption"><span class="hasCaption">(?P<content>.*?)</span>',
+ self._meta_regex('og:title'), self._meta_regex('twitter:title'), r'<title>(?P<content>.+?)</title>'
+ ), webpage, 'title', default=None, group='content')
+ description = description or self._html_search_meta(
['description', 'og:description', 'twitter:description'],
webpage, 'description', default=None)
- uploader = clean_html(get_element_by_id(
- 'fbPhotoPageAuthorName', webpage)) or self._search_regex(
- r'ownerName\s*:\s*"([^"]+)"', webpage, 'uploader',
- default=None) or self._og_search_title(webpage, fatal=False)
+ uploader = uploader_data.get('name') or (
+ clean_html(get_element_by_id('fbPhotoPageAuthorName', webpage))
+ or self._search_regex(
+ (r'ownerName\s*:\s*"([^"]+)"', *self._og_regexes('title')), webpage, 'uploader', fatal=False))
+
timestamp = int_or_none(self._search_regex(
r'<abbr[^>]+data-utime=["\'](\d+)', webpage,
'timestamp', default=None))
@@ -431,17 +432,17 @@ class FacebookIE(InfoExtractor):
r'\bviewCount\s*:\s*["\']([\d,.]+)', webpage, 'view count',
default=None))
info_dict = {
- 'title': video_title,
'description': description,
'uploader': uploader,
+ 'uploader_id': uploader_data.get('id'),
'timestamp': timestamp,
'thumbnail': thumbnail,
'view_count': view_count,
}
+
info_json_ld = self._search_json_ld(webpage, video_id, default={})
- if info_json_ld.get('title'):
- info_json_ld['title'] = limit_length(
- re.sub(r'\s*\|\s*Facebook$', '', info_json_ld['title']), 80)
+ info_json_ld['title'] = (re.sub(r'\s*\|\s*Facebook$', '', title or info_json_ld.get('title') or page_title or '')
+ or (description or '').replace('\n', ' ') or f'Facebook video #{video_id}')
return merge_dicts(info_json_ld, info_dict)
video_data = None
@@ -508,15 +509,19 @@ class FacebookIE(InfoExtractor):
def parse_graphql_video(video):
formats = []
q = qualities(['sd', 'hd'])
- for (suffix, format_id) in [('', 'sd'), ('_quality_hd', 'hd')]:
- playable_url = video.get('playable_url' + suffix)
+ for key, format_id in (('playable_url', 'sd'), ('playable_url_quality_hd', 'hd'),
+ ('playable_url_dash', '')):
+ playable_url = video.get(key)
if not playable_url:
continue
- formats.append({
- 'format_id': format_id,
- 'quality': q(format_id),
- 'url': playable_url,
- })
+ if determine_ext(playable_url) == 'mpd':
+ formats.extend(self._extract_mpd_formats(playable_url, video_id))
+ else:
+ formats.append({
+ 'format_id': format_id,
+ 'quality': q(format_id),
+ 'url': playable_url,
+ })
extract_dash_manifest(video, formats)
process_formats(formats)
v_id = video.get('videoId') or video.get('id') or video_id
@@ -544,22 +549,15 @@ class FacebookIE(InfoExtractor):
if media.get('__typename') == 'Video':
return parse_graphql_video(media)
- nodes = data.get('nodes') or []
- node = data.get('node') or {}
- if not nodes and node:
- nodes.append(node)
- for node in nodes:
- story = try_get(node, lambda x: x['comet_sections']['content']['story'], dict) or {}
- attachments = try_get(story, [
- lambda x: x['attached_story']['attachments'],
- lambda x: x['attachments']
- ], list) or []
- for attachment in attachments:
- attachment = try_get(attachment, lambda x: x['style_type_renderer']['attachment'], dict)
- ns = try_get(attachment, lambda x: x['all_subattachments']['nodes'], list) or []
- for n in ns:
- parse_attachment(n)
- parse_attachment(attachment)
+ nodes = variadic(traverse_obj(data, 'nodes', 'node') or [])
+ attachments = traverse_obj(nodes, (
+ ..., 'comet_sections', 'content', 'story', (None, 'attached_story'), 'attachments',
+ ..., ('styles', 'style_type_renderer'), 'attachment'), expected_type=dict) or []
+ for attachment in attachments:
+ ns = try_get(attachment, lambda x: x['all_subattachments']['nodes'], list) or []
+ for n in ns:
+ parse_attachment(n)
+ parse_attachment(attachment)
edges = try_get(data, lambda x: x['mediaset']['currMedia']['edges'], list) or []
for edge in edges:
@@ -728,6 +726,7 @@ class FacebookPluginsVideoIE(InfoExtractor):
'info_dict': {
'id': '10154383743583686',
'ext': 'mp4',
+ # TODO: Fix title, uploader
'title': 'What to do during the haze?',
'uploader': 'Gov.sg',
'upload_date': '20160826',
@@ -746,3 +745,42 @@ class FacebookPluginsVideoIE(InfoExtractor):
return self.url_result(
compat_urllib_parse_unquote(self._match_id(url)),
FacebookIE.ie_key())
+
+
+class FacebookRedirectURLIE(InfoExtractor):
+ IE_DESC = False # Do not list
+ _VALID_URL = r'https?://(?:[\w-]+\.)?facebook\.com/flx/warn[/?]'
+ _TESTS = [{
+ 'url': 'https://www.facebook.com/flx/warn/?h=TAQHsoToz&u=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DpO8h3EaFRdo&s=1',
+ 'info_dict': {
+ 'id': 'pO8h3EaFRdo',
+ 'ext': 'mp4',
+ 'title': 'Tripeo Boiler Room x Dekmantel Festival DJ Set',
+ 'description': 'md5:2d713ccbb45b686a1888397b2c77ca6b',
+ 'channel_id': 'UCGBpxWJr9FNOcFYA5GkKrMg',
+ 'playable_in_embed': True,
+ 'categories': ['Music'],
+ 'channel': 'Boiler Room',
+ 'uploader_id': 'brtvofficial',
+ 'uploader': 'Boiler Room',
+ 'tags': 'count:11',
+ 'duration': 3332,
+ 'live_status': 'not_live',
+ 'thumbnail': 'https://i.ytimg.com/vi/pO8h3EaFRdo/maxresdefault.jpg',
+ 'channel_url': 'https://www.youtube.com/channel/UCGBpxWJr9FNOcFYA5GkKrMg',
+ 'availability': 'public',
+ 'uploader_url': 'http://www.youtube.com/user/brtvofficial',
+ 'upload_date': '20150917',
+ 'age_limit': 0,
+ 'view_count': int,
+ 'like_count': int,
+ },
+ 'add_ie': ['Youtube'],
+ 'params': {'skip_download': 'Youtube'},
+ }]
+
+ def _real_extract(self, url):
+ redirect_url = url_or_none(parse_qs(url).get('u', [None])[-1])
+ if not redirect_url:
+ raise ExtractorError('Invalid facebook redirect URL', expected=True)
+ return self.url_result(redirect_url)
diff --git a/hypervideo_dl/extractor/fancode.py b/hypervideo_dl/extractor/fancode.py
index 912feb7..7ea16c6 100644
--- a/hypervideo_dl/extractor/fancode.py
+++ b/hypervideo_dl/extractor/fancode.py
@@ -21,7 +21,6 @@ class FancodeVodIE(InfoExtractor):
'url': 'https://fancode.com/video/15043/match-preview-pbks-vs-mi',
'params': {
'skip_download': True,
- 'format': 'bestvideo'
},
'info_dict': {
'id': '6249806281001',
@@ -42,7 +41,7 @@ class FancodeVodIE(InfoExtractor):
_ACCESS_TOKEN = None
_NETRC_MACHINE = 'fancode'
- _LOGIN_HINT = 'Use "--user refresh --password <refresh_token>" to login using a refresh token'
+ _LOGIN_HINT = 'Use "--username refresh --password <refresh_token>" to login using a refresh token'
headers = {
'content-type': 'application/json',
@@ -50,30 +49,26 @@ class FancodeVodIE(InfoExtractor):
'referer': 'https://fancode.com',
}
- def _login(self):
+ def _perform_login(self, username, password):
# Access tokens are shortlived, so get them using the refresh token.
- username, password = self._get_login_info()
- if username == 'refresh' and password is not None:
- self.report_login()
- data = '''{
- "query":"mutation RefreshToken($refreshToken: String\\u0021) { refreshToken(refreshToken: $refreshToken) { accessToken }}",
- "variables":{
- "refreshToken":"%s"
- },
- "operationName":"RefreshToken"
- }''' % password
-
- token_json = self.download_gql('refresh token', data, "Getting the Access token")
- self._ACCESS_TOKEN = try_get(token_json, lambda x: x['data']['refreshToken']['accessToken'])
- if self._ACCESS_TOKEN is None:
- self.report_warning('Failed to get Access token')
- else:
- self.headers.update({'Authorization': 'Bearer %s' % self._ACCESS_TOKEN})
- elif username is not None:
+ if username != 'refresh':
self.report_warning(f'Login using username and password is not currently supported. {self._LOGIN_HINT}')
- def _real_initialize(self):
- self._login()
+ self.report_login()
+ data = '''{
+ "query":"mutation RefreshToken($refreshToken: String\\u0021) { refreshToken(refreshToken: $refreshToken) { accessToken }}",
+ "variables":{
+ "refreshToken":"%s"
+ },
+ "operationName":"RefreshToken"
+ }''' % password
+
+ token_json = self.download_gql('refresh token', data, "Getting the Access token")
+ self._ACCESS_TOKEN = try_get(token_json, lambda x: x['data']['refreshToken']['accessToken'])
+ if self._ACCESS_TOKEN is None:
+ self.report_warning('Failed to get Access token')
+ else:
+ self.headers.update({'Authorization': 'Bearer %s' % self._ACCESS_TOKEN})
def _check_login_required(self, is_available, is_premium):
msg = None
diff --git a/hypervideo_dl/extractor/fc2.py b/hypervideo_dl/extractor/fc2.py
index 4d85e62..54a83aa 100644
--- a/hypervideo_dl/extractor/fc2.py
+++ b/hypervideo_dl/extractor/fc2.py
@@ -1,18 +1,23 @@
# coding: utf-8
from __future__ import unicode_literals
-import hashlib
+import re
from .common import InfoExtractor
from ..compat import (
compat_parse_qs,
- compat_urllib_request,
- compat_urlparse,
)
from ..utils import (
ExtractorError,
+ WebSocketsWrapper,
+ has_websockets,
+ js_to_json,
sanitized_Request,
+ std_headers,
+ traverse_obj,
+ update_url_query,
urlencode_postdata,
+ urljoin,
)
@@ -82,41 +87,33 @@ class FC2IE(InfoExtractor):
self._downloader.cookiejar.clear_session_cookies() # must clear
self._login()
- title = 'FC2 video %s' % video_id
- thumbnail = None
+ title, thumbnail, description = None, None, None
if webpage is not None:
- title = self._og_search_title(webpage)
+ title = self._html_search_regex(
+ (r'<h2\s+class="videoCnt_title">([^<]+?)</h2>',
+ r'\s+href="[^"]+"\s*title="([^"]+?)"\s*rel="nofollow">\s*<img',
+ # there's two matches in the webpage
+ r'\s+href="[^"]+"\s*title="([^"]+?)"\s*rel="nofollow">\s*\1'),
+ webpage,
+ 'title', fatal=False)
thumbnail = self._og_search_thumbnail(webpage)
- refer = url.replace('/content/', '/a/content/') if '/a/content/' not in url else url
-
- mimi = hashlib.md5((video_id + '_gGddgPfeaf_gzyr').encode('utf-8')).hexdigest()
-
- info_url = (
- 'http://video.fc2.com/ginfo.php?mimi={1:s}&href={2:s}&v={0:s}&fversion=WIN%2011%2C6%2C602%2C180&from=2&otag=0&upid={0:s}&tk=null&'.
- format(video_id, mimi, compat_urllib_request.quote(refer, safe=b'').replace('.', '%2E')))
-
- info_webpage = self._download_webpage(
- info_url, video_id, note='Downloading info page')
- info = compat_urlparse.parse_qs(info_webpage)
-
- if 'err_code' in info:
- # most of the time we can still download wideo even if err_code is 403 or 602
- self.report_warning(
- 'Error code was: %s... but still trying' % info['err_code'][0])
+ description = self._og_search_description(webpage, default=None)
- if 'filepath' not in info:
- raise ExtractorError('Cannot download file. Are you logged in?')
-
- video_url = info['filepath'][0] + '?mid=' + info['mid'][0]
- title_info = info.get('title')
- if title_info:
- title = title_info[0]
+ vidplaylist = self._download_json(
+ 'https://video.fc2.com/api/v3/videoplaylist/%s?sh=1&fs=0' % video_id, video_id,
+ note='Downloading info page')
+ vid_url = traverse_obj(vidplaylist, ('playlist', 'nq'))
+ if not vid_url:
+ raise ExtractorError('Unable to extract video URL')
+ vid_url = urljoin('https://video.fc2.com/', vid_url)
return {
'id': video_id,
'title': title,
- 'url': video_url,
- 'ext': 'flv',
+ 'url': vid_url,
+ 'ext': 'mp4',
+ 'protocol': 'm3u8_native',
+ 'description': description,
'thumbnail': thumbnail,
}
@@ -157,3 +154,145 @@ class FC2EmbedIE(InfoExtractor):
'title': title,
'thumbnail': thumbnail,
}
+
+
+class FC2LiveIE(InfoExtractor):
+ _VALID_URL = r'https?://live\.fc2\.com/(?P<id>\d+)'
+ IE_NAME = 'fc2:live'
+
+ _TESTS = [{
+ 'url': 'https://live.fc2.com/57892267/',
+ 'info_dict': {
+ 'id': '57892267',
+ 'title': 'どこまで・・・',
+ 'uploader': 'あつあげ',
+ 'uploader_id': '57892267',
+ 'thumbnail': r're:https?://.+fc2.+',
+ },
+ 'skip': 'livestream',
+ }]
+
+ def _real_extract(self, url):
+ if not has_websockets:
+ raise ExtractorError('websockets library is not available. Please install it.', expected=True)
+ video_id = self._match_id(url)
+ webpage = self._download_webpage('https://live.fc2.com/%s/' % video_id, video_id)
+
+ self._set_cookie('live.fc2.com', 'js-player_size', '1')
+
+ member_api = self._download_json(
+ 'https://live.fc2.com/api/memberApi.php', video_id, data=urlencode_postdata({
+ 'channel': '1',
+ 'profile': '1',
+ 'user': '1',
+ 'streamid': video_id
+ }), note='Requesting member info')
+
+ control_server = self._download_json(
+ 'https://live.fc2.com/api/getControlServer.php', video_id, note='Downloading ControlServer data',
+ data=urlencode_postdata({
+ 'channel_id': video_id,
+ 'mode': 'play',
+ 'orz': '',
+ 'channel_version': member_api['data']['channel_data']['version'],
+ 'client_version': '2.1.0\n [1]',
+ 'client_type': 'pc',
+ 'client_app': 'browser_hls',
+ 'ipv6': '',
+ }), headers={'X-Requested-With': 'XMLHttpRequest'})
+ self._set_cookie('live.fc2.com', 'l_ortkn', control_server['orz_raw'])
+
+ ws_url = update_url_query(control_server['url'], {'control_token': control_server['control_token']})
+ playlist_data = None
+
+ self.to_screen('%s: Fetching HLS playlist info via WebSocket' % video_id)
+ ws = WebSocketsWrapper(ws_url, {
+ 'Cookie': str(self._get_cookies('https://live.fc2.com/'))[12:],
+ 'Origin': 'https://live.fc2.com',
+ 'Accept': '*/*',
+ 'User-Agent': std_headers['User-Agent'],
+ })
+
+ self.write_debug('[debug] Sending HLS server request')
+
+ while True:
+ recv = ws.recv()
+ if not recv:
+ continue
+ data = self._parse_json(recv, video_id, fatal=False)
+ if not data or not isinstance(data, dict):
+ continue
+
+ if data.get('name') == 'connect_complete':
+ break
+ ws.send(r'{"name":"get_hls_information","arguments":{},"id":1}')
+
+ while True:
+ recv = ws.recv()
+ if not recv:
+ continue
+ data = self._parse_json(recv, video_id, fatal=False)
+ if not data or not isinstance(data, dict):
+ continue
+ if data.get('name') == '_response_' and data.get('id') == 1:
+ self.write_debug('[debug] Goodbye.')
+ playlist_data = data
+ break
+ elif self._downloader.params.get('verbose', False):
+ if len(recv) > 100:
+ recv = recv[:100] + '...'
+ self.to_screen('[debug] Server said: %s' % recv)
+
+ if not playlist_data:
+ raise ExtractorError('Unable to fetch HLS playlist info via WebSocket')
+
+ formats = []
+ for name, playlists in playlist_data['arguments'].items():
+ if not isinstance(playlists, list):
+ continue
+ for pl in playlists:
+ if pl.get('status') == 0 and 'master_playlist' in pl.get('url'):
+ formats.extend(self._extract_m3u8_formats(
+ pl['url'], video_id, ext='mp4', m3u8_id=name, live=True,
+ headers={
+ 'Origin': 'https://live.fc2.com',
+ 'Referer': url,
+ }))
+
+ self._sort_formats(formats)
+ for fmt in formats:
+ fmt.update({
+ 'protocol': 'fc2_live',
+ 'ws': ws,
+ })
+
+ title = self._html_search_meta(('og:title', 'twitter:title'), webpage, 'live title', fatal=False)
+ if not title:
+ title = self._html_extract_title(webpage, 'html title', fatal=False)
+ if title:
+ # remove service name in <title>
+ title = re.sub(r'\s+-\s+.+$', '', title)
+ uploader = None
+ if title:
+ match = self._search_regex(r'^(.+?)\s*\[(.+?)\]$', title, 'title and uploader', default=None, group=(1, 2))
+ if match and all(match):
+ title, uploader = match
+
+ live_info_view = self._search_regex(r'(?s)liveInfoView\s*:\s*({.+?}),\s*premiumStateView', webpage, 'user info', fatal=False) or None
+ if live_info_view:
+ # remove jQuery code from object literal
+ live_info_view = re.sub(r'\$\(.+?\)[^,]+,', '"",', live_info_view)
+ live_info_view = self._parse_json(js_to_json(live_info_view), video_id)
+
+ return {
+ 'id': video_id,
+ 'title': title or traverse_obj(live_info_view, 'title'),
+ 'description': self._html_search_meta(
+ ('og:description', 'twitter:description'),
+ webpage, 'live description', fatal=False) or traverse_obj(live_info_view, 'info'),
+ 'formats': formats,
+ 'uploader': uploader or traverse_obj(live_info_view, 'name'),
+ 'uploader_id': video_id,
+ 'thumbnail': traverse_obj(live_info_view, 'thumb'),
+ 'is_live': True,
+ }
diff --git a/hypervideo_dl/extractor/filmon.py b/hypervideo_dl/extractor/filmon.py
index f775fe0..7b43ecc 100644
--- a/hypervideo_dl/extractor/filmon.py
+++ b/hypervideo_dl/extractor/filmon.py
@@ -170,7 +170,7 @@ class FilmOnChannelIE(InfoExtractor):
return {
'id': channel_id,
'display_id': channel_data.get('alias'),
- 'title': self._live_title(title) if is_live else title,
+ 'title': title,
'description': channel_data.get('description'),
'thumbnails': thumbnails,
'formats': formats,
diff --git a/hypervideo_dl/extractor/fivetv.py b/hypervideo_dl/extractor/fivetv.py
index be81fcc..d6bebd1 100644
--- a/hypervideo_dl/extractor/fivetv.py
+++ b/hypervideo_dl/extractor/fivetv.py
@@ -75,8 +75,7 @@ class FiveTVIE(InfoExtractor):
r'<a[^>]+?href="([^"]+)"[^>]+?class="videoplayer"'],
webpage, 'video url')
- title = self._og_search_title(webpage, default=None) or self._search_regex(
- r'<title>([^<]+)</title>', webpage, 'title')
+ title = self._og_search_title(webpage, default=None) or self._html_extract_title(webpage)
duration = int_or_none(self._og_search_property(
'video:duration', webpage, 'duration', default=None))
diff --git a/hypervideo_dl/extractor/flickr.py b/hypervideo_dl/extractor/flickr.py
index 6c82fae..2ed6c2b 100644
--- a/hypervideo_dl/extractor/flickr.py
+++ b/hypervideo_dl/extractor/flickr.py
@@ -7,6 +7,7 @@ from ..compat import (
)
from ..utils import (
ExtractorError,
+ format_field,
int_or_none,
qualities,
)
@@ -95,7 +96,7 @@ class FlickrIE(InfoExtractor):
owner = video_info.get('owner', {})
uploader_id = owner.get('nsid')
uploader_path = owner.get('path_alias') or uploader_id
- uploader_url = 'https://www.flickr.com/photos/%s/' % uploader_path if uploader_path else None
+ uploader_url = format_field(uploader_path, template='https://www.flickr.com/photos/%s/')
return {
'id': video_id,
diff --git a/hypervideo_dl/extractor/fox.py b/hypervideo_dl/extractor/fox.py
index 04f4bdb..4c52b9a 100644
--- a/hypervideo_dl/extractor/fox.py
+++ b/hypervideo_dl/extractor/fox.py
@@ -4,7 +4,7 @@ from __future__ import unicode_literals
import json
import uuid
-from .adobepass import AdobePassIE
+from .common import InfoExtractor
from ..compat import (
compat_HTTPError,
compat_str,
@@ -20,7 +20,7 @@ from ..utils import (
)
-class FOXIE(AdobePassIE):
+class FOXIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?fox\.com/watch/(?P<id>[\da-fA-F]+)'
_TESTS = [{
# clip
@@ -37,6 +37,7 @@ class FOXIE(AdobePassIE):
'creator': 'FOX',
'series': 'Gotham',
'age_limit': 14,
+ 'episode': 'Aftermath: Bruce Wayne Develops Into The Dark Knight'
},
'params': {
'skip_download': True,
@@ -46,14 +47,15 @@ class FOXIE(AdobePassIE):
'url': 'https://www.fox.com/watch/087036ca7f33c8eb79b08152b4dd75c1/',
'only_matching': True,
}, {
- # episode, geo-restricted, tv provided required
- 'url': 'https://www.fox.com/watch/30056b295fb57f7452aeeb4920bc3024/',
+ # sports event, geo-restricted
+ 'url': 'https://www.fox.com/watch/b057484dade738d1f373b3e46216fa2c/',
'only_matching': True,
}]
_GEO_BYPASS = False
_HOME_PAGE_URL = 'https://www.fox.com/'
- _API_KEY = 'abdcbed02c124d393b39e818a4312055'
+ _API_KEY = '6E9S4bmcoNnZwVLOHywOv8PJEdu76cM9'
_access_token = None
+ _device_id = compat_str(uuid.uuid4())
def _call_api(self, path, video_id, data=None):
headers = {
@@ -63,7 +65,7 @@ class FOXIE(AdobePassIE):
headers['Authorization'] = 'Bearer ' + self._access_token
try:
return self._download_json(
- 'https://api2.fox.com/v2.0/' + path,
+ 'https://api3.fox.com/v2.0/' + path,
video_id, data=data, headers=headers)
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
@@ -87,16 +89,37 @@ class FOXIE(AdobePassIE):
if not self._access_token:
self._access_token = self._call_api(
'login', None, json.dumps({
- 'deviceId': compat_str(uuid.uuid4()),
+ 'deviceId': self._device_id,
}).encode())['accessToken']
def _real_extract(self, url):
video_id = self._match_id(url)
- video = self._call_api('vodplayer/' + video_id, video_id)
+ self._access_token = self._call_api(
+ 'previewpassmvpd?device_id=%s&mvpd_id=TempPass_fbcfox_60min' % self._device_id,
+ video_id)['accessToken']
+
+ video = self._call_api('watch', video_id, data=json.dumps({
+ 'capabilities': ['drm/widevine', 'fsdk/yo'],
+ 'deviceWidth': 1280,
+ 'deviceHeight': 720,
+ 'maxRes': '720p',
+ 'os': 'macos',
+ 'osv': '',
+ 'provider': {
+ 'freewheel': {'did': self._device_id},
+ 'vdms': {'rays': ''},
+ 'dmp': {'kuid': '', 'seg': ''}
+ },
+ 'playlist': '',
+ 'privacy': {'us': '1---'},
+ 'siteSection': '',
+ 'streamType': 'vod',
+ 'streamId': video_id}).encode('utf-8'))
title = video['name']
release_url = video['url']
+
try:
m3u8_url = self._download_json(release_url, video_id)['playURL']
except ExtractorError as e:
diff --git a/hypervideo_dl/extractor/foxgay.py b/hypervideo_dl/extractor/foxgay.py
index 512a106..1c53e06 100644
--- a/hypervideo_dl/extractor/foxgay.py
+++ b/hypervideo_dl/extractor/foxgay.py
@@ -29,8 +29,7 @@ class FoxgayIE(InfoExtractor):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- title = remove_end(self._html_search_regex(
- r'<title>([^<]+)</title>', webpage, 'title'), ' - Foxgay.com')
+ title = remove_end(self._html_extract_title(webpage), ' - Foxgay.com')
description = get_element_by_id('inf_tit', webpage)
# The default user-agent with foxgay cookies leads to pages without videos
diff --git a/hypervideo_dl/extractor/fptplay.py b/hypervideo_dl/extractor/fptplay.py
new file mode 100644
index 0000000..a34e90b
--- /dev/null
+++ b/hypervideo_dl/extractor/fptplay.py
@@ -0,0 +1,102 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import hashlib
+import time
+import urllib.parse
+
+from .common import InfoExtractor
+from ..utils import (
+ join_nonempty,
+)
+
+
+class FptplayIE(InfoExtractor):
+ _VALID_URL = r'https?://fptplay\.vn/(?P<type>xem-video)/[^/]+\-(?P<id>\w+)(?:/tap-(?P<episode>[^/]+)?/?(?:[?#]|$)|)'
+ _GEO_COUNTRIES = ['VN']
+ IE_NAME = 'fptplay'
+ IE_DESC = 'fptplay.vn'
+ _TESTS = [{
+ 'url': 'https://fptplay.vn/xem-video/nhan-duyen-dai-nhan-xin-dung-buoc-621a123016f369ebbde55945',
+ 'md5': 'ca0ee9bc63446c0c3e9a90186f7d6b33',
+ 'info_dict': {
+ 'id': '621a123016f369ebbde55945',
+ 'ext': 'mp4',
+ 'title': 'Nhân Duyên Đại Nhân Xin Dừng Bước - Ms. Cupid In Love',
+ 'description': 'md5:23cf7d1ce0ade8e21e76ae482e6a8c6c',
+ },
+ }, {
+ 'url': 'https://fptplay.vn/xem-video/ma-toi-la-dai-gia-61f3aa8a6b3b1d2e73c60eb5/tap-3',
+ 'md5': 'b35be968c909b3e4e1e20ca45dd261b1',
+ 'info_dict': {
+ 'id': '61f3aa8a6b3b1d2e73c60eb5',
+ 'ext': 'mp4',
+ 'title': 'Má Tôi Là Đại Gia - 3',
+ 'description': 'md5:ff8ba62fb6e98ef8875c42edff641d1c',
+ },
+ }, {
+ 'url': 'https://fptplay.vn/xem-video/nha-co-chuyen-hi-alls-well-ends-well-1997-6218995f6af792ee370459f0',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ type_url, video_id, episode = self._match_valid_url(url).group('type', 'id', 'episode')
+ webpage = self._download_webpage(url, video_id=video_id, fatal=False)
+ info = self._download_json(self.get_api_with_st_token(video_id, episode or 0), video_id)
+ formats, subtitles = self._extract_m3u8_formats_and_subtitles(info['data']['url'], video_id, 'mp4')
+ self._sort_formats(formats)
+ return {
+ 'id': video_id,
+ 'title': join_nonempty(
+ self._html_search_meta(('og:title', 'twitter:title'), webpage), episode, delim=' - '),
+ 'description': self._html_search_meta(['og:description', 'twitter:description'], webpage),
+ 'formats': formats,
+ 'subtitles': subtitles,
+ }
+
+ def get_api_with_st_token(self, video_id, episode):
+ path = f'/api/v6.2_w/stream/vod/{video_id}/{episode}/auto_vip'
+ timestamp = int(time.time()) + 10800
+
+ t = hashlib.md5(f'WEBv6Dkdsad90dasdjlALDDDS{timestamp}{path}'.encode()).hexdigest().upper()
+ r = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/'
+ n = [int(f'0x{t[2 * o: 2 * o + 2]}', 16) for o in range(len(t) // 2)]
+
+ def convert(e):
+ t = ''
+ n = 0
+ i = [0, 0, 0]
+ a = [0, 0, 0, 0]
+ s = len(e)
+ c = 0
+ for z in range(s, 0, -1):
+ if n <= 3:
+ i[n] = e[c]
+ n += 1
+ c += 1
+ if 3 == n:
+ a[0] = (252 & i[0]) >> 2
+ a[1] = ((3 & i[0]) << 4) + ((240 & i[1]) >> 4)
+ a[2] = ((15 & i[1]) << 2) + ((192 & i[2]) >> 6)
+ a[3] = (63 & i[2])
+ for v in range(4):
+ t += r[a[v]]
+ n = 0
+ if n:
+ for o in range(n, 3):
+ i[o] = 0
+
+ for o in range(n + 1):
+ a[0] = (252 & i[0]) >> 2
+ a[1] = ((3 & i[0]) << 4) + ((240 & i[1]) >> 4)
+ a[2] = ((15 & i[1]) << 2) + ((192 & i[2]) >> 6)
+ a[3] = (63 & i[2])
+ t += r[a[o]]
+ n += 1
+ while n < 3:
+ t += ''
+ n += 1
+ return t
+
+ st_token = convert(n).replace('+', '-').replace('/', '_').replace('=', '')
+ return f'https://api.fptplay.net{path}?{urllib.parse.urlencode({"st": st_token, "e": timestamp})}'
diff --git a/hypervideo_dl/extractor/franceculture.py b/hypervideo_dl/extractor/franceculture.py
index 14f4cb4..9dc28d8 100644
--- a/hypervideo_dl/extractor/franceculture.py
+++ b/hypervideo_dl/extractor/franceculture.py
@@ -1,18 +1,45 @@
# coding: utf-8
from __future__ import unicode_literals
+import re
from .common import InfoExtractor
from ..utils import (
determine_ext,
extract_attributes,
int_or_none,
+ traverse_obj,
+ unified_strdate,
)
class FranceCultureIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?franceculture\.fr/emissions/(?:[^/]+/)*(?P<id>[^/?#&]+)'
_TESTS = [{
- 'url': 'http://www.franceculture.fr/emissions/carnet-nomade/rendez-vous-au-pays-des-geeks',
+ # playlist
+ 'url': 'https://www.franceculture.fr/emissions/serie/hasta-dente',
+ 'playlist_count': 12,
+ 'info_dict': {
+ 'id': 'hasta-dente',
+ 'title': 'Hasta Dente',
+ 'description': 'md5:57479af50648d14e9bb649e6b1f8f911',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'upload_date': '20201024',
+ },
+ 'playlist': [{
+ 'info_dict': {
+ 'id': '3c1c2e55-41a0-11e5-9fe0-005056a87c89',
+ 'ext': 'mp3',
+ 'title': 'Jeudi, vous avez dit bizarre ?',
+ 'description': 'md5:47cf1e00cc21c86b0210279996a812c6',
+ 'duration': 604,
+ 'upload_date': '20201024',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'timestamp': 1603576680
+ },
+ },
+ ],
+ }, {
+ 'url': 'https://www.franceculture.fr/emissions/carnet-nomade/rendez-vous-au-pays-des-geeks',
'info_dict': {
'id': 'rendez-vous-au-pays-des-geeks',
'display_id': 'rendez-vous-au-pays-des-geeks',
@@ -20,9 +47,9 @@ class FranceCultureIE(InfoExtractor):
'title': 'Rendez-vous au pays des geeks',
'thumbnail': r're:^https?://.*\.jpg$',
'upload_date': '20140301',
- 'timestamp': 1393700400,
'vcodec': 'none',
- }
+ 'duration': 3569,
+ },
}, {
# no thumbnail
'url': 'https://www.franceculture.fr/emissions/la-recherche-montre-en-main/la-recherche-montre-en-main-du-mercredi-10-octobre-2018',
@@ -31,9 +58,54 @@ class FranceCultureIE(InfoExtractor):
def _real_extract(self, url):
display_id = self._match_id(url)
-
webpage = self._download_webpage(url, display_id)
+ info = {
+ 'id': display_id,
+ 'title': self._html_search_regex(
+ r'(?s)<h1[^>]*itemprop="[^"]*name[^"]*"[^>]*>(.+?)</h1>',
+ webpage, 'title', default=self._og_search_title(webpage)),
+ 'description': self._html_search_regex(
+ r'(?s)<div[^>]+class="excerpt"[^>]*>(.*?)</div>', webpage, 'description', default=None),
+ 'thumbnail': self._og_search_thumbnail(webpage),
+ 'uploader': self._html_search_regex(
+ r'(?s)<span class="author">(.*?)</span>', webpage, 'uploader', default=None),
+ 'upload_date': unified_strdate(self._html_search_regex(
+ r'(?s)class="teaser-text-date".*?(\d{2}/\d{2}/\d{4})', webpage, 'date', default=None)),
+ }
+
+ playlist_data = self._search_regex(
+ r'''(?sx)
+ <section[^>]+data-xiti-place="[^"]*?liste_episodes[^"?]*?"[^>]*>
+ (.*?)
+ </section>
+ ''',
+ webpage, 'playlist data', fatal=False, default=None)
+
+ if playlist_data:
+ entries = []
+ for item, item_description in re.findall(
+ r'(?s)(<button[^<]*class="[^"]*replay-button[^>]*>).*?<p[^>]*class="[^"]*teaser-text-chapo[^>]*>(.*?)</p>',
+ playlist_data):
+
+ item_attributes = extract_attributes(item)
+ entries.append({
+ 'id': item_attributes.get('data-emission-uuid'),
+ 'url': item_attributes.get('data-url'),
+ 'title': item_attributes.get('data-diffusion-title'),
+ 'duration': int_or_none(traverse_obj(item_attributes, 'data-duration-seconds', 'data-duration-seconds')),
+ 'description': item_description,
+ 'timestamp': int_or_none(item_attributes.get('data-start-time')),
+ 'thumbnail': info['thumbnail'],
+ 'uploader': info['uploader'],
+ })
+
+ return {
+ '_type': 'playlist',
+ 'entries': entries,
+ **info
+ }
+
video_data = extract_attributes(self._search_regex(
r'''(?sx)
(?:
@@ -43,31 +115,14 @@ class FranceCultureIE(InfoExtractor):
(<button[^>]+data-(?:url|asset-source)="[^"]+"[^>]+>)
''',
webpage, 'video data'))
-
- video_url = video_data.get('data-url') or video_data['data-asset-source']
- title = video_data.get('data-asset-title') or video_data.get('data-diffusion-title') or self._og_search_title(webpage)
-
- description = self._html_search_regex(
- r'(?s)<div[^>]+class="intro"[^>]*>.*?<h2>(.+?)</h2>',
- webpage, 'description', default=None)
- thumbnail = self._search_regex(
- r'(?s)<figure[^>]+itemtype="https://schema.org/ImageObject"[^>]*>.*?<img[^>]+(?:data-dejavu-)?src="([^"]+)"',
- webpage, 'thumbnail', default=None)
- uploader = self._html_search_regex(
- r'(?s)<span class="author">(.*?)</span>',
- webpage, 'uploader', default=None)
+ video_url = traverse_obj(video_data, 'data-url', 'data-asset-source')
ext = determine_ext(video_url.lower())
return {
- 'id': display_id,
'display_id': display_id,
'url': video_url,
- 'title': title,
- 'description': description,
- 'thumbnail': thumbnail,
'ext': ext,
'vcodec': 'none' if ext == 'mp3' else None,
- 'uploader': uploader,
- 'timestamp': int_or_none(video_data.get('data-start-time')) or int_or_none(video_data.get('data-asset-created-date')),
'duration': int_or_none(video_data.get('data-duration')),
+ **info
}
diff --git a/hypervideo_dl/extractor/francetv.py b/hypervideo_dl/extractor/francetv.py
index 3bbab69..347a766 100644
--- a/hypervideo_dl/extractor/francetv.py
+++ b/hypervideo_dl/extractor/francetv.py
@@ -185,9 +185,9 @@ class FranceTVIE(InfoExtractor):
'vcodec': 'none',
'ext': 'mhtml',
'protocol': 'mhtml',
- 'url': 'about:dummy',
+ 'url': 'about:invalid',
'fragments': [{
- 'path': sheet,
+ 'url': sheet,
# XXX: not entirely accurate; each spritesheet seems to be
# a 10×10 grid of thumbnails corresponding to approximately
# 2 seconds of the video; the last spritesheet may be shorter
@@ -203,7 +203,7 @@ class FranceTVIE(InfoExtractor):
return {
'id': video_id,
- 'title': self._live_title(title) if is_live else title,
+ 'title': title,
'thumbnail': image,
'duration': duration,
'timestamp': timestamp,
diff --git a/hypervideo_dl/extractor/frontendmasters.py b/hypervideo_dl/extractor/frontendmasters.py
index 40b8cb0..fc67a84 100644
--- a/hypervideo_dl/extractor/frontendmasters.py
+++ b/hypervideo_dl/extractor/frontendmasters.py
@@ -28,14 +28,7 @@ class FrontendMastersBaseIE(InfoExtractor):
'high': {'width': 1920, 'height': 1080}
}
- def _real_initialize(self):
- self._login()
-
- def _login(self):
- (username, password) = self._get_login_info()
- if username is None:
- return
-
+ def _perform_login(self, username, password):
login_page = self._download_webpage(
self._LOGIN_URL, None, 'Downloading login page')
@@ -252,9 +245,9 @@ class FrontendMastersCourseIE(FrontendMastersPageBaseIE):
entries = []
for lesson in lessons:
lesson_name = lesson.get('slug')
- if not lesson_name:
- continue
lesson_id = lesson.get('hash') or lesson.get('statsId')
+ if not lesson_id or not lesson_name:
+ continue
entries.append(self._extract_lesson(chapters, lesson_id, lesson))
title = course.get('title')
diff --git a/hypervideo_dl/extractor/fujitv.py b/hypervideo_dl/extractor/fujitv.py
index a02a943..4fdfe12 100644
--- a/hypervideo_dl/extractor/fujitv.py
+++ b/hypervideo_dl/extractor/fujitv.py
@@ -1,35 +1,73 @@
# coding: utf-8
from __future__ import unicode_literals
-
+from ..utils import HEADRequest
from .common import InfoExtractor
class FujiTVFODPlus7IE(InfoExtractor):
- _VALID_URL = r'https?://i\.fod\.fujitv\.co\.jp/plus7/web/[0-9a-z]{4}/(?P<id>[0-9a-z]+)'
- _BASE_URL = 'http://i.fod.fujitv.co.jp/'
+ _VALID_URL = r'https?://fod\.fujitv\.co\.jp/title/(?P<sid>[0-9a-z]{4})/(?P<id>[0-9a-z]+)'
+ _BASE_URL = 'https://i.fod.fujitv.co.jp/'
_BITRATE_MAP = {
300: (320, 180),
800: (640, 360),
1200: (1280, 720),
2000: (1280, 720),
+ 4000: (1920, 1080),
}
+ _TESTS = [{
+ 'url': 'https://fod.fujitv.co.jp/title/5d40/5d40110076',
+ 'info_dict': {
+ 'id': '5d40110076',
+ 'ext': 'mp4',
+ 'title': '#1318 『まる子、まぼろしの洋館を見る』の巻',
+ 'series': 'ちびまる子ちゃん',
+ 'series_id': '5d40',
+ 'description': 'md5:b3f51dbfdda162ac4f789e0ff4d65750',
+ 'thumbnail': 'https://i.fod.fujitv.co.jp/img/program/5d40/episode/5d40110076_a.jpg',
+ },
+ }, {
+ 'url': 'https://fod.fujitv.co.jp/title/5d40/5d40810083',
+ 'info_dict': {
+ 'id': '5d40810083',
+ 'ext': 'mp4',
+ 'title': '#1324 『まる子とオニの子』の巻/『結成!2月をムダにしない会』の巻',
+ 'description': 'md5:3972d900b896adc8ab1849e310507efa',
+ 'series': 'ちびまる子ちゃん',
+ 'series_id': '5d40',
+ 'thumbnail': 'https://i.fod.fujitv.co.jp/img/program/5d40/episode/5d40810083_a.jpg'},
+ 'skip': 'Video available only in one week'
+ }]
+
def _real_extract(self, url):
- video_id = self._match_id(url)
- formats = self._extract_m3u8_formats(
- self._BASE_URL + 'abr/pc_html5/%s.m3u8' % video_id, video_id, 'mp4')
- for f in formats:
- wh = self._BITRATE_MAP.get(f.get('tbr'))
- if wh:
- f.update({
- 'width': wh[0],
- 'height': wh[1],
- })
- self._sort_formats(formats)
+ series_id, video_id = self._match_valid_url(url).groups()
+ self._request_webpage(HEADRequest(url), video_id)
+ json_info = {}
+ token = self._get_cookies(url).get('CT')
+ if token:
+ json_info = self._download_json('https://fod-sp.fujitv.co.jp/apps/api/episode/detail/?ep_id=%s&is_premium=false' % video_id, video_id, headers={'x-authorization': f'Bearer {token.value}'}, fatal=False)
+ else:
+ self.report_warning(f'The token cookie is needed to extract video metadata. {self._LOGIN_HINTS["cookies"]}')
+ formats, subtitles = [], {}
+ src_json = self._download_json(f'{self._BASE_URL}abrjson_v2/tv_android/{video_id}', video_id)
+ for src in src_json['video_selector']:
+ if not src.get('url'):
+ continue
+ fmt, subs = self._extract_m3u8_formats_and_subtitles(src['url'], video_id, 'mp4')
+ for f in fmt:
+ f.update(dict(zip(('height', 'width'),
+ self._BITRATE_MAP.get(f.get('tbr'), ()))))
+ formats.extend(fmt)
+ subtitles = self._merge_subtitles(subtitles, subs)
+ self._sort_formats(formats, ['tbr'])
return {
'id': video_id,
- 'title': video_id,
+ 'title': json_info.get('ep_title'),
+ 'series': json_info.get('lu_title'),
+ 'series_id': series_id,
+ 'description': json_info.get('ep_description'),
'formats': formats,
- 'thumbnail': self._BASE_URL + 'pc/image/wbtn/wbtn_%s.jpg' % video_id,
+ 'subtitles': subtitles,
+ 'thumbnail': f'{self._BASE_URL}img/program/{series_id}/episode/{video_id}_a.jpg',
}
diff --git a/hypervideo_dl/extractor/funimation.py b/hypervideo_dl/extractor/funimation.py
index 382cbe1..6aa9bc9 100644
--- a/hypervideo_dl/extractor/funimation.py
+++ b/hypervideo_dl/extractor/funimation.py
@@ -10,6 +10,7 @@ from ..compat import compat_HTTPError
from ..utils import (
determine_ext,
int_or_none,
+ join_nonempty,
js_to_json,
orderedSet,
qualities,
@@ -35,9 +36,8 @@ class FunimationBaseIE(InfoExtractor):
note='Checking geo-location', errnote='Unable to fetch geo-location information'),
'region') or 'US'
- def _login(self):
- username, password = self._get_login_info()
- if username is None:
+ def _perform_login(self, username, password):
+ if self._TOKEN:
return
try:
data = self._download_json(
@@ -46,7 +46,7 @@ class FunimationBaseIE(InfoExtractor):
'username': username,
'password': password,
}))
- return data['token']
+ FunimationBaseIE._TOKEN = data['token']
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
error = self._parse_json(e.cause.read().decode(), None)['error']
@@ -89,8 +89,6 @@ class FunimationPageIE(FunimationBaseIE):
def _real_initialize(self):
if not self._REGION:
FunimationBaseIE._REGION = self._get_region()
- if not self._TOKEN:
- FunimationBaseIE._TOKEN = self._login()
def _real_extract(self, url):
locale, show, episode = self._match_valid_url(url).group('lang', 'show', 'episode')
@@ -153,10 +151,6 @@ class FunimationIE(FunimationBaseIE):
},
}]
- def _real_initialize(self):
- if not self._TOKEN:
- FunimationBaseIE._TOKEN = self._login()
-
@staticmethod
def _get_experiences(episode):
for lang, lang_data in episode.get('languages', {}).items():
@@ -275,7 +269,7 @@ class FunimationIE(FunimationBaseIE):
def _get_subtitles(self, subtitles, experience_id, episode, display_id, format_name):
if isinstance(episode, str):
webpage = self._download_webpage(
- f'https://www.funimation.com/player/{experience_id}', display_id,
+ f'https://www.funimation.com/player/{experience_id}/', display_id,
fatal=False, note=f'Downloading player webpage for {format_name}')
episode, _, _ = self._get_episode(webpage, episode_id=episode, fatal=False)
@@ -288,10 +282,11 @@ class FunimationIE(FunimationBaseIE):
sub_type = sub_type if sub_type != 'FULL' else None
current_sub = {
'url': text_track['src'],
- 'name': ' '.join(filter(None, (version, text_track.get('label'), sub_type)))
+ 'name': join_nonempty(version, text_track.get('label'), sub_type, delim=' ')
}
- lang = '_'.join(filter(None, (
- text_track.get('language', 'und'), version if version != 'Simulcast' else None, sub_type)))
+ lang = join_nonempty(text_track.get('language', 'und'),
+ version if version != 'Simulcast' else None,
+ sub_type, delim='_')
if current_sub not in subtitles.get(lang, []):
subtitles.setdefault(lang, []).append(current_sub)
return subtitles
@@ -338,7 +333,7 @@ class FunimationShowIE(FunimationBaseIE):
'https://prod-api-funimationnow.dadcdigital.com/api/funimation/episodes/?limit=99999&title_id=%s'
% show_info.get('id'), display_id)
- vod_items = traverse_obj(items_info, ('items', ..., re.compile('(?i)mostRecent[AS]vod').match, 'item'))
+ vod_items = traverse_obj(items_info, ('items', ..., lambda k, _: re.match(r'(?i)mostRecent[AS]vod', k), 'item'))
return {
'_type': 'playlist',
diff --git a/hypervideo_dl/extractor/funk.py b/hypervideo_dl/extractor/funk.py
index e5e3260..2c5cfe8 100644
--- a/hypervideo_dl/extractor/funk.py
+++ b/hypervideo_dl/extractor/funk.py
@@ -11,7 +11,7 @@ from ..utils import (
class FunkIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?funk\.net/(?:channel|playlist)/[^/]+/(?P<display_id>[0-9a-z-]+)-(?P<id>\d+)'
+ _VALID_URL = r'https?://(?:www\.|origin\.)?funk\.net/(?:channel|playlist)/[^/]+/(?P<display_id>[0-9a-z-]+)-(?P<id>\d+)'
_TESTS = [{
'url': 'https://www.funk.net/channel/ba-793/die-lustigsten-instrumente-aus-dem-internet-teil-2-1155821',
'md5': '8dd9d9ab59b4aa4173b3197f2ea48e81',
diff --git a/hypervideo_dl/extractor/gab.py b/hypervideo_dl/extractor/gab.py
index 25b5cb0..9ba0b1c 100644
--- a/hypervideo_dl/extractor/gab.py
+++ b/hypervideo_dl/extractor/gab.py
@@ -6,12 +6,16 @@ import re
from .common import InfoExtractor
from ..utils import (
clean_html,
+ int_or_none,
+ parse_codecs,
+ parse_duration,
str_to_int,
+ unified_timestamp
)
class GabTVIE(InfoExtractor):
- _VALID_URL = r'(?:https?://)tv.gab.com/channel/[^/]+/view/(?P<id>[a-z0-9-]+)'
+ _VALID_URL = r'https?://tv\.gab\.com/channel/[^/]+/view/(?P<id>[a-z0-9-]+)'
_TESTS = [{
'url': 'https://tv.gab.com/channel/wurzelroot/view/why-was-america-in-afghanistan-61217eacea5665de450d0488',
'info_dict': {
@@ -32,8 +36,10 @@ class GabTVIE(InfoExtractor):
channel_name = self._search_regex(r'data-channel-name=\"(?P<channel_id>[^\"]+)', webpage, 'channel_name')
title = self._search_regex(r'data-episode-title=\"(?P<channel_id>[^\"]+)', webpage, 'title')
view_key = self._search_regex(r'data-view-key=\"(?P<channel_id>[^\"]+)', webpage, 'view_key')
- description = clean_html(self._html_search_regex(self._meta_regex('description'), webpage, 'description', group='content')) or None
- available_resolutions = re.findall(r'<a\ data-episode-id=\"%s\"\ data-resolution=\"(?P<resolution>[^\"]+)' % id, webpage)
+ description = clean_html(
+ self._html_search_regex(self._meta_regex('description'), webpage, 'description', group='content')) or None
+ available_resolutions = re.findall(r'<a\ data-episode-id=\"%s\"\ data-resolution=\"(?P<resolution>[^\"]+)' % id,
+ webpage)
formats = []
for resolution in available_resolutions:
@@ -62,3 +68,80 @@ class GabTVIE(InfoExtractor):
'uploader_id': channel_id,
'thumbnail': f'https://tv.gab.com/image/{id}',
}
+
+
+class GabIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?gab\.com/[^/]+/posts/(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'https://gab.com/SomeBitchIKnow/posts/107163961867310434',
+ 'md5': '8ca34fb00f1e1033b5c5988d79ec531d',
+ 'info_dict': {
+ 'id': '107163961867310434-0',
+ 'ext': 'mp4',
+ 'title': 'L on Gab',
+ 'uploader_id': '946600',
+ 'uploader': 'SomeBitchIKnow',
+ 'description': 'md5:204055fafd5e1a519f5d6db953567ca3',
+ 'timestamp': 1635192289,
+ 'upload_date': '20211025',
+ }
+ }, {
+ 'url': 'https://gab.com/TheLonelyProud/posts/107045884469287653',
+ 'md5': 'f9cefcfdff6418e392611a828d47839d',
+ 'info_dict': {
+ 'id': '107045884469287653-0',
+ 'ext': 'mp4',
+ 'title': 'Jody Sadowski on Gab',
+ 'uploader_id': '1390705',
+ 'timestamp': 1633390571,
+ 'upload_date': '20211004',
+ 'uploader': 'TheLonelyProud',
+ }
+ }]
+
+ def _real_extract(self, url):
+ post_id = self._match_id(url)
+ json_data = self._download_json(f'https://gab.com/api/v1/statuses/{post_id}', post_id)
+
+ entries = []
+ for idx, media in enumerate(json_data['media_attachments']):
+ if media.get('type') not in ('video', 'gifv'):
+ continue
+ metadata = media['meta']
+ format_metadata = {
+ 'acodec': parse_codecs(metadata.get('audio_encode')).get('acodec'),
+ 'asr': int_or_none((metadata.get('audio_bitrate') or '').split(' ')[0]),
+ 'fps': metadata.get('fps'),
+ }
+
+ formats = [{
+ 'url': url,
+ 'width': f.get('width'),
+ 'height': f.get('height'),
+ 'tbr': int_or_none(f.get('bitrate'), scale=1000),
+ **format_metadata,
+ } for url, f in ((media.get('url'), metadata.get('original') or {}),
+ (media.get('source_mp4'), metadata.get('playable') or {})) if url]
+
+ self._sort_formats(formats)
+
+ author = json_data.get('account') or {}
+ entries.append({
+ 'id': f'{post_id}-{idx}',
+ 'title': f'{json_data["account"]["display_name"]} on Gab',
+ 'timestamp': unified_timestamp(json_data.get('created_at')),
+ 'formats': formats,
+ 'description': clean_html(json_data.get('content')),
+ 'duration': metadata.get('duration') or parse_duration(metadata.get('length')),
+ 'like_count': json_data.get('favourites_count'),
+ 'comment_count': json_data.get('replies_count'),
+ 'repost_count': json_data.get('reblogs_count'),
+ 'uploader': author.get('username'),
+ 'uploader_id': author.get('id'),
+ 'uploader_url': author.get('url'),
+ })
+
+ if len(entries) > 1:
+ return self.playlist_result(entries, post_id)
+
+ return entries[0]
diff --git a/hypervideo_dl/extractor/gaia.py b/hypervideo_dl/extractor/gaia.py
index 7821fb7..5b0195c 100644
--- a/hypervideo_dl/extractor/gaia.py
+++ b/hypervideo_dl/extractor/gaia.py
@@ -56,24 +56,22 @@ class GaiaIE(InfoExtractor):
def _real_initialize(self):
auth = self._get_cookies('https://www.gaia.com/').get('auth')
if auth:
- auth = self._parse_json(
- compat_urllib_parse_unquote(auth.value),
- None, fatal=False)
- if not auth:
- username, password = self._get_login_info()
- if username is None:
- return
- auth = self._download_json(
- 'https://auth.gaia.com/v1/login',
- None, data=urlencode_postdata({
- 'username': username,
- 'password': password
- }))
- if auth.get('success') is False:
- raise ExtractorError(', '.join(auth['messages']), expected=True)
- if auth:
+ auth = self._parse_json(compat_urllib_parse_unquote(auth.value), None, fatal=False)
self._jwt = auth.get('jwt')
+ def _perform_login(self, username, password):
+ if self._jwt:
+ return
+ auth = self._download_json(
+ 'https://auth.gaia.com/v1/login',
+ None, data=urlencode_postdata({
+ 'username': username,
+ 'password': password
+ }))
+ if auth.get('success') is False:
+ raise ExtractorError(', '.join(auth['messages']), expected=True)
+ self._jwt = auth.get('jwt')
+
def _real_extract(self, url):
display_id, vtype = self._match_valid_url(url).groups()
node_id = self._download_json(
diff --git a/hypervideo_dl/extractor/gamejolt.py b/hypervideo_dl/extractor/gamejolt.py
new file mode 100644
index 0000000..a13e528
--- /dev/null
+++ b/hypervideo_dl/extractor/gamejolt.py
@@ -0,0 +1,541 @@
+# coding: utf-8
+import itertools
+import json
+import math
+
+from .common import InfoExtractor
+from ..compat import compat_urllib_parse_unquote
+from ..utils import (
+ determine_ext,
+ format_field,
+ int_or_none,
+ str_or_none,
+ traverse_obj,
+ try_get
+)
+
+
+class GameJoltBaseIE(InfoExtractor):
+ _API_BASE = 'https://gamejolt.com/site-api/'
+
+ def _call_api(self, endpoint, *args, **kwargs):
+ kwargs.setdefault('headers', {}).update({'Accept': 'image/webp,*/*'})
+ return self._download_json(self._API_BASE + endpoint, *args, **kwargs)['payload']
+
+ def _parse_content_as_text(self, content):
+ outer_contents, joined_contents = content.get('content') or [], []
+ for outer_content in outer_contents:
+ if outer_content.get('type') != 'paragraph':
+ joined_contents.append(self._parse_content_as_text(outer_content))
+ continue
+ inner_contents, inner_content_text = outer_content.get('content') or [], ''
+ for inner_content in inner_contents:
+ if inner_content.get('text'):
+ inner_content_text += inner_content['text']
+ elif inner_content.get('type') == 'hardBreak':
+ inner_content_text += '\n'
+ joined_contents.append(inner_content_text)
+
+ return '\n'.join(joined_contents)
+
+ def _get_comments(self, post_num_id, post_hash_id):
+ sort_by, scroll_id = self._configuration_arg('comment_sort', ['hot'], ie_key=GameJoltIE.ie_key())[0], -1
+ is_scrolled = sort_by in ('new', 'you')
+ for page in itertools.count(1):
+ comments_data = self._call_api(
+ 'comments/Fireside_Post/%s/%s?%s=%d' % (
+ post_num_id, sort_by,
+ 'scroll_id' if is_scrolled else 'page', scroll_id if is_scrolled else page),
+ post_hash_id, note='Downloading comments list page %d' % page)
+ if not comments_data.get('comments'):
+ break
+ for comment in traverse_obj(comments_data, (('comments', 'childComments'), ...), expected_type=dict, default=[]):
+ yield {
+ 'id': comment['id'],
+ 'text': self._parse_content_as_text(
+ self._parse_json(comment['comment_content'], post_hash_id)),
+ 'timestamp': int_or_none(comment.get('posted_on'), scale=1000),
+ 'like_count': comment.get('votes'),
+ 'author': traverse_obj(comment, ('user', ('display_name', 'name')), expected_type=str_or_none, get_all=False),
+ 'author_id': traverse_obj(comment, ('user', 'username'), expected_type=str_or_none),
+ 'author_thumbnail': traverse_obj(comment, ('user', 'image_avatar'), expected_type=str_or_none),
+ 'parent': comment.get('parent_id') or None,
+ }
+ scroll_id = int_or_none(comments_data['comments'][-1].get('posted_on'))
+
+ def _parse_post(self, post_data):
+ post_id = post_data['hash']
+ lead_content = self._parse_json(post_data.get('lead_content') or '{}', post_id, fatal=False) or {}
+ description, full_description = post_data.get('leadStr') or self._parse_content_as_text(
+ self._parse_json(post_data.get('lead_content'), post_id)), None
+ if post_data.get('has_article'):
+ article_content = self._parse_json(
+ post_data.get('article_content')
+ or self._call_api(f'web/posts/article/{post_data.get("id", post_id)}', post_id,
+ note='Downloading article metadata', errnote='Unable to download article metadata', fatal=False).get('article'),
+ post_id, fatal=False)
+ full_description = self._parse_content_as_text(article_content)
+
+ user_data = post_data.get('user') or {}
+ info_dict = {
+ 'extractor_key': GameJoltIE.ie_key(),
+ 'extractor': 'GameJolt',
+ 'webpage_url': str_or_none(post_data.get('url')) or f'https://gamejolt.com/p/{post_id}',
+ 'id': post_id,
+ 'title': description,
+ 'description': full_description or description,
+ 'display_id': post_data.get('slug'),
+ 'uploader': user_data.get('display_name') or user_data.get('name'),
+ 'uploader_id': user_data.get('username'),
+ 'uploader_url': format_field(user_data, 'url', 'https://gamejolt.com%s'),
+ 'categories': [try_get(category, lambda x: '%s - %s' % (x['community']['name'], x['channel'].get('display_title') or x['channel']['title']))
+ for category in post_data.get('communities' or [])],
+ 'tags': traverse_obj(
+ lead_content, ('content', ..., 'content', ..., 'marks', ..., 'attrs', 'tag'), expected_type=str_or_none),
+ 'like_count': int_or_none(post_data.get('like_count')),
+ 'comment_count': int_or_none(post_data.get('comment_count'), default=0),
+ 'timestamp': int_or_none(post_data.get('added_on'), scale=1000),
+ 'release_timestamp': int_or_none(post_data.get('published_on'), scale=1000),
+ '__post_extractor': self.extract_comments(post_data.get('id'), post_id)
+ }
+
+ # TODO: Handle multiple videos/embeds?
+ video_data = traverse_obj(post_data, ('videos', ...), expected_type=dict, get_all=False) or {}
+ formats, subtitles, thumbnails = [], {}, []
+ for media in video_data.get('media') or []:
+ media_url, mimetype, ext, media_id = media['img_url'], media.get('filetype', ''), determine_ext(media['img_url']), media.get('type')
+ if mimetype == 'application/vnd.apple.mpegurl' or ext == 'm3u8':
+ hls_formats, hls_subs = self._extract_m3u8_formats_and_subtitles(media_url, post_id, 'mp4', m3u8_id=media_id)
+ formats.extend(hls_formats)
+ subtitles.update(hls_subs)
+ elif mimetype == 'application/dash+xml' or ext == 'mpd':
+ dash_formats, dash_subs = self._extract_mpd_formats_and_subtitles(media_url, post_id, mpd_id=media_id)
+ formats.extend(dash_formats)
+ subtitles.update(dash_subs)
+ elif 'image' in mimetype:
+ thumbnails.append({
+ 'id': media_id,
+ 'url': media_url,
+ 'width': media.get('width'),
+ 'height': media.get('height'),
+ 'filesize': media.get('filesize'),
+ })
+ else:
+ formats.append({
+ 'format_id': media_id,
+ 'url': media_url,
+ 'width': media.get('width'),
+ 'height': media.get('height'),
+ 'filesize': media.get('filesize'),
+ 'acodec': 'none' if 'video-card' in media_url else None,
+ })
+
+ if formats:
+ return {
+ **info_dict,
+ 'formats': formats,
+ 'subtitles': subtitles,
+ 'thumbnails': thumbnails,
+ 'view_count': int_or_none(video_data.get('view_count')),
+ }
+
+ gif_entries = []
+ for media in post_data.get('media', []):
+ if determine_ext(media['img_url']) != 'gif' or 'gif' not in media.get('filetype', ''):
+ continue
+ gif_entries.append({
+ 'id': media['hash'],
+ 'title': media['filename'].split('.')[0],
+ 'formats': [{
+ 'format_id': url_key,
+ 'url': media[url_key],
+ 'width': media.get('width') if url_key == 'img_url' else None,
+ 'height': media.get('height') if url_key == 'img_url' else None,
+ 'filesize': media.get('filesize') if url_key == 'img_url' else None,
+ 'acodec': 'none',
+ } for url_key in ('img_url', 'mediaserver_url', 'mediaserver_url_mp4', 'mediaserver_url_webm') if media.get(url_key)]
+ })
+ if gif_entries:
+ return {
+ '_type': 'playlist',
+ **info_dict,
+ 'entries': gif_entries,
+ }
+
+ embed_url = traverse_obj(post_data, ('embeds', ..., 'url'), expected_type=str_or_none, get_all=False)
+ if embed_url:
+ return self.url_result(embed_url)
+ return info_dict
+
+
+class GameJoltIE(GameJoltBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?gamejolt\.com/p/(?:[\w-]*-)?(?P<id>\w{8})'
+ _TESTS = [{
+ # No audio
+ 'url': 'https://gamejolt.com/p/introducing-ramses-jackson-some-fnf-himbo-i-ve-been-animating-fo-c6achnzu',
+ 'md5': 'cd5f733258f6678b0ce500dd88166d86',
+ 'info_dict': {
+ 'id': 'c6achnzu',
+ 'ext': 'mp4',
+ 'display_id': 'introducing-ramses-jackson-some-fnf-himbo-i-ve-been-animating-fo-c6achnzu',
+ 'title': 'Introducing Ramses Jackson, some FNF himbo I’ve been animating for the past few days, hehe.\n#fnfmod #fridaynightfunkin',
+ 'description': 'Introducing Ramses Jackson, some FNF himbo I’ve been animating for the past few days, hehe.\n#fnfmod #fridaynightfunkin',
+ 'uploader': 'Jakeneutron',
+ 'uploader_id': 'Jakeneutron',
+ 'uploader_url': 'https://gamejolt.com/@Jakeneutron',
+ 'categories': ['Friday Night Funkin\' - Videos'],
+ 'tags': ['fnfmod', 'fridaynightfunkin'],
+ 'timestamp': 1633499590,
+ 'upload_date': '20211006',
+ 'release_timestamp': 1633499655,
+ 'release_date': '20211006',
+ 'thumbnail': 're:^https?://.+wgch9mhq.png$',
+ 'like_count': int,
+ 'comment_count': int,
+ 'view_count': int,
+ }
+ }, {
+ # YouTube embed
+ 'url': 'https://gamejolt.com/p/hey-hey-if-there-s-anyone-who-s-looking-to-get-into-learning-a-n6g4jzpq',
+ 'md5': '79a931ff500a5c783ef6c3bda3272e32',
+ 'info_dict': {
+ 'id': 'XsNA_mzC0q4',
+ 'title': 'Adobe Animate CC 2021 Tutorial || Part 1 - The Basics',
+ 'description': 'md5:9d1ab9e2625b3fe1f42b2a44c67fdd13',
+ 'uploader': 'Jakeneutron',
+ 'uploader_id': 'Jakeneutron',
+ 'uploader_url': 'http://www.youtube.com/user/Jakeneutron',
+ 'ext': 'mp4',
+ 'duration': 1749,
+ 'tags': ['Adobe Animate CC', 'Tutorial', 'Animation', 'The Basics', 'For Beginners'],
+ 'like_count': int,
+ 'playable_in_embed': True,
+ 'categories': ['Education'],
+ 'availability': 'public',
+ 'thumbnail': 'https://i.ytimg.com/vi_webp/XsNA_mzC0q4/maxresdefault.webp',
+ 'age_limit': 0,
+ 'live_status': 'not_live',
+ 'channel_url': 'https://www.youtube.com/channel/UC6_L7fnczNalFZyBthUE9oA',
+ 'channel': 'Jakeneutron',
+ 'channel_id': 'UC6_L7fnczNalFZyBthUE9oA',
+ 'upload_date': '20211015',
+ 'view_count': int,
+ 'chapters': 'count:18',
+ }
+ }, {
+ # Article
+ 'url': 'https://gamejolt.com/p/i-fuckin-broke-chaos-d56h3eue',
+ 'md5': '786c1ccf98fde02c03a2768acb4258d0',
+ 'info_dict': {
+ 'id': 'd56h3eue',
+ 'ext': 'mp4',
+ 'display_id': 'i-fuckin-broke-chaos-d56h3eue',
+ 'title': 'I fuckin broke Chaos.',
+ 'description': 'I moved my tab durning the cutscene so now it\'s stuck like this.',
+ 'uploader': 'Jeff____________',
+ 'uploader_id': 'The_Nyesh_Man',
+ 'uploader_url': 'https://gamejolt.com/@The_Nyesh_Man',
+ 'categories': ['Friday Night Funkin\' - Videos'],
+ 'timestamp': 1639800264,
+ 'upload_date': '20211218',
+ 'release_timestamp': 1639800330,
+ 'release_date': '20211218',
+ 'thumbnail': 're:^https?://.+euksy8bd.png$',
+ 'like_count': int,
+ 'comment_count': int,
+ 'view_count': int,
+ }
+ }, {
+ # Single GIF
+ 'url': 'https://gamejolt.com/p/hello-everyone-i-m-developing-a-pixel-art-style-mod-for-fnf-and-i-vs4gdrd8',
+ 'info_dict': {
+ 'id': 'vs4gdrd8',
+ 'display_id': 'hello-everyone-i-m-developing-a-pixel-art-style-mod-for-fnf-and-i-vs4gdrd8',
+ 'title': 'md5:cc3d8b031d9bc7ec2ec5a9ffc707e1f9',
+ 'description': 'md5:cc3d8b031d9bc7ec2ec5a9ffc707e1f9',
+ 'uploader': 'Quesoguy',
+ 'uploader_id': 'CheeseguyDev',
+ 'uploader_url': 'https://gamejolt.com/@CheeseguyDev',
+ 'categories': ['Game Dev - General', 'Arts n\' Crafts - Creations', 'Pixel Art - showcase',
+ 'Friday Night Funkin\' - Mods', 'Newgrounds - Friday Night Funkin (13+)'],
+ 'timestamp': 1639517122,
+ 'release_timestamp': 1639519966,
+ 'like_count': int,
+ 'comment_count': int,
+ },
+ 'playlist': [{
+ 'info_dict': {
+ 'id': 'dszyjnwi',
+ 'ext': 'webm',
+ 'title': 'gif-presentacion-mejorado-dszyjnwi',
+ 'n_entries': 1,
+ }
+ }]
+ }, {
+ # Multiple GIFs
+ 'url': 'https://gamejolt.com/p/gif-yhsqkumq',
+ 'playlist_count': 35,
+ 'info_dict': {
+ 'id': 'yhsqkumq',
+ 'display_id': 'gif-yhsqkumq',
+ 'title': 'GIF',
+ 'description': 'GIF',
+ 'uploader': 'DaniilTvman',
+ 'uploader_id': 'DaniilTvman',
+ 'uploader_url': 'https://gamejolt.com/@DaniilTvman',
+ 'categories': ['Five Nights At The AGK Studio Comunity - NEWS game'],
+ 'timestamp': 1638721559,
+ 'release_timestamp': 1638722276,
+ 'like_count': int,
+ 'comment_count': int,
+ },
+ }]
+
+ def _real_extract(self, url):
+ post_id = self._match_id(url)
+ post_data = self._call_api(
+ f'web/posts/view/{post_id}', post_id)['post']
+ return self._parse_post(post_data)
+
+
+class GameJoltPostListBaseIE(GameJoltBaseIE):
+ def _entries(self, endpoint, list_id, note='Downloading post list', errnote='Unable to download post list', initial_items=[]):
+ page_num, scroll_id = 1, None
+ items = initial_items or self._call_api(endpoint, list_id, note=note, errnote=errnote)['items']
+ while items:
+ for item in items:
+ yield self._parse_post(item['action_resource_model'])
+ scroll_id = items[-1]['scroll_id']
+ page_num += 1
+ items = self._call_api(
+ endpoint, list_id, note=f'{note} page {page_num}', errnote=errnote, data=json.dumps({
+ 'scrollDirection': 'from',
+ 'scrollId': scroll_id,
+ }).encode('utf-8')).get('items')
+
+
+class GameJoltUserIE(GameJoltPostListBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?gamejolt\.com/@(?P<id>[\w-]+)'
+ _TESTS = [{
+ 'url': 'https://gamejolt.com/@BlazikenSuperStar',
+ 'playlist_mincount': 1,
+ 'info_dict': {
+ 'id': '6116784',
+ 'title': 'S. Blaze',
+ 'description': 'md5:5ba7fbbb549e8ea2545aafbfe22eb03a',
+ },
+ 'params': {
+ 'ignore_no_formats_error': True,
+ },
+ 'expected_warnings': ['skipping format', 'No video formats found', 'Requested format is not available'],
+ }]
+
+ def _real_extract(self, url):
+ user_id = self._match_id(url)
+ user_data = self._call_api(
+ f'web/profile/@{user_id}', user_id, note='Downloading user info', errnote='Unable to download user info')['user']
+ bio = self._parse_content_as_text(
+ self._parse_json(user_data.get('bio_content', '{}'), user_id, fatal=False) or {})
+ return self.playlist_result(
+ self._entries(f'web/posts/fetch/user/@{user_id}?tab=active', user_id, 'Downloading user posts', 'Unable to download user posts'),
+ str_or_none(user_data.get('id')), user_data.get('display_name') or user_data.get('name'), bio)
+
+
+class GameJoltGameIE(GameJoltPostListBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?gamejolt\.com/games/[\w-]+/(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'https://gamejolt.com/games/Friday4Fun/655124',
+ 'playlist_mincount': 2,
+ 'info_dict': {
+ 'id': '655124',
+ 'title': 'Friday Night Funkin\': Friday 4 Fun',
+ 'description': 'md5:576a7dd87912a2dcf33c50d2bd3966d3'
+ },
+ 'params': {
+ 'ignore_no_formats_error': True,
+ },
+ 'expected_warnings': ['skipping format', 'No video formats found', 'Requested format is not available'],
+ }]
+
+ def _real_extract(self, url):
+ game_id = self._match_id(url)
+ game_data = self._call_api(
+ f'web/discover/games/{game_id}', game_id, note='Downloading game info', errnote='Unable to download game info')['game']
+ description = self._parse_content_as_text(
+ self._parse_json(game_data.get('description_content', '{}'), game_id, fatal=False) or {})
+ return self.playlist_result(
+ self._entries(f'web/posts/fetch/game/{game_id}', game_id, 'Downloading game posts', 'Unable to download game posts'),
+ game_id, game_data.get('title'), description)
+
+
+class GameJoltGameSoundtrackIE(GameJoltBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?gamejolt\.com/get/soundtrack(?:\?|\#!?)(?:.*?[&;])??game=(?P<id>(?:\d+)+)'
+ _TESTS = [{
+ 'url': 'https://gamejolt.com/get/soundtrack?foo=bar&game=657899',
+ 'info_dict': {
+ 'id': '657899',
+ 'title': 'Friday Night Funkin\': Vs Oswald',
+ 'n_entries': None,
+ },
+ 'playlist': [{
+ 'info_dict': {
+ 'id': '184434',
+ 'ext': 'mp3',
+ 'title': 'Gettin\' Lucky (Menu Music)',
+ 'url': r're:^https://.+vs-oswald-menu-music\.mp3$',
+ 'release_timestamp': 1635190816,
+ 'release_date': '20211025',
+ 'n_entries': 3,
+ }
+ }, {
+ 'info_dict': {
+ 'id': '184435',
+ 'ext': 'mp3',
+ 'title': 'Rabbit\'s Luck (Extended Version)',
+ 'url': r're:^https://.+rabbit-s-luck--full-version-\.mp3$',
+ 'release_timestamp': 1635190841,
+ 'release_date': '20211025',
+ 'n_entries': 3,
+ }
+ }, {
+ 'info_dict': {
+ 'id': '185228',
+ 'ext': 'mp3',
+ 'title': 'Last Straw',
+ 'url': r're:^https://.+last-straw\.mp3$',
+ 'release_timestamp': 1635881104,
+ 'release_date': '20211102',
+ 'n_entries': 3,
+ }
+ }]
+ }]
+
+ def _real_extract(self, url):
+ game_id = self._match_id(url)
+ game_overview = self._call_api(
+ f'web/discover/games/overview/{game_id}', game_id, note='Downloading soundtrack info', errnote='Unable to download soundtrack info')
+ return self.playlist_result([{
+ 'id': str_or_none(song.get('id')),
+ 'title': str_or_none(song.get('title')),
+ 'url': str_or_none(song.get('url')),
+ 'release_timestamp': int_or_none(song.get('posted_on'), scale=1000),
+ } for song in game_overview.get('songs') or []], game_id, traverse_obj(
+ game_overview, ('microdata', 'name'), (('twitter', 'fb'), 'title'), expected_type=str_or_none, get_all=False))
+
+
+class GameJoltCommunityIE(GameJoltPostListBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?gamejolt\.com/c/(?P<id>(?P<community>[\w-]+)(?:/(?P<channel>[\w-]+))?)(?:(?:\?|\#!?)(?:.*?[&;])??sort=(?P<sort>\w+))?'
+ _TESTS = [{
+ 'url': 'https://gamejolt.com/c/fnf/videos',
+ 'playlist_mincount': 50,
+ 'info_dict': {
+ 'id': 'fnf/videos',
+ 'title': 'Friday Night Funkin\' - Videos',
+ 'description': 'md5:6d8c06f27460f7d35c1554757ffe53c8'
+ },
+ 'params': {
+ 'playlistend': 50,
+ 'ignore_no_formats_error': True,
+ },
+ 'expected_warnings': ['skipping format', 'No video formats found', 'Requested format is not available'],
+ }, {
+ 'url': 'https://gamejolt.com/c/youtubers',
+ 'playlist_mincount': 50,
+ 'info_dict': {
+ 'id': 'youtubers/featured',
+ 'title': 'Youtubers - featured',
+ 'description': 'md5:53e5582c93dcc467ab597bfca4db17d4'
+ },
+ 'params': {
+ 'playlistend': 50,
+ 'ignore_no_formats_error': True,
+ },
+ 'expected_warnings': ['skipping format', 'No video formats found', 'Requested format is not available'],
+ }]
+
+ def _real_extract(self, url):
+ display_id, community_id, channel_id, sort_by = self._match_valid_url(url).group('id', 'community', 'channel', 'sort')
+ channel_id, sort_by = channel_id or 'featured', sort_by or 'new'
+
+ community_data = self._call_api(
+ f'web/communities/view/{community_id}', display_id,
+ note='Downloading community info', errnote='Unable to download community info')['community']
+ channel_data = traverse_obj(self._call_api(
+ f'web/communities/view-channel/{community_id}/{channel_id}', display_id,
+ note='Downloading channel info', errnote='Unable to download channel info', fatal=False), 'channel') or {}
+
+ title = f'{community_data.get("name") or community_id} - {channel_data.get("display_title") or channel_id}'
+ description = self._parse_content_as_text(
+ self._parse_json(community_data.get('description_content') or '{}', display_id, fatal=False) or {})
+ return self.playlist_result(
+ self._entries(
+ f'web/posts/fetch/community/{community_id}?channels[]={sort_by}&channels[]={channel_id}',
+ display_id, 'Downloading community posts', 'Unable to download community posts'),
+ f'{community_id}/{channel_id}', title, description)
+
+
+class GameJoltSearchIE(GameJoltPostListBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?gamejolt\.com/search(?:/(?P<filter>communities|users|games))?(?:\?|\#!?)(?:.*?[&;])??q=(?P<id>(?:[^&#]+)+)'
+ _URL_FORMATS = {
+ 'users': 'https://gamejolt.com/@{username}',
+ 'communities': 'https://gamejolt.com/c/{path}',
+ 'games': 'https://gamejolt.com/games/{slug}/{id}',
+ }
+ _TESTS = [{
+ 'url': 'https://gamejolt.com/search?foo=bar&q=%23fnf',
+ 'playlist_mincount': 50,
+ 'info_dict': {
+ 'id': '#fnf',
+ 'title': '#fnf',
+ },
+ 'params': {
+ 'playlistend': 50,
+ 'ignore_no_formats_error': True,
+ },
+ 'expected_warnings': ['skipping format', 'No video formats found', 'Requested format is not available'],
+ }, {
+ 'url': 'https://gamejolt.com/search/communities?q=cookie%20run',
+ 'playlist_mincount': 10,
+ 'info_dict': {
+ 'id': 'cookie run',
+ 'title': 'cookie run',
+ },
+ }, {
+ 'url': 'https://gamejolt.com/search/users?q=mlp',
+ 'playlist_mincount': 278,
+ 'info_dict': {
+ 'id': 'mlp',
+ 'title': 'mlp',
+ },
+ }, {
+ 'url': 'https://gamejolt.com/search/games?q=roblox',
+ 'playlist_mincount': 688,
+ 'info_dict': {
+ 'id': 'roblox',
+ 'title': 'roblox',
+ },
+ }]
+
+ def _search_entries(self, query, filter_mode, display_query):
+ initial_search_data = self._call_api(
+ f'web/search/{filter_mode}?q={query}', display_query,
+ note=f'Downloading {filter_mode} list', errnote=f'Unable to download {filter_mode} list')
+ entries_num = traverse_obj(initial_search_data, 'count', f'{filter_mode}Count')
+ if not entries_num:
+ return
+ for page in range(1, math.ceil(entries_num / initial_search_data['perPage']) + 1):
+ search_results = self._call_api(
+ f'web/search/{filter_mode}?q={query}&page={page}', display_query,
+ note=f'Downloading {filter_mode} list page {page}', errnote=f'Unable to download {filter_mode} list')
+ for result in search_results[filter_mode]:
+ yield self.url_result(self._URL_FORMATS[filter_mode].format(**result))
+
+ def _real_extract(self, url):
+ filter_mode, query = self._match_valid_url(url).group('filter', 'id')
+ display_query = compat_urllib_parse_unquote(query)
+ return self.playlist_result(
+ self._search_entries(query, filter_mode, display_query) if filter_mode else self._entries(
+ f'web/posts/fetch/search/{query}', display_query, initial_items=self._call_api(
+ f'web/search?q={query}', display_query,
+ note='Downloading initial post list', errnote='Unable to download initial post list')['posts']),
+ display_query, display_query)
diff --git a/hypervideo_dl/extractor/generic.py b/hypervideo_dl/extractor/generic.py
index 8387646..03e6eb2 100644
--- a/hypervideo_dl/extractor/generic.py
+++ b/hypervideo_dl/extractor/generic.py
@@ -17,6 +17,7 @@ from ..compat import (
)
from ..utils import (
determine_ext,
+ dict_get,
ExtractorError,
float_or_none,
HEADRequest,
@@ -28,8 +29,10 @@ from ..utils import (
mimetype2ext,
orderedSet,
parse_duration,
+ parse_resolution,
sanitized_Request,
smuggle_url,
+ str_or_none,
unescapeHTML,
unified_timestamp,
unsmuggle_url,
@@ -56,7 +59,7 @@ from .sportbox import SportBoxIE
from .myvi import MyviIE
from .condenast import CondeNastIE
from .udn import UDNEmbedIE
-from .senateisvp import SenateISVPIE
+from .senategov import SenateISVPIE
from .svt import SVTIE
from .pornhub import PornHubIE
from .xhamster import XHamsterEmbedIE
@@ -100,6 +103,9 @@ from .ustream import UstreamIE
from .arte import ArteTVEmbedIE
from .videopress import VideoPressIE
from .rutube import RutubeIE
+from .glomex import GlomexEmbedIE
+from .megatvcom import MegaTVComEmbedIE
+from .ant1newsgr import Ant1NewsGrEmbedIE
from .limelight import LimelightBaseIE
from .anvato import AnvatoIE
from .washingtonpost import WashingtonPostIE
@@ -112,6 +118,7 @@ from .channel9 import Channel9IE
from .vshare import VShareIE
from .mediasite import MediasiteIE
from .springboardplatform import SpringboardPlatformIE
+from .ted import TedEmbedIE
from .yapfiles import YapFilesIE
from .vice import ViceIE
from .xfileshare import XFileShareIE
@@ -135,12 +142,21 @@ from .arcpublishing import ArcPublishingIE
from .medialaan import MedialaanIE
from .simplecast import SimplecastIE
from .wimtv import WimTVIE
+from .tvopengr import TVOpenGrEmbedIE
+from .ertgr import ERTWebtvEmbedIE
+from .tvp import TVPEmbedIE
+from .blogger import BloggerIE
+from .mainstreaming import MainStreamingIE
+from .gfycat import GfycatIE
+from .panopto import PanoptoBaseIE
+from .ruutu import RuutuIE
class GenericIE(InfoExtractor):
IE_DESC = 'Generic downloader that works on some sites'
_VALID_URL = r'.*'
IE_NAME = 'generic'
+ _NETRC_MACHINE = False # Supress username warning
_TESTS = [
# Direct link to a video
{
@@ -203,7 +219,7 @@ class GenericIE(InfoExtractor):
{
'url': 'http://phihag.de/2014/youtube-dl/rss2.xml',
'info_dict': {
- 'id': 'http://phihag.de/2014/youtube-dl/rss2.xml',
+ 'id': 'https://phihag.de/2014/youtube-dl/rss2.xml',
'title': 'Zero Punctuation',
'description': 're:.*groundbreaking video review series.*'
},
@@ -248,6 +264,9 @@ class GenericIE(InfoExtractor):
'episode_number': 1,
'season_number': 1,
'age_limit': 0,
+ 'season': 'Season 1',
+ 'direct': True,
+ 'episode': 'Episode 1',
},
}],
'params': {
@@ -264,6 +283,16 @@ class GenericIE(InfoExtractor):
},
'playlist_mincount': 100,
},
+ # RSS feed with guid
+ {
+ 'url': 'https://www.omnycontent.com/d/playlist/a7b4f8fe-59d9-4afc-a79a-a90101378abf/bf2c1d80-3656-4449-9d00-a903004e8f84/efbff746-e7c1-463a-9d80-a903004e8f8f/podcast.rss',
+ 'info_dict': {
+ 'id': 'https://www.omnycontent.com/d/playlist/a7b4f8fe-59d9-4afc-a79a-a90101378abf/bf2c1d80-3656-4449-9d00-a903004e8f84/efbff746-e7c1-463a-9d80-a903004e8f8f/podcast.rss',
+ 'description': 'md5:be809a44b63b0c56fb485caf68685520',
+ 'title': 'The Little Red Podcast',
+ },
+ 'playlist_mincount': 76,
+ },
# SMIL from http://videolectures.net/promogram_igor_mekjavic_eng
{
'url': 'http://videolectures.net/promogram_igor_mekjavic_eng/video/1/smil.xml',
@@ -359,9 +388,6 @@ class GenericIE(InfoExtractor):
'formats': 'mincount:9',
'upload_date': '20130904',
},
- 'params': {
- 'format': 'bestvideo',
- },
},
# m3u8 served with Content-Type: audio/x-mpegURL; charset=utf-8
{
@@ -1188,6 +1214,21 @@ class GenericIE(InfoExtractor):
},
'skip': 'Only has video a few mornings per month, see http://www.suffolk.edu/sjc/',
},
+ # jwplayer with only the json URL
+ {
+ 'url': 'https://www.hollywoodreporter.com/news/general-news/dunkirk-team-reveals-what-christopher-nolan-said-oscar-win-meet-your-oscar-winner-1092454',
+ 'info_dict': {
+ 'id': 'TljWkvWH',
+ 'ext': 'mp4',
+ 'upload_date': '20180306',
+ 'title': 'md5:91eb1862f6526415214f62c00b453936',
+ 'description': 'md5:73048ae50ae953da10549d1d2fe9b3aa',
+ 'timestamp': 1520367225,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ },
# Complex jwplayer
{
'url': 'http://www.indiedb.com/games/king-machine/videos',
@@ -1434,24 +1475,6 @@ class GenericIE(InfoExtractor):
'duration': 45.115,
},
},
- # 5min embed
- {
- 'url': 'http://techcrunch.com/video/facebook-creates-on-this-day-crunch-report/518726732/',
- 'md5': '4c6f127a30736b59b3e2c19234ee2bf7',
- 'info_dict': {
- 'id': '518726732',
- 'ext': 'mp4',
- 'title': 'Facebook Creates "On This Day" | Crunch Report',
- 'description': 'Amazon updates Fire TV line, Tesla\'s Model X spotted in the wild',
- 'timestamp': 1427237531,
- 'uploader': 'Crunch Report',
- 'upload_date': '20150324',
- },
- 'params': {
- # m3u8 download
- 'skip_download': True,
- },
- },
# Crooks and Liars embed
{
'url': 'http://crooksandliars.com/2015/04/fox-friends-says-protecting-atheists',
@@ -1856,6 +1879,62 @@ class GenericIE(InfoExtractor):
'add_ie': [RutubeIE.ie_key()],
},
{
+ # glomex:embed
+ 'url': 'https://www.skai.gr/news/world/iatrikos-syllogos-tourkias-to-turkovac-aplo-dialyma-erntogan-eiste-apateones-kai-pseytes',
+ 'info_dict': {
+ 'id': 'v-ch2nkhcirwc9-sf',
+ 'ext': 'mp4',
+ 'title': 'md5:786e1e24e06c55993cee965ef853a0c1',
+ 'description': 'md5:8b517a61d577efe7e36fde72fd535995',
+ 'timestamp': 1641885019,
+ 'upload_date': '20220111',
+ 'duration': 460000,
+ 'thumbnail': 'https://i3thumbs.glomex.com/dC1idjJwdndiMjRzeGwvMjAyMi8wMS8xMS8wNy8xMF8zNV82MWRkMmQ2YmU5ZTgyLmpwZw==/profile:player-960x540',
+ },
+ },
+ {
+ # megatvcom:embed
+ 'url': 'https://www.in.gr/2021/12/18/greece/apokalypsi-mega-poios-parelave-tin-ereyna-tsiodra-ek-merous-tis-kyvernisis-o-prothypourgos-telika-gnorize/',
+ 'info_dict': {
+ 'id': 'apokalypsi-mega-poios-parelave-tin-ereyna-tsiodra-ek-merous-tis-kyvernisis-o-prothypourgos-telika-gnorize',
+ 'title': 'md5:5e569cf996ec111057c2764ec272848f',
+ },
+ 'playlist': [{
+ 'md5': '1afa26064ff00ccb91617957dbc73dc1',
+ 'info_dict': {
+ 'ext': 'mp4',
+ 'id': '564916',
+ 'display_id': 'md5:6cdf22d3a2e7bacb274b7295089a1770',
+ 'title': 'md5:33b9dd39584685b62873043670eb52a6',
+ 'description': 'md5:c1db7310f390518ac36dd69d947ef1a1',
+ 'timestamp': 1639753145,
+ 'upload_date': '20211217',
+ 'thumbnail': 'https://www.megatv.com/wp-content/uploads/2021/12/prezerakos-1024x597.jpg',
+ },
+ }, {
+ 'md5': '4a1c220695f1ef865a8b7966a53e2474',
+ 'info_dict': {
+ 'ext': 'mp4',
+ 'id': '564905',
+ 'display_id': 'md5:ead15695e485e649aed2b81ebd699b88',
+ 'title': 'md5:2b71fd54249a3ca34609fe39ae31c47b',
+ 'description': 'md5:c42e12f638d0a97d6de4508e2c4df982',
+ 'timestamp': 1639753047,
+ 'upload_date': '20211217',
+ 'thumbnail': 'https://www.megatv.com/wp-content/uploads/2021/12/tsiodras-mitsotakis-1024x545.jpg',
+ },
+ }]
+ },
+ {
+ 'url': 'https://www.ertnews.gr/video/manolis-goyalles-o-anthropos-piso-apo-ti-diadiktyaki-vasilopita/',
+ 'info_dict': {
+ 'id': '2022/tv/news-themata-ianouarios/20220114-apotis6-gouales-pita.mp4',
+ 'ext': 'mp4',
+ 'title': 'md5:df64f5b61c06d0e9556c0cdd5cf14464',
+ 'thumbnail': 'https://www.ert.gr/themata/photos/2021/20220114-apotis6-gouales-pita.jpg',
+ },
+ },
+ {
# ThePlatform embedded with whitespaces in URLs
'url': 'http://www.golfchannel.com/topics/shows/golftalkcentral.htm',
'only_matching': True,
@@ -2160,6 +2239,33 @@ class GenericIE(InfoExtractor):
'skip_download': True,
},
},
+ {
+ # tvopengr:embed
+ 'url': 'https://www.ethnos.gr/World/article/190604/hparosiaxekinoynoisynomiliessthgeneyhmethskiatoypolemoypanoapothnoykrania',
+ 'md5': 'eb0c3995d0a6f18f6538c8e057865d7d',
+ 'info_dict': {
+ 'id': '101119',
+ 'ext': 'mp4',
+ 'display_id': 'oikarpoitondiapragmateyseonhparosias',
+ 'title': 'md5:b979f4d640c568617d6547035528a149',
+ 'description': 'md5:e54fc1977c7159b01cc11cd7d9d85550',
+ 'timestamp': 1641772800,
+ 'upload_date': '20220110',
+ 'thumbnail': 'https://opentv-static.siliconweb.com/imgHandler/1920/70bc39fa-895b-4918-a364-c39d2135fc6d.jpg',
+
+ }
+ },
+ {
+ # blogger embed
+ 'url': 'https://blog.tomeuvizoso.net/2019/01/a-panfrost-milestone.html',
+ 'md5': 'f1bc19b6ea1b0fd1d81e84ca9ec467ac',
+ 'info_dict': {
+ 'id': 'BLOGGER-video-3c740e3a49197e16-796',
+ 'ext': 'mp4',
+ 'title': 'Blogger',
+ 'thumbnail': r're:^https?://.*',
+ },
+ },
# {
# # TODO: find another test
# # http://schema.org/VideoObject
@@ -2319,12 +2425,120 @@ class GenericIE(InfoExtractor):
'thumbnail': 'https://bogmedia.org/contents/videos_screenshots/21000/21217/preview_480p.mp4.jpg',
}
},
+ {
+ # KVS Player (for sites that serve kt_player.js via non-https urls)
+ 'url': 'http://www.camhub.world/embed/389508',
+ 'md5': 'fbe89af4cfb59c8fd9f34a202bb03e32',
+ 'info_dict': {
+ 'id': '389508',
+ 'display_id': 'syren-de-mer-onlyfans-05-07-2020have-a-happy-safe-holiday5f014e68a220979bdb8cd-source',
+ 'ext': 'mp4',
+ 'title': 'Syren De Mer onlyfans_05-07-2020Have_a_happy_safe_holiday5f014e68a220979bdb8cd_source / Embed плеер',
+ 'thumbnail': 'http://www.camhub.world/contents/videos_screenshots/389000/389508/preview.mp4.jpg',
+ }
+ },
+ {
+ # Reddit-hosted video that will redirect and be processed by RedditIE
+ # Redirects to https://www.reddit.com/r/videos/comments/6rrwyj/that_small_heart_attack/
+ 'url': 'https://v.redd.it/zv89llsvexdz',
+ 'md5': '87f5f02f6c1582654146f830f21f8662',
+ 'info_dict': {
+ 'id': 'zv89llsvexdz',
+ 'ext': 'mp4',
+ 'timestamp': 1501941939.0,
+ 'title': 'That small heart attack.',
+ 'upload_date': '20170805',
+ 'uploader': 'Antw87'
+ }
+ },
+ {
+ # 1080p Reddit-hosted video that will redirect and be processed by RedditIE
+ 'url': 'https://v.redd.it/33hgok7dfbz71/',
+ 'md5': '7a1d587940242c9bb3bd6eb320b39258',
+ 'info_dict': {
+ 'id': '33hgok7dfbz71',
+ 'ext': 'mp4',
+ 'title': "The game Didn't want me to Knife that Guy I guess",
+ 'uploader': 'paraf1ve',
+ 'timestamp': 1636788683.0,
+ 'upload_date': '20211113'
+ }
+ },
+ {
+ # MainStreaming player
+ 'url': 'https://www.lactv.it/2021/10/03/lac-news24-la-settimana-03-10-2021/',
+ 'info_dict': {
+ 'id': 'EUlZfGWkGpOd',
+ 'title': 'La Settimana ',
+ 'description': '03 Ottobre ore 02:00',
+ 'ext': 'mp4',
+ 'live_status': 'not_live',
+ 'thumbnail': r're:https?://[A-Za-z0-9-]*\.msvdn.net/image/\w+/poster',
+ 'duration': 1512
+ }
+ },
+ {
+ # Multiple gfycat iframe embeds
+ 'url': 'https://www.gezip.net/bbs/board.php?bo_table=entertaine&wr_id=613422',
+ 'info_dict': {
+ 'title': '재이, 윤, 세은 황금 드레스를 입고 빛난다',
+ 'id': 'board'
+ },
+ 'playlist_count': 8,
+ },
+ {
+ # Multiple gfycat gifs (direct links)
+ 'url': 'https://www.gezip.net/bbs/board.php?bo_table=entertaine&wr_id=612199',
+ 'info_dict': {
+ 'title': '옳게 된 크롭 니트 스테이씨 아이사',
+ 'id': 'board'
+ },
+ 'playlist_count': 6
+ },
+ {
+ # Multiple gfycat embeds, with uppercase "IFR" in urls
+ 'url': 'https://kkzz.kr/?vid=2295',
+ 'info_dict': {
+ 'title': '지방시 앰버서더 에스파 카리나 움짤',
+ 'id': '?vid=2295'
+ },
+ 'playlist_count': 9
+ },
+ {
+ # Panopto embeds
+ 'url': 'https://www.monash.edu/learning-teaching/teachhq/learning-technologies/panopto/how-to/insert-a-quiz-into-a-panopto-video',
+ 'info_dict': {
+ 'title': 'Insert a quiz into a Panopto video',
+ 'id': 'insert-a-quiz-into-a-panopto-video'
+ },
+ 'playlist_count': 1
+ },
+ {
+ # Ruutu embed
+ 'url': 'https://www.nelonen.fi/ohjelmat/madventures-suomi/2160731-riku-ja-tunna-lahtevat-peurajahtiin-tv-sta-tutun-biologin-kanssa---metsastysreissu-huipentuu-kasvissyojan-painajaiseen',
+ 'md5': 'a2513a98d3496099e6eced40f7e6a14b',
+ 'info_dict': {
+ 'id': '4044426',
+ 'ext': 'mp4',
+ 'title': 'Riku ja Tunna lähtevät peurajahtiin tv:stä tutun biologin kanssa – metsästysreissu huipentuu kasvissyöjän painajaiseen!',
+ 'thumbnail': r're:^https?://.+\.jpg$',
+ 'duration': 108,
+ 'series': 'Madventures Suomi',
+ 'description': 'md5:aa55b44bd06a1e337a6f1d0b46507381',
+ 'categories': ['Matkailu', 'Elämäntyyli'],
+ 'age_limit': 0,
+ 'upload_date': '20220308',
+ },
+ },
]
def report_following_redirect(self, new_url):
"""Report information extraction."""
self._downloader.to_screen('[redirect] Following redirect to %s' % new_url)
+ def report_detected(self, name):
+ self._downloader.write_debug(f'Identified a {name}')
+
def _extract_rss(self, url, video_id, doc):
playlist_title = doc.find('./channel/title').text
playlist_desc_el = doc.find('./channel/description')
@@ -2349,6 +2563,9 @@ class GenericIE(InfoExtractor):
if not next_url:
continue
+ if it.find('guid').text is not None:
+ next_url = smuggle_url(next_url, {'force_videoid': it.find('guid').text})
+
def itunes(key):
return xpath_text(
it, xpath_with_ns('./itunes:%s' % key, NS_MAP),
@@ -2540,10 +2757,13 @@ class GenericIE(InfoExtractor):
content_type = head_response.headers.get('Content-Type', '').lower()
m = re.match(r'^(?P<type>audio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P<format_id>[^;\s]+)', content_type)
if m:
+ self.report_detected('direct video link')
format_id = compat_str(m.group('format_id'))
subtitles = {}
if format_id.endswith('mpegurl'):
formats, subtitles = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4')
+ elif format_id.endswith('mpd') or format_id.endswith('dash+xml'):
+ formats, subtitles = self._extract_mpd_formats_and_subtitles(url, video_id)
elif format_id == 'f4m':
formats = self._extract_f4m_formats(url, video_id)
else:
@@ -2580,6 +2800,7 @@ class GenericIE(InfoExtractor):
# Is it an M3U playlist?
if first_bytes.startswith(b'#EXTM3U'):
+ self.report_detected('M3U playlist')
info_dict['formats'], info_dict['subtitles'] = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4')
self._sort_formats(info_dict['formats'])
return info_dict
@@ -2610,16 +2831,20 @@ class GenericIE(InfoExtractor):
except compat_xml_parse_error:
doc = compat_etree_fromstring(webpage.encode('utf-8'))
if doc.tag == 'rss':
+ self.report_detected('RSS feed')
return self._extract_rss(url, video_id, doc)
elif doc.tag == 'SmoothStreamingMedia':
info_dict['formats'], info_dict['subtitles'] = self._parse_ism_formats_and_subtitles(doc, url)
+ self.report_detected('ISM manifest')
self._sort_formats(info_dict['formats'])
return info_dict
elif re.match(r'^(?:{[^}]+})?smil$', doc.tag):
smil = self._parse_smil(doc, url, video_id)
+ self.report_detected('SMIL file')
self._sort_formats(smil['formats'])
return smil
elif doc.tag == '{http://xspf.org/ns/0/}playlist':
+ self.report_detected('XSPF playlist')
return self.playlist_result(
self._parse_xspf(
doc, video_id, xspf_url=url,
@@ -2630,10 +2855,12 @@ class GenericIE(InfoExtractor):
doc,
mpd_base_url=full_response.geturl().rpartition('/')[0],
mpd_url=url)
+ self.report_detected('DASH manifest')
self._sort_formats(info_dict['formats'])
return info_dict
elif re.match(r'^{http://ns\.adobe\.com/f4m/[12]\.0}manifest$', doc.tag):
info_dict['formats'] = self._parse_f4m_formats(doc, url, video_id)
+ self.report_detected('F4M manifest')
self._sort_formats(info_dict['formats'])
return info_dict
except compat_xml_parse_error:
@@ -2642,6 +2869,7 @@ class GenericIE(InfoExtractor):
# Is it a Camtasia project?
camtasia_res = self._extract_camtasia(url, video_id, webpage)
if camtasia_res is not None:
+ self.report_detected('Camtasia video')
return camtasia_res
# Sometimes embedded video player is hidden behind percent encoding
@@ -2663,10 +2891,8 @@ class GenericIE(InfoExtractor):
# Site Name | Video Title
# Video Title - Tagline | Site Name
# and so on and so forth; it's just not practical
- video_title = self._og_search_title(
- webpage, default=None) or self._html_search_regex(
- r'(?s)<title>(.*?)</title>', webpage, 'video title',
- default='video')
+ video_title = (self._og_search_title(webpage, default=None)
+ or self._html_extract_title(webpage, 'video title', default='video'))
# Try to detect age limit automatically
age_limit = self._rta_search(webpage)
@@ -2692,6 +2918,8 @@ class GenericIE(InfoExtractor):
'age_limit': age_limit,
})
+ self._downloader.write_debug('Looking for video embeds')
+
# Look for Brightcove Legacy Studio embeds
bc_urls = BrightcoveLegacyIE._extract_brightcove_urls(webpage)
if bc_urls:
@@ -3002,10 +3230,9 @@ class GenericIE(InfoExtractor):
return self.url_result(mobj.group('url'), 'Tvigle')
# Look for embedded TED player
- mobj = re.search(
- r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed(?:-ssl)?\.ted\.com/.+?)\1', webpage)
- if mobj is not None:
- return self.url_result(mobj.group('url'), 'TED')
+ ted_urls = TedEmbedIE._extract_urls(webpage)
+ if ted_urls:
+ return self.playlist_from_matches(ted_urls, video_id, video_title, ie=TedEmbedIE.ie_key())
# Look for embedded Ustream videos
ustream_url = UstreamIE._extract_url(webpage)
@@ -3138,12 +3365,6 @@ class GenericIE(InfoExtractor):
if mobj is not None:
return self.url_result(mobj.group('url'))
- # Look for 5min embeds
- mobj = re.search(
- r'<meta[^>]+property="og:video"[^>]+content="https?://embed\.5min\.com/(?P<id>[0-9]+)/?', webpage)
- if mobj is not None:
- return self.url_result('5min:%s' % mobj.group('id'), 'FiveMin')
-
# Look for Crooks and Liars embeds
mobj = re.search(
r'<(?:iframe[^>]+src|param[^>]+value)=(["\'])(?P<url>(?:https?:)?//embed\.crooksandliars\.com/(?:embed|v)/.+?)\1', webpage)
@@ -3189,6 +3410,11 @@ class GenericIE(InfoExtractor):
if onionstudios_url:
return self.url_result(onionstudios_url)
+ # Look for Blogger embeds
+ blogger_urls = BloggerIE._extract_urls(webpage)
+ if blogger_urls:
+ return self.playlist_from_matches(blogger_urls, video_id, video_title, ie=BloggerIE.ie_key())
+
# Look for ViewLift embeds
viewlift_url = ViewLiftEmbedIE._extract_url(webpage)
if viewlift_url:
@@ -3336,6 +3562,24 @@ class GenericIE(InfoExtractor):
return self.playlist_from_matches(
rutube_urls, video_id, video_title, ie=RutubeIE.ie_key())
+ # Look for Glomex embeds
+ glomex_urls = list(GlomexEmbedIE._extract_urls(webpage, url))
+ if glomex_urls:
+ return self.playlist_from_matches(
+ glomex_urls, video_id, video_title, ie=GlomexEmbedIE.ie_key())
+
+ # Look for megatv.com embeds
+ megatvcom_urls = list(MegaTVComEmbedIE._extract_urls(webpage))
+ if megatvcom_urls:
+ return self.playlist_from_matches(
+ megatvcom_urls, video_id, video_title, ie=MegaTVComEmbedIE.ie_key())
+
+ # Look for ant1news.gr embeds
+ ant1newsgr_urls = list(Ant1NewsGrEmbedIE._extract_urls(webpage))
+ if ant1newsgr_urls:
+ return self.playlist_from_matches(
+ ant1newsgr_urls, video_id, video_title, ie=Ant1NewsGrEmbedIE.ie_key())
+
# Look for WashingtonPost embeds
wapo_urls = WashingtonPostIE._extract_urls(webpage)
if wapo_urls:
@@ -3482,9 +3726,45 @@ class GenericIE(InfoExtractor):
return self.playlist_from_matches(
rumble_urls, video_id, video_title, ie=RumbleEmbedIE.ie_key())
+ # Look for (tvopen|ethnos).gr embeds
+ tvopengr_urls = list(TVOpenGrEmbedIE._extract_urls(webpage))
+ if tvopengr_urls:
+ return self.playlist_from_matches(tvopengr_urls, video_id, video_title, ie=TVOpenGrEmbedIE.ie_key())
+
+ # Look for ert.gr webtv embeds
+ ertwebtv_urls = list(ERTWebtvEmbedIE._extract_urls(webpage))
+ if len(ertwebtv_urls) == 1:
+ return self.url_result(self._proto_relative_url(ertwebtv_urls[0]), video_title=video_title, url_transparent=True)
+ elif ertwebtv_urls:
+ return self.playlist_from_matches(ertwebtv_urls, video_id, video_title, ie=ERTWebtvEmbedIE.ie_key())
+
+ tvp_urls = TVPEmbedIE._extract_urls(webpage)
+ if tvp_urls:
+ return self.playlist_from_matches(tvp_urls, video_id, video_title, ie=TVPEmbedIE.ie_key())
+
+ # Look for MainStreaming embeds
+ mainstreaming_urls = MainStreamingIE._extract_urls(webpage)
+ if mainstreaming_urls:
+ return self.playlist_from_matches(mainstreaming_urls, video_id, video_title, ie=MainStreamingIE.ie_key())
+
+ # Look for Gfycat Embeds
+ gfycat_urls = GfycatIE._extract_urls(webpage)
+ if gfycat_urls:
+ return self.playlist_from_matches(gfycat_urls, video_id, video_title, ie=GfycatIE.ie_key())
+
+ panopto_urls = PanoptoBaseIE._extract_urls(webpage)
+ if panopto_urls:
+ return self.playlist_from_matches(panopto_urls, video_id, video_title)
+
+ # Look for Ruutu embeds
+ ruutu_url = RuutuIE._extract_url(webpage)
+ if ruutu_url:
+ return self.url_result(ruutu_url, RuutuIE)
+
# Look for HTML5 media
entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls')
if entries:
+ self.report_detected('HTML5 media')
if len(entries) == 1:
entries[0].update({
'id': video_id,
@@ -3503,9 +3783,18 @@ class GenericIE(InfoExtractor):
jwplayer_data = self._find_jwplayer_data(
webpage, video_id, transform_source=js_to_json)
if jwplayer_data:
+ if isinstance(jwplayer_data.get('playlist'), str):
+ self.report_detected('JW Player playlist')
+ return {
+ **info_dict,
+ '_type': 'url',
+ 'ie_key': JWPlatformIE.ie_key(),
+ 'url': jwplayer_data['playlist'],
+ }
try:
info = self._parse_jwplayer_data(
jwplayer_data, video_id, require_title=False, base_url=url)
+ self.report_detected('JW Player data')
return merge_dicts(info, info_dict)
except ExtractorError:
# See https://github.com/ytdl-org/youtube-dl/pull/16735
@@ -3513,11 +3802,12 @@ class GenericIE(InfoExtractor):
# Video.js embed
mobj = re.search(
- r'(?s)\bvideojs\s*\(.+?\.src\s*\(\s*((?:\[.+?\]|{.+?}))\s*\)\s*;',
+ r'(?s)\bvideojs\s*\(.+?([a-zA-Z0-9_$]+)\.src\s*\(\s*((?:\[.+?\]|{.+?}))\s*\)\s*;',
webpage)
if mobj is not None:
+ varname = mobj.group(1)
sources = self._parse_json(
- mobj.group(1), video_id, transform_source=js_to_json,
+ mobj.group(2), video_id, transform_source=js_to_json,
fatal=False) or []
if not isinstance(sources, list):
sources = [sources]
@@ -3554,16 +3844,40 @@ class GenericIE(InfoExtractor):
'Referer': full_response.geturl(),
},
})
+ # https://docs.videojs.com/player#addRemoteTextTrack
+ # https://html.spec.whatwg.org/multipage/media.html#htmltrackelement
+ for sub_match in re.finditer(rf'(?s){re.escape(varname)}' r'\.addRemoteTextTrack\(({.+?})\s*,\s*(?:true|false)\)', webpage):
+ sub = self._parse_json(
+ sub_match.group(1), video_id, transform_source=js_to_json, fatal=False) or {}
+ src = str_or_none(sub.get('src'))
+ if not src:
+ continue
+ subtitles.setdefault(dict_get(sub, ('language', 'srclang')) or 'und', []).append({
+ 'url': compat_urlparse.urljoin(url, src),
+ 'name': sub.get('label'),
+ 'http_headers': {
+ 'Referer': full_response.geturl(),
+ },
+ })
if formats or subtitles:
+ self.report_detected('video.js embed')
self._sort_formats(formats)
info_dict['formats'] = formats
info_dict['subtitles'] = subtitles
return info_dict
# Looking for http://schema.org/VideoObject
- json_ld = self._search_json_ld(
- webpage, video_id, default={}, expected_type='VideoObject')
- if json_ld.get('url'):
+ json_ld = self._search_json_ld(webpage, video_id, default={})
+ if json_ld.get('url') not in (url, None):
+ self.report_detected('JSON LD')
+ if determine_ext(json_ld['url']) == 'm3u8':
+ json_ld['formats'], json_ld['subtitles'] = self._extract_m3u8_formats_and_subtitles(
+ json_ld['url'], video_id, 'mp4')
+ json_ld.pop('url')
+ self._sort_formats(json_ld['formats'])
+ else:
+ json_ld['_type'] = 'url_transparent'
+ json_ld['url'] = smuggle_url(json_ld['url'], {'force_videoid': video_id, 'to_generic': True})
return merge_dicts(json_ld, info_dict)
def check_video(vurl):
@@ -3572,15 +3886,17 @@ class GenericIE(InfoExtractor):
if RtmpIE.suitable(vurl):
return True
vpath = compat_urlparse.urlparse(vurl).path
- vext = determine_ext(vpath)
- return '.' in vpath and vext not in ('swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml', 'js', 'xml')
+ vext = determine_ext(vpath, None)
+ return vext not in (None, 'swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml', 'js', 'xml')
def filter_video(urls):
return list(filter(check_video, urls))
# Start with something easy: JW Player in SWFObject
found = filter_video(re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage))
- if not found:
+ if found:
+ self.report_detected('JW Player in SFWObject')
+ else:
# Look for gorilla-vid style embedding
found = filter_video(re.findall(r'''(?sx)
(?:
@@ -3590,10 +3906,13 @@ class GenericIE(InfoExtractor):
)
.*?
['"]?file['"]?\s*:\s*["\'](.*?)["\']''', webpage))
+ if found:
+ self.report_detected('JW Player embed')
if not found:
# Look for generic KVS player
- found = re.search(r'<script [^>]*?src="https://.+?/kt_player\.js\?v=(?P<ver>(?P<maj_ver>\d+)(\.\d+)+)".*?>', webpage)
+ found = re.search(r'<script [^>]*?src="https?://.+?/kt_player\.js\?v=(?P<ver>(?P<maj_ver>\d+)(\.\d+)+)".*?>', webpage)
if found:
+ self.report_detected('KWS Player')
if found.group('maj_ver') not in ['4', '5']:
self.report_warning('Untested major version (%s) in player engine--Download may fail.' % found.group('ver'))
flashvars = re.search(r'(?ms)<script.*?>.*?var\s+flashvars\s*=\s*(\{.*?\});.*?</script>', webpage)
@@ -3613,20 +3932,21 @@ class GenericIE(InfoExtractor):
protocol, _, _ = url.partition('/')
thumbnail = protocol + thumbnail
+ url_keys = list(filter(re.compile(r'video_url|video_alt_url\d*').fullmatch, flashvars.keys()))
formats = []
- for key in ('video_url', 'video_alt_url', 'video_alt_url2'):
- if key in flashvars and '/get_file/' in flashvars[key]:
- next_format = {
- 'url': self._kvs_getrealurl(flashvars[key], flashvars['license_code']),
- 'format_id': flashvars.get(key + '_text', key),
- 'ext': 'mp4',
- }
- height = re.search(r'%s_(\d+)p\.mp4(?:/[?].*)?$' % flashvars['video_id'], flashvars[key])
- if height:
- next_format['height'] = int(height.group(1))
- else:
- next_format['quality'] = 1
- formats.append(next_format)
+ for key in url_keys:
+ if '/get_file/' not in flashvars[key]:
+ continue
+ format_id = flashvars.get(f'{key}_text', key)
+ formats.append({
+ 'url': self._kvs_getrealurl(flashvars[key], flashvars['license_code']),
+ 'format_id': format_id,
+ 'ext': 'mp4',
+ **(parse_resolution(format_id) or parse_resolution(flashvars[key]))
+ })
+ if not formats[-1].get('height'):
+ formats[-1]['quality'] = 1
+
self._sort_formats(formats)
return {
@@ -3639,10 +3959,14 @@ class GenericIE(InfoExtractor):
if not found:
# Broaden the search a little bit
found = filter_video(re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage))
+ if found:
+ self.report_detected('video file')
if not found:
# Broaden the findall a little bit: JWPlayer JS loader
found = filter_video(re.findall(
r'[^A-Za-z0-9]?(?:file|video_url)["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage))
+ if found:
+ self.report_detected('JW Player JS loader')
if not found:
# Flow player
found = filter_video(re.findall(r'''(?xs)
@@ -3651,10 +3975,14 @@ class GenericIE(InfoExtractor):
\s*\{[^}]+? ["']?clip["']?\s*:\s*\{\s*
["']?url["']?\s*:\s*["']([^"']+)["']
''', webpage))
+ if found:
+ self.report_detected('Flow Player')
if not found:
# Cinerama player
found = re.findall(
r"cinerama\.embedPlayer\(\s*\'[^']+\',\s*'([^']+)'", webpage)
+ if found:
+ self.report_detected('Cinerama player')
if not found:
# Try to find twitter cards info
# twitter:player:stream should be checked before twitter:player since
@@ -3662,6 +3990,8 @@ class GenericIE(InfoExtractor):
# https://dev.twitter.com/cards/types/player#On_twitter.com_via_desktop_browser)
found = filter_video(re.findall(
r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage))
+ if found:
+ self.report_detected('Twitter card')
if not found:
# We look for Open Graph info:
# We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am)
@@ -3669,6 +3999,8 @@ class GenericIE(InfoExtractor):
# We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
if m_video_type is not None:
found = filter_video(re.findall(r'<meta.*?property="og:(?:video|audio)".*?content="(.*?)"', webpage))
+ if found:
+ self.report_detected('Open Graph video info')
if not found:
REDIRECT_REGEX = r'[0-9]{,2};\s*(?:URL|url)=\'?([^\'"]+)'
found = re.search(
@@ -3700,6 +4032,7 @@ class GenericIE(InfoExtractor):
# https://dev.twitter.com/cards/types/player#On_twitter.com_via_desktop_browser)
embed_url = self._html_search_meta('twitter:player', webpage, default=None)
if embed_url and embed_url != url:
+ self.report_detected('twitter:player iframe')
return self.url_result(embed_url)
if not found:
@@ -3719,12 +4052,16 @@ class GenericIE(InfoExtractor):
# here's a fun little line of code for you:
video_id = os.path.splitext(video_id)[0]
+ headers = {
+ 'referer': full_response.geturl()
+ }
entry_info_dict = {
'id': video_id,
'uploader': video_uploader,
'title': video_title,
'age_limit': age_limit,
+ 'http_headers': headers,
}
if RtmpIE.suitable(video_url):
@@ -3742,11 +4079,11 @@ class GenericIE(InfoExtractor):
elif ext == 'xspf':
return self.playlist_result(self._extract_xspf_playlist(video_url, video_id), video_id)
elif ext == 'm3u8':
- entry_info_dict['formats'], entry_info_dict['subtitles'] = self._extract_m3u8_formats_and_subtitles(video_url, video_id, ext='mp4')
+ entry_info_dict['formats'], entry_info_dict['subtitles'] = self._extract_m3u8_formats_and_subtitles(video_url, video_id, ext='mp4', headers=headers)
elif ext == 'mpd':
- entry_info_dict['formats'], entry_info_dict['subtitles'] = self._extract_mpd_formats_and_subtitles(video_url, video_id)
+ entry_info_dict['formats'], entry_info_dict['subtitles'] = self._extract_mpd_formats_and_subtitles(video_url, video_id, headers=headers)
elif ext == 'f4m':
- entry_info_dict['formats'] = self._extract_f4m_formats(video_url, video_id)
+ entry_info_dict['formats'] = self._extract_f4m_formats(video_url, video_id, headers=headers)
elif re.search(r'(?i)\.(?:ism|smil)/manifest', video_url) and video_url != url:
# Just matching .ism/manifest is not enough to be reliably sure
# whether it's actually an ISM manifest or some other streaming
diff --git a/hypervideo_dl/extractor/gettr.py b/hypervideo_dl/extractor/gettr.py
index aa50b2f..327a4d0 100644
--- a/hypervideo_dl/extractor/gettr.py
+++ b/hypervideo_dl/extractor/gettr.py
@@ -3,22 +3,30 @@ from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
+ bool_or_none,
ExtractorError,
dict_get,
float_or_none,
int_or_none,
- remove_end,
str_or_none,
+ traverse_obj,
try_get,
url_or_none,
urljoin,
)
-class GettrIE(InfoExtractor):
- _VALID_URL = r'https?://(www\.)?gettr\.com/post/(?P<id>[a-z0-9]+)'
+class GettrBaseIE(InfoExtractor):
+ _BASE_REGEX = r'https?://(www\.)?gettr\.com/'
_MEDIA_BASE_URL = 'https://media.gettr.com/'
+ def _call_api(self, path, video_id, *args, **kwargs):
+ return self._download_json(urljoin('https://api.gettr.com/u/', path), video_id, *args, **kwargs)['result']
+
+
+class GettrIE(GettrBaseIE):
+ _VALID_URL = GettrBaseIE._BASE_REGEX + r'post/(?P<id>[a-z0-9]+)'
+
_TESTS = [{
'url': 'https://www.gettr.com/post/pcf6uv838f',
'info_dict': {
@@ -28,9 +36,11 @@ class GettrIE(InfoExtractor):
'ext': 'mp4',
'uploader': 'EpochTV',
'uploader_id': 'epochtv',
+ 'upload_date': '20210927',
'thumbnail': r're:^https?://.+/out\.jpg',
- 'timestamp': 1632782451058,
+ 'timestamp': 1632782451.058,
'duration': 58.5585,
+ 'tags': ['hornofafrica', 'explorations'],
}
}, {
'url': 'https://gettr.com/post/p4iahp',
@@ -41,43 +51,69 @@ class GettrIE(InfoExtractor):
'ext': 'mp4',
'uploader': 'Neues Forum Freiheit',
'uploader_id': 'nf_freiheit',
+ 'upload_date': '20210718',
'thumbnail': r're:^https?://.+/out\.jpg',
- 'timestamp': 1626594455017,
+ 'timestamp': 1626594455.017,
'duration': 23,
+ 'tags': 'count:12',
}
+ }, {
+ # quote post
+ 'url': 'https://gettr.com/post/pxn5b743a9',
+ 'only_matching': True,
+ }, {
+ # quote with video
+ 'url': 'https://gettr.com/post/pxtiiz5ca2',
+ 'only_matching': True,
+ }, {
+ # streaming embed
+ 'url': 'https://gettr.com/post/pxlu8p3b13',
+ 'only_matching': True,
+ }, {
+ # youtube embed
+ 'url': 'https://gettr.com/post/pv6wp9e24c',
+ 'only_matching': True,
+ 'add_ie': ['Youtube'],
}]
def _real_extract(self, url):
post_id = self._match_id(url)
webpage = self._download_webpage(url, post_id)
+ api_data = self._call_api('post/%s?incl="poststats|userinfo"' % post_id, post_id)
+
+ post_data = api_data.get('data')
+ user_data = try_get(api_data, lambda x: x['aux']['uinf'][post_data['uid']], dict) or {}
+
+ vid = post_data.get('vid')
+ ovid = post_data.get('ovid')
- api_data = self._download_json(
- 'https://api.gettr.com/u/post/%s?incl="poststats|userinfo"' % post_id, post_id)
+ if post_data.get('p_type') == 'stream':
+ return self.url_result(f'https://gettr.com/streaming/{post_id}', ie='GettrStreaming', video_id=post_id)
- post_data = try_get(api_data, lambda x: x['result']['data'])
- user_data = try_get(api_data, lambda x: x['result']['aux']['uinf'][post_data['uid']]) or {}
+ if not (ovid or vid):
+ embed_url = url_or_none(post_data.get('prevsrc'))
+ shared_post_id = traverse_obj(api_data, ('aux', 'shrdpst', '_id'), ('data', 'rpstIds', 0), expected_type=str)
- if post_data.get('nfound'):
- raise ExtractorError(post_data.get('txt'), expected=True)
+ if embed_url:
+ return self.url_result(embed_url)
+ elif shared_post_id:
+ return self.url_result(f'https://gettr.com/post/{shared_post_id}', ie='Gettr', video_id=shared_post_id)
+ else:
+ raise ExtractorError('There\'s no video in this post.')
title = description = str_or_none(
post_data.get('txt') or self._og_search_description(webpage))
uploader = str_or_none(
user_data.get('nickname')
- or remove_end(self._og_search_title(webpage), ' on GETTR'))
+ or self._search_regex(r'^(.+?) on GETTR', self._og_search_title(webpage, default=''), 'uploader', fatal=False))
+
if uploader:
title = '%s - %s' % (uploader, title)
- if not dict_get(post_data, ['vid', 'ovid']):
- raise ExtractorError('There\'s no video in this post.')
-
- vid = post_data.get('vid')
- ovid = post_data.get('ovid')
-
- formats = self._extract_m3u8_formats(
+ formats, subtitles = self._extract_m3u8_formats_and_subtitles(
urljoin(self._MEDIA_BASE_URL, vid), post_id, 'mp4',
- entry_protocol='m3u8_native', m3u8_id='hls') if vid else []
+ entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) if vid else ([], {})
if ovid:
formats.append({
@@ -86,8 +122,6 @@ class GettrIE(InfoExtractor):
'ext': 'mp4',
'width': int_or_none(post_data.get('vid_wid')),
'height': int_or_none(post_data.get('vid_hgt')),
- 'source_preference': 1,
- 'quality': 1,
})
self._sort_formats(formats)
@@ -96,15 +130,84 @@ class GettrIE(InfoExtractor):
'id': post_id,
'title': title,
'description': description,
- 'thumbnail': url_or_none(
- urljoin(self._MEDIA_BASE_URL, post_data.get('main'))
- or self._og_search_thumbnail(webpage)),
- 'timestamp': int_or_none(post_data.get('cdate')),
+ 'formats': formats,
+ 'subtitles': subtitles,
+ 'uploader': uploader,
'uploader_id': str_or_none(
dict_get(user_data, ['_id', 'username'])
or post_data.get('uid')),
- 'uploader': uploader,
- 'formats': formats,
+ 'thumbnail': url_or_none(
+ urljoin(self._MEDIA_BASE_URL, post_data.get('main'))
+ or self._html_search_meta(['og:image', 'image'], webpage, 'thumbnail', fatal=False)),
+ 'timestamp': float_or_none(dict_get(post_data, ['cdate', 'udate']), scale=1000),
'duration': float_or_none(post_data.get('vid_dur')),
'tags': post_data.get('htgs'),
}
+
+
+class GettrStreamingIE(GettrBaseIE):
+ _VALID_URL = GettrBaseIE._BASE_REGEX + r'streaming/(?P<id>[a-z0-9]+)'
+
+ _TESTS = [{
+ 'url': 'https://gettr.com/streaming/psoiulc122',
+ 'info_dict': {
+ 'id': 'psoiulc122',
+ 'ext': 'mp4',
+ 'description': 'md5:56bca4b8f48f1743d9fd03d49c723017',
+ 'view_count': int,
+ 'uploader': 'Corona Investigative Committee',
+ 'uploader_id': 'coronacommittee',
+ 'duration': 5180.184,
+ 'thumbnail': r're:^https?://.+',
+ 'title': 'Day 1: Opening Session of the Grand Jury Proceeding',
+ 'timestamp': 1644080997.164,
+ 'upload_date': '20220205',
+ }
+ }, {
+ 'url': 'https://gettr.com/streaming/psfmeefcc1',
+ 'info_dict': {
+ 'id': 'psfmeefcc1',
+ 'ext': 'mp4',
+ 'title': 'Session 90: "The Virus Of Power"',
+ 'view_count': int,
+ 'uploader_id': 'coronacommittee',
+ 'description': 'md5:98986acdf656aa836bf36f9c9704c65b',
+ 'uploader': 'Corona Investigative Committee',
+ 'thumbnail': r're:^https?://.+',
+ 'duration': 21872.507,
+ 'timestamp': 1643976662.858,
+ 'upload_date': '20220204',
+ }
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ video_info = self._call_api('live/join/%s' % video_id, video_id, data={})
+
+ live_info = video_info['broadcast']
+ live_url = url_or_none(live_info.get('url'))
+
+ formats, subtitles = self._extract_m3u8_formats_and_subtitles(
+ live_url, video_id, ext='mp4',
+ entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) if live_url else ([], {})
+
+ thumbnails = [{
+ 'url': urljoin(self._MEDIA_BASE_URL, thumbnail),
+ } for thumbnail in try_get(video_info, lambda x: x['postData']['imgs'], list) or []]
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': try_get(video_info, lambda x: x['postData']['ttl'], str),
+ 'description': try_get(video_info, lambda x: x['postData']['dsc'], str),
+ 'formats': formats,
+ 'subtitles': subtitles,
+ 'thumbnails': thumbnails,
+ 'uploader': try_get(video_info, lambda x: x['liveHostInfo']['nickname'], str),
+ 'uploader_id': try_get(video_info, lambda x: x['liveHostInfo']['_id'], str),
+ 'view_count': int_or_none(live_info.get('viewsCount')),
+ 'timestamp': float_or_none(live_info.get('startAt'), scale=1000),
+ 'duration': float_or_none(live_info.get('duration'), scale=1000),
+ 'is_live': bool_or_none(live_info.get('isLive')),
+ }
diff --git a/hypervideo_dl/extractor/gfycat.py b/hypervideo_dl/extractor/gfycat.py
index 18a30fe..2ad03e2 100644
--- a/hypervideo_dl/extractor/gfycat.py
+++ b/hypervideo_dl/extractor/gfycat.py
@@ -1,6 +1,8 @@
# coding: utf-8
from __future__ import unicode_literals
+import re
+
from .common import InfoExtractor
from ..utils import (
int_or_none,
@@ -11,7 +13,7 @@ from ..utils import (
class GfycatIE(InfoExtractor):
- _VALID_URL = r'https?://(?:(?:www|giant|thumbs)\.)?gfycat\.com/(?:ru/|ifr/|gifs/detail/)?(?P<id>[^-/?#\.]+)'
+ _VALID_URL = r'(?i)https?://(?:(?:www|giant|thumbs)\.)?gfycat\.com/(?:ru/|ifr/|gifs/detail/)?(?P<id>[^-/?#\."\']+)'
_TESTS = [{
'url': 'http://gfycat.com/DeadlyDecisiveGermanpinscher',
'info_dict': {
@@ -24,9 +26,10 @@ class GfycatIE(InfoExtractor):
'duration': 10.4,
'view_count': int,
'like_count': int,
- 'dislike_count': int,
'categories': list,
'age_limit': 0,
+ 'uploader_id': 'anonymous',
+ 'description': '',
}
}, {
'url': 'http://gfycat.com/ifr/JauntyTimelyAmazontreeboa',
@@ -40,9 +43,27 @@ class GfycatIE(InfoExtractor):
'duration': 3.52,
'view_count': int,
'like_count': int,
- 'dislike_count': int,
'categories': list,
'age_limit': 0,
+ 'uploader_id': 'anonymous',
+ 'description': '',
+ }
+ }, {
+ 'url': 'https://gfycat.com/alienatedsolidgreathornedowl',
+ 'info_dict': {
+ 'id': 'alienatedsolidgreathornedowl',
+ 'ext': 'mp4',
+ 'upload_date': '20211226',
+ 'uploader_id': 'reactions',
+ 'timestamp': 1640536930,
+ 'like_count': int,
+ 'description': '',
+ 'title': 'Ingrid Michaelson, Zooey Deschanel - Merry Christmas Happy New Year',
+ 'categories': list,
+ 'age_limit': 0,
+ 'duration': 2.9583333333333335,
+ 'uploader': 'Reaction GIFs',
+ 'view_count': int,
}
}, {
'url': 'https://gfycat.com/ru/RemarkableDrearyAmurstarfish',
@@ -59,8 +80,19 @@ class GfycatIE(InfoExtractor):
}, {
'url': 'https://giant.gfycat.com/acceptablehappygoluckyharborporpoise.mp4',
'only_matching': True
+ }, {
+ 'url': 'http://gfycat.com/IFR/JauntyTimelyAmazontreeboa',
+ 'only_matching': True
}]
+ @staticmethod
+ def _extract_urls(webpage):
+ return [
+ mobj.group('url')
+ for mobj in re.finditer(
+ r'<(?:iframe|source)[^>]+\bsrc=["\'](?P<url>%s)' % GfycatIE._VALID_URL,
+ webpage)]
+
def _real_extract(self, url):
video_id = self._match_id(url)
@@ -74,7 +106,7 @@ class GfycatIE(InfoExtractor):
title = gfy.get('title') or gfy['gfyName']
description = gfy.get('description')
timestamp = int_or_none(gfy.get('createDate'))
- uploader = gfy.get('userName')
+ uploader = gfy.get('userName') or gfy.get('username')
view_count = int_or_none(gfy.get('views'))
like_count = int_or_none(gfy.get('likes'))
dislike_count = int_or_none(gfy.get('dislikes'))
@@ -114,7 +146,8 @@ class GfycatIE(InfoExtractor):
'title': title,
'description': description,
'timestamp': timestamp,
- 'uploader': uploader,
+ 'uploader': gfy.get('userDisplayName') or uploader,
+ 'uploader_id': uploader,
'duration': duration,
'view_count': view_count,
'like_count': like_count,
diff --git a/hypervideo_dl/extractor/glide.py b/hypervideo_dl/extractor/glide.py
index d94dfbf..12af859 100644
--- a/hypervideo_dl/extractor/glide.py
+++ b/hypervideo_dl/extractor/glide.py
@@ -23,9 +23,7 @@ class GlideIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
- title = self._html_search_regex(
- r'<title>(.+?)</title>', webpage,
- 'title', default=None) or self._og_search_title(webpage)
+ title = self._html_extract_title(webpage, default=None) or self._og_search_title(webpage)
video_url = self._proto_relative_url(self._search_regex(
r'<source[^>]+src=(["\'])(?P<url>.+?)\1',
webpage, 'video URL', default=None,
diff --git a/hypervideo_dl/extractor/globo.py b/hypervideo_dl/extractor/globo.py
index a3f0241..f6aaae1 100644
--- a/hypervideo_dl/extractor/globo.py
+++ b/hypervideo_dl/extractor/globo.py
@@ -12,6 +12,7 @@ from ..compat import (
compat_str,
)
from ..utils import (
+ HEADRequest,
ExtractorError,
float_or_none,
orderedSet,
@@ -67,11 +68,28 @@ class GloboIE(InfoExtractor):
}, {
'url': 'globo:3607726',
'only_matching': True,
+ }, {
+ 'url': 'https://globoplay.globo.com/v/10248083/',
+ 'info_dict': {
+ 'id': '10248083',
+ 'ext': 'mp4',
+ 'title': 'Melhores momentos: Equador 1 x 1 Brasil pelas Eliminatórias da Copa do Mundo 2022',
+ 'duration': 530.964,
+ 'uploader': 'SporTV',
+ 'uploader_id': '698',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
}]
def _real_extract(self, url):
video_id = self._match_id(url)
+ self._request_webpage(
+ HEADRequest('https://globo-ab.globo.com/v2/selected-alternatives?experiments=player-isolated-experiment-02&skipImpressions=true'),
+ video_id, 'Getting cookies')
+
video = self._download_json(
'http://api.globovideos.com/videos/%s/playlist' % video_id,
video_id)['videos'][0]
@@ -82,7 +100,7 @@ class GloboIE(InfoExtractor):
formats = []
security = self._download_json(
- 'https://playback.video.globo.com/v1/video-session', video_id, 'Downloading security hash for %s' % video_id,
+ 'https://playback.video.globo.com/v2/video-session', video_id, 'Downloading security hash for %s' % video_id,
headers={'content-type': 'application/json'}, data=json.dumps({
"player_type": "desktop",
"video_id": video_id,
@@ -92,7 +110,9 @@ class GloboIE(InfoExtractor):
"tz": "-3.0:00"
}).encode())
- security_hash = security['source']['token']
+ self._request_webpage(HEADRequest(security['sources'][0]['url_template']), video_id, 'Getting locksession cookie')
+
+ security_hash = security['sources'][0]['token']
if not security_hash:
message = security.get('message')
if message:
@@ -115,15 +135,15 @@ class GloboIE(InfoExtractor):
md5_data = (received_md5 + padded_sign_time + '0xAC10FD').encode()
signed_md5 = base64.urlsafe_b64encode(hashlib.md5(md5_data).digest()).decode().strip('=')
signed_hash = hash_prefix + padded_sign_time + signed_md5
- source = security['source']['url_parts']
+ source = security['sources'][0]['url_parts']
resource_url = source['scheme'] + '://' + source['domain'] + source['path']
signed_url = '%s?h=%s&k=html5&a=%s' % (resource_url, signed_hash, 'F' if video.get('subscriber_only') else 'A')
- formats.extend(self._extract_m3u8_formats(
- signed_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False))
+ fmts, subtitles = self._extract_m3u8_formats_and_subtitles(
+ signed_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)
+ formats.extend(fmts)
self._sort_formats(formats)
- subtitles = {}
for resource in video['resources']:
if resource.get('type') == 'subtitle':
subtitles.setdefault(resource.get('language') or 'por', []).append({
@@ -166,6 +186,7 @@ class GloboArticleIE(InfoExtractor):
r'\bvideosIDs\s*:\s*["\']?(\d{7,})',
r'\bdata-id=["\'](\d{7,})',
r'<div[^>]+\bid=["\'](\d{7,})',
+ r'<bs-player[^>]+\bvideoid=["\'](\d{8,})',
]
_TESTS = [{
@@ -193,6 +214,14 @@ class GloboArticleIE(InfoExtractor):
}, {
'url': 'http://oglobo.globo.com/rio/a-amizade-entre-um-entregador-de-farmacia-um-piano-19946271',
'only_matching': True,
+ }, {
+ 'url': 'https://ge.globo.com/video/ta-na-area-como-foi-assistir-ao-jogo-do-palmeiras-que-a-globo-nao-passou-10287094.ghtml',
+ 'info_dict': {
+ 'id': 'ta-na-area-como-foi-assistir-ao-jogo-do-palmeiras-que-a-globo-nao-passou-10287094',
+ 'title': 'Tá na Área: como foi assistir ao jogo do Palmeiras que a Globo não passou',
+ 'description': 'md5:2d089d036c4c9675117d3a56f8c61739',
+ },
+ 'playlist_count': 1,
}]
@classmethod
@@ -208,6 +237,6 @@ class GloboArticleIE(InfoExtractor):
entries = [
self.url_result('globo:%s' % video_id, GloboIE.ie_key())
for video_id in orderedSet(video_ids)]
- title = self._og_search_title(webpage, fatal=False)
+ title = self._og_search_title(webpage)
description = self._html_search_meta('description', webpage)
return self.playlist_result(entries, display_id, title, description)
diff --git a/hypervideo_dl/extractor/glomex.py b/hypervideo_dl/extractor/glomex.py
new file mode 100644
index 0000000..d9ef433
--- /dev/null
+++ b/hypervideo_dl/extractor/glomex.py
@@ -0,0 +1,220 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+import urllib.parse
+
+from .common import InfoExtractor
+from ..utils import (
+ determine_ext,
+ extract_attributes,
+ ExtractorError,
+ int_or_none,
+ parse_qs,
+ smuggle_url,
+ unescapeHTML,
+ unsmuggle_url,
+)
+
+
+class GlomexBaseIE(InfoExtractor):
+ _DEFAULT_ORIGIN_URL = 'https://player.glomex.com/'
+ _API_URL = 'https://integration-cloudfront-eu-west-1.mes.glomex.cloud/'
+
+ @staticmethod
+ def _smuggle_origin_url(url, origin_url):
+ if origin_url is None:
+ return url
+ return smuggle_url(url, {'origin': origin_url})
+
+ @classmethod
+ def _unsmuggle_origin_url(cls, url, fallback_origin_url=None):
+ defaults = {'origin': fallback_origin_url or cls._DEFAULT_ORIGIN_URL}
+ unsmuggled_url, data = unsmuggle_url(url, default=defaults)
+ return unsmuggled_url, data['origin']
+
+ def _get_videoid_type(self, video_id):
+ _VIDEOID_TYPES = {
+ 'v': 'video',
+ 'pl': 'playlist',
+ 'rl': 'related videos playlist',
+ 'cl': 'curated playlist',
+ }
+ prefix = video_id.split('-')[0]
+ return _VIDEOID_TYPES.get(prefix, 'unknown type')
+
+ def _download_api_data(self, video_id, integration, current_url=None):
+ query = {
+ 'integration_id': integration,
+ 'playlist_id': video_id,
+ 'current_url': current_url or self._DEFAULT_ORIGIN_URL,
+ }
+ video_id_type = self._get_videoid_type(video_id)
+ return self._download_json(
+ self._API_URL,
+ video_id, 'Downloading %s JSON' % video_id_type,
+ 'Unable to download %s JSON' % video_id_type,
+ query=query)
+
+ def _download_and_extract_api_data(self, video_id, integration, current_url):
+ api_data = self._download_api_data(video_id, integration, current_url)
+ videos = api_data['videos']
+ if not videos:
+ raise ExtractorError('no videos found for %s' % video_id)
+ videos = [self._extract_api_data(video, video_id) for video in videos]
+ return videos[0] if len(videos) == 1 else self.playlist_result(videos, video_id)
+
+ def _extract_api_data(self, video, video_id):
+ if video.get('error_code') == 'contentGeoblocked':
+ self.raise_geo_restricted(countries=video['geo_locations'])
+
+ formats, subs = [], {}
+ for format_id, format_url in video['source'].items():
+ ext = determine_ext(format_url)
+ if ext == 'm3u8':
+ formats_, subs_ = self._extract_m3u8_formats_and_subtitles(
+ format_url, video_id, 'mp4', m3u8_id=format_id,
+ fatal=False)
+ formats.extend(formats_)
+ self._merge_subtitles(subs_, target=subs)
+ else:
+ formats.append({
+ 'url': format_url,
+ 'format_id': format_id,
+ })
+ if video.get('language'):
+ for fmt in formats:
+ fmt['language'] = video['language']
+ self._sort_formats(formats)
+
+ images = (video.get('images') or []) + [video.get('image') or {}]
+ thumbnails = [{
+ 'id': image.get('id'),
+ 'url': f'{image["url"]}/profile:player-960x540',
+ 'width': 960,
+ 'height': 540,
+ } for image in images if image.get('url')]
+ self._remove_duplicate_formats(thumbnails)
+
+ return {
+ 'id': video.get('clip_id') or video_id,
+ 'title': video.get('title'),
+ 'description': video.get('description'),
+ 'thumbnails': thumbnails,
+ 'duration': int_or_none(video.get('clip_duration')),
+ 'timestamp': video.get('created_at'),
+ 'formats': formats,
+ 'subtitles': subs,
+ }
+
+
+class GlomexIE(GlomexBaseIE):
+ IE_NAME = 'glomex'
+ IE_DESC = 'Glomex videos'
+ _VALID_URL = r'https?://video\.glomex\.com/[^/]+/(?P<id>v-[^-]+)'
+ _INTEGRATION_ID = '19syy24xjn1oqlpc'
+
+ _TESTS = [{
+ 'url': 'https://video.glomex.com/sport/v-cb24uwg77hgh-nach-2-0-sieg-guardiola-mit-mancity-vor-naechstem-titel',
+ 'md5': 'cec33a943c4240c9cb33abea8c26242e',
+ 'info_dict': {
+ 'id': 'v-cb24uwg77hgh',
+ 'ext': 'mp4',
+ 'title': 'md5:38a90cedcfadd72982c81acf13556e0c',
+ 'description': 'md5:1ea6b6caff1443fcbbba159e432eedb8',
+ 'duration': 29600,
+ 'timestamp': 1619895017,
+ 'upload_date': '20210501',
+ },
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ return self.url_result(
+ GlomexEmbedIE.build_player_url(video_id, self._INTEGRATION_ID, url),
+ GlomexEmbedIE.ie_key(), video_id)
+
+
+class GlomexEmbedIE(GlomexBaseIE):
+ IE_NAME = 'glomex:embed'
+ IE_DESC = 'Glomex embedded videos'
+ _BASE_PLAYER_URL = '//player.glomex.com/integration/1/iframe-player.html'
+ _BASE_PLAYER_URL_RE = re.escape(_BASE_PLAYER_URL).replace('/1/', r'/[^/]/')
+ _VALID_URL = rf'https?:{_BASE_PLAYER_URL_RE}\?([^#]+&)?playlistId=(?P<id>[^#&]+)'
+
+ _TESTS = [{
+ 'url': 'https://player.glomex.com/integration/1/iframe-player.html?integrationId=4059a013k56vb2yd&playlistId=v-cfa6lye0dkdd-sf',
+ 'md5': '68f259b98cc01918ac34180142fce287',
+ 'info_dict': {
+ 'id': 'v-cfa6lye0dkdd-sf',
+ 'ext': 'mp4',
+ 'timestamp': 1635337199,
+ 'duration': 133080,
+ 'upload_date': '20211027',
+ 'description': 'md5:e741185fc309310ff5d0c789b437be66',
+ 'title': 'md5:35647293513a6c92363817a0fb0a7961',
+ },
+ }, {
+ 'url': 'https://player.glomex.com/integration/1/iframe-player.html?origin=fullpage&integrationId=19syy24xjn1oqlpc&playlistId=rl-vcb49w1fb592p&playlistIndex=0',
+ 'info_dict': {
+ 'id': 'rl-vcb49w1fb592p',
+ },
+ 'playlist_count': 100,
+ }, {
+ 'url': 'https://player.glomex.com/integration/1/iframe-player.html?playlistId=cl-bgqaata6aw8x&integrationId=19syy24xjn1oqlpc',
+ 'info_dict': {
+ 'id': 'cl-bgqaata6aw8x',
+ },
+ 'playlist_mincount': 2,
+ }]
+
+ @classmethod
+ def build_player_url(cls, video_id, integration, origin_url=None):
+ query_string = urllib.parse.urlencode({
+ 'playlistId': video_id,
+ 'integrationId': integration,
+ })
+ return cls._smuggle_origin_url(f'https:{cls._BASE_PLAYER_URL}?{query_string}', origin_url)
+
+ @classmethod
+ def _extract_urls(cls, webpage, origin_url):
+ # https://docs.glomex.com/publisher/video-player-integration/javascript-api/
+ quot_re = r'["\']'
+
+ regex = fr'''(?x)
+ <iframe[^>]+?src=(?P<q>{quot_re})(?P<url>
+ (?:https?:)?{cls._BASE_PLAYER_URL_RE}\?(?:(?!(?P=q)).)+
+ )(?P=q)'''
+ for mobj in re.finditer(regex, webpage):
+ url = unescapeHTML(mobj.group('url'))
+ if cls.suitable(url):
+ yield cls._smuggle_origin_url(url, origin_url)
+
+ regex = fr'''(?x)
+ <glomex-player [^>]+?>|
+ <div[^>]* data-glomex-player=(?P<q>{quot_re})true(?P=q)[^>]*>'''
+ for mobj in re.finditer(regex, webpage):
+ attrs = extract_attributes(mobj.group(0))
+ if attrs.get('data-integration-id') and attrs.get('data-playlist-id'):
+ yield cls.build_player_url(attrs['data-playlist-id'], attrs['data-integration-id'], origin_url)
+
+ # naive parsing of inline scripts for hard-coded integration parameters
+ regex = fr'''(?x)
+ (?P<is_js>dataset\.)?%s\s*(?(is_js)=|:)\s*
+ (?P<q>{quot_re})(?P<id>(?:(?!(?P=q)).)+)(?P=q)\s'''
+ for mobj in re.finditer(r'(?x)<script[^<]*>.+?</script>', webpage):
+ script = mobj.group(0)
+ integration_id = re.search(regex % 'integrationId', script)
+ if not integration_id:
+ continue
+ playlist_id = re.search(regex % 'playlistId', script)
+ if playlist_id:
+ yield cls.build_player_url(playlist_id, integration_id, origin_url)
+
+ def _real_extract(self, url):
+ url, origin_url = self._unsmuggle_origin_url(url)
+ playlist_id = self._match_id(url)
+ integration = parse_qs(url).get('integrationId', [None])[0]
+ if not integration:
+ raise ExtractorError('No integrationId in URL', expected=True)
+ return self._download_and_extract_api_data(playlist_id, integration, origin_url)
diff --git a/hypervideo_dl/extractor/go.py b/hypervideo_dl/extractor/go.py
index 2ccc6df..f92e166 100644
--- a/hypervideo_dl/extractor/go.py
+++ b/hypervideo_dl/extractor/go.py
@@ -217,6 +217,7 @@ class GoIE(AdobePassIE):
title = video_data['title']
formats = []
+ subtitles = {}
for asset in video_data.get('assets', {}).get('asset', []):
asset_url = asset.get('value')
if not asset_url:
@@ -256,8 +257,10 @@ class GoIE(AdobePassIE):
error_message = ', '.join([error['message'] for error in errors])
raise ExtractorError('%s said: %s' % (self.IE_NAME, error_message), expected=True)
asset_url += '?' + entitlement['uplynkData']['sessionKey']
- formats.extend(self._extract_m3u8_formats(
- asset_url, video_id, 'mp4', m3u8_id=format_id or 'hls', fatal=False))
+ fmts, subs = self._extract_m3u8_formats_and_subtitles(
+ asset_url, video_id, 'mp4', m3u8_id=format_id or 'hls', fatal=False)
+ formats.extend(fmts)
+ self._merge_subtitles(subs, target=subtitles)
else:
f = {
'format_id': format_id,
@@ -281,7 +284,6 @@ class GoIE(AdobePassIE):
formats.append(f)
self._sort_formats(formats)
- subtitles = {}
for cc in video_data.get('closedcaption', {}).get('src', []):
cc_url = cc.get('value')
if not cc_url:
diff --git a/hypervideo_dl/extractor/gofile.py b/hypervideo_dl/extractor/gofile.py
new file mode 100644
index 0000000..62d778c
--- /dev/null
+++ b/hypervideo_dl/extractor/gofile.py
@@ -0,0 +1,83 @@
+# coding: utf-8
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ try_get
+)
+
+
+class GofileIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?gofile\.io/d/(?P<id>[^/]+)'
+ _TESTS = [{
+ 'url': 'https://gofile.io/d/AMZyDw',
+ 'info_dict': {
+ 'id': 'AMZyDw',
+ },
+ 'playlist_mincount': 2,
+ 'playlist': [{
+ 'info_dict': {
+ 'id': 'de571ac1-5edc-42e2-8ec2-bdac83ad4a31',
+ 'filesize': 928116,
+ 'ext': 'mp4',
+ 'title': 'nuuh'
+ }
+ }]
+ }, { # URL to test mixed file types
+ 'url': 'https://gofile.io/d/avt34h',
+ 'info_dict': {
+ 'id': 'avt34h',
+ },
+ 'playlist_mincount': 1,
+ }, { # URL to test no video/audio error
+ 'url': 'https://gofile.io/d/aB03lZ',
+ 'info_dict': {
+ 'id': 'aB03lZ',
+ },
+ 'playlist_count': 0,
+ 'skip': 'No video/audio found at provided URL.',
+ }]
+ _TOKEN = None
+
+ def _real_initialize(self):
+ token = self._get_cookies('https://gofile.io/').get('accountToken')
+ if token:
+ self._TOKEN = token.value
+ return
+
+ account_data = self._download_json(
+ 'https://api.gofile.io/createAccount', None, note='Getting a new guest account')
+ self._TOKEN = account_data['data']['token']
+ self._set_cookie('gofile.io', 'accountToken', self._TOKEN)
+
+ def _entries(self, file_id):
+ files = self._download_json(
+ f'https://api.gofile.io/getContent?contentId={file_id}&token={self._TOKEN}&websiteToken=websiteToken&cache=true',
+ 'Gofile', note='Getting filelist')
+
+ status = files['status']
+ if status != 'ok':
+ raise ExtractorError(f'{self.IE_NAME} said: status {status}', expected=True)
+
+ found_files = False
+ for file in (try_get(files, lambda x: x['data']['contents'], dict) or {}).values():
+ file_type, file_format = file.get('mimetype').split('/', 1)
+ if file_type not in ('video', 'audio') and file_format != 'vnd.mts':
+ continue
+
+ found_files = True
+ file_url = file.get('directLink')
+ if file_url:
+ yield {
+ 'id': file['id'],
+ 'title': file['name'].rsplit('.', 1)[0],
+ 'url': file_url,
+ 'filesize': file.get('size'),
+ 'release_timestamp': file.get('createTime')
+ }
+
+ if not found_files:
+ raise ExtractorError('No video/audio found at provided URL.', expected=True)
+
+ def _real_extract(self, url):
+ file_id = self._match_id(url)
+ return self.playlist_result(self._entries(file_id), playlist_id=file_id)
diff --git a/hypervideo_dl/extractor/googlesearch.py b/hypervideo_dl/extractor/googlesearch.py
index f605c0c..4b8b1bc 100644
--- a/hypervideo_dl/extractor/googlesearch.py
+++ b/hypervideo_dl/extractor/googlesearch.py
@@ -8,36 +8,33 @@ from .common import SearchInfoExtractor
class GoogleSearchIE(SearchInfoExtractor):
IE_DESC = 'Google Video search'
- _MAX_RESULTS = 1000
IE_NAME = 'video.google:search'
_SEARCH_KEY = 'gvsearch'
- _WORKING = False
- _TEST = {
+ _TESTS = [{
'url': 'gvsearch15:python language',
'info_dict': {
'id': 'python language',
'title': 'python language',
},
'playlist_count': 15,
- }
+ }]
+ _PAGE_SIZE = 100
def _search_results(self, query):
for pagenum in itertools.count():
webpage = self._download_webpage(
- 'http://www.google.com/search',
- 'gvsearch:' + query,
- note='Downloading result page %s' % (pagenum + 1),
+ 'http://www.google.com/search', f'gvsearch:{query}',
+ note=f'Downloading result page {pagenum + 1}',
query={
'tbm': 'vid',
'q': query,
- 'start': pagenum * 10,
+ 'start': pagenum * self._PAGE_SIZE,
+ 'num': self._PAGE_SIZE,
'hl': 'en',
})
- for hit_idx, mobj in enumerate(re.finditer(
- r'<h3 class="r"><a href="([^"]+)"', webpage)):
- if re.search(f'id="vidthumb{hit_idx + 1}"', webpage):
- yield self.url_result(mobj.group(1))
+ for url in re.findall(r'<div[^>]* class="dXiKIc"[^>]*><a href="([^"]+)"', webpage):
+ yield self.url_result(url)
if not re.search(r'id="pnnext"', webpage):
return
diff --git a/hypervideo_dl/extractor/gronkh.py b/hypervideo_dl/extractor/gronkh.py
index a7792a5..c9f1dd2 100644
--- a/hypervideo_dl/extractor/gronkh.py
+++ b/hypervideo_dl/extractor/gronkh.py
@@ -6,7 +6,7 @@ from ..utils import unified_strdate
class GronkhIE(InfoExtractor):
- _VALID_URL = r'(?:https?://)(?:www\.)?gronkh\.tv/stream/(?P<id>\d+)'
+ _VALID_URL = r'https?://(?:www\.)?gronkh\.tv/(?:watch/)?stream/(?P<id>\d+)'
_TESTS = [{
'url': 'https://gronkh.tv/stream/536',
@@ -19,6 +19,9 @@ class GronkhIE(InfoExtractor):
'upload_date': '20211001'
},
'params': {'skip_download': True}
+ }, {
+ 'url': 'https://gronkh.tv/watch/stream/546',
+ 'only_matching': True,
}]
def _real_extract(self, url):
diff --git a/hypervideo_dl/extractor/hellporno.py b/hypervideo_dl/extractor/hellporno.py
index fae4251..92d32cd 100644
--- a/hypervideo_dl/extractor/hellporno.py
+++ b/hypervideo_dl/extractor/hellporno.py
@@ -38,8 +38,7 @@ class HellPornoIE(InfoExtractor):
webpage = self._download_webpage(url, display_id)
- title = remove_end(self._html_search_regex(
- r'<title>([^<]+)</title>', webpage, 'title'), ' - Hell Porno')
+ title = remove_end(self._html_extract_title(webpage), ' - Hell Porno')
info = self._parse_html5_media_entries(url, webpage, display_id)[0]
self._sort_formats(info['formats'])
diff --git a/hypervideo_dl/extractor/hidive.py b/hypervideo_dl/extractor/hidive.py
index 15bd444..46d7d62 100644
--- a/hypervideo_dl/extractor/hidive.py
+++ b/hypervideo_dl/extractor/hidive.py
@@ -35,18 +35,14 @@ class HiDiveIE(InfoExtractor):
'skip': 'Requires Authentication',
}]
- def _real_initialize(self):
- email, password = self._get_login_info()
- if email is None:
- return
-
+ def _perform_login(self, username, password):
webpage = self._download_webpage(self._LOGIN_URL, None)
form = self._search_regex(
r'(?s)<form[^>]+action="/account/login"[^>]*>(.+?)</form>',
webpage, 'login form')
data = self._hidden_inputs(form)
data.update({
- 'Email': email,
+ 'Email': username,
'Password': password,
})
self._download_webpage(
diff --git a/hypervideo_dl/extractor/hitbox.py b/hypervideo_dl/extractor/hitbox.py
index 3e5ff26..0470d0a 100644
--- a/hypervideo_dl/extractor/hitbox.py
+++ b/hypervideo_dl/extractor/hitbox.py
@@ -209,6 +209,6 @@ class HitboxLiveIE(HitboxIE):
'https://www.smashcast.tv/api/media/live', video_id)
metadata['formats'] = formats
metadata['is_live'] = True
- metadata['title'] = self._live_title(metadata.get('title'))
+ metadata['title'] = metadata.get('title')
return metadata
diff --git a/hypervideo_dl/extractor/hotstar.py b/hypervideo_dl/extractor/hotstar.py
index 74e2728..d55a79b 100644
--- a/hypervideo_dl/extractor/hotstar.py
+++ b/hypervideo_dl/extractor/hotstar.py
@@ -203,6 +203,9 @@ class HotStarIE(HotStarBaseIE):
format_url = re.sub(
r'(?<=//staragvod)(\d)', r'web\1', format_url)
tags = str_or_none(playback_set.get('tagsCombination')) or ''
+ ingored_res, ignored_vcodec, ignored_dr = self._configuration_arg('res'), self._configuration_arg('vcodec'), self._configuration_arg('dr')
+ if any(f'resolution:{ig_res}' in tags for ig_res in ingored_res) or any(f'video_codec:{ig_vc}' in tags for ig_vc in ignored_vcodec) or any(f'dynamic_range:{ig_dr}' in tags for ig_dr in ignored_dr):
+ continue
ext = determine_ext(format_url)
current_formats, current_subs = [], {}
try:
@@ -230,6 +233,11 @@ class HotStarIE(HotStarBaseIE):
if tags and 'encryption:plain' not in tags:
for f in current_formats:
f['has_drm'] = True
+ if tags and 'language' in tags:
+ lang = re.search(r'language:(?P<lang>[a-z]+)', tags).group('lang')
+ for f in current_formats:
+ if not f.get('langauge'):
+ f['language'] = lang
formats.extend(current_formats)
subs = self._merge_subtitles(subs, current_subs)
if not formats and geo_restricted:
@@ -291,7 +299,7 @@ class HotStarPlaylistIE(HotStarBaseIE):
class HotStarSeriesIE(HotStarBaseIE):
IE_NAME = 'hotstar:series'
- _VALID_URL = r'(?P<url>(?:https?://)(?:www\.)?hotstar\.com(?:/in)?/tv/[^/]+/(?P<id>\d+))'
+ _VALID_URL = r'(?P<url>https?://(?:www\.)?hotstar\.com(?:/in)?/tv/[^/]+/(?P<id>\d+))'
_TESTS = [{
'url': 'https://www.hotstar.com/in/tv/radhakrishn/1260000646',
'info_dict': {
diff --git a/hypervideo_dl/extractor/hrfensehen.py b/hypervideo_dl/extractor/hrfensehen.py
index 2a994d4..e39ded2 100644
--- a/hypervideo_dl/extractor/hrfensehen.py
+++ b/hypervideo_dl/extractor/hrfensehen.py
@@ -26,13 +26,7 @@ class HRFernsehenIE(InfoExtractor):
}]},
'timestamp': 1598470200,
'upload_date': '20200826',
- 'thumbnails': [{
- 'url': 'https://www.hessenschau.de/tv-sendung/hs_ganz-1554~_t-1598465545029_v-16to9.jpg',
- 'id': '0'
- }, {
- 'url': 'https://www.hessenschau.de/tv-sendung/hs_ganz-1554~_t-1598465545029_v-16to9__medium.jpg',
- 'id': '1'
- }],
+ 'thumbnail': 'https://www.hessenschau.de/tv-sendung/hs_ganz-1554~_t-1598465545029_v-16to9__medium.jpg',
'title': 'hessenschau vom 26.08.2020'
}
}, {
@@ -81,7 +75,7 @@ class HRFernsehenIE(InfoExtractor):
description = self._html_search_meta(
['description'], webpage)
- loader_str = unescapeHTML(self._search_regex(r"data-hr-mediaplayer-loader='([^']*)'", webpage, "ardloader"))
+ loader_str = unescapeHTML(self._search_regex(r"data-new-hr-mediaplayer-loader='([^']*)'", webpage, "ardloader"))
loader_data = json.loads(loader_str)
info = {
diff --git a/hypervideo_dl/extractor/hrti.py b/hypervideo_dl/extractor/hrti.py
index dc5b967..36d6007 100644
--- a/hypervideo_dl/extractor/hrti.py
+++ b/hypervideo_dl/extractor/hrti.py
@@ -27,8 +27,9 @@ class HRTiBaseIE(InfoExtractor):
_APP_VERSION = '1.1'
_APP_PUBLICATION_ID = 'all_in_one'
_API_URL = 'http://clientapi.hrt.hr/client_api.php/config/identify/format/json'
+ _token = None
- def _initialize_api(self):
+ def _initialize_pre_login(self):
init_data = {
'application_publication_id': self._APP_PUBLICATION_ID
}
@@ -64,12 +65,7 @@ class HRTiBaseIE(InfoExtractor):
self._logout_url = modules['user']['resources']['logout']['uri']
- def _login(self):
- username, password = self._get_login_info()
- # TODO: figure out authentication with cookies
- if username is None or password is None:
- self.raise_login_required()
-
+ def _perform_login(self, username, password):
auth_data = {
'username': username,
'password': password,
@@ -94,8 +90,9 @@ class HRTiBaseIE(InfoExtractor):
self._token = auth_info['secure_streaming_token']
def _real_initialize(self):
- self._initialize_api()
- self._login()
+ if not self._token:
+ # TODO: figure out authentication with cookies
+ self.raise_login_required(method='password')
class HRTiIE(HRTiBaseIE):
diff --git a/hypervideo_dl/extractor/hse.py b/hypervideo_dl/extractor/hse.py
new file mode 100644
index 0000000..9144ff8
--- /dev/null
+++ b/hypervideo_dl/extractor/hse.py
@@ -0,0 +1,95 @@
+# coding: utf-8
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ traverse_obj,
+ unified_timestamp,
+)
+
+
+class HSEShowBaseInfoExtractor(InfoExtractor):
+ _GEO_COUNTRIES = ['DE']
+
+ def _extract_redux_data(self, url, video_id):
+ webpage = self._download_webpage(url, video_id)
+ redux = self._html_search_regex(
+ r'window\.__REDUX_DATA__\s*=\s*({.*});?', webpage, 'redux data')
+ return self._parse_json(redux.replace('\n', ''), video_id)
+
+ def _extract_formats_and_subtitles(self, sources, video_id):
+ if not sources:
+ raise ExtractorError('No video found', expected=True, video_id=video_id)
+ formats, subtitles = [], {}
+ for src in sources:
+ if src['mimetype'] != 'application/x-mpegURL':
+ continue
+ fmts, subs = self._extract_m3u8_formats_and_subtitles(src['url'], video_id, ext='mp4')
+ formats.extend(fmts)
+ subtitles = self._merge_subtitles(subtitles, subs)
+ self._sort_formats(formats)
+ return formats, subtitles
+
+
+class HSEShowIE(HSEShowBaseInfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?hse\.de/dpl/c/tv-shows/(?P<id>[0-9]+)'
+ _TESTS = [{
+ 'url': 'https://www.hse.de/dpl/c/tv-shows/505350',
+ 'info_dict': {
+ 'id': '505350',
+ 'ext': 'mp4',
+ 'title': 'Pfeffinger Mode & Accessoires',
+ 'timestamp': 1638810000,
+ 'upload_date': '20211206',
+ 'channel': 'HSE24',
+ 'uploader': 'Arina Pirayesh'
+ },
+ 'params': {'skip_download': 'm3u8'},
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ json_data = self._extract_redux_data(url, video_id)
+ formats, subtitles = self._extract_formats_and_subtitles(
+ traverse_obj(json_data, ('tvShowPage', 'tvShowVideo', 'sources')), video_id)
+
+ show = traverse_obj(json_data, ('tvShowPage', 'tvShow')) or {}
+ return {
+ 'id': video_id,
+ 'title': show.get('title') or video_id,
+ 'formats': formats,
+ 'timestamp': unified_timestamp(f'{show.get("date")} {show.get("hour")}:00'),
+ 'thumbnail': traverse_obj(json_data, ('tvShowVideo', 'poster')),
+ 'channel': self._search_regex(
+ r'tvShow \| ([A-Z0-9]+)_', show.get('actionFieldText') or '', video_id, fatal=False),
+ 'uploader': show.get('presenter'),
+ 'subtitles': subtitles,
+ }
+
+
+class HSEProductIE(HSEShowBaseInfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?hse\.de/dpl/p/product/(?P<id>[0-9]+)'
+ _TESTS = [{
+ 'url': 'https://www.hse.de/dpl/p/product/408630',
+ 'info_dict': {
+ 'id': '408630',
+ 'ext': 'mp4',
+ 'title': 'Hose im Ponte-Mix',
+ 'uploader': 'Judith Williams'
+ },
+ 'params': {'skip_download': 'm3u8'},
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ json_data = self._extract_redux_data(url, video_id)
+ video = traverse_obj(json_data, ('productContent', 'productContent', 'videos', 0)) or {}
+ formats, subtitles = self._extract_formats_and_subtitles(video.get('sources'), video_id)
+
+ return {
+ 'id': video_id,
+ 'title': traverse_obj(json_data, ('productDetail', 'product', 'name', 'short')) or video_id,
+ 'formats': formats,
+ 'subtitles': subtitles,
+ 'thumbnail': video.get('poster'),
+ 'uploader': traverse_obj(json_data, ('productDetail', 'product', 'brand', 'brandName')),
+ }
diff --git a/hypervideo_dl/extractor/huffpost.py b/hypervideo_dl/extractor/huffpost.py
index 97e36f0..54385ba 100644
--- a/hypervideo_dl/extractor/huffpost.py
+++ b/hypervideo_dl/extractor/huffpost.py
@@ -80,9 +80,6 @@ class HuffPostIE(InfoExtractor):
'vcodec': 'none' if key.startswith('audio/') else None,
})
- if not formats and data.get('fivemin_id'):
- return self.url_result('5min:%s' % data['fivemin_id'])
-
self._sort_formats(formats)
return {
diff --git a/hypervideo_dl/extractor/huya.py b/hypervideo_dl/extractor/huya.py
new file mode 100644
index 0000000..4e96f22
--- /dev/null
+++ b/hypervideo_dl/extractor/huya.py
@@ -0,0 +1,137 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import hashlib
+import random
+
+from ..compat import compat_urlparse, compat_b64decode
+
+from ..utils import (
+ ExtractorError,
+ int_or_none,
+ js_to_json,
+ str_or_none,
+ try_get,
+ unescapeHTML,
+ update_url_query,
+)
+
+from .common import InfoExtractor
+
+
+class HuyaLiveIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.|m\.)?huya\.com/(?P<id>[^/#?&]+)(?:\D|$)'
+ IE_NAME = 'huya:live'
+ IE_DESC = 'huya.com'
+ TESTS = [{
+ 'url': 'https://www.huya.com/572329',
+ 'info_dict': {
+ 'id': '572329',
+ 'title': str,
+ 'description': str,
+ 'is_live': True,
+ 'view_count': int,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'https://www.huya.com/xiaoyugame',
+ 'only_matching': True
+ }]
+
+ _RESOLUTION = {
+ '蓝光4M': {
+ 'width': 1920,
+ 'height': 1080,
+ },
+ '超清': {
+ 'width': 1280,
+ 'height': 720,
+ },
+ '流畅': {
+ 'width': 800,
+ 'height': 480
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id=video_id)
+ json_stream = self._search_regex(r'"stream":\s+"([a-zA-Z0-9+=/]+)"', webpage, 'stream', default=None)
+ if not json_stream:
+ raise ExtractorError('Video is offline', expected=True)
+ stream_data = self._parse_json(compat_b64decode(json_stream).decode(), video_id=video_id,
+ transform_source=js_to_json)
+ room_info = try_get(stream_data, lambda x: x['data'][0]['gameLiveInfo'])
+ if not room_info:
+ raise ExtractorError('Can not extract the room info', expected=True)
+ title = room_info.get('roomName') or room_info.get('introduction') or self._html_extract_title(webpage)
+ screen_type = room_info.get('screenType')
+ live_source_type = room_info.get('liveSourceType')
+ stream_info_list = stream_data['data'][0]['gameStreamInfoList']
+ formats = []
+ for stream_info in stream_info_list:
+ stream_url = stream_info.get('sFlvUrl')
+ if not stream_url:
+ continue
+ stream_name = stream_info.get('sStreamName')
+ re_secret = not screen_type and live_source_type in (0, 8, 13)
+ params = dict(compat_urlparse.parse_qsl(unescapeHTML(stream_info['sFlvAntiCode'])))
+ fm, ss = '', ''
+ if re_secret:
+ fm, ss = self.encrypt(params, stream_info, stream_name)
+ for si in stream_data.get('vMultiStreamInfo'):
+ rate = si.get('iBitRate')
+ if rate:
+ params['ratio'] = rate
+ else:
+ params.pop('ratio', None)
+ if re_secret:
+ params['wsSecret'] = hashlib.md5(
+ '_'.join([fm, params['u'], stream_name, ss, params['wsTime']]))
+ formats.append({
+ 'ext': stream_info.get('sFlvUrlSuffix'),
+ 'format_id': str_or_none(stream_info.get('iLineIndex')),
+ 'tbr': rate,
+ 'url': update_url_query(f'{stream_url}/{stream_name}.{stream_info.get("sFlvUrlSuffix")}',
+ query=params),
+ **self._RESOLUTION.get(si.get('sDisplayName'), {}),
+ })
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'formats': formats,
+ 'view_count': room_info.get('totalCount'),
+ 'thumbnail': room_info.get('screenshot'),
+ 'description': room_info.get('contentIntro'),
+ 'http_headers': {
+ 'Origin': 'https://www.huya.com',
+ 'Referer': 'https://www.huya.com/',
+ },
+ }
+
+ def encrypt(self, params, stream_info, stream_name):
+ ct = int_or_none(params.get('wsTime'), 16) + random.random()
+ presenter_uid = stream_info['lPresenterUid']
+ if not stream_name.startswith(str(presenter_uid)):
+ uid = presenter_uid
+ else:
+ uid = int_or_none(ct % 1e7 * 1e6 % 0xffffffff)
+ u1 = uid & 0xffffffff00000000
+ u2 = uid & 0xffffffff
+ u3 = uid & 0xffffff
+ u = u1 | u2 >> 24 | u3 << 8
+ params.update({
+ 'u': str_or_none(u),
+ 'seqid': str_or_none(int_or_none(ct * 1000) + uid),
+ 'ver': '1',
+ 'uuid': int_or_none(ct % 1e7 * 1e6 % 0xffffffff),
+ 't': '100',
+ })
+ fm = compat_b64decode(params['fm']).decode().split('_', 1)[0]
+ ss = hashlib.md5('|'.join([params['seqid'], params['ctype'], params['t']]))
+ return fm, ss
diff --git a/hypervideo_dl/extractor/imdb.py b/hypervideo_dl/extractor/imdb.py
index a313019..96cee2e 100644
--- a/hypervideo_dl/extractor/imdb.py
+++ b/hypervideo_dl/extractor/imdb.py
@@ -7,9 +7,10 @@ import re
from .common import InfoExtractor
from ..utils import (
determine_ext,
+ int_or_none,
mimetype2ext,
- parse_duration,
qualities,
+ traverse_obj,
try_get,
url_or_none,
)
@@ -28,6 +29,17 @@ class ImdbIE(InfoExtractor):
'title': 'No. 2',
'description': 'md5:87bd0bdc61e351f21f20d2d7441cb4e7',
'duration': 152,
+ 'thumbnail': r're:^https?://.+\.jpg',
+ }
+ }, {
+ 'url': 'https://www.imdb.com/video/vi3516832537',
+ 'info_dict': {
+ 'id': '3516832537',
+ 'ext': 'mp4',
+ 'title': 'Paul: U.S. Trailer #1',
+ 'description': 'md5:17fcc4fe11ec29b4399be9d4c5ef126c',
+ 'duration': 153,
+ 'thumbnail': r're:^https?://.+\.jpg',
}
}, {
'url': 'http://www.imdb.com/video/_/vi2524815897',
@@ -51,8 +63,13 @@ class ImdbIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
-
- data = self._download_json(
+ webpage = self._download_webpage(f'https://www.imdb.com/video/vi{video_id}', video_id)
+ info = self._search_nextjs_data(webpage, video_id)
+ video_info = traverse_obj(info, ('props', 'pageProps', 'videoPlaybackData', 'video'), default={})
+ title = (traverse_obj(video_info, ('name', 'value'), ('primaryTitle', 'titleText', 'text'))
+ or self._html_search_meta(('og:title', 'twitter:title'), webpage, default=None)
+ or self._html_extract_title(webpage))
+ data = video_info.get('playbackURLs') or try_get(self._download_json(
'https://www.imdb.com/ve/data/VIDEO_PLAYBACK_DATA', video_id,
query={
'key': base64.b64encode(json.dumps({
@@ -60,11 +77,10 @@ class ImdbIE(InfoExtractor):
'subType': 'FORCE_LEGACY',
'id': 'vi%s' % video_id,
}).encode()).decode(),
- })[0]
-
+ }), lambda x: x[0]['videoLegacyEncodings'])
quality = qualities(('SD', '480p', '720p', '1080p'))
- formats = []
- for encoding in data['videoLegacyEncodings']:
+ formats, subtitles = [], {}
+ for encoding in data:
if not encoding or not isinstance(encoding, dict):
continue
video_url = url_or_none(encoding.get('url'))
@@ -73,11 +89,13 @@ class ImdbIE(InfoExtractor):
ext = mimetype2ext(encoding.get(
'mimeType')) or determine_ext(video_url)
if ext == 'm3u8':
- formats.extend(self._extract_m3u8_formats(
+ fmts, subs = self._extract_m3u8_formats_and_subtitles(
video_url, video_id, 'mp4', entry_protocol='m3u8_native',
- preference=1, m3u8_id='hls', fatal=False))
+ preference=1, m3u8_id='hls', fatal=False)
+ subtitles = self._merge_subtitles(subtitles, subs)
+ formats.extend(fmts)
continue
- format_id = encoding.get('definition')
+ format_id = traverse_obj(encoding, ('displayName', 'value'), 'definition')
formats.append({
'format_id': format_id,
'url': video_url,
@@ -86,33 +104,15 @@ class ImdbIE(InfoExtractor):
})
self._sort_formats(formats)
- webpage = self._download_webpage(
- 'https://www.imdb.com/video/vi' + video_id, video_id)
- video_metadata = self._parse_json(self._search_regex(
- r'args\.push\(\s*({.+?})\s*\)\s*;', webpage,
- 'video metadata'), video_id)
-
- video_info = video_metadata.get('VIDEO_INFO')
- if video_info and isinstance(video_info, dict):
- info = try_get(
- video_info, lambda x: x[list(video_info.keys())[0]][0], dict)
- else:
- info = {}
-
- title = self._html_search_meta(
- ['og:title', 'twitter:title'], webpage) or self._html_search_regex(
- r'<title>(.+?)</title>', webpage, 'title',
- default=None) or info['videoTitle']
-
return {
'id': video_id,
'title': title,
'alt_title': info.get('videoSubTitle'),
'formats': formats,
- 'description': info.get('videoDescription'),
- 'thumbnail': url_or_none(try_get(
- video_metadata, lambda x: x['videoSlate']['source'])),
- 'duration': parse_duration(info.get('videoRuntime')),
+ 'description': try_get(video_info, lambda x: x['description']['value']),
+ 'thumbnail': url_or_none(try_get(video_info, lambda x: x['thumbnail']['url'])),
+ 'duration': int_or_none(try_get(video_info, lambda x: x['runtime']['value'])),
+ 'subtitles': subtitles,
}
diff --git a/hypervideo_dl/extractor/imggaming.py b/hypervideo_dl/extractor/imggaming.py
index ef20a4b..ce7b21a 100644
--- a/hypervideo_dl/extractor/imggaming.py
+++ b/hypervideo_dl/extractor/imggaming.py
@@ -21,25 +21,26 @@ class ImgGamingBaseIE(InfoExtractor):
_REALM = None
_VALID_URL_TEMPL = r'https?://(?P<domain>%s)/(?P<type>live|playlist|video)/(?P<id>\d+)(?:\?.*?\bplaylistId=(?P<playlist_id>\d+))?'
- def _real_initialize(self):
+ def _initialize_pre_login(self):
self._HEADERS = {
'Realm': 'dce.' + self._REALM,
'x-api-key': self._API_KEY,
}
- email, password = self._get_login_info()
- if email is None:
- self.raise_login_required()
-
+ def _perform_login(self, username, password):
p_headers = self._HEADERS.copy()
p_headers['Content-Type'] = 'application/json'
self._HEADERS['Authorization'] = 'Bearer ' + self._download_json(
self._API_BASE + 'login',
None, 'Logging in', data=json.dumps({
- 'id': email,
+ 'id': username,
'secret': password,
}).encode(), headers=p_headers)['authorisationToken']
+ def _real_initialize(self):
+ if not self._HEADERS.get('Authorization'):
+ self.raise_login_required(method='password')
+
def _call_api(self, path, media_id):
return self._download_json(
self._API_BASE + path + media_id, media_id, headers=self._HEADERS)
@@ -64,10 +65,7 @@ class ImgGamingBaseIE(InfoExtractor):
domain, media_type, media_id, playlist_id = self._match_valid_url(url).groups()
if playlist_id:
- if self.get_param('noplaylist'):
- self.to_screen('Downloading just video %s because of --no-playlist' % media_id)
- else:
- self.to_screen('Downloading playlist %s - add --no-playlist to just download video' % playlist_id)
+ if self._yes_playlist(playlist_id, media_id):
media_type, media_id = 'playlist', playlist_id
if media_type == 'playlist':
@@ -88,7 +86,7 @@ class ImgGamingBaseIE(InfoExtractor):
video_data = self._download_json(dve_api_url, media_id)
is_live = media_type == 'live'
if is_live:
- title = self._live_title(self._call_api('event/', media_id)['title'])
+ title = self._call_api('event/', media_id)['title']
else:
title = video_data['name']
@@ -99,7 +97,7 @@ class ImgGamingBaseIE(InfoExtractor):
continue
if proto == 'hls':
m3u8_formats = self._extract_m3u8_formats(
- media_url, media_id, 'mp4', 'm3u8' if is_live else 'm3u8_native',
+ media_url, media_id, 'mp4', live=is_live,
m3u8_id='hls', fatal=False, headers=self._MANIFEST_HEADERS)
for f in m3u8_formats:
f.setdefault('http_headers', {}).update(self._MANIFEST_HEADERS)
diff --git a/hypervideo_dl/extractor/infoq.py b/hypervideo_dl/extractor/infoq.py
index 0a70a1f..347cc51 100644
--- a/hypervideo_dl/extractor/infoq.py
+++ b/hypervideo_dl/extractor/infoq.py
@@ -115,7 +115,7 @@ class InfoQIE(BokeCCBaseIE):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- video_title = self._html_search_regex(r'<title>(.*?)</title>', webpage, 'title')
+ video_title = self._html_extract_title(webpage)
video_description = self._html_search_meta('description', webpage, 'description')
if '/cn/' in url:
diff --git a/hypervideo_dl/extractor/instagram.py b/hypervideo_dl/extractor/instagram.py
index 3801c7a..970f2c8 100644
--- a/hypervideo_dl/extractor/instagram.py
+++ b/hypervideo_dl/extractor/instagram.py
@@ -1,32 +1,202 @@
-from __future__ import unicode_literals
+# coding: utf-8
import itertools
import hashlib
import json
import re
+import time
from .common import InfoExtractor
from ..compat import (
- compat_str,
compat_HTTPError,
)
from ..utils import (
ExtractorError,
+ format_field,
float_or_none,
get_element_by_attribute,
int_or_none,
lowercase_escape,
- std_headers,
- try_get,
+ str_or_none,
+ str_to_int,
+ traverse_obj,
url_or_none,
- variadic,
+ urlencode_postdata,
)
-class InstagramIE(InfoExtractor):
- _VALID_URL = r'(?P<url>https?://(?:www\.)?instagram\.com/(?:p|tv|reel)/(?P<id>[^/?#&]+))'
+class InstagramBaseIE(InfoExtractor):
+ _NETRC_MACHINE = 'instagram'
+ _IS_LOGGED_IN = False
+
+ def _perform_login(self, username, password):
+ if self._IS_LOGGED_IN:
+ return
+
+ login_webpage = self._download_webpage(
+ 'https://www.instagram.com/accounts/login/', None,
+ note='Downloading login webpage', errnote='Failed to download login webpage')
+
+ shared_data = self._parse_json(
+ self._search_regex(
+ r'window\._sharedData\s*=\s*({.+?});',
+ login_webpage, 'shared data', default='{}'),
+ None)
+
+ login = self._download_json('https://www.instagram.com/accounts/login/ajax/', None, note='Logging in', headers={
+ 'Accept': '*/*',
+ 'X-IG-App-ID': '936619743392459',
+ 'X-ASBD-ID': '198387',
+ 'X-IG-WWW-Claim': '0',
+ 'X-Requested-With': 'XMLHttpRequest',
+ 'X-CSRFToken': shared_data['config']['csrf_token'],
+ 'X-Instagram-AJAX': shared_data['rollout_hash'],
+ 'Referer': 'https://www.instagram.com/',
+ }, data=urlencode_postdata({
+ 'enc_password': f'#PWD_INSTAGRAM_BROWSER:0:{int(time.time())}:{password}',
+ 'username': username,
+ 'queryParams': '{}',
+ 'optIntoOneTap': 'false',
+ 'stopDeletionNonce': '',
+ 'trustedDeviceRecords': '{}',
+ }))
+
+ if not login.get('authenticated'):
+ if login.get('message'):
+ raise ExtractorError(f'Unable to login: {login["message"]}')
+ elif login.get('user'):
+ raise ExtractorError('Unable to login: Sorry, your password was incorrect. Please double-check your password.', expected=True)
+ elif login.get('user') is False:
+ raise ExtractorError('Unable to login: The username you entered doesn\'t belong to an account. Please check your username and try again.', expected=True)
+ raise ExtractorError('Unable to login')
+ InstagramBaseIE._IS_LOGGED_IN = True
+
+ def _get_count(self, media, kind, *keys):
+ return traverse_obj(
+ media, (kind, 'count'), *((f'edge_media_{key}', 'count') for key in keys),
+ expected_type=int_or_none)
+
+ def _get_dimension(self, name, media, webpage=None):
+ return (
+ traverse_obj(media, ('dimensions', name), expected_type=int_or_none)
+ or int_or_none(self._html_search_meta(
+ (f'og:video:{name}', f'video:{name}'), webpage or '', default=None)))
+
+ def _extract_nodes(self, nodes, is_direct=False):
+ for idx, node in enumerate(nodes, start=1):
+ if node.get('__typename') != 'GraphVideo' and node.get('is_video') is not True:
+ continue
+
+ video_id = node.get('shortcode')
+
+ if is_direct:
+ info = {
+ 'id': video_id or node['id'],
+ 'url': node.get('video_url'),
+ 'width': self._get_dimension('width', node),
+ 'height': self._get_dimension('height', node),
+ 'http_headers': {
+ 'Referer': 'https://www.instagram.com/',
+ }
+ }
+ elif not video_id:
+ continue
+ else:
+ info = {
+ '_type': 'url',
+ 'ie_key': 'Instagram',
+ 'id': video_id,
+ 'url': f'https://instagram.com/p/{video_id}',
+ }
+
+ yield {
+ **info,
+ 'title': node.get('title') or (f'Video {idx}' if is_direct else None),
+ 'description': traverse_obj(
+ node, ('edge_media_to_caption', 'edges', 0, 'node', 'text'), expected_type=str),
+ 'thumbnail': traverse_obj(
+ node, 'display_url', 'thumbnail_src', 'display_src', expected_type=url_or_none),
+ 'duration': float_or_none(node.get('video_duration')),
+ 'timestamp': int_or_none(node.get('taken_at_timestamp')),
+ 'view_count': int_or_none(node.get('video_view_count')),
+ 'comment_count': self._get_count(node, 'comments', 'preview_comment', 'to_comment', 'to_parent_comment'),
+ 'like_count': self._get_count(node, 'likes', 'preview_like'),
+ }
+
+ def _extract_product_media(self, product_media):
+ media_id = product_media.get('code') or product_media.get('id')
+ vcodec = product_media.get('video_codec')
+ dash_manifest_raw = product_media.get('video_dash_manifest')
+ videos_list = product_media.get('video_versions')
+ if not (dash_manifest_raw or videos_list):
+ return {}
+
+ formats = [{
+ 'format_id': format.get('id'),
+ 'url': format.get('url'),
+ 'width': format.get('width'),
+ 'height': format.get('height'),
+ 'vcodec': vcodec,
+ } for format in videos_list or []]
+ if dash_manifest_raw:
+ formats.extend(self._parse_mpd_formats(self._parse_xml(dash_manifest_raw, media_id), mpd_id='dash'))
+ self._sort_formats(formats)
+
+ thumbnails = [{
+ 'url': thumbnail.get('url'),
+ 'width': thumbnail.get('width'),
+ 'height': thumbnail.get('height')
+ } for thumbnail in traverse_obj(product_media, ('image_versions2', 'candidates')) or []]
+ return {
+ 'id': media_id,
+ 'duration': float_or_none(product_media.get('video_duration')),
+ 'formats': formats,
+ 'thumbnails': thumbnails
+ }
+
+ def _extract_product(self, product_info):
+ if isinstance(product_info, list):
+ product_info = product_info[0]
+
+ user_info = product_info.get('user') or {}
+ info_dict = {
+ 'id': product_info.get('code') or product_info.get('id'),
+ 'title': product_info.get('title') or f'Video by {user_info.get("username")}',
+ 'description': traverse_obj(product_info, ('caption', 'text'), expected_type=str_or_none),
+ 'timestamp': int_or_none(product_info.get('taken_at')),
+ 'channel': user_info.get('username'),
+ 'uploader': user_info.get('full_name'),
+ 'uploader_id': str_or_none(user_info.get('pk')),
+ 'view_count': int_or_none(product_info.get('view_count')),
+ 'like_count': int_or_none(product_info.get('like_count')),
+ 'comment_count': int_or_none(product_info.get('comment_count')),
+ 'http_headers': {
+ 'Referer': 'https://www.instagram.com/',
+ }
+ }
+ carousel_media = product_info.get('carousel_media')
+ if carousel_media:
+ return {
+ '_type': 'playlist',
+ **info_dict,
+ 'title': f'Post by {user_info.get("username")}',
+ 'entries': [{
+ **info_dict,
+ **self._extract_product_media(product_media),
+ } for product_media in carousel_media],
+ }
+
+ return {
+ **info_dict,
+ **self._extract_product_media(product_info)
+ }
+
+
+class InstagramIOSIE(InfoExtractor):
+ IE_DESC = 'IOS instagram:// URL'
+ _VALID_URL = r'instagram://media\?id=(?P<id>[\d_]+)'
_TESTS = [{
- 'url': 'https://instagram.com/p/aye83DjauH/?foo=bar#abc',
+ 'url': 'instagram://media?id=482584233761418119',
'md5': '0d2da106a9d2631273e192b372806516',
'info_dict': {
'id': 'aye83DjauH',
@@ -43,6 +213,49 @@ class InstagramIE(InfoExtractor):
'comment_count': int,
'comments': list,
},
+ 'add_ie': ['Instagram']
+ }]
+
+ def _get_id(self, id):
+ """Source: https://stackoverflow.com/questions/24437823/getting-instagram-post-url-from-media-id"""
+ chrs = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_'
+ media_id = int(id.split('_')[0])
+ shortened_id = ''
+ while media_id > 0:
+ r = media_id % 64
+ media_id = (media_id - r) // 64
+ shortened_id = chrs[r] + shortened_id
+ return shortened_id
+
+ def _real_extract(self, url):
+ return {
+ '_type': 'url_transparent',
+ 'url': f'http://instagram.com/tv/{self._get_id(self._match_id(url))}/',
+ 'ie_key': 'Instagram',
+ }
+
+
+class InstagramIE(InstagramBaseIE):
+ _VALID_URL = r'(?P<url>https?://(?:www\.)?instagram\.com(?:/[^/]+)?/(?:p|tv|reel)/(?P<id>[^/?#&]+))'
+ _TESTS = [{
+ 'url': 'https://instagram.com/p/aye83DjauH/?foo=bar#abc',
+ 'md5': '0d2da106a9d2631273e192b372806516',
+ 'info_dict': {
+ 'id': 'aye83DjauH',
+ 'ext': 'mp4',
+ 'title': 'Video by naomipq',
+ 'description': 'md5:1f17f0ab29bd6fe2bfad705f58de3cb8',
+ 'thumbnail': r're:^https?://.*\.jpg',
+ 'duration': 0,
+ 'timestamp': 1371748545,
+ 'upload_date': '20130620',
+ 'uploader_id': '2815873',
+ 'uploader': 'B E A U T Y F O R A S H E S',
+ 'channel': 'naomipq',
+ 'like_count': int,
+ 'comment_count': int,
+ 'comments': list,
+ },
}, {
# missing description
'url': 'https://www.instagram.com/p/BA-pQFBG8HZ/?taken-by=britneyspears',
@@ -54,8 +267,9 @@ class InstagramIE(InfoExtractor):
'duration': 0,
'timestamp': 1453760977,
'upload_date': '20160125',
- 'uploader_id': 'britneyspears',
+ 'uploader_id': '12246775',
'uploader': 'Britney Spears',
+ 'channel': 'britneyspears',
'like_count': int,
'comment_count': int,
'comments': list,
@@ -101,8 +315,9 @@ class InstagramIE(InfoExtractor):
'duration': 53.83,
'timestamp': 1530032919,
'upload_date': '20180626',
- 'uploader_id': 'instagram',
+ 'uploader_id': '25025320',
'uploader': 'Instagram',
+ 'channel': 'instagram',
'like_count': int,
'comment_count': int,
'comments': list,
@@ -120,6 +335,9 @@ class InstagramIE(InfoExtractor):
}, {
'url': 'https://www.instagram.com/reel/CDUMkliABpa/',
'only_matching': True,
+ }, {
+ 'url': 'https://www.instagram.com/marvelskies.fc/reel/CWqAgUZgCku/',
+ 'only_matching': True,
}]
@staticmethod
@@ -141,154 +359,114 @@ class InstagramIE(InfoExtractor):
return mobj.group('link')
def _real_extract(self, url):
- mobj = self._match_valid_url(url)
- video_id = mobj.group('id')
- url = mobj.group('url')
-
+ video_id, url = self._match_valid_url(url).group('id', 'url')
webpage, urlh = self._download_webpage_handle(url, video_id)
- if 'www.instagram.com/accounts/login' in urlh.geturl().rstrip('/'):
- self.raise_login_required('You need to log in to access this content', method='cookies')
-
- (media, video_url, description, thumbnail, timestamp, uploader,
- uploader_id, like_count, comment_count, comments, height,
- width) = [None] * 12
+ if 'www.instagram.com/accounts/login' in urlh.geturl():
+ self.report_warning('Main webpage is locked behind the login page. '
+ 'Retrying with embed webpage (Note that some metadata might be missing)')
+ webpage = self._download_webpage(
+ 'https://www.instagram.com/p/%s/embed/' % video_id, video_id, note='Downloading embed webpage')
shared_data = self._parse_json(
self._search_regex(
r'window\._sharedData\s*=\s*({.+?});',
webpage, 'shared data', default='{}'),
video_id, fatal=False)
- if shared_data:
- media = try_get(
- shared_data,
- (lambda x: x['entry_data']['PostPage'][0]['graphql']['shortcode_media'],
- lambda x: x['entry_data']['PostPage'][0]['media']),
- dict)
+ media = traverse_obj(
+ shared_data,
+ ('entry_data', 'PostPage', 0, 'graphql', 'shortcode_media'),
+ ('entry_data', 'PostPage', 0, 'media'),
+ expected_type=dict)
+
# _sharedData.entry_data.PostPage is empty when authenticated (see
# https://github.com/ytdl-org/youtube-dl/pull/22880)
if not media:
additional_data = self._parse_json(
self._search_regex(
- r'window\.__additionalDataLoaded\s*\(\s*[^,]+,\s*({.+?})\s*\)\s*;',
+ r'window\.__additionalDataLoaded\s*\(\s*[^,]+,\s*({.+?})\s*\);',
webpage, 'additional data', default='{}'),
video_id, fatal=False)
- if additional_data:
- media = try_get(
- additional_data, lambda x: x['graphql']['shortcode_media'],
- dict)
- if media:
- video_url = media.get('video_url')
- height = int_or_none(media.get('dimensions', {}).get('height'))
- width = int_or_none(media.get('dimensions', {}).get('width'))
- description = try_get(
- media, lambda x: x['edge_media_to_caption']['edges'][0]['node']['text'],
- compat_str) or media.get('caption')
- title = media.get('title')
- thumbnail = media.get('display_src') or media.get('display_url')
- duration = float_or_none(media.get('video_duration'))
- timestamp = int_or_none(media.get('taken_at_timestamp') or media.get('date'))
- uploader = media.get('owner', {}).get('full_name')
- uploader_id = media.get('owner', {}).get('username')
-
- def get_count(keys, kind):
- for key in variadic(keys):
- count = int_or_none(try_get(
- media, (lambda x: x['edge_media_%s' % key]['count'],
- lambda x: x['%ss' % kind]['count'])))
- if count is not None:
- return count
-
- like_count = get_count('preview_like', 'like')
- comment_count = get_count(
- ('preview_comment', 'to_comment', 'to_parent_comment'), 'comment')
-
- comments = []
- for comment in try_get(media, lambda x: x['edge_media_to_parent_comment']['edges']):
- comment_dict = comment.get('node', {})
- comment_text = comment_dict.get('text')
- if comment_text:
- comments.append({
- 'author': try_get(comment_dict, lambda x: x['owner']['username']),
- 'author_id': try_get(comment_dict, lambda x: x['owner']['id']),
- 'id': comment_dict.get('id'),
- 'text': comment_text,
- 'timestamp': int_or_none(comment_dict.get('created_at')),
- })
- if not video_url:
- edges = try_get(
- media, lambda x: x['edge_sidecar_to_children']['edges'],
- list) or []
- if edges:
- entries = []
- for edge_num, edge in enumerate(edges, start=1):
- node = try_get(edge, lambda x: x['node'], dict)
- if not node:
- continue
- node_video_url = url_or_none(node.get('video_url'))
- if not node_video_url:
- continue
- entries.append({
- 'id': node.get('shortcode') or node['id'],
- 'title': node.get('title') or 'Video %d' % edge_num,
- 'url': node_video_url,
- 'thumbnail': node.get('display_url'),
- 'duration': float_or_none(node.get('video_duration')),
- 'width': int_or_none(try_get(node, lambda x: x['dimensions']['width'])),
- 'height': int_or_none(try_get(node, lambda x: x['dimensions']['height'])),
- 'view_count': int_or_none(node.get('video_view_count')),
- })
- return self.playlist_result(
- entries, video_id,
- 'Post by %s' % uploader_id if uploader_id else None,
- description)
+ product_item = traverse_obj(additional_data, ('items', 0), expected_type=dict)
+ if product_item:
+ return self._extract_product(product_item)
+ media = traverse_obj(additional_data, ('graphql', 'shortcode_media'), 'shortcode_media', expected_type=dict) or {}
- if not video_url:
- video_url = self._og_search_video_url(webpage, secure=False)
+ if not media and 'www.instagram.com/accounts/login' in urlh.geturl():
+ self.raise_login_required('You need to log in to access this content')
- formats = [{
- 'url': video_url,
- 'width': width,
- 'height': height,
- }]
-
- if not uploader_id:
- uploader_id = self._search_regex(
- r'"owner"\s*:\s*{\s*"username"\s*:\s*"(.+?)"',
- webpage, 'uploader id', fatal=False)
+ username = traverse_obj(media, ('owner', 'username')) or self._search_regex(
+ r'"owner"\s*:\s*{\s*"username"\s*:\s*"(.+?)"', webpage, 'username', fatal=False)
+ description = (
+ traverse_obj(media, ('edge_media_to_caption', 'edges', 0, 'node', 'text'), expected_type=str)
+ or media.get('caption'))
if not description:
description = self._search_regex(
r'"caption"\s*:\s*"(.+?)"', webpage, 'description', default=None)
if description is not None:
description = lowercase_escape(description)
- if not thumbnail:
- thumbnail = self._og_search_thumbnail(webpage)
+ video_url = media.get('video_url')
+ if not video_url:
+ nodes = traverse_obj(media, ('edge_sidecar_to_children', 'edges', ..., 'node'), expected_type=dict) or []
+ if nodes:
+ return self.playlist_result(
+ self._extract_nodes(nodes, True), video_id,
+ format_field(username, template='Post by %s'), description)
+
+ video_url = self._og_search_video_url(webpage, secure=False)
+
+ formats = [{
+ 'url': video_url,
+ 'width': self._get_dimension('width', media, webpage),
+ 'height': self._get_dimension('height', media, webpage),
+ }]
+ dash = traverse_obj(media, ('dash_info', 'video_dash_manifest'))
+ if dash:
+ formats.extend(self._parse_mpd_formats(self._parse_xml(dash, video_id), mpd_id='dash'))
+ self._sort_formats(formats)
+
+ comment_data = traverse_obj(media, ('edge_media_to_parent_comment', 'edges'))
+ comments = [{
+ 'author': traverse_obj(comment_dict, ('node', 'owner', 'username')),
+ 'author_id': traverse_obj(comment_dict, ('node', 'owner', 'id')),
+ 'id': traverse_obj(comment_dict, ('node', 'id')),
+ 'text': traverse_obj(comment_dict, ('node', 'text')),
+ 'timestamp': traverse_obj(comment_dict, ('node', 'created_at'), expected_type=int_or_none),
+ } for comment_dict in comment_data] if comment_data else None
+
+ display_resources = (
+ media.get('display_resources')
+ or [{'src': media.get(key)} for key in ('display_src', 'display_url')]
+ or [{'src': self._og_search_thumbnail(webpage)}])
+ thumbnails = [{
+ 'url': thumbnail['src'],
+ 'width': thumbnail.get('config_width'),
+ 'height': thumbnail.get('config_height'),
+ } for thumbnail in display_resources if thumbnail.get('src')]
return {
'id': video_id,
'formats': formats,
- 'ext': 'mp4',
- 'title': title or 'Video by %s' % uploader_id,
+ 'title': media.get('title') or 'Video by %s' % username,
'description': description,
- 'duration': duration,
- 'thumbnail': thumbnail,
- 'timestamp': timestamp,
- 'uploader_id': uploader_id,
- 'uploader': uploader,
- 'like_count': like_count,
- 'comment_count': comment_count,
+ 'duration': float_or_none(media.get('video_duration')),
+ 'timestamp': traverse_obj(media, 'taken_at_timestamp', 'date', expected_type=int_or_none),
+ 'uploader_id': traverse_obj(media, ('owner', 'id')),
+ 'uploader': traverse_obj(media, ('owner', 'full_name')),
+ 'channel': username,
+ 'like_count': self._get_count(media, 'likes', 'preview_like') or str_to_int(self._search_regex(
+ r'data-log-event="likeCountClick"[^>]*>[^\d]*([\d,\.]+)', webpage, 'like count', fatal=False)),
+ 'comment_count': self._get_count(media, 'comments', 'preview_comment', 'to_comment', 'to_parent_comment'),
'comments': comments,
+ 'thumbnails': thumbnails,
'http_headers': {
'Referer': 'https://www.instagram.com/',
}
}
-class InstagramPlaylistIE(InfoExtractor):
- # A superclass for handling any kind of query based on GraphQL which
- # results in a playlist.
-
+class InstagramPlaylistBaseIE(InstagramBaseIE):
_gis_tmpl = None # used to cache GIS request type
def _parse_graphql(self, webpage, item_id):
@@ -300,10 +478,6 @@ class InstagramPlaylistIE(InfoExtractor):
def _extract_graphql(self, data, url):
# Parses GraphQL queries containing videos and generates a playlist.
- def get_count(suffix):
- return int_or_none(try_get(
- node, lambda x: x['edge_media_' + suffix]['count']))
-
uploader_id = self._match_id(url)
csrf_token = data['config']['csrf_token']
rhx_gis = data.get('rhx_gis') or '3c7ca9dcefcf966d11dacf1f151335e8'
@@ -324,7 +498,7 @@ class InstagramPlaylistIE(InfoExtractor):
'%s' % rhx_gis,
'',
'%s:%s' % (rhx_gis, csrf_token),
- '%s:%s:%s' % (rhx_gis, csrf_token, std_headers['User-Agent']),
+ '%s:%s:%s' % (rhx_gis, csrf_token, self.get_param('http_headers')['User-Agent']),
]
# try all of the ways to generate a GIS query, and not only use the
@@ -352,55 +526,14 @@ class InstagramPlaylistIE(InfoExtractor):
continue
raise
- edges = media.get('edges')
- if not edges or not isinstance(edges, list):
+ nodes = traverse_obj(media, ('edges', ..., 'node'), expected_type=dict) or []
+ if not nodes:
break
+ yield from self._extract_nodes(nodes)
- for edge in edges:
- node = edge.get('node')
- if not node or not isinstance(node, dict):
- continue
- if node.get('__typename') != 'GraphVideo' and node.get('is_video') is not True:
- continue
- video_id = node.get('shortcode')
- if not video_id:
- continue
-
- info = self.url_result(
- 'https://instagram.com/p/%s/' % video_id,
- ie=InstagramIE.ie_key(), video_id=video_id)
-
- description = try_get(
- node, lambda x: x['edge_media_to_caption']['edges'][0]['node']['text'],
- compat_str)
- thumbnail = node.get('thumbnail_src') or node.get('display_src')
- timestamp = int_or_none(node.get('taken_at_timestamp'))
-
- comment_count = get_count('to_comment')
- like_count = get_count('preview_like')
- view_count = int_or_none(node.get('video_view_count'))
-
- info.update({
- 'description': description,
- 'thumbnail': thumbnail,
- 'timestamp': timestamp,
- 'comment_count': comment_count,
- 'like_count': like_count,
- 'view_count': view_count,
- })
-
- yield info
-
- page_info = media.get('page_info')
- if not page_info or not isinstance(page_info, dict):
- break
-
- has_next_page = page_info.get('has_next_page')
- if not has_next_page:
- break
-
- cursor = page_info.get('end_cursor')
- if not cursor or not isinstance(cursor, compat_str):
+ has_next_page = traverse_obj(media, ('page_info', 'has_next_page'))
+ cursor = traverse_obj(media, ('page_info', 'end_cursor'), expected_type=str)
+ if not has_next_page or not cursor:
break
def _real_extract(self, url):
@@ -414,11 +547,11 @@ class InstagramPlaylistIE(InfoExtractor):
self._extract_graphql(data, url), user_or_tag, user_or_tag)
-class InstagramUserIE(InstagramPlaylistIE):
+class InstagramUserIE(InstagramPlaylistBaseIE):
_VALID_URL = r'https?://(?:www\.)?instagram\.com/(?P<id>[^/]{2,})/?(?:$|[?#])'
IE_DESC = 'Instagram user profile'
IE_NAME = 'instagram:user'
- _TEST = {
+ _TESTS = [{
'url': 'https://instagram.com/porsche',
'info_dict': {
'id': 'porsche',
@@ -430,7 +563,7 @@ class InstagramUserIE(InstagramPlaylistIE):
'skip_download': True,
'playlistend': 5,
}
- }
+ }]
_QUERY_HASH = '42323d64886122307be10013ad2dcc44',
@@ -448,11 +581,11 @@ class InstagramUserIE(InstagramPlaylistIE):
}
-class InstagramTagIE(InstagramPlaylistIE):
+class InstagramTagIE(InstagramPlaylistBaseIE):
_VALID_URL = r'https?://(?:www\.)?instagram\.com/explore/tags/(?P<id>[^/]+)'
- IE_DESC = 'Instagram hashtag search'
+ IE_DESC = 'Instagram hashtag search URLs'
IE_NAME = 'instagram:tag'
- _TEST = {
+ _TESTS = [{
'url': 'https://instagram.com/explore/tags/lolcats',
'info_dict': {
'id': 'lolcats',
@@ -464,7 +597,7 @@ class InstagramTagIE(InstagramPlaylistIE):
'skip_download': True,
'playlistend': 50,
}
- }
+ }]
_QUERY_HASH = 'f92f56d47dc7a55b606908374b43a314',
@@ -481,3 +614,58 @@ class InstagramTagIE(InstagramPlaylistIE):
'tag_name':
data['entry_data']['TagPage'][0]['graphql']['hashtag']['name']
}
+
+
+class InstagramStoryIE(InstagramBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?instagram\.com/stories/(?P<user>[^/]+)/(?P<id>\d+)'
+ IE_NAME = 'instagram:story'
+
+ _TESTS = [{
+ 'url': 'https://www.instagram.com/stories/highlights/18090946048123978/',
+ 'info_dict': {
+ 'id': '18090946048123978',
+ 'title': 'Rare',
+ },
+ 'playlist_mincount': 50
+ }]
+
+ def _real_extract(self, url):
+ username, story_id = self._match_valid_url(url).groups()
+
+ story_info_url = f'{username}/{story_id}/?__a=1' if username == 'highlights' else f'{username}/?__a=1'
+ story_info = self._download_json(f'https://www.instagram.com/stories/{story_info_url}', story_id, headers={
+ 'X-IG-App-ID': 936619743392459,
+ 'X-ASBD-ID': 198387,
+ 'X-IG-WWW-Claim': 0,
+ 'X-Requested-With': 'XMLHttpRequest',
+ 'Referer': url,
+ })
+ user_id = story_info['user']['id']
+ highlight_title = traverse_obj(story_info, ('highlight', 'title'))
+
+ story_info_url = user_id if username != 'highlights' else f'highlight:{story_id}'
+ videos = self._download_json(f'https://i.instagram.com/api/v1/feed/reels_media/?reel_ids={story_info_url}', story_id, headers={
+ 'X-IG-App-ID': 936619743392459,
+ 'X-ASBD-ID': 198387,
+ 'X-IG-WWW-Claim': 0,
+ })['reels']
+
+ full_name = traverse_obj(videos, ('user', 'full_name'))
+
+ user_info = {}
+ if not (username and username != 'highlights' and full_name):
+ user_info = self._download_json(
+ f'https://i.instagram.com/api/v1/users/{user_id}/info/', story_id, headers={
+ 'User-Agent': 'Mozilla/5.0 (Linux; Android 11; SM-A505F Build/RP1A.200720.012; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/96.0.4664.45 Mobile Safari/537.36 Instagram 214.1.0.29.120 Android (30/11; 450dpi; 1080x2122; samsung; SM-A505F; a50; exynos9610; en_US; 333717274)',
+ }, note='Downloading user info')
+
+ username = traverse_obj(user_info, ('user', 'username')) or username
+ full_name = traverse_obj(user_info, ('user', 'full_name')) or full_name
+
+ highlights = traverse_obj(videos, (f'highlight:{story_id}', 'items'), (str(user_id), 'items'))
+ return self.playlist_result([{
+ **self._extract_product(highlight),
+ 'title': f'Story by {username}',
+ 'uploader': full_name,
+ 'uploader_id': user_id,
+ } for highlight in highlights], playlist_id=story_id, playlist_title=highlight_title)
diff --git a/hypervideo_dl/extractor/internazionale.py b/hypervideo_dl/extractor/internazionale.py
index 676e8e2..45e2af6 100644
--- a/hypervideo_dl/extractor/internazionale.py
+++ b/hypervideo_dl/extractor/internazionale.py
@@ -20,9 +20,6 @@ class InternazionaleIE(InfoExtractor):
'upload_date': '20150219',
'thumbnail': r're:^https?://.*\.jpg$',
},
- 'params': {
- 'format': 'bestvideo',
- },
}, {
'url': 'https://www.internazionale.it/video/2018/08/29/telefono-stare-con-noi-stessi',
'md5': '9db8663704cab73eb972d1cee0082c79',
@@ -36,9 +33,6 @@ class InternazionaleIE(InfoExtractor):
'upload_date': '20180829',
'thumbnail': r're:^https?://.*\.jpg$',
},
- 'params': {
- 'format': 'bestvideo',
- },
}]
def _real_extract(self, url):
diff --git a/hypervideo_dl/extractor/iprima.py b/hypervideo_dl/extractor/iprima.py
index 28e6609..1a20384 100644
--- a/hypervideo_dl/extractor/iprima.py
+++ b/hypervideo_dl/extractor/iprima.py
@@ -8,12 +8,19 @@ from .common import InfoExtractor
from ..utils import (
determine_ext,
js_to_json,
+ urlencode_postdata,
+ ExtractorError,
+ parse_qs
)
class IPrimaIE(InfoExtractor):
- _VALID_URL = r'https?://(?:[^/]+)\.iprima\.cz/(?:[^/]+/)*(?P<id>[^/?#&]+)'
+ _VALID_URL = r'https?://(?!cnn)(?:[^/]+)\.iprima\.cz/(?:[^/]+/)*(?P<id>[^/?#&]+)'
_GEO_BYPASS = False
+ _NETRC_MACHINE = 'iprima'
+ _LOGIN_URL = 'https://auth.iprima.cz/oauth2/login'
+ _TOKEN_URL = 'https://auth.iprima.cz/oauth2/token'
+ access_token = None
_TESTS = [{
'url': 'https://prima.iprima.cz/particka/92-epizoda',
@@ -22,16 +29,8 @@ class IPrimaIE(InfoExtractor):
'ext': 'mp4',
'title': 'Partička (92)',
'description': 'md5:859d53beae4609e6dd7796413f1b6cac',
- },
- 'params': {
- 'skip_download': True, # m3u8 download
- },
- }, {
- 'url': 'https://cnn.iprima.cz/videa/70-epizoda',
- 'info_dict': {
- 'id': 'p681554',
- 'ext': 'mp4',
- 'title': 'HLAVNÍ ZPRÁVY 3.5.2020',
+ 'upload_date': '20201103',
+ 'timestamp': 1604437480,
},
'params': {
'skip_download': True, # m3u8 download
@@ -44,11 +43,9 @@ class IPrimaIE(InfoExtractor):
'url': 'http://play.iprima.cz/closer-nove-pripady/closer-nove-pripady-iv-1',
'only_matching': True,
}, {
- # iframe api.play-backend.iprima.cz
'url': 'https://prima.iprima.cz/my-little-pony/mapa-znameni-2-2',
'only_matching': True,
}, {
- # iframe prima.iprima.cz
'url': 'https://prima.iprima.cz/porady/jak-se-stavi-sen/rodina-rathousova-praha',
'only_matching': True,
}, {
@@ -66,9 +63,125 @@ class IPrimaIE(InfoExtractor):
}, {
'url': 'https://love.iprima.cz/laska-az-za-hrob/slib-dany-bratrovi',
'only_matching': True,
- }, {
- 'url': 'https://autosalon.iprima.cz/motorsport/7-epizoda-1',
- 'only_matching': True,
+ }]
+
+ def _perform_login(self, username, password):
+ if self.access_token:
+ return
+
+ login_page = self._download_webpage(
+ self._LOGIN_URL, None, note='Downloading login page',
+ errnote='Downloading login page failed')
+
+ login_form = self._hidden_inputs(login_page)
+
+ login_form.update({
+ '_email': username,
+ '_password': password})
+
+ _, login_handle = self._download_webpage_handle(
+ self._LOGIN_URL, None, data=urlencode_postdata(login_form),
+ note='Logging in')
+
+ code = parse_qs(login_handle.geturl()).get('code')[0]
+ if not code:
+ raise ExtractorError('Login failed', expected=True)
+
+ token_request_data = {
+ 'scope': 'openid+email+profile+phone+address+offline_access',
+ 'client_id': 'prima_sso',
+ 'grant_type': 'authorization_code',
+ 'code': code,
+ 'redirect_uri': 'https://auth.iprima.cz/sso/auth-check'}
+
+ token_data = self._download_json(
+ self._TOKEN_URL, None,
+ note='Downloading token', errnote='Downloading token failed',
+ data=urlencode_postdata(token_request_data))
+
+ self.access_token = token_data.get('access_token')
+ if self.access_token is None:
+ raise ExtractorError('Getting token failed', expected=True)
+
+ def _real_initialize(self):
+ if not self.access_token:
+ self.raise_login_required('Login is required to access any iPrima content', method='password')
+
+ def _raise_access_error(self, error_code):
+ if error_code == 'PLAY_GEOIP_DENIED':
+ self.raise_geo_restricted(countries=['CZ'], metadata_available=True)
+ elif error_code is not None:
+ self.raise_no_formats('Access to stream infos forbidden', expected=True)
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ title = self._html_search_meta(
+ ['og:title', 'twitter:title'],
+ webpage, 'title', default=None)
+
+ video_id = self._search_regex((
+ r'productId\s*=\s*([\'"])(?P<id>p\d+)\1',
+ r'pproduct_id\s*=\s*([\'"])(?P<id>p\d+)\1'),
+ webpage, 'real id', group='id')
+
+ metadata = self._download_json(
+ f'https://api.play-backend.iprima.cz/api/v1//products/id-{video_id}/play',
+ video_id, note='Getting manifest URLs', errnote='Failed to get manifest URLs',
+ headers={'X-OTT-Access-Token': self.access_token},
+ expected_status=403)
+
+ self._raise_access_error(metadata.get('errorCode'))
+
+ stream_infos = metadata.get('streamInfos')
+ formats = []
+ if stream_infos is None:
+ self.raise_no_formats('Reading stream infos failed', expected=True)
+ else:
+ for manifest in stream_infos:
+ manifest_type = manifest.get('type')
+ manifest_url = manifest.get('url')
+ ext = determine_ext(manifest_url)
+ if manifest_type == 'HLS' or ext == 'm3u8':
+ formats += self._extract_m3u8_formats(
+ manifest_url, video_id, 'mp4', entry_protocol='m3u8_native',
+ m3u8_id='hls', fatal=False)
+ elif manifest_type == 'DASH' or ext == 'mpd':
+ formats += self._extract_mpd_formats(
+ manifest_url, video_id, mpd_id='dash', fatal=False)
+ self._sort_formats(formats)
+
+ final_result = self._search_json_ld(webpage, video_id) or {}
+ final_result.update({
+ 'id': video_id,
+ 'title': title,
+ 'thumbnail': self._html_search_meta(
+ ['thumbnail', 'og:image', 'twitter:image'],
+ webpage, 'thumbnail', default=None),
+ 'formats': formats,
+ 'description': self._html_search_meta(
+ ['description', 'og:description', 'twitter:description'],
+ webpage, 'description', default=None)})
+
+ return final_result
+
+
+class IPrimaCNNIE(InfoExtractor):
+ _VALID_URL = r'https?://cnn\.iprima\.cz/(?:[^/]+/)*(?P<id>[^/?#&]+)'
+ _GEO_BYPASS = False
+
+ _TESTS = [{
+ 'url': 'https://cnn.iprima.cz/porady/strunc/24072020-koronaviru-mam-plne-zuby-strasit-druhou-vlnou-je-absurdni-rika-senatorka-dernerova',
+ 'info_dict': {
+ 'id': 'p716177',
+ 'ext': 'mp4',
+ 'title': 'md5:277c6b1ed0577e51b40ddd35602ff43e',
+ },
+ 'params': {
+ 'skip_download': 'm3u8'
+ }
}]
def _real_extract(self, url):
diff --git a/hypervideo_dl/extractor/iqiyi.py b/hypervideo_dl/extractor/iqiyi.py
index b13b9f4..d07b39d 100644
--- a/hypervideo_dl/extractor/iqiyi.py
+++ b/hypervideo_dl/extractor/iqiyi.py
@@ -9,14 +9,28 @@ import time
from .common import InfoExtractor
from ..compat import (
compat_str,
+ compat_urllib_parse_unquote
)
+from .openload import PhantomJSwrapper
from ..utils import (
clean_html,
+ ExtractorError,
+ float_or_none,
+ format_field,
get_element_by_id,
get_element_by_attribute,
- ExtractorError,
+ int_or_none,
+ js_to_json,
ohdave_rsa_encrypt,
+ parse_age_limit,
+ parse_duration,
+ parse_iso8601,
+ parse_resolution,
+ qualities,
remove_start,
+ str_or_none,
+ traverse_obj,
+ urljoin,
)
@@ -96,9 +110,6 @@ class IqiyiIE(InfoExtractor):
'18': 7, # 1080p
}
- def _real_initialize(self):
- self._login()
-
@staticmethod
def _rsa_fun(data):
# public key extracted from http://static.iqiyi.com/js/qiyiV2/20160129180840/jobs/i18n/i18nIndex.js
@@ -107,7 +118,7 @@ class IqiyiIE(InfoExtractor):
return ohdave_rsa_encrypt(data, e, N)
- def _login(self):
+ def _perform_login(self):
raise ExtractorError("iQiyi's non-free authentication algorithm has made login impossible", expected=True)
def get_raw_data(self, tvid, video_id):
@@ -217,3 +228,359 @@ class IqiyiIE(InfoExtractor):
'title': title,
'formats': formats,
}
+
+
+class IqIE(InfoExtractor):
+ IE_NAME = 'iq.com'
+ IE_DESC = 'International version of iQiyi'
+ _VALID_URL = r'https?://(?:www\.)?iq\.com/play/(?:[\w%-]*-)?(?P<id>\w+)'
+ _TESTS = [{
+ 'url': 'https://www.iq.com/play/one-piece-episode-1000-1ma1i6ferf4',
+ 'md5': '2d7caf6eeca8a32b407094b33b757d39',
+ 'info_dict': {
+ 'ext': 'mp4',
+ 'id': '1ma1i6ferf4',
+ 'title': '航海王 第1000集',
+ 'description': 'Subtitle available on Sunday 4PM(GMT+8).',
+ 'duration': 1430,
+ 'timestamp': 1637488203,
+ 'upload_date': '20211121',
+ 'episode_number': 1000,
+ 'episode': 'Episode 1000',
+ 'series': 'One Piece',
+ 'age_limit': 13,
+ 'average_rating': float,
+ },
+ 'params': {
+ 'format': '500',
+ },
+ 'expected_warnings': ['format is restricted']
+ }, {
+ # VIP-restricted video
+ 'url': 'https://www.iq.com/play/mermaid-in-the-fog-2021-gbdpx13bs4',
+ 'only_matching': True
+ }]
+ _BID_TAGS = {
+ '100': '240P',
+ '200': '360P',
+ '300': '480P',
+ '500': '720P',
+ '600': '1080P',
+ '610': '1080P50',
+ '700': '2K',
+ '800': '4K',
+ }
+ _LID_TAGS = {
+ '1': 'zh_CN',
+ '2': 'zh_TW',
+ '3': 'en',
+ '18': 'th',
+ '21': 'my',
+ '23': 'vi',
+ '24': 'id',
+ '26': 'es',
+ '28': 'ar',
+ }
+
+ _DASH_JS = '''
+ console.log(page.evaluate(function() {
+ var tvid = "%(tvid)s"; var vid = "%(vid)s"; var src = "%(src)s";
+ var uid = "%(uid)s"; var dfp = "%(dfp)s"; var mode = "%(mode)s"; var lang = "%(lang)s";
+ var bid_list = %(bid_list)s; var ut_list = %(ut_list)s; var tm = new Date().getTime();
+ var cmd5x_func = %(cmd5x_func)s; var cmd5x_exporter = {}; cmd5x_func({}, cmd5x_exporter, {}); var cmd5x = cmd5x_exporter.cmd5x;
+ var authKey = cmd5x(cmd5x('') + tm + '' + tvid);
+ var k_uid = Array.apply(null, Array(32)).map(function() {return Math.floor(Math.random() * 15).toString(16)}).join('');
+ var dash_paths = {};
+ bid_list.forEach(function(bid) {
+ var query = {
+ 'tvid': tvid,
+ 'bid': bid,
+ 'ds': 1,
+ 'vid': vid,
+ 'src': src,
+ 'vt': 0,
+ 'rs': 1,
+ 'uid': uid,
+ 'ori': 'pcw',
+ 'ps': 1,
+ 'k_uid': k_uid,
+ 'pt': 0,
+ 'd': 0,
+ 's': '',
+ 'lid': '',
+ 'slid': 0,
+ 'cf': '',
+ 'ct': '',
+ 'authKey': authKey,
+ 'k_tag': 1,
+ 'ost': 0,
+ 'ppt': 0,
+ 'dfp': dfp,
+ 'prio': JSON.stringify({
+ 'ff': 'f4v',
+ 'code': 2
+ }),
+ 'k_err_retries': 0,
+ 'up': '',
+ 'su': 2,
+ 'applang': lang,
+ 'sver': 2,
+ 'X-USER-MODE': mode,
+ 'qd_v': 2,
+ 'tm': tm,
+ 'qdy': 'a',
+ 'qds': 0,
+ 'k_ft1': 141287244169348,
+ 'k_ft4': 34359746564,
+ 'k_ft5': 1,
+ 'bop': JSON.stringify({
+ 'version': '10.0',
+ 'dfp': dfp
+ }),
+ };
+ var enc_params = [];
+ for (var prop in query) {
+ enc_params.push(encodeURIComponent(prop) + '=' + encodeURIComponent(query[prop]));
+ }
+ ut_list.forEach(function(ut) {
+ enc_params.push('ut=' + ut);
+ })
+ var dash_path = '/dash?' + enc_params.join('&'); dash_path += '&vf=' + cmd5x(dash_path);
+ dash_paths[bid] = dash_path;
+ });
+ return JSON.stringify(dash_paths);
+ }));
+ saveAndExit();
+ '''
+
+ def _extract_vms_player_js(self, webpage, video_id):
+ player_js_cache = self._downloader.cache.load('iq', 'player_js')
+ if player_js_cache:
+ return player_js_cache
+ webpack_js_url = self._proto_relative_url(self._search_regex(
+ r'<script src="((?:https?)?//stc.iqiyipic.com/_next/static/chunks/webpack-\w+\.js)"', webpage, 'webpack URL'))
+ webpack_js = self._download_webpage(webpack_js_url, video_id, note='Downloading webpack JS', errnote='Unable to download webpack JS')
+ webpack_map1, webpack_map2 = [self._parse_json(js_map, video_id, transform_source=js_to_json) for js_map in self._search_regex(
+ r'\(({[^}]*})\[\w+\][^\)]*\)\s*\+\s*["\']\.["\']\s*\+\s*({[^}]*})\[\w+\]\+["\']\.js', webpack_js, 'JS locations', group=(1, 2))]
+ for module_index in reversed(list(webpack_map2.keys())):
+ module_js = self._download_webpage(
+ f'https://stc.iqiyipic.com/_next/static/chunks/{webpack_map1.get(module_index, module_index)}.{webpack_map2[module_index]}.js',
+ video_id, note=f'Downloading #{module_index} module JS', errnote='Unable to download module JS', fatal=False) or ''
+ if 'vms request' in module_js:
+ self._downloader.cache.store('iq', 'player_js', module_js)
+ return module_js
+ raise ExtractorError('Unable to extract player JS')
+
+ def _extract_cmd5x_function(self, webpage, video_id):
+ return self._search_regex(r',\s*(function\s*\([^\)]*\)\s*{\s*var _qda.+_qdc\(\)\s*})\s*,',
+ self._extract_vms_player_js(webpage, video_id), 'signature function')
+
+ def _update_bid_tags(self, webpage, video_id):
+ extracted_bid_tags = self._parse_json(
+ self._search_regex(
+ r'arguments\[1\][^,]*,\s*function\s*\([^\)]*\)\s*{\s*"use strict";?\s*var \w=({.+}})\s*,\s*\w\s*=\s*{\s*getNewVd',
+ self._extract_vms_player_js(webpage, video_id), 'video tags', default=''),
+ video_id, transform_source=js_to_json, fatal=False)
+ if not extracted_bid_tags:
+ return
+ self._BID_TAGS = {
+ bid: traverse_obj(extracted_bid_tags, (bid, 'value'), expected_type=str, default=self._BID_TAGS.get(bid))
+ for bid in extracted_bid_tags.keys()
+ }
+
+ def _get_cookie(self, name, default=None):
+ cookie = self._get_cookies('https://iq.com/').get(name)
+ return cookie.value if cookie else default
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+ self._update_bid_tags(webpage, video_id)
+
+ next_props = self._search_nextjs_data(webpage, video_id)['props']
+ page_data = next_props['initialState']['play']
+ video_info = page_data['curVideoInfo']
+
+ uid = traverse_obj(
+ self._parse_json(
+ self._get_cookie('I00002', '{}'), video_id, transform_source=compat_urllib_parse_unquote, fatal=False),
+ ('data', 'uid'), default=0)
+
+ if uid:
+ vip_data = self._download_json(
+ 'https://pcw-api.iq.com/api/vtype', video_id, note='Downloading VIP data', errnote='Unable to download VIP data', query={
+ 'batch': 1,
+ 'platformId': 3,
+ 'modeCode': self._get_cookie('mod', 'intl'),
+ 'langCode': self._get_cookie('lang', 'en_us'),
+ 'deviceId': self._get_cookie('QC005', '')
+ }, fatal=False)
+ ut_list = traverse_obj(vip_data, ('data', 'all_vip', ..., 'vipType'), expected_type=str_or_none, default=[])
+ else:
+ ut_list = ['0']
+
+ # bid 0 as an initial format checker
+ dash_paths = self._parse_json(PhantomJSwrapper(self).get(
+ url, html='<!DOCTYPE html>', video_id=video_id, note2='Executing signature code', jscode=self._DASH_JS % {
+ 'tvid': video_info['tvId'],
+ 'vid': video_info['vid'],
+ 'src': traverse_obj(next_props, ('initialProps', 'pageProps', 'ptid'),
+ expected_type=str, default='04022001010011000000'),
+ 'uid': uid,
+ 'dfp': self._get_cookie('dfp', ''),
+ 'mode': self._get_cookie('mod', 'intl'),
+ 'lang': self._get_cookie('lang', 'en_us'),
+ 'bid_list': '[' + ','.join(['0', *self._BID_TAGS.keys()]) + ']',
+ 'ut_list': '[' + ','.join(ut_list) + ']',
+ 'cmd5x_func': self._extract_cmd5x_function(webpage, video_id),
+ })[1].strip(), video_id)
+
+ formats, subtitles = [], {}
+ initial_format_data = self._download_json(
+ urljoin('https://cache-video.iq.com', dash_paths['0']), video_id,
+ note='Downloading initial video format info', errnote='Unable to download initial video format info')['data']
+
+ preview_time = traverse_obj(
+ initial_format_data, ('boss_ts', (None, 'data'), ('previewTime', 'rtime')), expected_type=float_or_none, get_all=False)
+ if traverse_obj(initial_format_data, ('boss_ts', 'data', 'prv'), expected_type=int_or_none):
+ self.report_warning('This preview video is limited%s' % format_field(preview_time, template=' to %s seconds'))
+
+ # TODO: Extract audio-only formats
+ for bid in set(traverse_obj(initial_format_data, ('program', 'video', ..., 'bid'), expected_type=str_or_none, default=[])):
+ dash_path = dash_paths.get(bid)
+ if not dash_path:
+ self.report_warning(f'Unknown format id: {bid}. It is currently not being extracted')
+ continue
+ format_data = traverse_obj(self._download_json(
+ urljoin('https://cache-video.iq.com', dash_path), video_id,
+ note=f'Downloading format data for {self._BID_TAGS[bid]}', errnote='Unable to download format data',
+ fatal=False), 'data', expected_type=dict)
+
+ video_format = traverse_obj(format_data, ('program', 'video', lambda _, v: str(v['bid']) == bid),
+ expected_type=dict, default=[], get_all=False) or {}
+ extracted_formats = []
+ if video_format.get('m3u8Url'):
+ extracted_formats.extend(self._extract_m3u8_formats(
+ urljoin(format_data.get('dm3u8', 'https://cache-m.iq.com/dc/dt/'), video_format['m3u8Url']),
+ 'mp4', m3u8_id=bid, fatal=False))
+ if video_format.get('mpdUrl'):
+ # TODO: Properly extract mpd hostname
+ extracted_formats.extend(self._extract_mpd_formats(
+ urljoin(format_data.get('dm3u8', 'https://cache-m.iq.com/dc/dt/'), video_format['mpdUrl']),
+ mpd_id=bid, fatal=False))
+ if video_format.get('m3u8'):
+ ff = video_format.get('ff', 'ts')
+ if ff == 'ts':
+ m3u8_formats, _ = self._parse_m3u8_formats_and_subtitles(
+ video_format['m3u8'], ext='mp4', m3u8_id=bid, fatal=False)
+ extracted_formats.extend(m3u8_formats)
+ elif ff == 'm4s':
+ mpd_data = traverse_obj(
+ self._parse_json(video_format['m3u8'], video_id, fatal=False), ('payload', ..., 'data'), expected_type=str)
+ if not mpd_data:
+ continue
+ mpd_formats, _ = self._parse_mpd_formats_and_subtitles(
+ mpd_data, bid, format_data.get('dm3u8', 'https://cache-m.iq.com/dc/dt/'))
+ extracted_formats.extend(mpd_formats)
+ else:
+ self.report_warning(f'{ff} formats are currently not supported')
+
+ if not extracted_formats:
+ if video_format.get('s'):
+ self.report_warning(f'{self._BID_TAGS[bid]} format is restricted')
+ else:
+ self.report_warning(f'Unable to extract {self._BID_TAGS[bid]} format')
+ for f in extracted_formats:
+ f.update({
+ 'quality': qualities(list(self._BID_TAGS.keys()))(bid),
+ 'format_note': self._BID_TAGS[bid],
+ **parse_resolution(video_format.get('scrsz'))
+ })
+ formats.extend(extracted_formats)
+
+ self._sort_formats(formats)
+
+ for sub_format in traverse_obj(initial_format_data, ('program', 'stl', ...), expected_type=dict, default=[]):
+ lang = self._LID_TAGS.get(str_or_none(sub_format.get('lid')), sub_format.get('_name'))
+ subtitles.setdefault(lang, []).extend([{
+ 'ext': format_ext,
+ 'url': urljoin(initial_format_data.get('dstl', 'http://meta.video.iqiyi.com'), sub_format[format_key])
+ } for format_key, format_ext in [('srt', 'srt'), ('webvtt', 'vtt')] if sub_format.get(format_key)])
+
+ extra_metadata = page_data.get('albumInfo') if video_info.get('albumId') and page_data.get('albumInfo') else video_info
+ return {
+ 'id': video_id,
+ 'title': video_info['name'],
+ 'formats': formats,
+ 'subtitles': subtitles,
+ 'description': video_info.get('mergeDesc'),
+ 'duration': parse_duration(video_info.get('len')),
+ 'age_limit': parse_age_limit(video_info.get('rating')),
+ 'average_rating': traverse_obj(page_data, ('playScoreInfo', 'score'), expected_type=float_or_none),
+ 'timestamp': parse_iso8601(video_info.get('isoUploadDate')),
+ 'categories': traverse_obj(extra_metadata, ('videoTagMap', ..., ..., 'name'), expected_type=str),
+ 'cast': traverse_obj(extra_metadata, ('actorArr', ..., 'name'), expected_type=str),
+ 'episode_number': int_or_none(video_info.get('order')) or None,
+ 'series': video_info.get('albumName'),
+ }
+
+
+class IqAlbumIE(InfoExtractor):
+ IE_NAME = 'iq.com:album'
+ _VALID_URL = r'https?://(?:www\.)?iq\.com/album/(?:[\w%-]*-)?(?P<id>\w+)'
+ _TESTS = [{
+ 'url': 'https://www.iq.com/album/one-piece-1999-1bk9icvr331',
+ 'info_dict': {
+ 'id': '1bk9icvr331',
+ 'title': 'One Piece',
+ 'description': 'Subtitle available on Sunday 4PM(GMT+8).'
+ },
+ 'playlist_mincount': 238
+ }, {
+ # Movie/single video
+ 'url': 'https://www.iq.com/album/九龙城寨-2021-22yjnij099k',
+ 'info_dict': {
+ 'ext': 'mp4',
+ 'id': '22yjnij099k',
+ 'title': '九龙城寨',
+ 'description': 'md5:8a09f50b8ba0db4dc69bc7c844228044',
+ 'duration': 5000,
+ 'timestamp': 1641911371,
+ 'upload_date': '20220111',
+ 'series': '九龙城寨',
+ 'cast': ['Shi Yan Neng', 'Yu Lang', 'Peter lv', 'Sun Zi Jun', 'Yang Xiao Bo'],
+ 'age_limit': 13,
+ 'average_rating': float,
+ },
+ 'expected_warnings': ['format is restricted']
+ }]
+
+ def _entries(self, album_id_num, page_ranges, album_id=None, mode_code='intl', lang_code='en_us'):
+ for page_range in page_ranges:
+ page = self._download_json(
+ f'https://pcw-api.iq.com/api/episodeListSource/{album_id_num}', album_id,
+ note=f'Downloading video list episodes {page_range.get("msg", "")}',
+ errnote='Unable to download video list', query={
+ 'platformId': 3,
+ 'modeCode': mode_code,
+ 'langCode': lang_code,
+ 'endOrder': page_range['to'],
+ 'startOrder': page_range['from']
+ })
+ for video in page['data']['epg']:
+ yield self.url_result('https://www.iq.com/play/%s' % (video.get('playLocSuffix') or video['qipuIdStr']),
+ IqIE.ie_key(), video.get('qipuIdStr'), video.get('name'))
+
+ def _real_extract(self, url):
+ album_id = self._match_id(url)
+ webpage = self._download_webpage(url, album_id)
+ next_data = self._search_nextjs_data(webpage, album_id)
+ album_data = next_data['props']['initialState']['album']['videoAlbumInfo']
+
+ if album_data.get('videoType') == 'singleVideo':
+ return self.url_result('https://www.iq.com/play/%s' % album_id, IqIE.ie_key())
+ return self.playlist_result(
+ self._entries(album_data['albumId'], album_data['totalPageRange'], album_id,
+ traverse_obj(next_data, ('props', 'initialProps', 'pageProps', 'modeCode')),
+ traverse_obj(next_data, ('props', 'initialProps', 'pageProps', 'langCode'))),
+ album_id, album_data.get('name'), album_data.get('desc'))
diff --git a/hypervideo_dl/extractor/itprotv.py b/hypervideo_dl/extractor/itprotv.py
new file mode 100644
index 0000000..64cb4e6
--- /dev/null
+++ b/hypervideo_dl/extractor/itprotv.py
@@ -0,0 +1,141 @@
+# coding: utf-8
+
+import re
+
+from .common import InfoExtractor
+
+from ..utils import (
+ int_or_none,
+ str_or_none,
+ traverse_obj,
+ urljoin
+)
+
+
+class ITProTVBaseIE(InfoExtractor):
+ _ENDPOINTS = {
+ 'course': 'course?url={}&brand=00002560-0000-3fa9-0000-1d61000035f3',
+ 'episode': 'brand/00002560-0000-3fa9-0000-1d61000035f3/episode?url={}'
+ }
+
+ def _call_api(self, ep, item_id, webpage):
+ return self._download_json(
+ f'https://api.itpro.tv/api/urza/v3/consumer-web/{self._ENDPOINTS[ep].format(item_id)}',
+ item_id, note=f'Fetching {ep} data API',
+ headers={'Authorization': f'Bearer {self._fetch_jwt(webpage)}'})[ep]
+
+ def _fetch_jwt(self, webpage):
+ return self._search_regex(r'{"passedToken":"([\w-]+\.[\w-]+\.[\w-]+)",', webpage, 'jwt')
+
+ def _check_if_logged_in(self, webpage):
+ if re.match(r'{\s*member\s*:\s*null', webpage):
+ self.raise_login_required()
+
+
+class ITProTVIE(ITProTVBaseIE):
+ _VALID_URL = r'https://app.itpro.tv/course/(?P<course>[\w-]+)/(?P<id>[\w-]+)'
+ _TESTS = [{
+ 'url': 'https://app.itpro.tv/course/guided-tour/introductionitprotv',
+ 'md5': 'bca4a28c2667fd1a63052e71a94bb88c',
+ 'info_dict': {
+ 'id': 'introductionitprotv',
+ 'ext': 'mp4',
+ 'title': 'An Introduction to ITProTV 101',
+ 'thumbnail': 'https://itprotv-image-bucket.s3.amazonaws.com/getting-started/itprotv-101-introduction-PGM.11_39_56_02.Still001.png',
+ 'description': 'md5:b175c2c3061ce35a4dd33865b2c1da4e',
+ 'duration': 269,
+ 'series': 'ITProTV 101',
+ 'series_id': 'guided-tour',
+ 'availability': 'needs_auth',
+ 'chapter': 'ITProTV 101',
+ 'chapter_number': 1,
+ 'chapter_id': '5dbb3de426b46c0010b5d1b6'
+ },
+ },
+ {
+ 'url': 'https://app.itpro.tv/course/beyond-tech/job-interview-tips',
+ 'md5': '101a299b98c47ccf4c67f9f0951defa8',
+ 'info_dict': {
+ 'id': 'job-interview-tips',
+ 'ext': 'mp4',
+ 'title': 'Job Interview Tips',
+ 'thumbnail': 'https://s3.amazonaws.com:443/production-itprotv-thumbnails/2f370bf5-294d-4bbe-ab80-c0b5781630ea.png',
+ 'description': 'md5:30d8ba483febdf89ec85623aad3c3cb6',
+ 'duration': 267,
+ 'series': 'Beyond Tech',
+ 'series_id': 'beyond-tech',
+ 'availability': 'needs_auth',
+ 'chapter': 'Job Development',
+ 'chapter_number': 2,
+ 'chapter_id': '5f7c78d424330c000edf04d9'
+ },
+ }]
+
+ def _real_extract(self, url):
+ episode_id, course_name = self._match_valid_url(url).group('id', 'course')
+ webpage = self._download_webpage(url, episode_id)
+ self._check_if_logged_in(webpage)
+ course = self._call_api('course', course_name, webpage)
+ episode = self._call_api('episode', episode_id, webpage)
+
+ chapter_number, chapter = next((
+ (i, topic) for i, topic in enumerate(course.get('topics') or [], 1)
+ if traverse_obj(topic, 'id') == episode.get('topic')), {})
+
+ return {
+ 'id': episode_id,
+ 'title': episode.get('title'),
+ 'description': episode.get('description'),
+ 'thumbnail': episode.get('thumbnail'),
+ 'formats': [
+ {'url': episode[f'jwVideo{h}Embed'], 'height': h}
+ for h in (320, 480, 720, 1080) if episode.get(f'jwVideo{h}Embed')
+ ],
+ 'duration': int_or_none(episode.get('length')),
+ 'series': course.get('name'),
+ 'series_id': course.get('url'),
+ 'chapter': str_or_none(chapter.get('title')),
+ 'chapter_number': chapter_number,
+ 'chapter_id': str_or_none(chapter.get('id')),
+ 'subtitles': {
+ 'en': [{'ext': 'vtt', 'data': episode['enCaptionData']}]
+ } if episode.get('enCaptionData') else None,
+ }
+
+
+class ITProTVCourseIE(ITProTVBaseIE):
+ _VALID_URL = r'https?://app.itpro.tv/course/(?P<id>[\w-]+)/?(?:$|[#?])'
+ _TESTS = [
+ {
+ 'url': 'https://app.itpro.tv/course/guided-tour',
+ 'info_dict': {
+ 'id': 'guided-tour',
+ 'description': 'md5:b175c2c3061ce35a4dd33865b2c1da4e',
+ 'title': 'ITProTV 101',
+ },
+ 'playlist_count': 6
+ },
+ {
+ 'url': 'https://app.itpro.tv/course/beyond-tech',
+ 'info_dict': {
+ 'id': 'beyond-tech',
+ 'description': 'md5:44cd99855e7f81a15ce1269bd0621fed',
+ 'title': 'Beyond Tech'
+ },
+ 'playlist_count': 15
+ },
+ ]
+
+ def _real_extract(self, url):
+ course_id = self._match_id(url)
+ webpage = self._download_webpage(url, course_id)
+ self._check_if_logged_in(webpage)
+ course = self._call_api('course', course_id, webpage)
+
+ entries = [self.url_result(
+ urljoin(url, f'{course_id}/{episode["url"]}'), ITProTVIE,
+ episode['url'], episode.get('title'), url_transparent=True)
+ for episode in course['episodes']]
+
+ return self.playlist_result(
+ entries, course_id, course.get('name'), course.get('description'))
diff --git a/hypervideo_dl/extractor/itv.py b/hypervideo_dl/extractor/itv.py
index 4cd34a2..66705a2 100644
--- a/hypervideo_dl/extractor/itv.py
+++ b/hypervideo_dl/extractor/itv.py
@@ -117,7 +117,7 @@ class ITVIE(InfoExtractor):
# See: https://github.com/hypervideo/hypervideo/issues/986
platform_tag_subs, featureset_subs = next(
((platform_tag, featureset)
- for platform_tag, featuresets in reversed(variants.items()) for featureset in featuresets
+ for platform_tag, featuresets in reversed(list(variants.items())) for featureset in featuresets
if try_get(featureset, lambda x: x[2]) == 'outband-webvtt'),
(None, None))
@@ -146,8 +146,8 @@ class ITVIE(InfoExtractor):
# See: https://github.com/hypervideo/hypervideo/issues/986
platform_tag_video, featureset_video = next(
((platform_tag, featureset)
- for platform_tag, featuresets in reversed(variants.items()) for featureset in featuresets
- if try_get(featureset, lambda x: x[:2]) == ['hls', 'aes']),
+ for platform_tag, featuresets in reversed(list(variants.items())) for featureset in featuresets
+ if set(try_get(featureset, lambda x: x[:2]) or []) == {'aes', 'hls'}),
(None, None))
if not platform_tag_video or not featureset_video:
raise ExtractorError('No downloads available', expected=True, video_id=video_id)
@@ -220,35 +220,42 @@ class ITVIE(InfoExtractor):
class ITVBTCCIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?itv\.com/btcc/(?:[^/]+/)*(?P<id>[^/?#&]+)'
- _TEST = {
+ _VALID_URL = r'https?://(?:www\.)?itv\.com/(?:news|btcc)/(?:[^/]+/)*(?P<id>[^/?#&]+)'
+ _TESTS = [{
'url': 'https://www.itv.com/btcc/articles/btcc-2019-brands-hatch-gp-race-action',
'info_dict': {
'id': 'btcc-2019-brands-hatch-gp-race-action',
'title': 'BTCC 2019: Brands Hatch GP race action',
},
'playlist_count': 12,
- }
- BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1582188683001/HkiHLnNRx_default/index.html?videoId=%s'
+ }, {
+ 'url': 'https://www.itv.com/news/2021-10-27/i-have-to-protect-the-country-says-rishi-sunak-as-uk-faces-interest-rate-hike',
+ 'info_dict': {
+ 'id': 'i-have-to-protect-the-country-says-rishi-sunak-as-uk-faces-interest-rate-hike',
+ 'title': 'md5:6ef054dd9f069330db3dcc66cb772d32'
+ },
+ 'playlist_count': 4
+ }]
+ BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_default/index.html?videoId=%s'
def _real_extract(self, url):
playlist_id = self._match_id(url)
webpage = self._download_webpage(url, playlist_id)
- json_map = try_get(self._parse_json(self._html_search_regex(
- '(?s)<script[^>]+id=[\'"]__NEXT_DATA__[^>]*>([^<]+)</script>', webpage, 'json_map'), playlist_id),
+ json_map = try_get(
+ self._search_nextjs_data(webpage, playlist_id),
lambda x: x['props']['pageProps']['article']['body']['content']) or []
- # Discard empty objects
- video_ids = []
+ entries = []
for video in json_map:
- if video['data'].get('id'):
- video_ids.append(video['data']['id'])
-
- entries = [
- self.url_result(
- smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % video_id, {
+ if not any(video['data'].get(attr) == 'Brightcove' for attr in ('name', 'type')):
+ continue
+ video_id = video['data']['id']
+ account_id = video['data']['accountId']
+ player_id = video['data']['playerId']
+ entries.append(self.url_result(
+ smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % (account_id, player_id, video_id), {
# ITV does not like some GB IP ranges, so here are some
# IP blocks it accepts
'geo_ip_blocks': [
@@ -256,8 +263,7 @@ class ITVBTCCIE(InfoExtractor):
],
'referrer': url,
}),
- ie=BrightcoveNewIE.ie_key(), video_id=video_id)
- for video_id in video_ids]
+ ie=BrightcoveNewIE.ie_key(), video_id=video_id))
title = self._og_search_title(webpage, fatal=False)
diff --git a/hypervideo_dl/extractor/ivideon.py b/hypervideo_dl/extractor/ivideon.py
index 01e7b22..44b2208 100644
--- a/hypervideo_dl/extractor/ivideon.py
+++ b/hypervideo_dl/extractor/ivideon.py
@@ -75,7 +75,7 @@ class IvideonIE(InfoExtractor):
return {
'id': server_id,
- 'title': self._live_title(camera_name or server_id),
+ 'title': camera_name or server_id,
'description': description,
'is_live': True,
'formats': formats,
diff --git a/hypervideo_dl/extractor/iwara.py b/hypervideo_dl/extractor/iwara.py
index 254d986..c0e01e3 100644
--- a/hypervideo_dl/extractor/iwara.py
+++ b/hypervideo_dl/extractor/iwara.py
@@ -76,8 +76,7 @@ class IwaraIE(InfoExtractor):
'age_limit': age_limit,
}
- title = remove_end(self._html_search_regex(
- r'<title>([^<]+)</title>', webpage, 'title'), ' | Iwara')
+ title = remove_end(self._html_extract_title(webpage), ' | Iwara')
thumbnail = self._html_search_regex(
r'poster=[\'"]([^\'"]+)', webpage, 'thumbnail', default=None)
diff --git a/hypervideo_dl/extractor/jamendo.py b/hypervideo_dl/extractor/jamendo.py
index 1db7c64..755d970 100644
--- a/hypervideo_dl/extractor/jamendo.py
+++ b/hypervideo_dl/extractor/jamendo.py
@@ -59,7 +59,7 @@ class JamendoIE(InfoExtractor):
})[0]
def _real_extract(self, url):
- track_id, display_id = self._VALID_URL_RE.match(url).groups()
+ track_id, display_id = self._match_valid_url(url).groups()
# webpage = self._download_webpage(
# 'https://www.jamendo.com/track/' + track_id, track_id)
# models = self._parse_json(self._html_search_regex(
diff --git a/hypervideo_dl/extractor/joj.py b/hypervideo_dl/extractor/joj.py
index 6376181..7350f53 100644
--- a/hypervideo_dl/extractor/joj.py
+++ b/hypervideo_dl/extractor/joj.py
@@ -6,6 +6,7 @@ import re
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
+ format_field,
int_or_none,
js_to_json,
try_get,
@@ -72,7 +73,7 @@ class JojIE(InfoExtractor):
r'(\d+)[pP]\.', format_url, 'height', default=None)
formats.append({
'url': format_url,
- 'format_id': '%sp' % height if height else None,
+ 'format_id': format_field(height, template='%sp'),
'height': int(height),
})
if not formats:
diff --git a/hypervideo_dl/extractor/kakao.py b/hypervideo_dl/extractor/kakao.py
index 97c986d..483ab71 100644
--- a/hypervideo_dl/extractor/kakao.py
+++ b/hypervideo_dl/extractor/kakao.py
@@ -3,10 +3,12 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..compat import compat_str
+from ..compat import compat_HTTPError
from ..utils import (
+ ExtractorError,
int_or_none,
strip_or_none,
+ str_or_none,
traverse_obj,
unified_timestamp,
)
@@ -24,10 +26,17 @@ class KakaoIE(InfoExtractor):
'id': '301965083',
'ext': 'mp4',
'title': '乃木坂46 バナナマン 「3期生紹介コーナーが始動!顔高低差GPも!」 『乃木坂工事中』',
- 'uploader_id': 2671005,
+ 'description': '',
+ 'uploader_id': '2671005',
'uploader': '그랑그랑이',
'timestamp': 1488160199,
'upload_date': '20170227',
+ 'like_count': int,
+ 'thumbnail': r're:http://.+/thumb\.png',
+ 'tags': ['乃木坂'],
+ 'view_count': int,
+ 'duration': 1503,
+ 'comment_count': int,
}
}, {
'url': 'http://tv.kakao.com/channel/2653210/cliplink/300103180',
@@ -37,11 +46,21 @@ class KakaoIE(InfoExtractor):
'ext': 'mp4',
'description': '러블리즈 - Destiny (나의 지구) (Lovelyz - Destiny)\r\n\r\n[쇼! 음악중심] 20160611, 507회',
'title': '러블리즈 - Destiny (나의 지구) (Lovelyz - Destiny)',
- 'uploader_id': 2653210,
+ 'uploader_id': '2653210',
'uploader': '쇼! 음악중심',
'timestamp': 1485684628,
'upload_date': '20170129',
+ 'like_count': int,
+ 'thumbnail': r're:http://.+/thumb\.png',
+ 'tags': 'count:28',
+ 'view_count': int,
+ 'duration': 184,
+ 'comment_count': int,
}
+ }, {
+ # geo restricted
+ 'url': 'https://tv.kakao.com/channel/3643855/cliplink/412069491',
+ 'only_matching': True,
}]
def _real_extract(self, url):
@@ -73,19 +92,24 @@ class KakaoIE(InfoExtractor):
title = clip.get('title') or clip_link.get('displayTitle')
formats = []
- for fmt in clip.get('videoOutputList', []):
+ for fmt in clip.get('videoOutputList') or []:
profile_name = fmt.get('profile')
if not profile_name or profile_name == 'AUDIO':
continue
query.update({
'profile': profile_name,
- 'fields': '-*,url',
+ 'fields': '-*,code,message,url',
})
+ try:
+ fmt_url_json = self._download_json(
+ cdn_api_base, video_id, query=query,
+ note='Downloading video URL for profile %s' % profile_name)
+ except ExtractorError as e:
+ if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
+ resp = self._parse_json(e.cause.read().decode(), video_id)
+ if resp.get('code') == 'GeoBlocked':
+ self.raise_geo_restricted()
- fmt_url_json = self._download_json(
- cdn_api_base, video_id,
- 'Downloading video URL for profile %s' % profile_name,
- query=query, fatal=False)
fmt_url = traverse_obj(fmt_url_json, ('videoLocation', 'url'))
if not fmt_url:
continue
@@ -105,7 +129,7 @@ class KakaoIE(InfoExtractor):
for thumb in clip.get('clipChapterThumbnailList') or []:
thumbs.append({
'url': thumb.get('thumbnailUrl'),
- 'id': compat_str(thumb.get('timeInSec')),
+ 'id': str(thumb.get('timeInSec')),
'preference': -1 if thumb.get('isDefault') else 0
})
top_thumbnail = clip.get('thumbnailUrl')
@@ -120,7 +144,7 @@ class KakaoIE(InfoExtractor):
'title': title,
'description': strip_or_none(clip.get('description')),
'uploader': traverse_obj(clip_link, ('channel', 'name')),
- 'uploader_id': clip_link.get('channelId'),
+ 'uploader_id': str_or_none(clip_link.get('channelId')),
'thumbnails': thumbs,
'timestamp': unified_timestamp(clip_link.get('createTime')),
'duration': int_or_none(clip.get('duration')),
diff --git a/hypervideo_dl/extractor/kaltura.py b/hypervideo_dl/extractor/kaltura.py
index c8f60ef..f6dfc9c 100644
--- a/hypervideo_dl/extractor/kaltura.py
+++ b/hypervideo_dl/extractor/kaltura.py
@@ -12,6 +12,7 @@ from ..compat import (
from ..utils import (
clean_html,
ExtractorError,
+ format_field,
int_or_none,
unsmuggle_url,
smuggle_url,
@@ -300,6 +301,7 @@ class KalturaIE(InfoExtractor):
data_url = re.sub(r'/flvclipper/.*', '/serveFlavor', data_url)
formats = []
+ subtitles = {}
for f in flavor_assets:
# Continue if asset is not ready
if f.get('status') != 2:
@@ -343,13 +345,14 @@ class KalturaIE(InfoExtractor):
if '/playManifest/' in data_url:
m3u8_url = sign_url(data_url.replace(
'format/url', 'format/applehttp'))
- formats.extend(self._extract_m3u8_formats(
+ fmts, subs = self._extract_m3u8_formats_and_subtitles(
m3u8_url, entry_id, 'mp4', 'm3u8_native',
- m3u8_id='hls', fatal=False))
+ m3u8_id='hls', fatal=False)
+ formats.extend(fmts)
+ self._merge_subtitles(subs, target=subtitles)
self._sort_formats(formats)
- subtitles = {}
if captions:
for caption in captions.get('objects', []):
# Continue if caption is not ready
@@ -372,6 +375,6 @@ class KalturaIE(InfoExtractor):
'thumbnail': info.get('thumbnailUrl'),
'duration': info.get('duration'),
'timestamp': info.get('createdAt'),
- 'uploader_id': info.get('userId') if info.get('userId') != 'None' else None,
+ 'uploader_id': format_field(info, 'userId', ignore=('None', None)),
'view_count': info.get('plays'),
}
diff --git a/hypervideo_dl/extractor/keezmovies.py b/hypervideo_dl/extractor/keezmovies.py
index 027f43c..06dbcbb 100644
--- a/hypervideo_dl/extractor/keezmovies.py
+++ b/hypervideo_dl/extractor/keezmovies.py
@@ -8,6 +8,7 @@ from ..compat import compat_urllib_parse_unquote
from ..utils import (
determine_ext,
ExtractorError,
+ format_field,
int_or_none,
str_to_int,
strip_or_none,
@@ -69,7 +70,7 @@ class KeezMoviesIE(InfoExtractor):
video_url, title, 32).decode('utf-8')
formats.append({
'url': format_url,
- 'format_id': '%dp' % height if height else None,
+ 'format_id': format_field(height, template='%dp'),
'height': height,
'tbr': tbr,
})
diff --git a/hypervideo_dl/extractor/kelbyone.py b/hypervideo_dl/extractor/kelbyone.py
new file mode 100644
index 0000000..20c26cf
--- /dev/null
+++ b/hypervideo_dl/extractor/kelbyone.py
@@ -0,0 +1,84 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import int_or_none
+
+
+class KelbyOneIE(InfoExtractor):
+ _VALID_URL = r'https?://members\.kelbyone\.com/course/(?P<id>[^$&?#/]+)'
+
+ _TESTS = [{
+ 'url': 'https://members.kelbyone.com/course/glyn-dewis-mastering-selections/',
+ 'playlist_mincount': 1,
+ 'info_dict': {
+ 'id': 'glyn-dewis-mastering-selections',
+ 'title': 'Trailer - Mastering Selections in Photoshop',
+ },
+ 'playlist': [{
+ 'info_dict': {
+ 'id': 'MkiOnLqK',
+ 'ext': 'mp4',
+ 'title': 'Trailer - Mastering Selections in Photoshop',
+ 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
+ 'thumbnail': 'https://content.jwplatform.com/v2/media/MkiOnLqK/poster.jpg?width=720',
+ 'timestamp': 1601568639,
+ 'duration': 90,
+ 'upload_date': '20201001',
+ },
+ }]
+ }]
+
+ def _entries(self, playlist):
+ for item in playlist:
+ video_id = item['mediaid']
+ thumbnails = [{
+ 'url': image.get('src'),
+ 'width': int_or_none(image.get('width')),
+ } for image in item.get('images') or []]
+ formats, subtitles = [], {}
+ for source in item.get('sources') or []:
+ if not source.get('file'):
+ continue
+ if source.get('type') == 'application/vnd.apple.mpegurl':
+ fmts, subs = self._extract_m3u8_formats_and_subtitles(source['file'], video_id)
+ formats.extend(fmts)
+ subtitles = self._merge_subtitles(subs, subtitles)
+ elif source.get('type') == 'audio/mp4':
+ formats.append({
+ 'format_id': source.get('label'),
+ 'url': source['file'],
+ 'vcodec': 'none',
+ })
+ else:
+ formats.append({
+ 'format_id': source.get('label'),
+ 'height': source.get('height'),
+ 'width': source.get('width'),
+ 'url': source['file'],
+ })
+ for track in item.get('tracks'):
+ if track.get('kind') == 'captions' and track.get('file'):
+ subtitles.setdefault('en', []).append({
+ 'url': track['file'],
+ })
+ self._sort_formats(formats)
+ yield {
+ 'id': video_id,
+ 'title': item['title'],
+ 'description': item.get('description'),
+ 'thumbnails': thumbnails,
+ 'thumbnail': item.get('image'),
+ 'timestamp': item.get('pubdate'),
+ 'duration': item.get('duration'),
+ 'formats': formats,
+ 'subtitles': subtitles,
+ }
+
+ def _real_extract(self, url):
+ item_id = self._match_id(url)
+ webpage = self._download_webpage(url, item_id)
+ playlist_url = self._html_search_regex(r'playlist"\:"(https.*content\.jwplatform\.com.*json)"', webpage, 'playlist url').replace('\\', '')
+ course_data = self._download_json(playlist_url, item_id)
+ return self.playlist_result(self._entries(course_data['playlist']), item_id,
+ course_data.get('title'), course_data.get('description'))
diff --git a/hypervideo_dl/extractor/kinopoisk.py b/hypervideo_dl/extractor/kinopoisk.py
index 9e8d01f..cdbb642 100644
--- a/hypervideo_dl/extractor/kinopoisk.py
+++ b/hypervideo_dl/extractor/kinopoisk.py
@@ -23,9 +23,6 @@ class KinoPoiskIE(InfoExtractor):
'duration': 4533,
'age_limit': 12,
},
- 'params': {
- 'format': 'bestvideo',
- },
}, {
'url': 'https://www.kinopoisk.ru/film/81041',
'only_matching': True,
diff --git a/hypervideo_dl/extractor/koo.py b/hypervideo_dl/extractor/koo.py
index 8154ba7..2d6ed3b 100644
--- a/hypervideo_dl/extractor/koo.py
+++ b/hypervideo_dl/extractor/koo.py
@@ -8,7 +8,7 @@ from ..utils import (
class KooIE(InfoExtractor):
- _VALID_URL = r'(?:https?://)(?:www\.)?kooapp\.com/koo/[^/]+/(?P<id>[^/&#$?]+)'
+ _VALID_URL = r'https?://(?:www\.)?kooapp\.com/koo/[^/]+/(?P<id>[^/&#$?]+)'
_TESTS = [{ # Test for video in the comments
'url': 'https://www.kooapp.com/koo/ytdlpTestAccount/946c4189-bc2d-4524-b95b-43f641e2adde',
'info_dict': {
diff --git a/hypervideo_dl/extractor/la7.py b/hypervideo_dl/extractor/la7.py
index 363fbd6..de985e4 100644
--- a/hypervideo_dl/extractor/la7.py
+++ b/hypervideo_dl/extractor/la7.py
@@ -7,8 +7,9 @@ from .common import InfoExtractor
from ..utils import (
determine_ext,
float_or_none,
+ HEADRequest,
+ int_or_none,
parse_duration,
- smuggle_url,
unified_strdate,
)
@@ -25,19 +26,38 @@ class LA7IE(InfoExtractor):
'url': 'http://www.la7.it/crozza/video/inccool8-02-10-2015-163722',
'md5': '8b613ffc0c4bf9b9e377169fc19c214c',
'info_dict': {
- 'id': '0_42j6wd36',
+ 'id': 'inccool8-02-10-2015-163722',
'ext': 'mp4',
'title': 'Inc.Cool8',
'description': 'Benvenuti nell\'incredibile mondo della INC. COOL. 8. dove “INC.” sta per “Incorporated” “COOL” sta per “fashion” ed Eight sta per il gesto atletico',
'thumbnail': 're:^https?://.*',
- 'uploader_id': 'kdla7pillole@iltrovatore.it',
- 'timestamp': 1443814869,
'upload_date': '20151002',
},
}, {
'url': 'http://www.la7.it/omnibus/rivedila7/omnibus-news-02-07-2016-189077',
'only_matching': True,
}]
+ _HOST = 'https://awsvodpkg.iltrovatore.it'
+
+ def _generate_mp4_url(self, quality, m3u8_formats):
+ for f in m3u8_formats:
+ if f['vcodec'] != 'none' and quality in f['url']:
+ http_url = '%s%s.mp4' % (self._HOST, quality)
+
+ urlh = self._request_webpage(
+ HEADRequest(http_url), quality,
+ note='Check filesize', fatal=False)
+ if urlh:
+ http_f = f.copy()
+ del http_f['manifest_url']
+ http_f.update({
+ 'format_id': http_f['format_id'].replace('hls-', 'https-'),
+ 'url': http_url,
+ 'protocol': 'https',
+ 'filesize_approx': int_or_none(urlh.headers.get('Content-Length', None)),
+ })
+ return http_f
+ return None
def _real_extract(self, url):
video_id = self._match_id(url)
@@ -46,22 +66,30 @@ class LA7IE(InfoExtractor):
url = '%s//%s' % (self.http_scheme(), url)
webpage = self._download_webpage(url, video_id)
+ video_path = self._search_regex(r'(/content/.*?).mp4', webpage, 'video_path')
- player_data = self._search_regex(
- [r'(?s)videoParams\s*=\s*({.+?});', r'videoLa7\(({[^;]+})\);'],
- webpage, 'player data')
- vid = self._search_regex(r'vid\s*:\s*"(.+?)",', player_data, 'vid')
+ formats = self._extract_mpd_formats(
+ f'{self._HOST}/local/dash/,{video_path}.mp4.urlset/manifest.mpd',
+ video_id, mpd_id='dash', fatal=False)
+ m3u8_formats = self._extract_m3u8_formats(
+ f'{self._HOST}/local/hls/,{video_path}.mp4.urlset/master.m3u8',
+ video_id, 'mp4', m3u8_id='hls', fatal=False)
+ formats.extend(m3u8_formats)
+
+ for q in filter(None, video_path.split(',')):
+ http_f = self._generate_mp4_url(q, m3u8_formats)
+ if http_f:
+ formats.append(http_f)
+
+ self._sort_formats(formats)
return {
- '_type': 'url_transparent',
- 'url': smuggle_url('kaltura:103:%s' % vid, {
- 'service_url': 'http://nkdam.iltrovatore.it',
- }),
'id': video_id,
'title': self._og_search_title(webpage, default=None),
'description': self._og_search_description(webpage, default=None),
'thumbnail': self._og_search_thumbnail(webpage, default=None),
- 'ie_key': 'Kaltura',
+ 'formats': formats,
+ 'upload_date': unified_strdate(self._search_regex(r'datetime="(.+?)"', webpage, 'upload_date', fatal=False))
}
diff --git a/hypervideo_dl/extractor/laola1tv.py b/hypervideo_dl/extractor/laola1tv.py
index fa21736..b5d27c2 100644
--- a/hypervideo_dl/extractor/laola1tv.py
+++ b/hypervideo_dl/extractor/laola1tv.py
@@ -112,7 +112,7 @@ class Laola1TvEmbedIE(InfoExtractor):
return {
'id': video_id,
- 'title': self._live_title(title) if is_live else title,
+ 'title': title,
'upload_date': unified_strdate(_v('time_date')),
'uploader': _v('meta_organisation'),
'categories': categories,
@@ -161,7 +161,7 @@ class Laola1TvBaseIE(Laola1TvEmbedIE):
return {
'id': video_id,
'display_id': display_id,
- 'title': self._live_title(title) if is_live else title,
+ 'title': title,
'description': video_data.get('description'),
'thumbnail': video_data.get('image'),
'categories': categories,
diff --git a/hypervideo_dl/extractor/lastfm.py b/hypervideo_dl/extractor/lastfm.py
new file mode 100644
index 0000000..5215717
--- /dev/null
+++ b/hypervideo_dl/extractor/lastfm.py
@@ -0,0 +1,129 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import int_or_none, format_field
+
+
+class LastFMPlaylistBaseIE(InfoExtractor):
+ def _entries(self, url, playlist_id):
+ webpage = self._download_webpage(url, playlist_id)
+ start_page_number = int_or_none(self._search_regex(
+ r'\bpage=(\d+)', url, 'page', default=None)) or 1
+ last_page_number = int_or_none(self._search_regex(
+ r'>(\d+)</a>[^<]*</li>[^<]*<li[^>]+class="pagination-next', webpage, 'last_page', default=None))
+
+ for page_number in range(start_page_number, (last_page_number or start_page_number) + 1):
+ webpage = self._download_webpage(
+ url, playlist_id,
+ note='Downloading page %d%s' % (page_number, format_field(last_page_number, template=' of %d')),
+ query={'page': page_number})
+ page_entries = [
+ self.url_result(player_url, 'Youtube')
+ for player_url in set(re.findall(r'data-youtube-url="([^"]+)"', webpage))
+ ]
+
+ for e in page_entries:
+ yield e
+
+ def _real_extract(self, url):
+ playlist_id = self._match_id(url)
+ return self.playlist_result(self._entries(url, playlist_id), playlist_id)
+
+
+class LastFMPlaylistIE(LastFMPlaylistBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?last\.fm/(music|tag)/(?P<id>[^/]+)(?:/[^/]+)?/?(?:[?#]|$)'
+ _TESTS = [{
+ 'url': 'https://www.last.fm/music/Oasis/(What%27s+the+Story)+Morning+Glory%3F',
+ 'info_dict': {
+ 'id': 'Oasis',
+ },
+ 'playlist_count': 11,
+ }, {
+ 'url': 'https://www.last.fm/music/Oasis',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.last.fm/music/Oasis/',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.last.fm/music/Oasis?top_tracks_date_preset=ALL#top-tracks',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.last.fm/music/Oasis/+tracks',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.last.fm/music/Oasis/+tracks?page=2',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.last.fm/music/Oasis/+tracks?date_preset=LAST_90_DAYS#top-tracks',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.last.fm/tag/rock',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.last.fm/tag/rock/tracks',
+ 'only_matching': True,
+ }]
+
+
+class LastFMUserIE(LastFMPlaylistBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?last\.fm/user/[^/]+/playlists/(?P<id>[^/#?]+)'
+ _TESTS = [{
+ 'url': 'https://www.last.fm/user/mehq/playlists/12319471',
+ 'info_dict': {
+ 'id': '12319471',
+ },
+ 'playlist_count': 30,
+ }]
+
+
+class LastFMIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?last\.fm/music(?:/[^/]+){2}/(?P<id>[^/#?]+)'
+ _TESTS = [{
+ 'url': 'https://www.last.fm/music/Oasis/_/Wonderwall',
+ 'md5': '9c4a70c2e84c03d54fe24229b9e13b7b',
+ 'info_dict': {
+ 'id': '6hzrDeceEKc',
+ 'ext': 'mp4',
+ 'title': 'Oasis - Wonderwall (Official Video)',
+ 'thumbnail': r're:^https?://i.ytimg.com/.*\.jpg$',
+ 'description': 'md5:0848669853c10687cc28e88b5756738f',
+ 'uploader': 'Oasis',
+ 'uploader_id': 'oasisinetofficial',
+ 'upload_date': '20080207',
+ 'album': '(What\'s The Story) Morning Glory? (Remastered)',
+ 'track': 'Wonderwall (Remastered)',
+ 'channel_id': 'UCUDVBtnOQi4c7E8jebpjc9Q',
+ 'view_count': int,
+ 'live_status': 'not_live',
+ 'channel_url': 'https://www.youtube.com/channel/UCUDVBtnOQi4c7E8jebpjc9Q',
+ 'tags': 'count:39',
+ 'creator': 'Oasis',
+ 'uploader_url': 're:^https?://www.youtube.com/user/oasisinetofficial',
+ 'duration': 279,
+ 'alt_title': 'Wonderwall (Remastered)',
+ 'age_limit': 0,
+ 'channel': 'Oasis',
+ 'channel_follower_count': int,
+ 'categories': ['Music'],
+ 'availability': 'public',
+ 'like_count': int,
+ 'playable_in_embed': True,
+ 'artist': 'Oasis',
+ },
+ 'add_ie': ['Youtube'],
+ }, {
+ 'url': 'https://www.last.fm/music/Oasis/_/Don%27t+Look+Back+In+Anger+-+Remastered/',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.last.fm/music/Guns+N%27+Roses/_/Sweet+Child+o%27+Mine',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+ player_url = self._search_regex(r'(?s)class="header-new-playlink"\s+href="([^"]+)"', webpage, 'player_url')
+ return self.url_result(player_url, 'Youtube')
diff --git a/hypervideo_dl/extractor/lbry.py b/hypervideo_dl/extractor/lbry.py
index 0f87bf1..5d5457c 100644
--- a/hypervideo_dl/extractor/lbry.py
+++ b/hypervideo_dl/extractor/lbry.py
@@ -17,6 +17,7 @@ from ..utils import (
parse_qs,
OnDemandPagedList,
try_get,
+ UnsupportedError,
urljoin,
)
@@ -184,28 +185,38 @@ class LBRYIE(LBRYBaseIE):
display_id = compat_urllib_parse_unquote(display_id)
uri = 'lbry://' + display_id
result = self._resolve_url(uri, display_id, 'stream')
- result_value = result['value']
- if result_value.get('stream_type') not in self._SUPPORTED_STREAM_TYPES:
- raise ExtractorError('Unsupported URL', expected=True)
- claim_id = result['claim_id']
- title = result_value['title']
- streaming_url = self._call_api_proxy(
- 'get', claim_id, {'uri': uri}, 'streaming url')['streaming_url']
+ if result['value'].get('stream_type') in self._SUPPORTED_STREAM_TYPES:
+ claim_id, is_live, headers = result['claim_id'], False, None
+ streaming_url = self._call_api_proxy(
+ 'get', claim_id, {'uri': uri}, 'streaming url')['streaming_url']
+ final_url = self._request_webpage(
+ streaming_url, display_id, note='Downloading streaming redirect url info').geturl()
+ elif result.get('value_type') == 'stream':
+ claim_id, is_live = result['signing_channel']['claim_id'], True
+ headers = {'referer': 'https://player.odysee.live/'}
+ live_data = self._download_json(
+ f'https://api.live.odysee.com/v1/odysee/live/{claim_id}', claim_id,
+ note='Downloading livestream JSON metadata')['data']
+ streaming_url = final_url = live_data.get('url')
+ if not final_url and not live_data.get('live'):
+ self.raise_no_formats('This stream is not live', True, claim_id)
+ else:
+ raise UnsupportedError(url)
+
info = self._parse_stream(result, url)
- urlh = self._request_webpage(
- streaming_url, display_id, note='Downloading streaming redirect url info')
- if determine_ext(urlh.geturl()) == 'm3u8':
+ if determine_ext(final_url) == 'm3u8':
info['formats'] = self._extract_m3u8_formats(
- urlh.geturl(), display_id, 'mp4', entry_protocol='m3u8_native',
- m3u8_id='hls')
+ final_url, display_id, 'mp4', 'm3u8_native', m3u8_id='hls', live=is_live, headers=headers)
self._sort_formats(info['formats'])
else:
info['url'] = streaming_url
- info.update({
+ return {
+ **info,
'id': claim_id,
- 'title': title,
- })
- return info
+ 'title': result['value']['title'],
+ 'is_live': is_live,
+ 'http_headers': headers,
+ }
class LBRYChannelIE(LBRYBaseIE):
diff --git a/hypervideo_dl/extractor/lecturio.py b/hypervideo_dl/extractor/lecturio.py
index 9d22287..0ee1eeb 100644
--- a/hypervideo_dl/extractor/lecturio.py
+++ b/hypervideo_dl/extractor/lecturio.py
@@ -22,14 +22,7 @@ class LecturioBaseIE(InfoExtractor):
_LOGIN_URL = 'https://app.lecturio.com/en/login'
_NETRC_MACHINE = 'lecturio'
- def _real_initialize(self):
- self._login()
-
- def _login(self):
- username, password = self._get_login_info()
- if username is None:
- return
-
+ def _perform_login(self, username, password):
# Sets some cookies
_, urlh = self._download_webpage_handle(
self._LOGIN_URL, None, 'Downloading login popup')
diff --git a/hypervideo_dl/extractor/lego.py b/hypervideo_dl/extractor/lego.py
index b9d8b16..901f43b 100644
--- a/hypervideo_dl/extractor/lego.py
+++ b/hypervideo_dl/extractor/lego.py
@@ -8,6 +8,7 @@ from ..compat import compat_HTTPError
from ..utils import (
ExtractorError,
int_or_none,
+ join_nonempty,
qualities,
)
@@ -102,12 +103,8 @@ class LEGOIE(InfoExtractor):
m3u8_id=video_source_format, fatal=False))
else:
video_source_quality = video_source.get('Quality')
- format_id = []
- for v in (video_source_format, video_source_quality):
- if v:
- format_id.append(v)
f = {
- 'format_id': '-'.join(format_id),
+ 'format_id': join_nonempty(video_source_format, video_source_quality),
'quality': q(video_source_quality),
'url': video_source_url,
}
diff --git a/hypervideo_dl/extractor/limelight.py b/hypervideo_dl/extractor/limelight.py
index 369141d..b20681a 100644
--- a/hypervideo_dl/extractor/limelight.py
+++ b/hypervideo_dl/extractor/limelight.py
@@ -194,7 +194,7 @@ class LimelightBaseIE(InfoExtractor):
cc_url = cc.get('webvttFileUrl')
if not cc_url:
continue
- lang = cc.get('languageCode') or self._search_regex(r'/[a-z]{2}\.vtt', cc_url, 'lang', default='en')
+ lang = cc.get('languageCode') or self._search_regex(r'/([a-z]{2})\.vtt', cc_url, 'lang', default='en')
subtitles.setdefault(lang, []).append({
'url': cc_url,
})
diff --git a/hypervideo_dl/extractor/line.py b/hypervideo_dl/extractor/line.py
index d4bcae6..987c434 100644
--- a/hypervideo_dl/extractor/line.py
+++ b/hypervideo_dl/extractor/line.py
@@ -5,95 +5,12 @@ from __future__ import unicode_literals
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
+ format_field,
int_or_none,
- js_to_json,
str_or_none,
)
-class LineTVIE(InfoExtractor):
- _VALID_URL = r'https?://tv\.line\.me/v/(?P<id>\d+)_[^/]+-(?P<segment>ep\d+-\d+)'
-
- _TESTS = [{
- 'url': 'https://tv.line.me/v/793123_goodbye-mrblack-ep1-1/list/69246',
- 'info_dict': {
- 'id': '793123_ep1-1',
- 'ext': 'mp4',
- 'title': 'Goodbye Mr.Black | EP.1-1',
- 'thumbnail': r're:^https?://.*\.jpg$',
- 'duration': 998.509,
- 'view_count': int,
- },
- }, {
- 'url': 'https://tv.line.me/v/2587507_%E6%B4%BE%E9%81%A3%E5%A5%B3%E9%86%ABx-ep1-02/list/185245',
- 'only_matching': True,
- }]
-
- def _real_extract(self, url):
- series_id, segment = self._match_valid_url(url).groups()
- video_id = '%s_%s' % (series_id, segment)
-
- webpage = self._download_webpage(url, video_id)
-
- player_params = self._parse_json(self._search_regex(
- r'naver\.WebPlayer\(({[^}]+})\)', webpage, 'player parameters'),
- video_id, transform_source=js_to_json)
-
- video_info = self._download_json(
- 'https://global-nvapis.line.me/linetv/rmcnmv/vod_play_videoInfo.json',
- video_id, query={
- 'videoId': player_params['videoId'],
- 'key': player_params['key'],
- })
-
- stream = video_info['streams'][0]
- extra_query = '?__gda__=' + stream['key']['value']
- formats = self._extract_m3u8_formats(
- stream['source'] + extra_query, video_id, ext='mp4',
- entry_protocol='m3u8_native', m3u8_id='hls')
-
- for a_format in formats:
- a_format['url'] += extra_query
-
- duration = None
- for video in video_info.get('videos', {}).get('list', []):
- encoding_option = video.get('encodingOption', {})
- abr = video['bitrate']['audio']
- vbr = video['bitrate']['video']
- tbr = abr + vbr
- formats.append({
- 'url': video['source'],
- 'format_id': 'http-%d' % int(tbr),
- 'height': encoding_option.get('height'),
- 'width': encoding_option.get('width'),
- 'abr': abr,
- 'vbr': vbr,
- 'filesize': video.get('size'),
- })
- if video.get('duration') and duration is None:
- duration = video['duration']
-
- self._sort_formats(formats)
-
- if formats and not formats[0].get('width'):
- formats[0]['vcodec'] = 'none'
-
- title = self._og_search_title(webpage)
-
- # like_count requires an additional API request https://tv.line.me/api/likeit/getCount
-
- return {
- 'id': video_id,
- 'title': title,
- 'formats': formats,
- 'extra_param_to_segment_url': extra_query[1:],
- 'duration': duration,
- 'thumbnails': [{'url': thumbnail['source']}
- for thumbnail in video_info.get('thumbnails', {}).get('list', [])],
- 'view_count': video_info.get('meta', {}).get('count'),
- }
-
-
class LineLiveBaseIE(InfoExtractor):
_API_BASE_URL = 'https://live-api.line-apps.com/web/v4.0/channel/'
@@ -116,12 +33,12 @@ class LineLiveBaseIE(InfoExtractor):
return {
'id': broadcast_id,
- 'title': self._live_title(title) if is_live else title,
+ 'title': title,
'thumbnails': thumbnails,
'timestamp': int_or_none(item.get('createdAt')),
'channel': channel.get('name'),
'channel_id': channel_id,
- 'channel_url': 'https://live.line.me/channels/' + channel_id if channel_id else None,
+ 'channel_url': format_field(channel_id, template='https://live.line.me/channels/%s'),
'duration': int_or_none(item.get('archiveDuration')),
'view_count': int_or_none(item.get('viewerCount')),
'comment_count': int_or_none(item.get('chatCount')),
@@ -132,16 +49,19 @@ class LineLiveBaseIE(InfoExtractor):
class LineLiveIE(LineLiveBaseIE):
_VALID_URL = r'https?://live\.line\.me/channels/(?P<channel_id>\d+)/broadcast/(?P<id>\d+)'
_TESTS = [{
- 'url': 'https://live.line.me/channels/4867368/broadcast/16331360',
- 'md5': 'bc931f26bf1d4f971e3b0982b3fab4a3',
+ 'url': 'https://live.line.me/channels/5833718/broadcast/18373277',
+ 'md5': '2c15843b8cb3acd55009ddcb2db91f7c',
'info_dict': {
- 'id': '16331360',
- 'title': '振りコピ講座😙😙😙',
+ 'id': '18373277',
+ 'title': '2021/12/05 (15分犬)定例譲渡会🐶',
'ext': 'mp4',
- 'timestamp': 1617095132,
- 'upload_date': '20210330',
- 'channel': '白川ゆめか',
- 'channel_id': '4867368',
+ 'timestamp': 1638674925,
+ 'upload_date': '20211205',
+ 'thumbnail': 'md5:e1f5817e60f4a72b7e43377cf308d7ef',
+ 'channel_url': 'https://live.line.me/channels/5833718',
+ 'channel': 'Yahooニュース掲載🗞プロフ見てね🐕🐕',
+ 'channel_id': '5833718',
+ 'duration': 937,
'view_count': int,
'comment_count': int,
'is_live': False,
@@ -193,8 +113,8 @@ class LineLiveChannelIE(LineLiveBaseIE):
'url': 'https://live.line.me/channels/5893542',
'info_dict': {
'id': '5893542',
- 'title': 'いくらちゃん',
- 'description': 'md5:c3a4af801f43b2fac0b02294976580be',
+ 'title': 'いくらちゃんだよぉ🦒',
+ 'description': 'md5:4d418087973ad081ceb1b3481f0b1816',
},
'playlist_mincount': 29
}
diff --git a/hypervideo_dl/extractor/linkedin.py b/hypervideo_dl/extractor/linkedin.py
index 3ce906e..0f57bfa 100644
--- a/hypervideo_dl/extractor/linkedin.py
+++ b/hypervideo_dl/extractor/linkedin.py
@@ -6,18 +6,51 @@ import re
from .common import InfoExtractor
from ..utils import (
+ clean_html,
+ extract_attributes,
ExtractorError,
float_or_none,
+ get_element_by_class,
int_or_none,
srt_subtitles_timecode,
+ strip_or_none,
+ mimetype2ext,
try_get,
urlencode_postdata,
urljoin,
)
-class LinkedInLearningBaseIE(InfoExtractor):
+class LinkedInBaseIE(InfoExtractor):
_NETRC_MACHINE = 'linkedin'
+ _logged_in = False
+
+ def _perform_login(self, username, password):
+ if self._logged_in:
+ return
+
+ login_page = self._download_webpage(
+ self._LOGIN_URL, None, 'Downloading login page')
+ action_url = urljoin(self._LOGIN_URL, self._search_regex(
+ r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page, 'post url',
+ default='https://www.linkedin.com/uas/login-submit', group='url'))
+ data = self._hidden_inputs(login_page)
+ data.update({
+ 'session_key': username,
+ 'session_password': password,
+ })
+ login_submit_page = self._download_webpage(
+ action_url, None, 'Logging in',
+ data=urlencode_postdata(data))
+ error = self._search_regex(
+ r'<span[^>]+class="error"[^>]*>\s*(.+?)\s*</span>',
+ login_submit_page, 'error', default=None)
+ if error:
+ raise ExtractorError(error, expected=True)
+ LinkedInBaseIE._logged_in = True
+
+
+class LinkedInLearningBaseIE(LinkedInBaseIE):
_LOGIN_URL = 'https://www.linkedin.com/uas/login?trk=learning'
def _call_api(self, course_slug, fields, video_slug=None, resolution=None):
@@ -34,6 +67,8 @@ class LinkedInLearningBaseIE(InfoExtractor):
})
sub = ' %dp' % resolution
api_url = 'https://www.linkedin.com/learning-api/detailedCourses'
+ if not self._get_cookies(api_url).get('JSESSIONID'):
+ self.raise_login_required()
return self._download_json(
api_url, video_slug, 'Downloading%s JSON metadata' % sub, headers={
'Csrf-Token': self._get_cookies(api_url)['JSESSIONID'].value,
@@ -49,29 +84,47 @@ class LinkedInLearningBaseIE(InfoExtractor):
def _get_video_id(self, video_data, course_slug, video_slug):
return self._get_urn_id(video_data) or '%s/%s' % (course_slug, video_slug)
- def _real_initialize(self):
- email, password = self._get_login_info()
- if email is None:
- return
- login_page = self._download_webpage(
- self._LOGIN_URL, None, 'Downloading login page')
- action_url = urljoin(self._LOGIN_URL, self._search_regex(
- r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page, 'post url',
- default='https://www.linkedin.com/uas/login-submit', group='url'))
- data = self._hidden_inputs(login_page)
- data.update({
- 'session_key': email,
- 'session_password': password,
- })
- login_submit_page = self._download_webpage(
- action_url, None, 'Logging in',
- data=urlencode_postdata(data))
- error = self._search_regex(
- r'<span[^>]+class="error"[^>]*>\s*(.+?)\s*</span>',
- login_submit_page, 'error', default=None)
- if error:
- raise ExtractorError(error, expected=True)
+class LinkedInIE(LinkedInBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?linkedin\.com/posts/.+?(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'https://www.linkedin.com/posts/mishalkhawaja_sendinblueviews-toronto-digitalmarketing-ugcPost-6850898786781339649-mM20',
+ 'info_dict': {
+ 'id': '6850898786781339649',
+ 'ext': 'mp4',
+ 'title': 'Mishal K. on LinkedIn: #sendinblueviews #toronto #digitalmarketing',
+ 'description': 'md5:be125430bab1c574f16aeb186a4d5b19',
+ 'creator': 'Mishal K.'
+ },
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ title = self._html_extract_title(webpage)
+ description = clean_html(get_element_by_class('share-update-card__update-text', webpage))
+ like_count = int_or_none(get_element_by_class('social-counts-reactions__social-counts-numRections', webpage))
+ creator = strip_or_none(clean_html(get_element_by_class('comment__actor-name', webpage)))
+
+ sources = self._parse_json(extract_attributes(self._search_regex(r'(<video[^>]+>)', webpage, 'video'))['data-sources'], video_id)
+ formats = [{
+ 'url': source['src'],
+ 'ext': mimetype2ext(source.get('type')),
+ 'tbr': float_or_none(source.get('data-bitrate'), scale=1000),
+ } for source in sources]
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'formats': formats,
+ 'title': title,
+ 'like_count': like_count,
+ 'creator': creator,
+ 'thumbnail': self._og_search_thumbnail(webpage),
+ 'description': description,
+ }
class LinkedInLearningIE(LinkedInLearningBaseIE):
@@ -102,7 +155,6 @@ class LinkedInLearningIE(LinkedInLearningBaseIE):
def _real_extract(self, url):
course_slug, video_slug = self._match_valid_url(url).groups()
- video_data = None
formats = []
for width, height in ((640, 360), (960, 540), (1280, 720)):
video_data = self._call_api(
diff --git a/hypervideo_dl/extractor/linuxacademy.py b/hypervideo_dl/extractor/linuxacademy.py
index 2053970..6aff88e 100644
--- a/hypervideo_dl/extractor/linuxacademy.py
+++ b/hypervideo_dl/extractor/linuxacademy.py
@@ -75,14 +75,7 @@ class LinuxAcademyIE(InfoExtractor):
_CLIENT_ID = 'KaWxNn1C2Gc7n83W9OFeXltd8Utb5vvx'
_NETRC_MACHINE = 'linuxacademy'
- def _real_initialize(self):
- self._login()
-
- def _login(self):
- username, password = self._get_login_info()
- if username is None:
- return
-
+ def _perform_login(self, username, password):
def random_string():
return ''.join([
random.choice('0123456789ABCDEFGHIJKLMNOPQRSTUVXYZabcdefghijklmnopqrstuvwxyz-._~')
diff --git a/hypervideo_dl/extractor/litv.py b/hypervideo_dl/extractor/litv.py
index 18d237e..16b475a 100644
--- a/hypervideo_dl/extractor/litv.py
+++ b/hypervideo_dl/extractor/litv.py
@@ -7,6 +7,7 @@ from .common import InfoExtractor
from ..utils import (
ExtractorError,
int_or_none,
+ traverse_obj,
smuggle_url,
unsmuggle_url,
)
@@ -55,9 +56,6 @@ class LiTVIE(InfoExtractor):
episode_title = program_info['title']
content_id = season_list['contentId']
- if prompt:
- self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (content_id, video_id))
-
all_episodes = [
self.url_result(smuggle_url(
self._URL_TEMPLATE % (program_info['contentType'], episode['contentId']),
@@ -67,16 +65,10 @@ class LiTVIE(InfoExtractor):
return self.playlist_result(all_episodes, content_id, episode_title)
def _real_extract(self, url):
- url, data = unsmuggle_url(url, {})
+ url, smuggled_data = unsmuggle_url(url, {})
video_id = self._match_id(url)
- noplaylist = self.get_param('noplaylist')
- noplaylist_prompt = True
- if 'force_noplaylist' in data:
- noplaylist = data['force_noplaylist']
- noplaylist_prompt = False
-
webpage = self._download_webpage(url, video_id)
program_info = self._parse_json(self._search_regex(
@@ -84,14 +76,9 @@ class LiTVIE(InfoExtractor):
video_id)
season_list = list(program_info.get('seasonList', {}).values())
- if season_list:
- if not noplaylist:
- return self._extract_playlist(
- season_list[0], video_id, program_info,
- prompt=noplaylist_prompt)
-
- if noplaylist_prompt:
- self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
+ playlist_id = traverse_obj(season_list, 0, 'contentId')
+ if self._yes_playlist(playlist_id, video_id, smuggled_data):
+ return self._extract_playlist(season_list[0], video_id, program_info)
# In browsers `getMainUrl` request is always issued. Usually this
# endpoint gives the same result as the data embedded in the webpage.
diff --git a/hypervideo_dl/extractor/livestream.py b/hypervideo_dl/extractor/livestream.py
index f591289..45bf26d 100644
--- a/hypervideo_dl/extractor/livestream.py
+++ b/hypervideo_dl/extractor/livestream.py
@@ -176,7 +176,7 @@ class LivestreamIE(InfoExtractor):
return {
'id': broadcast_id,
'formats': formats,
- 'title': self._live_title(stream_info['stream_title']) if is_live else stream_info['stream_title'],
+ 'title': stream_info['stream_title'],
'thumbnail': stream_info.get('thumbnail_url'),
'is_live': is_live,
}
@@ -344,7 +344,7 @@ class LivestreamOriginalIE(InfoExtractor):
is_live = video_data.get('isLive')
info.update({
'id': content_id,
- 'title': self._live_title(info['title']) if is_live else info['title'],
+ 'title': info['title'],
'formats': self._extract_video_formats(video_data, content_id),
'is_live': is_live,
})
diff --git a/hypervideo_dl/extractor/lnkgo.py b/hypervideo_dl/extractor/lnkgo.py
index 1467596..bd2dffa 100644
--- a/hypervideo_dl/extractor/lnkgo.py
+++ b/hypervideo_dl/extractor/lnkgo.py
@@ -6,8 +6,10 @@ from .common import InfoExtractor
from ..utils import (
clean_html,
compat_str,
+ format_field,
int_or_none,
parse_iso8601,
+ unified_strdate,
)
@@ -71,17 +73,97 @@ class LnkGoIE(InfoExtractor):
video_id, 'mp4', 'm3u8_native')
self._sort_formats(formats)
- poster_image = video_info.get('posterImage')
-
return {
'id': video_id,
'display_id': display_id,
'title': title,
'formats': formats,
- 'thumbnail': 'https://lnk.lt/all-images/' + poster_image if poster_image else None,
+ 'thumbnail': format_field(video_info, 'posterImage', 'https://lnk.lt/all-images/%s'),
'duration': int_or_none(video_info.get('duration')),
'description': clean_html(video_info.get('htmlDescription')),
'age_limit': self._AGE_LIMITS.get(video_info.get('pgRating'), 0),
'timestamp': parse_iso8601(video_info.get('airDate')),
'view_count': int_or_none(video_info.get('viewsCount')),
}
+
+
+class LnkIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?lnk\.lt/[^/]+/(?P<id>\d+)'
+
+ _TESTS = [{
+ 'url': 'https://lnk.lt/zinios/79791',
+ 'info_dict': {
+ 'id': '79791',
+ 'ext': 'mp4',
+ 'title': 'LNK.lt: Viešintų gyventojai sukilo prieš radijo bangų siųstuvą',
+ 'description': 'Svarbiausios naujienos trumpai, LNK žinios ir Info dienos pokalbiai.',
+ 'view_count': int,
+ 'duration': 233,
+ 'upload_date': '20191123',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'episode_number': 13431,
+ 'series': 'Naujausi žinių reportažai',
+ 'episode': 'Episode 13431'
+ },
+ 'params': {'skip_download': True}
+ }, {
+ 'url': 'https://lnk.lt/istorijos-trumpai/152546',
+ 'info_dict': {
+ 'id': '152546',
+ 'ext': 'mp4',
+ 'title': 'Radžio koncertas gaisre ',
+ 'description': 'md5:0666b5b85cb9fc7c1238dec96f71faba',
+ 'view_count': int,
+ 'duration': 54,
+ 'upload_date': '20220105',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'episode_number': 1036,
+ 'series': 'Istorijos trumpai',
+ 'episode': 'Episode 1036'
+ },
+ 'params': {'skip_download': True}
+ }, {
+ 'url': 'https://lnk.lt/gyvunu-pasaulis/151549',
+ 'info_dict': {
+ 'id': '151549',
+ 'ext': 'mp4',
+ 'title': 'Gyvūnų pasaulis',
+ 'description': '',
+ 'view_count': int,
+ 'duration': 1264,
+ 'upload_date': '20220108',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'episode_number': 16,
+ 'series': 'Gyvūnų pasaulis',
+ 'episode': 'Episode 16'
+ },
+ 'params': {'skip_download': True}
+ }]
+
+ def _real_extract(self, url):
+ id = self._match_id(url)
+ video_json = self._download_json(f'https://lnk.lt/api/video/video-config/{id}', id)['videoInfo']
+ formats, subtitles = [], {}
+ if video_json.get('videoUrl'):
+ fmts, subs = self._extract_m3u8_formats_and_subtitles(video_json['videoUrl'], id)
+ formats.extend(fmts)
+ subtitles = self._merge_subtitles(subtitles, subs)
+ if video_json.get('videoFairplayUrl') and not video_json.get('drm'):
+ fmts, subs = self._extract_m3u8_formats_and_subtitles(video_json['videoFairplayUrl'], id)
+ formats.extend(fmts)
+ subtitles = self._merge_subtitles(subtitles, subs)
+
+ self._sort_formats(formats)
+ return {
+ 'id': id,
+ 'title': video_json.get('title'),
+ 'description': video_json.get('description'),
+ 'view_count': video_json.get('viewsCount'),
+ 'duration': video_json.get('duration'),
+ 'upload_date': unified_strdate(video_json.get('airDate')),
+ 'thumbnail': format_field(video_json, 'posterImage', 'https://lnk.lt/all-images/%s'),
+ 'episode_number': int_or_none(video_json.get('episodeNumber')),
+ 'series': video_json.get('programTitle'),
+ 'formats': formats,
+ 'subtitles': subtitles,
+ }
diff --git a/hypervideo_dl/extractor/lynda.py b/hypervideo_dl/extractor/lynda.py
index 58cf172..ce30474 100644
--- a/hypervideo_dl/extractor/lynda.py
+++ b/hypervideo_dl/extractor/lynda.py
@@ -21,9 +21,6 @@ class LyndaBaseIE(InfoExtractor):
_ACCOUNT_CREDENTIALS_HINT = 'Use --username and --password options to provide lynda.com account credentials.'
_NETRC_MACHINE = 'lynda'
- def _real_initialize(self):
- self._login()
-
@staticmethod
def _check_error(json_string, key_or_keys):
keys = [key_or_keys] if isinstance(key_or_keys, compat_str) else key_or_keys
@@ -32,7 +29,7 @@ class LyndaBaseIE(InfoExtractor):
if error:
raise ExtractorError('Unable to login: %s' % error, expected=True)
- def _login_step(self, form_html, fallback_action_url, extra_form_data, note, referrer_url):
+ def _perform_login_step(self, form_html, fallback_action_url, extra_form_data, note, referrer_url):
action_url = self._search_regex(
r'<form[^>]+action=(["\'])(?P<url>.+?)\1', form_html,
'post url', default=fallback_action_url, group='url')
@@ -55,11 +52,7 @@ class LyndaBaseIE(InfoExtractor):
return response, action_url
- def _login(self):
- username, password = self._get_login_info()
- if username is None:
- return
-
+ def _perform_login(self, username, password):
# Step 1: download signin page
signin_page = self._download_webpage(
self._SIGNIN_URL, None, 'Downloading signin page')
diff --git a/hypervideo_dl/extractor/mainstreaming.py b/hypervideo_dl/extractor/mainstreaming.py
new file mode 100644
index 0000000..0f349a7
--- /dev/null
+++ b/hypervideo_dl/extractor/mainstreaming.py
@@ -0,0 +1,219 @@
+# coding: utf-8
+import re
+
+from .common import InfoExtractor
+
+from ..utils import (
+ int_or_none,
+ js_to_json,
+ parse_duration,
+ traverse_obj,
+ try_get,
+ urljoin
+)
+
+
+class MainStreamingIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:webtools-?)?(?P<host>[A-Za-z0-9-]*\.msvdn.net)/(?:embed|amp_embed|content)/(?P<id>\w+)'
+ IE_DESC = 'MainStreaming Player'
+
+ _TESTS = [
+ {
+ # Live stream offline, has alternative content id
+ 'url': 'https://webtools-e18da6642b684f8aa9ae449862783a56.msvdn.net/embed/53EN6GxbWaJC',
+ 'info_dict': {
+ 'id': '53EN6GxbWaJC',
+ 'title': 'Diretta homepage 2021-12-31 12:00',
+ 'description': '',
+ 'live_status': 'was_live',
+ 'ext': 'mp4',
+ 'thumbnail': r're:https?://[A-Za-z0-9-]*\.msvdn.net/image/\w+/poster',
+ },
+ 'expected_warnings': [
+ 'Ignoring alternative content ID: WDAF1KOWUpH3',
+ 'MainStreaming said: Live event is OFFLINE'
+ ],
+ 'skip': 'live stream offline'
+ }, {
+ # playlist
+ 'url': 'https://webtools-e18da6642b684f8aa9ae449862783a56.msvdn.net/embed/WDAF1KOWUpH3',
+ 'info_dict': {
+ 'id': 'WDAF1KOWUpH3',
+ 'title': 'Playlist homepage',
+ },
+ 'playlist_mincount': 2
+ }, {
+ # livestream
+ 'url': 'https://webtools-859c1818ed614cc5b0047439470927b0.msvdn.net/embed/tDoFkZD3T1Lw',
+ 'info_dict': {
+ 'id': 'tDoFkZD3T1Lw',
+ 'title': r're:Class CNBC Live \d{4}-\d{2}-\d{2} \d{2}:\d{2}$',
+ 'live_status': 'is_live',
+ 'ext': 'mp4',
+ 'thumbnail': r're:https?://[A-Za-z0-9-]*\.msvdn.net/image/\w+/poster',
+ },
+ 'skip': 'live stream'
+ }, {
+ 'url': 'https://webtools-f5842579ff984c1c98d63b8d789673eb.msvdn.net/embed/EUlZfGWkGpOd?autoPlay=false',
+ 'info_dict': {
+ 'id': 'EUlZfGWkGpOd',
+ 'title': 'La Settimana ',
+ 'description': '03 Ottobre ore 02:00',
+ 'ext': 'mp4',
+ 'live_status': 'not_live',
+ 'thumbnail': r're:https?://[A-Za-z0-9-]*\.msvdn.net/image/\w+/poster',
+ 'duration': 1512
+ }
+ }, {
+ # video without webtools- prefix
+ 'url': 'https://f5842579ff984c1c98d63b8d789673eb.msvdn.net/embed/MfuWmzL2lGkA?autoplay=false&T=1635860445',
+ 'info_dict': {
+ 'id': 'MfuWmzL2lGkA',
+ 'title': 'TG Mattina',
+ 'description': '06 Ottobre ore 08:00',
+ 'ext': 'mp4',
+ 'live_status': 'not_live',
+ 'thumbnail': r're:https?://[A-Za-z0-9-]*\.msvdn.net/image/\w+/poster',
+ 'duration': 789.04
+ }
+ }, {
+ # always-on livestream with DVR
+ 'url': 'https://webtools-f5842579ff984c1c98d63b8d789673eb.msvdn.net/embed/HVvPMzy',
+ 'info_dict': {
+ 'id': 'HVvPMzy',
+ 'title': r're:^Diretta LaC News24 \d{4}-\d{2}-\d{2} \d{2}:\d{2}$',
+ 'description': 'canale all news',
+ 'live_status': 'is_live',
+ 'ext': 'mp4',
+ 'thumbnail': r're:https?://[A-Za-z0-9-]*\.msvdn.net/image/\w+/poster',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ # no host
+ 'url': 'https://webtools.msvdn.net/embed/MfuWmzL2lGkA',
+ 'only_matching': True
+ }, {
+ 'url': 'https://859c1818ed614cc5b0047439470927b0.msvdn.net/amp_embed/tDoFkZD3T1Lw',
+ 'only_matching': True
+ }, {
+ 'url': 'https://859c1818ed614cc5b0047439470927b0.msvdn.net/content/tDoFkZD3T1Lw#',
+ 'only_matching': True
+ }
+ ]
+
+ @staticmethod
+ def _extract_urls(webpage):
+ mobj = re.findall(
+ r'<iframe[^>]+?src=["\']?(?P<url>%s)["\']?' % MainStreamingIE._VALID_URL, webpage)
+ if mobj:
+ return [group[0] for group in mobj]
+
+ def _playlist_entries(self, host, playlist_content):
+ for entry in playlist_content:
+ content_id = entry.get('contentID')
+ yield {
+ '_type': 'url',
+ 'ie_key': MainStreamingIE.ie_key(),
+ 'id': content_id,
+ 'duration': int_or_none(traverse_obj(entry, ('duration', 'totalSeconds'))),
+ 'title': entry.get('title'),
+ 'url': f'https://{host}/embed/{content_id}'
+ }
+
+ @staticmethod
+ def _get_webtools_host(host):
+ if not host.startswith('webtools'):
+ host = 'webtools' + ('-' if not host.startswith('.') else '') + host
+ return host
+
+ def _get_webtools_base_url(self, host):
+ return f'{self.http_scheme()}//{self._get_webtools_host(host)}'
+
+ def _call_api(self, host: str, path: str, item_id: str, query=None, note='Downloading API JSON', fatal=False):
+ # JSON API, does not appear to be documented
+ return self._call_webtools_api(host, '/api/v2/' + path, item_id, query, note, fatal)
+
+ def _call_webtools_api(self, host: str, path: str, item_id: str, query=None, note='Downloading webtools API JSON', fatal=False):
+ # webtools docs: https://webtools.msvdn.net/
+ return self._download_json(
+ urljoin(self._get_webtools_base_url(host), path), item_id, query=query, note=note, fatal=fatal)
+
+ def _real_extract(self, url):
+ host, video_id = self._match_valid_url(url).groups()
+ content_info = try_get(
+ self._call_api(
+ host, f'content/{video_id}', video_id, note='Downloading content info API JSON'), lambda x: x['playerContentInfo'])
+ # Fallback
+ if not content_info:
+ webpage = self._download_webpage(url, video_id)
+ player_config = self._parse_json(
+ self._search_regex(
+ r'config\s*=\s*({.+?})\s*;', webpage, 'mainstreaming player config',
+ default='{}', flags=re.DOTALL),
+ video_id, transform_source=js_to_json, fatal=False) or {}
+ content_info = player_config['contentInfo']
+
+ host = content_info.get('host') or host
+ video_id = content_info.get('contentID') or video_id
+ title = content_info.get('title')
+ description = traverse_obj(content_info, 'longDescription', 'shortDescription', expected_type=str)
+ live_status = 'not_live'
+ if content_info.get('drmEnabled'):
+ self.report_drm(video_id)
+
+ alternative_content_id = content_info.get('alternativeContentID')
+ if alternative_content_id:
+ self.report_warning(f'Ignoring alternative content ID: {alternative_content_id}')
+
+ content_type = int_or_none(content_info.get('contentType'))
+ format_base_url = None
+ formats = []
+ subtitles = {}
+ # Live content
+ if content_type == 20:
+ dvr_enabled = traverse_obj(content_info, ('playerSettings', 'dvrEnabled'), expected_type=bool)
+ format_base_url = f"https://{host}/live/{content_info['liveSourceID']}/{video_id}/%s{'?DVR' if dvr_enabled else ''}"
+ live_status = 'is_live'
+ heartbeat = self._call_api(host, f'heartbeat/{video_id}', video_id, note='Checking stream status') or {}
+ if heartbeat.get('heartBeatUp') is False:
+ self.raise_no_formats(f'MainStreaming said: {heartbeat.get("responseMessage")}', expected=True)
+ live_status = 'was_live'
+
+ # Playlist
+ elif content_type == 31:
+ return self.playlist_result(
+ self._playlist_entries(host, content_info.get('playlistContents')), video_id, title, description)
+ # Normal video content?
+ elif content_type == 10:
+ format_base_url = f'https://{host}/vod/{video_id}/%s'
+ # Progressive format
+ # Note: in https://webtools.msvdn.net/loader/playerV2.js there is mention of original.mp3 format,
+ # however it seems to be the same as original.mp4?
+ formats.append({'url': format_base_url % 'original.mp4', 'format_note': 'original', 'quality': 1})
+ else:
+ self.raise_no_formats(f'Unknown content type {content_type}')
+
+ if format_base_url:
+ m3u8_formats, m3u8_subs = self._extract_m3u8_formats_and_subtitles(
+ format_base_url % 'playlist.m3u8', video_id=video_id, fatal=False)
+ mpd_formats, mpd_subs = self._extract_mpd_formats_and_subtitles(
+ format_base_url % 'manifest.mpd', video_id=video_id, fatal=False)
+
+ subtitles = self._merge_subtitles(m3u8_subs, mpd_subs)
+ formats.extend(m3u8_formats + mpd_formats)
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'formats': formats,
+ 'live_status': live_status,
+ 'duration': parse_duration(content_info.get('duration')),
+ 'tags': content_info.get('tags'),
+ 'subtitles': subtitles,
+ 'thumbnail': urljoin(self._get_webtools_base_url(host), f'image/{video_id}/poster')
+ }
diff --git a/hypervideo_dl/extractor/mangomolo.py b/hypervideo_dl/extractor/mangomolo.py
index acee370..68ce138 100644
--- a/hypervideo_dl/extractor/mangomolo.py
+++ b/hypervideo_dl/extractor/mangomolo.py
@@ -33,7 +33,7 @@ class MangomoloBaseIE(InfoExtractor):
return {
'id': page_id,
- 'title': self._live_title(page_id) if self._IS_LIVE else page_id,
+ 'title': page_id,
'uploader_id': hidden_inputs.get('userid'),
'duration': int_or_none(hidden_inputs.get('duration')),
'is_live': self._IS_LIVE,
diff --git a/hypervideo_dl/extractor/manyvids.py b/hypervideo_dl/extractor/manyvids.py
index e8d7163..bd24f88 100644
--- a/hypervideo_dl/extractor/manyvids.py
+++ b/hypervideo_dl/extractor/manyvids.py
@@ -89,4 +89,5 @@ class ManyVidsIE(InfoExtractor):
'view_count': view_count,
'like_count': like_count,
'formats': formats,
+ 'uploader': self._html_search_regex(r'<meta[^>]+name="author"[^>]*>([^<]+)', webpage, 'uploader'),
}
diff --git a/hypervideo_dl/extractor/matchtv.py b/hypervideo_dl/extractor/matchtv.py
index bc9933a..e003b8d 100644
--- a/hypervideo_dl/extractor/matchtv.py
+++ b/hypervideo_dl/extractor/matchtv.py
@@ -49,7 +49,7 @@ class MatchTVIE(InfoExtractor):
self._sort_formats(formats)
return {
'id': video_id,
- 'title': self._live_title('Матч ТВ - Прямой эфир'),
+ 'title': 'Матч ТВ - Прямой эфир',
'is_live': True,
'formats': formats,
}
diff --git a/hypervideo_dl/extractor/mdr.py b/hypervideo_dl/extractor/mdr.py
index 0bdd626..3ca174c 100644
--- a/hypervideo_dl/extractor/mdr.py
+++ b/hypervideo_dl/extractor/mdr.py
@@ -2,13 +2,11 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..compat import (
- compat_str,
- compat_urlparse,
-)
+from ..compat import compat_urlparse
from ..utils import (
determine_ext,
int_or_none,
+ join_nonempty,
parse_duration,
parse_iso8601,
url_or_none,
@@ -148,13 +146,9 @@ class MDRIE(InfoExtractor):
abr = int_or_none(xpath_text(asset, './bitrateAudio', 'abr'), 1000)
filesize = int_or_none(xpath_text(asset, './fileSize', 'file size'))
- format_id = [media_type]
- if vbr or abr:
- format_id.append(compat_str(vbr or abr))
-
f = {
'url': video_url,
- 'format_id': '-'.join(format_id),
+ 'format_id': join_nonempty(media_type, vbr or abr),
'filesize': filesize,
'abr': abr,
'vbr': vbr,
diff --git a/hypervideo_dl/extractor/medaltv.py b/hypervideo_dl/extractor/medaltv.py
index 2ece5aa..59cc307 100644
--- a/hypervideo_dl/extractor/medaltv.py
+++ b/hypervideo_dl/extractor/medaltv.py
@@ -7,6 +7,7 @@ from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
ExtractorError,
+ format_field,
float_or_none,
int_or_none,
str_or_none,
@@ -118,7 +119,7 @@ class MedalTVIE(InfoExtractor):
author = try_get(
hydration_data, lambda x: list(x['profiles'].values())[0], dict) or {}
author_id = str_or_none(author.get('id'))
- author_url = 'https://medal.tv/users/{0}'.format(author_id) if author_id else None
+ author_url = format_field(author_id, template='https://medal.tv/users/%s')
return {
'id': video_id,
diff --git a/hypervideo_dl/extractor/mediaklikk.py b/hypervideo_dl/extractor/mediaklikk.py
index b9b6d73..18ff3be 100644
--- a/hypervideo_dl/extractor/mediaklikk.py
+++ b/hypervideo_dl/extractor/mediaklikk.py
@@ -12,8 +12,8 @@ from ..compat import (
class MediaKlikkIE(InfoExtractor):
- _VALID_URL = r'''(?x)^https?:\/\/(?:www\.)?
- (?:mediaklikk|m4sport|hirado|petofilive)\.hu\/.*?videok?\/
+ _VALID_URL = r'''(?x)https?://(?:www\.)?
+ (?:mediaklikk|m4sport|hirado|petofilive)\.hu/.*?(?:videok?|cikk)/
(?:(?P<year>[0-9]{4})/(?P<month>[0-9]{1,2})/(?P<day>[0-9]{1,2})/)?
(?P<id>[^/#?_]+)'''
diff --git a/hypervideo_dl/extractor/mediaset.py b/hypervideo_dl/extractor/mediaset.py
index 26e7abc..d6b456c 100644
--- a/hypervideo_dl/extractor/mediaset.py
+++ b/hypervideo_dl/extractor/mediaset.py
@@ -1,13 +1,18 @@
# coding: utf-8
from __future__ import unicode_literals
+import functools
import re
from .theplatform import ThePlatformBaseIE
from ..utils import (
ExtractorError,
+ GeoRestrictedError,
int_or_none,
+ OnDemandPagedList,
parse_qs,
+ try_get,
+ urljoin,
update_url_query,
)
@@ -33,7 +38,7 @@ class MediasetIE(ThePlatformBaseIE):
'id': 'F310575103000102',
'ext': 'mp4',
'title': 'Episodio 1',
- 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
+ 'description': 'md5:e8017b7d7194e9bfb75299c2b8d81e02',
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 2682.0,
'upload_date': '20210530',
@@ -41,6 +46,11 @@ class MediasetIE(ThePlatformBaseIE):
'timestamp': 1622413946,
'uploader': 'Canale 5',
'uploader_id': 'C5',
+ 'season': 'Season 1',
+ 'episode': 'Episode 1',
+ 'season_number': 1,
+ 'episode_number': 1,
+ 'chapters': [{'start_time': 0.0, 'end_time': 439.88}, {'start_time': 439.88, 'end_time': 1685.84}, {'start_time': 1685.84, 'end_time': 2682.0}],
},
}, {
'url': 'https://www.mediasetplay.mediaset.it/video/matrix/puntata-del-25-maggio_F309013801000501',
@@ -49,7 +59,7 @@ class MediasetIE(ThePlatformBaseIE):
'id': 'F309013801000501',
'ext': 'mp4',
'title': 'Puntata del 25 maggio',
- 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
+ 'description': 'md5:ee2e456e3eb1dba5e814596655bb5296',
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 6565.008,
'upload_date': '20200903',
@@ -57,6 +67,11 @@ class MediasetIE(ThePlatformBaseIE):
'timestamp': 1599172492,
'uploader': 'Canale 5',
'uploader_id': 'C5',
+ 'season': 'Season 5',
+ 'episode': 'Episode 5',
+ 'season_number': 5,
+ 'episode_number': 5,
+ 'chapters': [{'start_time': 0.0, 'end_time': 3409.08}, {'start_time': 3409.08, 'end_time': 6565.008}],
},
}, {
'url': 'https://www.mediasetplay.mediaset.it/video/cameracafe5/episodio-69-pezzo-di-luna_F303843101017801',
@@ -65,7 +80,7 @@ class MediasetIE(ThePlatformBaseIE):
'id': 'F303843101017801',
'ext': 'mp4',
'title': 'Episodio 69 - Pezzo di luna',
- 'description': '',
+ 'description': 'md5:7c32c8ec4118b72588b9412f11353f73',
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 263.008,
'upload_date': '20200902',
@@ -73,6 +88,11 @@ class MediasetIE(ThePlatformBaseIE):
'timestamp': 1599064700,
'uploader': 'Italia 1',
'uploader_id': 'I1',
+ 'season': 'Season 5',
+ 'episode': 'Episode 178',
+ 'season_number': 5,
+ 'episode_number': 178,
+ 'chapters': [{'start_time': 0.0, 'end_time': 261.88}, {'start_time': 261.88, 'end_time': 263.008}],
},
}, {
'url': 'https://www.mediasetplay.mediaset.it/video/cameracafe5/episodio-51-tu-chi-sei_F303843107000601',
@@ -81,7 +101,7 @@ class MediasetIE(ThePlatformBaseIE):
'id': 'F303843107000601',
'ext': 'mp4',
'title': 'Episodio 51 - Tu chi sei?',
- 'description': '',
+ 'description': 'md5:42ef006e56824cc31787a547590923f4',
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 367.021,
'upload_date': '20200902',
@@ -89,6 +109,28 @@ class MediasetIE(ThePlatformBaseIE):
'timestamp': 1599069817,
'uploader': 'Italia 1',
'uploader_id': 'I1',
+ 'season': 'Season 5',
+ 'episode': 'Episode 6',
+ 'season_number': 5,
+ 'episode_number': 6,
+ 'chapters': [{'start_time': 0.0, 'end_time': 358.68}, {'start_time': 358.68, 'end_time': 367.021}],
+ },
+ }, {
+ # movie
+ 'url': 'https://www.mediasetplay.mediaset.it/movie/selvaggi/selvaggi_F006474501000101',
+ 'md5': '720440187a2ae26af8148eb9e6b901ed',
+ 'info_dict': {
+ 'id': 'F006474501000101',
+ 'ext': 'mp4',
+ 'title': 'Selvaggi',
+ 'description': 'md5:cfdedbbfdd12d4d0e5dcf1fa1b75284f',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'duration': 5233.01,
+ 'upload_date': '20210729',
+ 'timestamp': 1627594716,
+ 'uploader': 'Cine34',
+ 'uploader_id': 'B6',
+ 'chapters': [{'start_time': 0.0, 'end_time': 1938.56}, {'start_time': 1938.56, 'end_time': 5233.01}],
},
}, {
# clip
@@ -156,6 +198,22 @@ class MediasetIE(ThePlatformBaseIE):
video.attrib['src'] = re.sub(r'(https?://vod05)t(-mediaset-it\.akamaized\.net/.+?.mpd)\?.+', r'\1\2', video.attrib['src'])
return super(MediasetIE, self)._parse_smil_formats(smil, smil_url, video_id, namespace, f4m_params, transform_rtmp_url)
+ def _check_drm_formats(self, tp_formats, video_id):
+ has_nondrm, drm_manifest = False, ''
+ for f in tp_formats:
+ if '_sampleaes/' in (f.get('manifest_url') or ''):
+ drm_manifest = drm_manifest or f['manifest_url']
+ f['has_drm'] = True
+ if not f.get('has_drm') and f.get('manifest_url'):
+ has_nondrm = True
+
+ nodrm_manifest = re.sub(r'_sampleaes/(\w+)_fp_', r'/\1_no_', drm_manifest)
+ if has_nondrm or nodrm_manifest == drm_manifest:
+ return
+
+ tp_formats.extend(self._extract_m3u8_formats(
+ nodrm_manifest, video_id, m3u8_id='hls', fatal=False) or [])
+
def _real_extract(self, url):
guid = self._match_id(url)
tp_path = 'PR1GhC/media/guid/2702976343/' + guid
@@ -163,10 +221,10 @@ class MediasetIE(ThePlatformBaseIE):
formats = []
subtitles = {}
- first_e = None
+ first_e = geo_e = None
asset_type = 'geoNo:HD,browser,geoIT|geoNo:HD,geoIT|geoNo:SD,browser,geoIT|geoNo:SD,geoIT|geoNo|HD|SD'
# TODO: fixup ISM+none manifest URLs
- for f in ('MPEG4', 'MPEG-DASH+none', 'M3U+none'):
+ for f in ('MPEG4', 'M3U'):
try:
tp_formats, tp_subtitles = self._extract_theplatform_smil(
update_url_query('http://link.theplatform.%s/s/%s' % (self._TP_TLD, tp_path), {
@@ -175,13 +233,19 @@ class MediasetIE(ThePlatformBaseIE):
'assetTypes': asset_type,
}), guid, 'Downloading %s SMIL data' % (f.split('+')[0]))
except ExtractorError as e:
+ if not geo_e and isinstance(e, GeoRestrictedError):
+ geo_e = e
if not first_e:
first_e = e
- break
+ continue
+ self._check_drm_formats(tp_formats, guid)
formats.extend(tp_formats)
subtitles = self._merge_subtitles(subtitles, tp_subtitles)
- if first_e and not formats:
- raise first_e
+
+ # check for errors and report them
+ if (first_e or geo_e) and not formats:
+ raise geo_e or first_e
+
self._sort_formats(formats)
feed_data = self._download_json(
@@ -197,18 +261,95 @@ class MediasetIE(ThePlatformBaseIE):
break
info.update({
- 'episode_number': int_or_none(feed_data.get('tvSeasonEpisodeNumber')),
- 'season_number': int_or_none(feed_data.get('tvSeasonNumber')),
- 'series': feed_data.get('mediasetprogram$brandTitle'),
+ 'description': info.get('description') or feed_data.get('description') or feed_data.get('longDescription'),
'uploader': publish_info.get('description'),
'uploader_id': publish_info.get('channel'),
'view_count': int_or_none(feed_data.get('mediasetprogram$numberOfViews')),
'thumbnail': thumbnail,
})
+ if feed_data.get('programType') == 'episode':
+ info.update({
+ 'episode_number': int_or_none(
+ feed_data.get('tvSeasonEpisodeNumber')),
+ 'season_number': int_or_none(
+ feed_data.get('tvSeasonNumber')),
+ 'series': feed_data.get('mediasetprogram$brandTitle'),
+ })
+
info.update({
'id': guid,
'formats': formats,
'subtitles': subtitles,
})
return info
+
+
+class MediasetShowIE(MediasetIE):
+ _VALID_URL = r'''(?x)
+ (?:
+ https?://
+ (?:(?:www|static3)\.)?mediasetplay\.mediaset\.it/
+ (?:
+ (?:fiction|programmi-tv|serie-tv|kids)/(?:.+?/)?
+ (?:[a-z-]+)_SE(?P<id>\d{12})
+ (?:,ST(?P<st>\d{12}))?
+ (?:,sb(?P<sb>\d{9}))?$
+ )
+ )
+ '''
+ _TESTS = [{
+ # TV Show webpage (general webpage)
+ 'url': 'https://www.mediasetplay.mediaset.it/programmi-tv/leiene/leiene_SE000000000061',
+ 'info_dict': {
+ 'id': '000000000061',
+ 'title': 'Le Iene',
+ },
+ 'playlist_mincount': 7,
+ }, {
+ # TV Show webpage (specific season)
+ 'url': 'https://www.mediasetplay.mediaset.it/programmi-tv/leiene/leiene_SE000000000061,ST000000002763',
+ 'info_dict': {
+ 'id': '000000002763',
+ 'title': 'Le Iene',
+ },
+ 'playlist_mincount': 7,
+ }, {
+ # TV Show specific playlist (with multiple pages)
+ 'url': 'https://www.mediasetplay.mediaset.it/programmi-tv/leiene/iservizi_SE000000000061,ST000000002763,sb100013375',
+ 'info_dict': {
+ 'id': '100013375',
+ 'title': 'I servizi',
+ },
+ 'playlist_mincount': 50,
+ }]
+
+ _BY_SUBBRAND = 'https://feed.entertainment.tv.theplatform.eu/f/PR1GhC/mediaset-prod-all-programs-v2?byCustomValue={subBrandId}{%s}&sort=:publishInfo_lastPublished|desc,tvSeasonEpisodeNumber|desc&range=%d-%d'
+ _PAGE_SIZE = 25
+
+ def _fetch_page(self, sb, page):
+ lower_limit = page * self._PAGE_SIZE + 1
+ upper_limit = lower_limit + self._PAGE_SIZE - 1
+ content = self._download_json(
+ self._BY_SUBBRAND % (sb, lower_limit, upper_limit), sb)
+ for entry in content.get('entries') or []:
+ yield self.url_result(
+ 'mediaset:' + entry['guid'],
+ playlist_title=entry['mediasetprogram$subBrandDescription'])
+
+ def _real_extract(self, url):
+ playlist_id, st, sb = self._match_valid_url(url).group('id', 'st', 'sb')
+ if not sb:
+ page = self._download_webpage(url, st or playlist_id)
+ entries = [self.url_result(urljoin('https://www.mediasetplay.mediaset.it', url))
+ for url in re.findall(r'href="([^<>=]+SE\d{12},ST\d{12},sb\d{9})">[^<]+<', page)]
+ title = (self._html_search_regex(r'(?s)<h1[^>]*>(.+?)</h1>', page, 'title', default=None)
+ or self._og_search_title(page))
+ return self.playlist_result(entries, st or playlist_id, title)
+
+ entries = OnDemandPagedList(
+ functools.partial(self._fetch_page, sb),
+ self._PAGE_SIZE)
+ title = try_get(entries, lambda x: x[0]['playlist_title'])
+
+ return self.playlist_result(entries, sb, title)
diff --git a/hypervideo_dl/extractor/mediasite.py b/hypervideo_dl/extractor/mediasite.py
index ace86c2..fbf9223 100644
--- a/hypervideo_dl/extractor/mediasite.py
+++ b/hypervideo_dl/extractor/mediasite.py
@@ -14,6 +14,7 @@ from ..utils import (
float_or_none,
mimetype2ext,
str_or_none,
+ try_call,
try_get,
unescapeHTML,
unsmuggle_url,
@@ -145,11 +146,11 @@ class MediasiteIE(InfoExtractor):
'duration': slide['Time'] / 1000,
})
- next_time = try_get(None, [
- lambda _: Stream['Slides'][i + 1]['Time'],
- lambda _: duration,
- lambda _: slide['Time'],
- ], expected_type=(int, float))
+ next_time = try_call(
+ lambda: Stream['Slides'][i + 1]['Time'],
+ lambda: duration,
+ lambda: slide['Time'],
+ expected_type=(int, float))
fragments.append({
'path': fname_template.format(slide.get('Number', i + 1)),
diff --git a/hypervideo_dl/extractor/megatvcom.py b/hypervideo_dl/extractor/megatvcom.py
new file mode 100644
index 0000000..0d6793a
--- /dev/null
+++ b/hypervideo_dl/extractor/megatvcom.py
@@ -0,0 +1,173 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ clean_html,
+ determine_ext,
+ ExtractorError,
+ extract_attributes,
+ get_element_by_class,
+ get_element_html_by_id,
+ HEADRequest,
+ parse_qs,
+ unescapeHTML,
+ unified_timestamp,
+)
+
+
+class MegaTVComBaseIE(InfoExtractor):
+ _PLAYER_DIV_ID = 'player_div_id'
+
+ def _extract_player_attrs(self, webpage):
+ player_el = get_element_html_by_id(self._PLAYER_DIV_ID, webpage)
+ return {
+ re.sub(r'^data-(?:kwik_)?', '', k): v
+ for k, v in extract_attributes(player_el).items()
+ if k not in ('id',)
+ }
+
+
+class MegaTVComIE(MegaTVComBaseIE):
+ IE_NAME = 'megatvcom'
+ IE_DESC = 'megatv.com videos'
+ _VALID_URL = r'https?://(?:www\.)?megatv\.com/(?:\d{4}/\d{2}/\d{2}|[^/]+/(?P<id>\d+))/(?P<slug>[^/]+)'
+
+ _TESTS = [{
+ 'url': 'https://www.megatv.com/2021/10/23/egkainia-gia-ti-nea-skini-omega-tou-dimotikou-theatrou-peiraia/',
+ 'md5': '6546a1a37fff0dd51c9dce5f490b7d7d',
+ 'info_dict': {
+ 'id': '520979',
+ 'ext': 'mp4',
+ 'title': 'md5:70eef71a9cd2c1ecff7ee428354dded2',
+ 'description': 'md5:0209fa8d318128569c0d256a5c404db1',
+ 'timestamp': 1634975747,
+ 'upload_date': '20211023',
+ 'display_id': 'egkainia-gia-ti-nea-skini-omega-tou-dimotikou-theatrou-peiraia',
+ 'thumbnail': 'https://www.megatv.com/wp-content/uploads/2021/10/ΠΕΙΡΑΙΑΣ-1024x450.jpg',
+ },
+ }, {
+ 'url': 'https://www.megatv.com/tvshows/527800/epeisodio-65-12/',
+ 'md5': 'cba2085d45c1abeb8e7e9b7e1d6c0072',
+ 'info_dict': {
+ 'id': '527800',
+ 'ext': 'mp4',
+ 'title': 'md5:fc322cb51f682eecfe2f54cd5ab3a157',
+ 'description': 'md5:b2b7ed3690a78f2a0156eb790fdc00df',
+ 'timestamp': 1636048859,
+ 'upload_date': '20211104',
+ 'display_id': 'epeisodio-65-12',
+ 'thumbnail': 'https://www.megatv.com/wp-content/uploads/2021/11/16-1-1.jpg',
+ },
+ }]
+
+ def _real_extract(self, url):
+ video_id, display_id = self._match_valid_url(url).group('id', 'slug')
+ _is_article = video_id is None
+ webpage = self._download_webpage(url, video_id or display_id)
+ if _is_article:
+ video_id = self._search_regex(
+ r'<article[^>]*\sid=["\']Article_(\d+)["\']', webpage, 'article id')
+ player_attrs = self._extract_player_attrs(webpage)
+ title = player_attrs.get('label') or self._og_search_title(webpage)
+ description = get_element_by_class(
+ 'article-wrapper' if _is_article else 'story_content',
+ webpage)
+ description = clean_html(re.sub(r'<script[^>]*>[^<]+</script>', '', description))
+ if not description:
+ description = self._og_search_description(webpage)
+ thumbnail = player_attrs.get('image') or self._og_search_thumbnail(webpage)
+ timestamp = unified_timestamp(self._html_search_meta(
+ 'article:published_time', webpage))
+ source = player_attrs.get('source')
+ if not source:
+ raise ExtractorError('No source found', video_id=video_id)
+ if determine_ext(source) == 'm3u8':
+ formats, subs = self._extract_m3u8_formats_and_subtitles(source, video_id, 'mp4')
+ else:
+ formats, subs = [{'url': source}], {}
+ if player_attrs.get('subs'):
+ self._merge_subtitles({'und': [{'url': player_attrs['subs']}]}, target=subs)
+ self._sort_formats(formats)
+ return {
+ 'id': video_id,
+ 'display_id': display_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'timestamp': timestamp,
+ 'formats': formats,
+ 'subtitles': subs,
+ }
+
+
+class MegaTVComEmbedIE(MegaTVComBaseIE):
+ IE_NAME = 'megatvcom:embed'
+ IE_DESC = 'megatv.com embedded videos'
+ _VALID_URL = r'(?:https?:)?//(?:www\.)?megatv\.com/embed/?\?p=(?P<id>\d+)'
+ _EMBED_RE = re.compile(rf'''<iframe[^>]+?src=(?P<_q1>["'])(?P<url>{_VALID_URL})(?P=_q1)''')
+
+ _TESTS = [{
+ 'url': 'https://www.megatv.com/embed/?p=2020520979',
+ 'md5': '6546a1a37fff0dd51c9dce5f490b7d7d',
+ 'info_dict': {
+ 'id': '520979',
+ 'ext': 'mp4',
+ 'title': 'md5:70eef71a9cd2c1ecff7ee428354dded2',
+ 'description': 'md5:0209fa8d318128569c0d256a5c404db1',
+ 'timestamp': 1634975747,
+ 'upload_date': '20211023',
+ 'display_id': 'egkainia-gia-ti-nea-skini-omega-tou-dimotikou-theatrou-peiraia',
+ 'thumbnail': 'https://www.megatv.com/wp-content/uploads/2021/10/ΠΕΙΡΑΙΑΣ-1024x450.jpg',
+ },
+ }, {
+ 'url': 'https://www.megatv.com/embed/?p=2020534081',
+ 'md5': '6ac8b3ce4dc6120c802f780a1e6b3812',
+ 'info_dict': {
+ 'id': '534081',
+ 'ext': 'mp4',
+ 'title': 'md5:062e9d5976ef854d8bdc1f5724d9b2d0',
+ 'description': 'md5:36dbe4c3762d2ede9513eea8d07f6d52',
+ 'timestamp': 1636376351,
+ 'upload_date': '20211108',
+ 'display_id': 'neo-rekor-stin-timi-tou-ilektrikou-reymatos-pano-apo-ta-200e-i-xondriki-timi-tou-ilektrikou',
+ 'thumbnail': 'https://www.megatv.com/wp-content/uploads/2021/11/Capture-266.jpg',
+ },
+ }]
+
+ @classmethod
+ def _extract_urls(cls, webpage):
+ for mobj in cls._EMBED_RE.finditer(webpage):
+ yield unescapeHTML(mobj.group('url'))
+
+ def _match_canonical_url(self, webpage):
+ LINK_RE = r'''(?x)
+ <link(?:
+ rel=(?P<_q1>["'])(?P<canonical>canonical)(?P=_q1)|
+ href=(?P<_q2>["'])(?P<href>(?:(?!(?P=_q2)).)+)(?P=_q2)|
+ [^>]*?
+ )+>
+ '''
+ for mobj in re.finditer(LINK_RE, webpage):
+ canonical, href = mobj.group('canonical', 'href')
+ if canonical and href:
+ return unescapeHTML(href)
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+ player_attrs = self._extract_player_attrs(webpage)
+ canonical_url = player_attrs.get('share_url') or self._match_canonical_url(webpage)
+ if not canonical_url:
+ raise ExtractorError('canonical URL not found')
+ video_id = parse_qs(canonical_url)['p'][0]
+
+ # Defer to megatvcom as the metadata extracted from the embeddable page some
+ # times are slightly different, for the same video
+ canonical_url = self._request_webpage(
+ HEADRequest(canonical_url), video_id,
+ note='Resolve canonical URL',
+ errnote='Could not resolve canonical URL').geturl()
+ return self.url_result(canonical_url, MegaTVComIE.ie_key(), video_id)
diff --git a/hypervideo_dl/extractor/mgtv.py b/hypervideo_dl/extractor/mgtv.py
index cab3aa0..4ac70ea 100644
--- a/hypervideo_dl/extractor/mgtv.py
+++ b/hypervideo_dl/extractor/mgtv.py
@@ -13,12 +13,15 @@ from ..compat import (
from ..utils import (
ExtractorError,
int_or_none,
+ try_get,
+ url_or_none,
)
class MGTVIE(InfoExtractor):
_VALID_URL = r'https?://(?:w(?:ww)?\.)?mgtv\.com/(v|b)/(?:[^/]+/)*(?P<id>\d+)\.html'
IE_DESC = '芒果TV'
+ IE_NAME = 'MangoTV'
_TESTS = [{
'url': 'http://www.mgtv.com/v/1/290525/f/3116640.html',
@@ -31,6 +34,32 @@ class MGTVIE(InfoExtractor):
'thumbnail': r're:^https?://.*\.jpg$',
},
}, {
+ 'url': 'https://w.mgtv.com/b/427837/15588271.html',
+ 'info_dict': {
+ 'id': '15588271',
+ 'ext': 'mp4',
+ 'title': '春日迟迟再出发 沉浸版',
+ 'description': 'md5:a7a05a05b1aa87bd50cae619b19bbca6',
+ 'thumbnail': r're:^https?://.+\.jpg',
+ 'duration': 4026,
+ },
+ }, {
+ 'url': 'https://w.mgtv.com/b/333652/7329822.html',
+ 'info_dict': {
+ 'id': '7329822',
+ 'ext': 'mp4',
+ 'title': '拜托,请你爱我',
+ 'description': 'md5:cd81be6499bafe32e4d143abd822bf9c',
+ 'thumbnail': r're:^https?://.+\.jpg',
+ 'duration': 2656,
+ },
+ }, {
+ 'url': 'https://w.mgtv.com/b/427837/15591647.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://w.mgtv.com/b/388252/15634192.html?fpa=33318&fpos=4&lastp=ch_home',
+ 'only_matching': True,
+ }, {
'url': 'http://www.mgtv.com/b/301817/3826653.html',
'only_matching': True,
}, {
@@ -40,12 +69,14 @@ class MGTVIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
- tk2 = base64.urlsafe_b64encode(b'did=%s|pno=1030|ver=0.3.0301|clit=%d' % (compat_str(uuid.uuid4()).encode(), time.time()))[::-1]
+ tk2 = base64.urlsafe_b64encode(
+ f'did={compat_str(uuid.uuid4()).encode()}|pno=1030|ver=0.3.0301|clit={int(time.time())}'.encode())[::-1]
try:
api_data = self._download_json(
'https://pcweb.api.mgtv.com/player/video', video_id, query={
'tk2': tk2,
'video_id': video_id,
+ 'type': 'pch5'
}, headers=self.geo_verification_headers())['data']
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
@@ -61,6 +92,7 @@ class MGTVIE(InfoExtractor):
'pm2': api_data['atc']['pm2'],
'tk2': tk2,
'video_id': video_id,
+ 'src': 'intelmgtv',
}, headers=self.geo_verification_headers())['data']
stream_domain = stream_data['stream_domain'][0]
@@ -71,7 +103,7 @@ class MGTVIE(InfoExtractor):
continue
format_data = self._download_json(
stream_domain + stream_path, video_id,
- note='Download video info for format #%d' % idx)
+ note=f'Download video info for format #{idx}')
format_url = format_data.get('info')
if not format_url:
continue
@@ -79,7 +111,7 @@ class MGTVIE(InfoExtractor):
r'_(\d+)_mp4/', format_url, 'tbr', default=None))
formats.append({
'format_id': compat_str(tbr or idx),
- 'url': format_url,
+ 'url': url_or_none(format_url),
'ext': 'mp4',
'tbr': tbr,
'protocol': 'm3u8_native',
@@ -97,4 +129,25 @@ class MGTVIE(InfoExtractor):
'description': info.get('desc'),
'duration': int_or_none(info.get('duration')),
'thumbnail': info.get('thumb'),
+ 'subtitles': self.extract_subtitles(video_id, stream_domain),
}
+
+ def _get_subtitles(self, video_id, domain):
+ info = self._download_json(f'https://pcweb.api.mgtv.com/video/title?videoId={video_id}',
+ video_id, fatal=False) or {}
+ subtitles = {}
+ for sub in try_get(info, lambda x: x['data']['title']) or []:
+ url_sub = sub.get('url')
+ if not url_sub:
+ continue
+ locale = sub.get('captionCountrySimpleName')
+ sub = self._download_json(f'{domain}{url_sub}', video_id, fatal=False,
+ note=f'Download subtitle for locale {sub.get("name")} ({locale})') or {}
+ sub_url = url_or_none(sub.get('info'))
+ if not sub_url:
+ continue
+ subtitles.setdefault(locale or 'en', []).append({
+ 'url': sub_url,
+ 'ext': 'srt'
+ })
+ return subtitles
diff --git a/hypervideo_dl/extractor/miaopai.py b/hypervideo_dl/extractor/miaopai.py
index f9e35ac..cf0610b 100644
--- a/hypervideo_dl/extractor/miaopai.py
+++ b/hypervideo_dl/extractor/miaopai.py
@@ -24,8 +24,7 @@ class MiaoPaiIE(InfoExtractor):
webpage = self._download_webpage(
url, video_id, headers={'User-Agent': self._USER_AGENT_IPAD})
- title = self._html_search_regex(
- r'<title>([^<]+)</title>', webpage, 'title')
+ title = self._html_extract_title(webpage)
thumbnail = self._html_search_regex(
r'<div[^>]+class=(?P<q1>[\'"]).*\bvideo_img\b.*(?P=q1)[^>]+data-url=(?P<q2>[\'"])(?P<url>[^\'"]+)(?P=q2)',
webpage, 'thumbnail', fatal=False, group='url')
diff --git a/hypervideo_dl/extractor/microsoftstream.py b/hypervideo_dl/extractor/microsoftstream.py
new file mode 100644
index 0000000..4d5a9df
--- /dev/null
+++ b/hypervideo_dl/extractor/microsoftstream.py
@@ -0,0 +1,125 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from base64 import b64decode
+
+from .common import InfoExtractor
+from ..utils import (
+ merge_dicts,
+ parse_iso8601,
+ parse_duration,
+ parse_resolution,
+ try_get,
+ url_basename,
+)
+
+
+class MicrosoftStreamIE(InfoExtractor):
+ IE_NAME = 'microsoftstream'
+ IE_DESC = 'Microsoft Stream'
+ _VALID_URL = r'https?://(?:web|www|msit)\.microsoftstream\.com/video/(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'
+
+ _TESTS = [{
+ 'url': 'https://web.microsoftstream.com/video/6e51d928-4f46-4f1c-b141-369925e37b62?list=user&userId=f5491e02-e8fe-4e34-b67c-ec2e79a6ecc0',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://msit.microsoftstream.com/video/b60f5987-aabd-4e1c-a42f-c559d138f2ca',
+ 'only_matching': True,
+ }]
+
+ def _get_all_subtitles(self, api_url, video_id, headers):
+ subtitles = {}
+ automatic_captions = {}
+ text_tracks = self._download_json(
+ f'{api_url}/videos/{video_id}/texttracks', video_id,
+ note='Downloading subtitles JSON', fatal=False, headers=headers,
+ query={'api-version': '1.4-private'}).get('value') or []
+ for track in text_tracks:
+ if not track.get('language') or not track.get('url'):
+ continue
+ sub_dict = automatic_captions if track.get('autoGenerated') else subtitles
+ sub_dict.setdefault(track['language'], []).append({
+ 'ext': 'vtt',
+ 'url': track.get('url')
+ })
+ return {
+ 'subtitles': subtitles,
+ 'automatic_captions': automatic_captions
+ }
+
+ def extract_all_subtitles(self, *args, **kwargs):
+ if (self.get_param('writesubtitles', False)
+ or self.get_param('writeautomaticsub', False)
+ or self.get_param('listsubtitles')):
+ return self._get_all_subtitles(*args, **kwargs)
+ return {}
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+ if '<title>Microsoft Stream</title>' not in webpage:
+ self.raise_login_required(method='cookies')
+
+ access_token = self._html_search_regex(r'"AccessToken":"(.+?)"', webpage, 'access token')
+ api_url = self._html_search_regex(r'"ApiGatewayUri":"(.+?)"', webpage, 'api url')
+
+ headers = {'Authorization': f'Bearer {access_token}'}
+
+ video_data = self._download_json(
+ f'{api_url}/videos/{video_id}', video_id,
+ headers=headers, query={
+ '$expand': 'creator,tokens,status,liveEvent,extensions',
+ 'api-version': '1.4-private'
+ })
+ video_id = video_data.get('id') or video_id
+ language = video_data.get('language')
+
+ thumbnails = []
+ for thumbnail_id in ('extraSmall', 'small', 'medium', 'large'):
+ thumbnail_url = try_get(video_data, lambda x: x['posterImage'][thumbnail_id]['url'], str)
+ if not thumbnail_url:
+ continue
+ thumb = {
+ 'id': thumbnail_id,
+ 'url': thumbnail_url,
+ }
+ thumb_name = url_basename(thumbnail_url)
+ thumb_name = str(b64decode(thumb_name + '=' * (-len(thumb_name) % 4)))
+ thumb.update(parse_resolution(thumb_name))
+ thumbnails.append(thumb)
+
+ formats = []
+ for playlist in video_data['playbackUrls']:
+ if playlist['mimeType'] == 'application/vnd.apple.mpegurl':
+ formats.extend(self._extract_m3u8_formats(
+ playlist['playbackUrl'], video_id,
+ ext='mp4', entry_protocol='m3u8_native', m3u8_id='hls',
+ fatal=False, headers=headers))
+ elif playlist['mimeType'] == 'application/dash+xml':
+ formats.extend(self._extract_mpd_formats(
+ playlist['playbackUrl'], video_id, mpd_id='dash',
+ fatal=False, headers=headers))
+ elif playlist['mimeType'] == 'application/vnd.ms-sstr+xml':
+ formats.extend(self._extract_ism_formats(
+ playlist['playbackUrl'], video_id, ism_id='mss',
+ fatal=False, headers=headers))
+ formats = [merge_dicts(f, {'language': language}) for f in formats]
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': video_data['name'],
+ 'description': video_data.get('description'),
+ 'uploader': try_get(video_data, lambda x: x['creator']['name'], str),
+ 'uploader_id': try_get(video_data, (lambda x: x['creator']['mail'],
+ lambda x: x['creator']['id']), str),
+ 'thumbnails': thumbnails,
+ **self.extract_all_subtitles(api_url, video_id, headers),
+ 'timestamp': parse_iso8601(video_data.get('created')),
+ 'duration': parse_duration(try_get(video_data, lambda x: x['media']['duration'])),
+ 'webpage_url': f'https://web.microsoftstream.com/video/{video_id}',
+ 'view_count': try_get(video_data, lambda x: x['metrics']['views'], int),
+ 'like_count': try_get(video_data, lambda x: x['metrics']['likes'], int),
+ 'comment_count': try_get(video_data, lambda x: x['metrics']['comments'], int),
+ 'formats': formats,
+ }
diff --git a/hypervideo_dl/extractor/mildom.py b/hypervideo_dl/extractor/mildom.py
index c147cbb..5f2df29 100644
--- a/hypervideo_dl/extractor/mildom.py
+++ b/hypervideo_dl/extractor/mildom.py
@@ -1,92 +1,42 @@
# coding: utf-8
from __future__ import unicode_literals
-import base64
-from datetime import datetime
-import itertools
+import functools
import json
from .common import InfoExtractor
from ..utils import (
- std_headers,
- update_url_query,
+ determine_ext,
+ dict_get,
+ ExtractorError,
+ float_or_none,
+ OnDemandPagedList,
random_uuidv4,
- try_get,
-)
-from ..compat import (
- compat_str,
+ traverse_obj,
)
class MildomBaseIE(InfoExtractor):
_GUEST_ID = None
- _DISPATCHER_CONFIG = None
-
- def _call_api(self, url, video_id, query={}, note='Downloading JSON metadata', init=False):
- url = update_url_query(url, self._common_queries(query, init=init))
- return self._download_json(url, video_id, note=note)['body']
-
- def _common_queries(self, query={}, init=False):
- dc = self._fetch_dispatcher_config()
- r = {
- 'timestamp': self.iso_timestamp(),
- '__guest_id': '' if init else self.guest_id(),
- '__location': dc['location'],
- '__country': dc['country'],
- '__cluster': dc['cluster'],
- '__platform': 'web',
- '__la': self.lang_code(),
- '__pcv': 'v2.9.44',
- 'sfr': 'pc',
- 'accessToken': '',
- }
- r.update(query)
- return r
-
- def _fetch_dispatcher_config(self):
- if not self._DISPATCHER_CONFIG:
- tmp = self._download_json(
- 'https://disp.mildom.com/serverListV2', 'initialization',
- note='Downloading dispatcher_config', data=json.dumps({
- 'protover': 0,
- 'data': base64.b64encode(json.dumps({
- 'fr': 'web',
- 'sfr': 'pc',
- 'devi': 'Windows',
- 'la': 'ja',
- 'gid': None,
- 'loc': '',
- 'clu': '',
- 'wh': '1919*810',
- 'rtm': self.iso_timestamp(),
- 'ua': std_headers['User-Agent'],
- }).encode('utf8')).decode('utf8').replace('\n', ''),
- }).encode('utf8'))
- self._DISPATCHER_CONFIG = self._parse_json(base64.b64decode(tmp['data']), 'initialization')
- return self._DISPATCHER_CONFIG
-
- @staticmethod
- def iso_timestamp():
- 'new Date().toISOString()'
- return datetime.utcnow().isoformat()[0:-3] + 'Z'
-
- def guest_id(self):
- 'getGuestId'
- if self._GUEST_ID:
- return self._GUEST_ID
- self._GUEST_ID = try_get(
- self, (
- lambda x: x._call_api(
- 'https://cloudac.mildom.com/nonolive/gappserv/guest/h5init', 'initialization',
- note='Downloading guest token', init=True)['guest_id'] or None,
- lambda x: x._get_cookies('https://www.mildom.com').get('gid').value,
- lambda x: x._get_cookies('https://m.mildom.com').get('gid').value,
- ), compat_str) or ''
- return self._GUEST_ID
-
- def lang_code(self):
- 'getCurrentLangCode'
- return 'ja'
+
+ def _call_api(self, url, video_id, query=None, note='Downloading JSON metadata', body=None):
+ if not self._GUEST_ID:
+ self._GUEST_ID = f'pc-gp-{random_uuidv4()}'
+
+ content = self._download_json(
+ url, video_id, note=note, data=json.dumps(body).encode() if body else None,
+ headers={'Content-Type': 'application/json'} if body else {},
+ query={
+ '__guest_id': self._GUEST_ID,
+ '__platform': 'web',
+ **(query or {}),
+ })
+
+ if content['code'] != 0:
+ raise ExtractorError(
+ f'Mildom says: {content["message"]} (code {content["code"]})',
+ expected=True)
+ return content['body']
class MildomIE(MildomBaseIE):
@@ -96,31 +46,13 @@ class MildomIE(MildomBaseIE):
def _real_extract(self, url):
video_id = self._match_id(url)
- url = 'https://www.mildom.com/%s' % video_id
-
- webpage = self._download_webpage(url, video_id)
+ webpage = self._download_webpage(f'https://www.mildom.com/{video_id}', video_id)
enterstudio = self._call_api(
'https://cloudac.mildom.com/nonolive/gappserv/live/enterstudio', video_id,
note='Downloading live metadata', query={'user_id': video_id})
result_video_id = enterstudio.get('log_id', video_id)
- title = try_get(
- enterstudio, (
- lambda x: self._html_search_meta('twitter:description', webpage),
- lambda x: x['anchor_intro'],
- ), compat_str)
- description = try_get(
- enterstudio, (
- lambda x: x['intro'],
- lambda x: x['live_intro'],
- ), compat_str)
- uploader = try_get(
- enterstudio, (
- lambda x: self._html_search_meta('twitter:title', webpage),
- lambda x: x['loginname'],
- ), compat_str)
-
servers = self._call_api(
'https://cloudac.mildom.com/nonolive/gappserv/live/liveserver', result_video_id,
note='Downloading live server list', query={
@@ -128,17 +60,20 @@ class MildomIE(MildomBaseIE):
'live_server_type': 'hls',
})
- stream_query = self._common_queries({
- 'streamReqId': random_uuidv4(),
- 'is_lhls': '0',
- })
- m3u8_url = update_url_query(servers['stream_server'] + '/%s_master.m3u8' % video_id, stream_query)
- formats = self._extract_m3u8_formats(m3u8_url, result_video_id, 'mp4', headers={
- 'Referer': 'https://www.mildom.com/',
- 'Origin': 'https://www.mildom.com',
- }, note='Downloading m3u8 information')
-
- del stream_query['streamReqId'], stream_query['timestamp']
+ playback_token = self._call_api(
+ 'https://cloudac.mildom.com/nonolive/gappserv/live/token', result_video_id,
+ note='Obtaining live playback token', body={'host_id': video_id, 'type': 'hls'})
+ playback_token = traverse_obj(playback_token, ('data', ..., 'token'), get_all=False)
+ if not playback_token:
+ raise ExtractorError('Failed to obtain live playback token')
+
+ formats = self._extract_m3u8_formats(
+ f'{servers["stream_server"]}/{video_id}_master.m3u8?{playback_token}',
+ result_video_id, 'mp4', headers={
+ 'Referer': 'https://www.mildom.com/',
+ 'Origin': 'https://www.mildom.com',
+ })
+
for fmt in formats:
fmt.setdefault('http_headers', {})['Referer'] = 'https://www.mildom.com/'
@@ -146,9 +81,10 @@ class MildomIE(MildomBaseIE):
return {
'id': result_video_id,
- 'title': title,
- 'description': description,
- 'uploader': uploader,
+ 'title': self._html_search_meta('twitter:description', webpage, default=None) or traverse_obj(enterstudio, 'anchor_intro'),
+ 'description': traverse_obj(enterstudio, 'intro', 'live_intro', expected_type=str),
+ 'timestamp': float_or_none(enterstudio.get('live_start_ms'), scale=1000),
+ 'uploader': self._html_search_meta('twitter:title', webpage, default=None) or traverse_obj(enterstudio, 'loginname'),
'uploader_id': video_id,
'formats': formats,
'is_live': True,
@@ -157,15 +93,55 @@ class MildomIE(MildomBaseIE):
class MildomVodIE(MildomBaseIE):
IE_NAME = 'mildom:vod'
- IE_DESC = 'Download a VOD in Mildom'
- _VALID_URL = r'https?://(?:(?:www|m)\.)mildom\.com/playback/(?P<user_id>\d+)/(?P<id>(?P=user_id)-[a-zA-Z0-9]+)'
+ IE_DESC = 'VOD in Mildom'
+ _VALID_URL = r'https?://(?:(?:www|m)\.)mildom\.com/playback/(?P<user_id>\d+)/(?P<id>(?P=user_id)-[a-zA-Z0-9]+-?[0-9]*)'
+ _TESTS = [{
+ 'url': 'https://www.mildom.com/playback/10882672/10882672-1597662269',
+ 'info_dict': {
+ 'id': '10882672-1597662269',
+ 'ext': 'mp4',
+ 'title': '始めてのミルダム配信じゃぃ!',
+ 'thumbnail': r're:^https?://.*\.(png|jpg)$',
+ 'upload_date': '20200817',
+ 'duration': 4138.37,
+ 'description': 'ゲームをしたくて!',
+ 'timestamp': 1597662269.0,
+ 'uploader_id': '10882672',
+ 'uploader': 'kson組長(けいそん)',
+ },
+ }, {
+ 'url': 'https://www.mildom.com/playback/10882672/10882672-1597758589870-477',
+ 'info_dict': {
+ 'id': '10882672-1597758589870-477',
+ 'ext': 'mp4',
+ 'title': '【kson】感染メイズ!麻酔銃で無双する',
+ 'thumbnail': r're:^https?://.*\.(png|jpg)$',
+ 'timestamp': 1597759093.0,
+ 'uploader': 'kson組長(けいそん)',
+ 'duration': 4302.58,
+ 'uploader_id': '10882672',
+ 'description': 'このステージ絶対乗り越えたい',
+ 'upload_date': '20200818',
+ },
+ }, {
+ 'url': 'https://www.mildom.com/playback/10882672/10882672-buha9td2lrn97fk2jme0',
+ 'info_dict': {
+ 'id': '10882672-buha9td2lrn97fk2jme0',
+ 'ext': 'mp4',
+ 'title': '【kson組長】CART RACER!!!',
+ 'thumbnail': r're:^https?://.*\.(png|jpg)$',
+ 'uploader_id': '10882672',
+ 'uploader': 'kson組長(けいそん)',
+ 'upload_date': '20201104',
+ 'timestamp': 1604494797.0,
+ 'duration': 4657.25,
+ 'description': 'WTF',
+ },
+ }]
def _real_extract(self, url):
- m = self._match_valid_url(url)
- user_id, video_id = m.group('user_id'), m.group('id')
- url = 'https://www.mildom.com/playback/%s/%s' % (user_id, video_id)
-
- webpage = self._download_webpage(url, video_id)
+ user_id, video_id = self._match_valid_url(url).group('user_id', 'id')
+ webpage = self._download_webpage(f'https://www.mildom.com/playback/{user_id}/{video_id}', video_id)
autoplay = self._call_api(
'https://cloudac.mildom.com/nonolive/videocontent/playback/getPlaybackDetail', video_id,
@@ -173,20 +149,6 @@ class MildomVodIE(MildomBaseIE):
'v_id': video_id,
})['playback']
- title = try_get(
- autoplay, (
- lambda x: self._html_search_meta('og:description', webpage),
- lambda x: x['title'],
- ), compat_str)
- description = try_get(
- autoplay, (
- lambda x: x['video_intro'],
- ), compat_str)
- uploader = try_get(
- autoplay, (
- lambda x: x['author_info']['login_name'],
- ), compat_str)
-
formats = [{
'url': autoplay['audio_url'],
'format_id': 'audio',
@@ -211,14 +173,81 @@ class MildomVodIE(MildomBaseIE):
return {
'id': video_id,
- 'title': title,
- 'description': description,
- 'uploader': uploader,
+ 'title': self._html_search_meta(('og:description', 'description'), webpage, default=None) or autoplay.get('title'),
+ 'description': traverse_obj(autoplay, 'video_intro'),
+ 'timestamp': float_or_none(autoplay.get('publish_time'), scale=1000),
+ 'duration': float_or_none(autoplay.get('video_length'), scale=1000),
+ 'thumbnail': dict_get(autoplay, ('upload_pic', 'video_pic')),
+ 'uploader': traverse_obj(autoplay, ('author_info', 'login_name')),
'uploader_id': user_id,
'formats': formats,
}
+class MildomClipIE(MildomBaseIE):
+ IE_NAME = 'mildom:clip'
+ IE_DESC = 'Clip in Mildom'
+ _VALID_URL = r'https?://(?:(?:www|m)\.)mildom\.com/clip/(?P<id>(?P<user_id>\d+)-[a-zA-Z0-9]+)'
+ _TESTS = [{
+ 'url': 'https://www.mildom.com/clip/10042245-63921673e7b147ebb0806d42b5ba5ce9',
+ 'info_dict': {
+ 'id': '10042245-63921673e7b147ebb0806d42b5ba5ce9',
+ 'title': '全然違ったよ',
+ 'timestamp': 1619181890,
+ 'duration': 59,
+ 'thumbnail': r're:https?://.+',
+ 'uploader': 'ざきんぽ',
+ 'uploader_id': '10042245',
+ },
+ }, {
+ 'url': 'https://www.mildom.com/clip/10111524-ebf4036e5aa8411c99fb3a1ae0902864',
+ 'info_dict': {
+ 'id': '10111524-ebf4036e5aa8411c99fb3a1ae0902864',
+ 'title': 'かっこいい',
+ 'timestamp': 1621094003,
+ 'duration': 59,
+ 'thumbnail': r're:https?://.+',
+ 'uploader': '(ルーキー',
+ 'uploader_id': '10111524',
+ },
+ }, {
+ 'url': 'https://www.mildom.com/clip/10660174-2c539e6e277c4aaeb4b1fbe8d22cb902',
+ 'info_dict': {
+ 'id': '10660174-2c539e6e277c4aaeb4b1fbe8d22cb902',
+ 'title': 'あ',
+ 'timestamp': 1614769431,
+ 'duration': 31,
+ 'thumbnail': r're:https?://.+',
+ 'uploader': 'ドルゴルスレンギーン=ダグワドルジ',
+ 'uploader_id': '10660174',
+ },
+ }]
+
+ def _real_extract(self, url):
+ user_id, video_id = self._match_valid_url(url).group('user_id', 'id')
+ webpage = self._download_webpage(f'https://www.mildom.com/clip/{video_id}', video_id)
+
+ clip_detail = self._call_api(
+ 'https://cloudac-cf-jp.mildom.com/nonolive/videocontent/clip/detail', video_id,
+ note='Downloading playback metadata', query={
+ 'clip_id': video_id,
+ })
+
+ return {
+ 'id': video_id,
+ 'title': self._html_search_meta(
+ ('og:description', 'description'), webpage, default=None) or clip_detail.get('title'),
+ 'timestamp': float_or_none(clip_detail.get('create_time')),
+ 'duration': float_or_none(clip_detail.get('length')),
+ 'thumbnail': clip_detail.get('cover'),
+ 'uploader': traverse_obj(clip_detail, ('user_info', 'loginname')),
+ 'uploader_id': user_id,
+
+ 'url': clip_detail['url'],
+ 'ext': determine_ext(clip_detail.get('url'), 'mp4'),
+ }
+
+
class MildomUserVodIE(MildomBaseIE):
IE_NAME = 'mildom:user:vod'
IE_DESC = 'Download all VODs from specific user in Mildom'
@@ -229,22 +258,32 @@ class MildomUserVodIE(MildomBaseIE):
'id': '10093333',
'title': 'Uploads from ねこばたけ',
},
- 'playlist_mincount': 351,
+ 'playlist_mincount': 732,
+ }, {
+ 'url': 'https://www.mildom.com/profile/10882672',
+ 'info_dict': {
+ 'id': '10882672',
+ 'title': 'Uploads from kson組長(けいそん)',
+ },
+ 'playlist_mincount': 201,
}]
- def _entries(self, user_id):
- for page in itertools.count(1):
- reply = self._call_api(
- 'https://cloudac.mildom.com/nonolive/videocontent/profile/playbackList',
- user_id, note='Downloading page %d' % page, query={
- 'user_id': user_id,
- 'page': page,
- 'limit': '30',
- })
- if not reply:
- break
- for x in reply:
- yield self.url_result('https://www.mildom.com/playback/%s/%s' % (user_id, x['v_id']))
+ def _fetch_page(self, user_id, page):
+ page += 1
+ reply = self._call_api(
+ 'https://cloudac.mildom.com/nonolive/videocontent/profile/playbackList',
+ user_id, note=f'Downloading page {page}', query={
+ 'user_id': user_id,
+ 'page': page,
+ 'limit': '30',
+ })
+ if not reply:
+ return
+ for x in reply:
+ v_id = x.get('v_id')
+ if not v_id:
+ continue
+ yield self.url_result(f'https://www.mildom.com/playback/{user_id}/{v_id}')
def _real_extract(self, url):
user_id = self._match_id(url)
@@ -255,4 +294,5 @@ class MildomUserVodIE(MildomBaseIE):
query={'user_id': user_id}, note='Downloading user profile')['user_info']
return self.playlist_result(
- self._entries(user_id), user_id, 'Uploads from %s' % profile['loginname'])
+ OnDemandPagedList(functools.partial(self._fetch_page, user_id), 30),
+ user_id, f'Uploads from {profile["loginname"]}')
diff --git a/hypervideo_dl/extractor/minds.py b/hypervideo_dl/extractor/minds.py
index 8e9f0f8..9da0720 100644
--- a/hypervideo_dl/extractor/minds.py
+++ b/hypervideo_dl/extractor/minds.py
@@ -5,6 +5,7 @@ from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
clean_html,
+ format_field,
int_or_none,
str_or_none,
strip_or_none,
@@ -120,7 +121,7 @@ class MindsIE(MindsBaseIE):
'timestamp': int_or_none(entity.get('time_created')),
'uploader': strip_or_none(owner.get('name')),
'uploader_id': uploader_id,
- 'uploader_url': 'https://www.minds.com/' + uploader_id if uploader_id else None,
+ 'uploader_url': format_field(uploader_id, template='https://www.minds.com/%s'),
'view_count': int_or_none(entity.get('play:count')),
'like_count': int_or_none(entity.get('thumbs:up:count')),
'dislike_count': int_or_none(entity.get('thumbs:down:count')),
diff --git a/hypervideo_dl/extractor/mirrativ.py b/hypervideo_dl/extractor/mirrativ.py
index 81aea54..2111de6 100644
--- a/hypervideo_dl/extractor/mirrativ.py
+++ b/hypervideo_dl/extractor/mirrativ.py
@@ -19,9 +19,25 @@ class MirrativBaseIE(InfoExtractor):
class MirrativIE(MirrativBaseIE):
IE_NAME = 'mirrativ'
_VALID_URL = r'https?://(?:www\.)?mirrativ\.com/live/(?P<id>[^/?#&]+)'
- LIVE_API_URL = 'https://www.mirrativ.com/api/live/live?live_id=%s'
TESTS = [{
+ 'url': 'https://mirrativ.com/live/UQomuS7EMgHoxRHjEhNiHw',
+ 'info_dict': {
+ 'id': 'UQomuS7EMgHoxRHjEhNiHw',
+ 'title': 'ねむいぃ、。『参加型』🔰jcが初めてやるCOD✨初見さん大歓迎💗',
+ 'is_live': True,
+ 'description': 'md5:bfcd8f77f2fab24c3c672e5620f3f16e',
+ 'thumbnail': r're:https?://.+',
+ 'uploader': '# あ ち ゅ 。💡',
+ 'uploader_id': '118572165',
+ 'duration': None,
+ 'view_count': 1241,
+ 'release_timestamp': 1646229192,
+ 'timestamp': 1646229167,
+ 'was_live': False,
+ },
+ 'skip': 'livestream',
+ }, {
'url': 'https://mirrativ.com/live/POxyuG1KmW2982lqlDTuPw',
'only_matching': True,
}]
@@ -29,12 +45,11 @@ class MirrativIE(MirrativBaseIE):
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage('https://www.mirrativ.com/live/%s' % video_id, video_id)
- live_response = self._download_json(self.LIVE_API_URL % video_id, video_id)
+ live_response = self._download_json(f'https://www.mirrativ.com/api/live/live?live_id={video_id}', video_id)
self.assert_error(live_response)
hls_url = dict_get(live_response, ('archive_url_hls', 'streaming_url_hls'))
is_live = bool(live_response.get('is_live'))
- was_live = bool(live_response.get('is_archive'))
if not hls_url:
raise ExtractorError('Neither archive nor live is available.', expected=True)
@@ -42,55 +57,29 @@ class MirrativIE(MirrativBaseIE):
hls_url, video_id,
ext='mp4', entry_protocol='m3u8_native',
m3u8_id='hls', live=is_live)
- rtmp_url = live_response.get('streaming_url_edge')
- if rtmp_url:
- keys_to_copy = ('width', 'height', 'vcodec', 'acodec', 'tbr')
- fmt = {
- 'format_id': 'rtmp',
- 'url': rtmp_url,
- 'protocol': 'rtmp',
- 'ext': 'mp4',
- }
- fmt.update({k: traverse_obj(formats, (0, k)) for k in keys_to_copy})
- formats.append(fmt)
self._sort_formats(formats)
- title = self._og_search_title(webpage, default=None) or self._search_regex(
- r'<title>\s*(.+?) - Mirrativ\s*</title>', webpage) or live_response.get('title')
- description = live_response.get('description')
- thumbnail = live_response.get('image_url')
-
- duration = try_get(live_response, lambda x: x['ended_at'] - x['started_at'])
- view_count = live_response.get('total_viewer_num')
- release_timestamp = live_response.get('started_at')
- timestamp = live_response.get('created_at')
-
- owner = live_response.get('owner', {})
- uploader = owner.get('name')
- uploader_id = owner.get('user_id')
-
return {
'id': video_id,
- 'title': title,
+ 'title': self._og_search_title(webpage, default=None) or self._search_regex(
+ r'<title>\s*(.+?) - Mirrativ\s*</title>', webpage) or live_response.get('title'),
'is_live': is_live,
- 'description': description,
+ 'description': live_response.get('description'),
'formats': formats,
- 'thumbnail': thumbnail,
- 'uploader': uploader,
- 'uploader_id': uploader_id,
- 'duration': duration,
- 'view_count': view_count,
- 'release_timestamp': release_timestamp,
- 'timestamp': timestamp,
- 'was_live': was_live,
+ 'thumbnail': live_response.get('image_url'),
+ 'uploader': traverse_obj(live_response, ('owner', 'name')),
+ 'uploader_id': traverse_obj(live_response, ('owner', 'user_id')),
+ 'duration': try_get(live_response, lambda x: x['ended_at'] - x['started_at']) if not is_live else None,
+ 'view_count': live_response.get('total_viewer_num'),
+ 'release_timestamp': live_response.get('started_at'),
+ 'timestamp': live_response.get('created_at'),
+ 'was_live': bool(live_response.get('is_archive')),
}
class MirrativUserIE(MirrativBaseIE):
IE_NAME = 'mirrativ:user'
_VALID_URL = r'https?://(?:www\.)?mirrativ\.com/user/(?P<id>\d+)'
- LIVE_HISTORY_API_URL = 'https://www.mirrativ.com/api/live/live_history?user_id=%s&page=%d'
- USER_INFO_API_URL = 'https://www.mirrativ.com/api/user/profile?user_id=%s'
_TESTS = [{
# Live archive is available up to 3 days
@@ -104,8 +93,8 @@ class MirrativUserIE(MirrativBaseIE):
page = 1
while page is not None:
api_response = self._download_json(
- self.LIVE_HISTORY_API_URL % (user_id, page), user_id,
- note='Downloading page %d' % page)
+ f'https://www.mirrativ.com/api/live/live_history?user_id={user_id}&page={page}', user_id,
+ note=f'Downloading page {page}')
self.assert_error(api_response)
lives = api_response.get('lives')
if not lives:
@@ -123,12 +112,10 @@ class MirrativUserIE(MirrativBaseIE):
def _real_extract(self, url):
user_id = self._match_id(url)
user_info = self._download_json(
- self.USER_INFO_API_URL % user_id, user_id,
+ f'https://www.mirrativ.com/api/user/profile?user_id={user_id}', user_id,
note='Downloading user info', fatal=False)
self.assert_error(user_info)
- uploader = user_info.get('name')
- description = user_info.get('description')
-
- entries = self._entries(user_id)
- return self.playlist_result(entries, user_id, uploader, description)
+ return self.playlist_result(
+ self._entries(user_id), user_id,
+ user_info.get('name'), user_info.get('description'))
diff --git a/hypervideo_dl/extractor/mixch.py b/hypervideo_dl/extractor/mixch.py
new file mode 100644
index 0000000..31f450d
--- /dev/null
+++ b/hypervideo_dl/extractor/mixch.py
@@ -0,0 +1,85 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ traverse_obj,
+)
+
+
+class MixchIE(InfoExtractor):
+ IE_NAME = 'mixch'
+ _VALID_URL = r'https?://(?:www\.)?mixch\.tv/u/(?P<id>\d+)'
+
+ _TESTS = [{
+ 'url': 'https://mixch.tv/u/16236849/live',
+ 'skip': 'don\'t know if this live persists',
+ 'info_dict': {
+ 'id': '16236849',
+ 'title': '24配信シェア⭕️投票🙏💦',
+ 'comment_count': 13145,
+ 'view_count': 28348,
+ 'timestamp': 1636189377,
+ 'uploader': '🦥伊咲👶🏻#フレアワ',
+ 'uploader_id': '16236849',
+ }
+ }, {
+ 'url': 'https://mixch.tv/u/16137876/live',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(f'https://mixch.tv/u/{video_id}/live', video_id)
+
+ initial_js_state = self._parse_json(self._search_regex(
+ r'(?m)^\s*window\.__INITIAL_JS_STATE__\s*=\s*(\{.+?\});\s*$', webpage, 'initial JS state'), video_id)
+ if not initial_js_state.get('liveInfo'):
+ raise ExtractorError('Livestream has ended.', expected=True)
+
+ return {
+ 'id': video_id,
+ 'title': traverse_obj(initial_js_state, ('liveInfo', 'title')),
+ 'comment_count': traverse_obj(initial_js_state, ('liveInfo', 'comments')),
+ 'view_count': traverse_obj(initial_js_state, ('liveInfo', 'visitor')),
+ 'timestamp': traverse_obj(initial_js_state, ('liveInfo', 'created')),
+ 'uploader': traverse_obj(initial_js_state, ('broadcasterInfo', 'name')),
+ 'uploader_id': video_id,
+ 'formats': [{
+ 'format_id': 'hls',
+ 'url': traverse_obj(initial_js_state, ('liveInfo', 'hls')) or 'https://d1hd0ww6piyb43.cloudfront.net/hls/torte_%s.m3u8' % video_id,
+ 'ext': 'mp4',
+ 'protocol': 'm3u8',
+ }],
+ 'is_live': True,
+ }
+
+
+class MixchArchiveIE(InfoExtractor):
+ IE_NAME = 'mixch:archive'
+ _VALID_URL = r'https?://(?:www\.)?mixch\.tv/archive/(?P<id>\d+)'
+
+ _TESTS = [{
+ 'url': 'https://mixch.tv/archive/421',
+ 'skip': 'paid video, no DRM. expires at Jan 23',
+ 'info_dict': {
+ 'id': '421',
+ 'title': '96NEKO SHOW TIME',
+ }
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ html5_videos = self._parse_html5_media_entries(
+ url, webpage.replace('video-js', 'video'), video_id, 'hls')
+ if not html5_videos:
+ self.raise_login_required(method='cookies')
+ infodict = html5_videos[0]
+ infodict.update({
+ 'id': video_id,
+ 'title': self._html_search_regex(r'class="archive-title">(.+?)</', webpage, 'title')
+ })
+
+ return infodict
diff --git a/hypervideo_dl/extractor/mixcloud.py b/hypervideo_dl/extractor/mixcloud.py
index a0c043d..c2dd078 100644
--- a/hypervideo_dl/extractor/mixcloud.py
+++ b/hypervideo_dl/extractor/mixcloud.py
@@ -12,6 +12,7 @@ from ..compat import (
compat_zip
)
from ..utils import (
+ ExtractorError,
int_or_none,
parse_iso8601,
strip_or_none,
@@ -125,7 +126,20 @@ class MixcloudIE(MixcloudBaseIE):
tag {
name
}
- }''', track_id, username, slug)
+ }
+ restrictedReason
+ id''', track_id, username, slug)
+
+ if not cloudcast:
+ raise ExtractorError('Track not found', expected=True)
+
+ reason = cloudcast.get('restrictedReason')
+ if reason == 'tracklist':
+ raise ExtractorError('Track unavailable in your country due to licensing restrictions', expected=True)
+ elif reason == 'repeat_play':
+ raise ExtractorError('You have reached your play limit for this track', expected=True)
+ elif reason:
+ raise ExtractorError('Track is restricted', expected=True)
title = cloudcast['name']
diff --git a/hypervideo_dl/extractor/mlssoccer.py b/hypervideo_dl/extractor/mlssoccer.py
new file mode 100644
index 0000000..1d6d4b8
--- /dev/null
+++ b/hypervideo_dl/extractor/mlssoccer.py
@@ -0,0 +1,117 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class MLSSoccerIE(InfoExtractor):
+ _VALID_DOMAINS = r'(?:(?:cfmontreal|intermiamicf|lagalaxy|lafc|houstondynamofc|dcunited|atlutd|mlssoccer|fcdallas|columbuscrew|coloradorapids|fccincinnati|chicagofirefc|austinfc|nashvillesc|whitecapsfc|sportingkc|soundersfc|sjearthquakes|rsl|timbers|philadelphiaunion|orlandocitysc|newyorkredbulls|nycfc)\.com|(?:torontofc)\.ca|(?:revolutionsoccer)\.net)'
+ _VALID_URL = r'https?://(?:www\.)?%s/video/#?(?P<id>[^/&$#?]+)' % _VALID_DOMAINS
+
+ _TESTS = [{
+ 'url': 'https://www.mlssoccer.com/video/the-octagon-can-alphonso-davies-lead-canada-to-first-world-cup-since-1986#the-octagon-can-alphonso-davies-lead-canada-to-first-world-cup-since-1986',
+ 'info_dict': {
+ 'id': '6276033198001',
+ 'ext': 'mp4',
+ 'title': 'The Octagon | Can Alphonso Davies lead Canada to first World Cup since 1986?',
+ 'description': 'md5:f0a883ee33592a0221798f451a98be8f',
+ 'thumbnail': 'https://cf-images.us-east-1.prod.boltdns.net/v1/static/5530036772001/1bbc44f6-c63c-4981-82fa-46b0c1f891e0/5c1ca44a-a033-4e98-b531-ff24c4947608/160x90/match/image.jpg',
+ 'duration': 350.165,
+ 'timestamp': 1633627291,
+ 'uploader_id': '5530036772001',
+ 'tags': ['club/canada'],
+ 'is_live': False,
+ 'upload_date': '20211007',
+ 'filesize_approx': 255193528.83200002
+ },
+ 'params': {'skip_download': True}
+ }, {
+ 'url': 'https://www.whitecapsfc.com/video/highlights-san-jose-earthquakes-vs-vancouver-whitecaps-fc-october-23-2021#highlights-san-jose-earthquakes-vs-vancouver-whitecaps-fc-october-23-2021',
+ 'only_matching': True
+ }, {
+ 'url': 'https://www.torontofc.ca/video/highlights-toronto-fc-vs-cf-montreal-october-23-2021-x6733#highlights-toronto-fc-vs-cf-montreal-october-23-2021-x6733',
+ 'only_matching': True
+ }, {
+ 'url': 'https://www.sportingkc.com/video/post-match-press-conference-john-pulskamp-oct-27-2021#post-match-press-conference-john-pulskamp-oct-27-2021',
+ 'only_matching': True
+ }, {
+ 'url': 'https://www.soundersfc.com/video/highlights-seattle-sounders-fc-vs-sporting-kansas-city-october-23-2021',
+ 'only_matching': True
+ }, {
+ 'url': 'https://www.sjearthquakes.com/video/#highlights-austin-fc-vs-san-jose-earthquakes-june-19-2021',
+ 'only_matching': True
+ }, {
+ 'url': 'https://www.rsl.com/video/2021-u-of-u-health-mic-d-up-vs-colorado-10-16-21#2021-u-of-u-health-mic-d-up-vs-colorado-10-16-21',
+ 'only_matching': True
+ }, {
+ 'url': 'https://www.timbers.com/video/highlights-d-chara-asprilla-with-goals-in-portland-timbers-2-0-win-over-san-jose#highlights-d-chara-asprilla-with-goals-in-portland-timbers-2-0-win-over-san-jose',
+ 'only_matching': True
+ }, {
+ 'url': 'https://www.philadelphiaunion.com/video/highlights-torvphi',
+ 'only_matching': True
+ }, {
+ 'url': 'https://www.orlandocitysc.com/video/highlight-columbus-crew-vs-orlando-city-sc',
+ 'only_matching': True
+ }, {
+ 'url': 'https://www.newyorkredbulls.com/video/all-access-matchday-double-derby-week#all-access-matchday-double-derby-week',
+ 'only_matching': True
+ }, {
+ 'url': 'https://www.nycfc.com/video/highlights-nycfc-1-0-chicago-fire-fc#highlights-nycfc-1-0-chicago-fire-fc',
+ 'only_matching': True
+ }, {
+ 'url': 'https://www.revolutionsoccer.net/video/two-minute-highlights-revs-1-rapids-0-october-27-2021#two-minute-highlights-revs-1-rapids-0-october-27-2021',
+ 'only_matching': True
+ }, {
+ 'url': 'https://www.nashvillesc.com/video/goal-c-j-sapong-nashville-sc-92nd-minute',
+ 'only_matching': True
+ }, {
+ 'url': 'https://www.cfmontreal.com/video/faits-saillants-tor-v-mtl#faits-saillants-orl-v-mtl-x5645',
+ 'only_matching': True
+ }, {
+ 'url': 'https://www.intermiamicf.com/video/all-access-victory-vs-nashville-sc-by-ukg#all-access-victory-vs-nashville-sc-by-ukg',
+ 'only_matching': True
+ }, {
+ 'url': 'https://www.lagalaxy.com/video/#moment-of-the-month-presented-by-san-manuel-casino-rayan-raveloson-scores-his-se',
+ 'only_matching': True
+ }, {
+ 'url': 'https://www.lafc.com/video/breaking-down-lafc-s-final-6-matches-of-the-2021-mls-regular-season#breaking-down-lafc-s-final-6-matches-of-the-2021-mls-regular-season',
+ 'only_matching': True
+ }, {
+ 'url': 'https://www.houstondynamofc.com/video/postgame-press-conference-michael-nelson-presented-by-coushatta-casino-res-x9660#postgame-press-conference-michael-nelson-presented-by-coushatta-casino-res-x9660',
+ 'only_matching': True
+ }, {
+ 'url': 'https://www.dcunited.com/video/tony-alfaro-my-family-pushed-me-to-believe-everything-was-possible',
+ 'only_matching': True
+ }, {
+ 'url': 'https://www.fcdallas.com/video/highlights-fc-dallas-vs-minnesota-united-fc-october-02-2021#highlights-fc-dallas-vs-minnesota-united-fc-october-02-2021',
+ 'only_matching': True
+ }, {
+ 'url': 'https://www.columbuscrew.com/video/match-rewind-columbus-crew-vs-new-york-red-bulls-october-23-2021',
+ 'only_matching': True
+ }, {
+ 'url': 'https://www.coloradorapids.com/video/postgame-reaction-robin-fraser-october-27#postgame-reaction-robin-fraser-october-27',
+ 'only_matching': True
+ }, {
+ 'url': 'https://www.fccincinnati.com/video/#keeping-cincy-chill-presented-by-coors-lite',
+ 'only_matching': True
+ }, {
+ 'url': 'https://www.chicagofirefc.com/video/all-access-fire-score-dramatic-road-win-in-cincy#all-access-fire-score-dramatic-road-win-in-cincy',
+ 'only_matching': True
+ }, {
+ 'url': 'https://www.austinfc.com/video/highlights-colorado-rapids-vs-austin-fc-september-29-2021#highlights-colorado-rapids-vs-austin-fc-september-29-2021',
+ 'only_matching': True
+ }, {
+ 'url': 'https://www.atlutd.com/video/goal-josef-martinez-scores-in-the-73rd-minute#goal-josef-martinez-scores-in-the-73rd-minute',
+ 'only_matching': True
+ }]
+
+ def _real_extract(self, url):
+ id = self._match_id(url)
+ webpage = self._download_webpage(url, id)
+ data_json = self._parse_json(self._html_search_regex(r'data-options\=\"([^\"]+)\"', webpage, 'json'), id)['videoList'][0]
+ return {
+ 'id': id,
+ '_type': 'url',
+ 'url': 'https://players.brightcove.net/%s/default_default/index.html?videoId=%s' % (data_json['accountId'], data_json['videoId']),
+ 'ie_key': 'BrightcoveNew',
+ }
diff --git a/hypervideo_dl/extractor/mojvideo.py b/hypervideo_dl/extractor/mojvideo.py
index 0421f3f..16d9405 100644
--- a/hypervideo_dl/extractor/mojvideo.py
+++ b/hypervideo_dl/extractor/mojvideo.py
@@ -38,8 +38,7 @@ class MojvideoIE(InfoExtractor):
r'<errordesc>([^<]*)</errordesc>', playerapi, 'error description', fatal=False)
raise ExtractorError('%s said: %s' % (self.IE_NAME, error_desc), expected=True)
- title = self._html_search_regex(
- r'<title>([^<]+)</title>', playerapi, 'title')
+ title = self._html_extract_title(playerapi)
video_url = self._html_search_regex(
r'<file>([^<]+)</file>', playerapi, 'video URL')
thumbnail = self._html_search_regex(
diff --git a/hypervideo_dl/extractor/mtv.py b/hypervideo_dl/extractor/mtv.py
index e060884..be5de0a 100644
--- a/hypervideo_dl/extractor/mtv.py
+++ b/hypervideo_dl/extractor/mtv.py
@@ -15,6 +15,7 @@ from ..utils import (
float_or_none,
HEADRequest,
int_or_none,
+ join_nonempty,
RegexNotFoundError,
sanitized_Request,
strip_or_none,
@@ -99,9 +100,9 @@ class MTVServicesInfoExtractor(InfoExtractor):
formats.extend([{
'ext': 'flv' if rtmp_video_url.startswith('rtmp') else ext,
'url': rtmp_video_url,
- 'format_id': '-'.join(filter(None, [
+ 'format_id': join_nonempty(
'rtmp' if rtmp_video_url.startswith('rtmp') else None,
- rendition.get('bitrate')])),
+ rendition.get('bitrate')),
'width': int(rendition.get('width')),
'height': int(rendition.get('height')),
}])
@@ -311,11 +312,17 @@ class MTVServicesInfoExtractor(InfoExtractor):
main_container = self._extract_child_with_type(data, 'MainContainer')
ab_testing = self._extract_child_with_type(main_container, 'ABTesting')
video_player = self._extract_child_with_type(ab_testing or main_container, 'VideoPlayer')
- mgid = video_player['props']['media']['video']['config']['uri']
+ if video_player:
+ mgid = try_get(video_player, lambda x: x['props']['media']['video']['config']['uri'])
+ else:
+ flex_wrapper = self._extract_child_with_type(ab_testing or main_container, 'FlexWrapper')
+ auth_suite_wrapper = self._extract_child_with_type(flex_wrapper, 'AuthSuiteWrapper')
+ player = self._extract_child_with_type(auth_suite_wrapper or flex_wrapper, 'Player')
+ if player:
+ mgid = try_get(player, lambda x: x['props']['videoDetail']['mgid'])
if not mgid:
- mgid = self._search_regex(
- r'"media":{"video":{"config":{"uri":"(mgid:.*?)"', webpage, 'mgid', default=None)
+ raise ExtractorError('Could not extract mgid')
return mgid
diff --git a/hypervideo_dl/extractor/muenchentv.py b/hypervideo_dl/extractor/muenchentv.py
index d256236..a53929e 100644
--- a/hypervideo_dl/extractor/muenchentv.py
+++ b/hypervideo_dl/extractor/muenchentv.py
@@ -33,7 +33,7 @@ class MuenchenTVIE(InfoExtractor):
display_id = 'live'
webpage = self._download_webpage(url, display_id)
- title = self._live_title(self._og_search_title(webpage))
+ title = self._og_search_title(webpage)
data_js = self._search_regex(
r'(?s)\nplaylist:\s*(\[.*?}\]),',
diff --git a/hypervideo_dl/extractor/murrtube.py b/hypervideo_dl/extractor/murrtube.py
new file mode 100644
index 0000000..1eb5de6
--- /dev/null
+++ b/hypervideo_dl/extractor/murrtube.py
@@ -0,0 +1,165 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import functools
+import json
+
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ OnDemandPagedList,
+ determine_ext,
+ int_or_none,
+ try_get,
+)
+
+
+class MurrtubeIE(InfoExtractor):
+ _VALID_URL = r'''(?x)
+ (?:
+ murrtube:|
+ https?://murrtube\.net/videos/(?P<slug>[a-z0-9\-]+)\-
+ )
+ (?P<id>[a-f0-9]{8}\-[a-f0-9]{4}\-[a-f0-9]{4}\-[a-f0-9]{4}\-[a-f0-9]{12})
+ '''
+ _TEST = {
+ 'url': 'https://murrtube.net/videos/inferno-x-skyler-148b6f2a-fdcc-4902-affe-9c0f41aaaca0',
+ 'md5': '169f494812d9a90914b42978e73aa690',
+ 'info_dict': {
+ 'id': '148b6f2a-fdcc-4902-affe-9c0f41aaaca0',
+ 'ext': 'mp4',
+ 'title': 'Inferno X Skyler',
+ 'description': 'Humping a very good slutty sheppy (roomate)',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'duration': 284,
+ 'uploader': 'Inferno Wolf',
+ 'age_limit': 18,
+ 'comment_count': int,
+ 'view_count': int,
+ 'like_count': int,
+ 'tags': ['hump', 'breed', 'Fursuit', 'murrsuit', 'bareback'],
+ }
+ }
+
+ def _download_gql(self, video_id, op, note=None, fatal=True):
+ result = self._download_json(
+ 'https://murrtube.net/graphql',
+ video_id, note, data=json.dumps(op).encode(), fatal=fatal,
+ headers={'Content-Type': 'application/json'})
+ return result['data']
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ data = self._download_gql(video_id, {
+ 'operationName': 'Medium',
+ 'variables': {
+ 'id': video_id,
+ },
+ 'query': '''\
+query Medium($id: ID!) {
+ medium(id: $id) {
+ title
+ description
+ key
+ duration
+ commentsCount
+ likesCount
+ viewsCount
+ thumbnailKey
+ tagList
+ user {
+ name
+ __typename
+ }
+ __typename
+ }
+}'''})
+ meta = data['medium']
+
+ storage_url = 'https://storage.murrtube.net/murrtube/'
+ format_url = storage_url + meta.get('key', '')
+ thumbnail = storage_url + meta.get('thumbnailKey', '')
+
+ if determine_ext(format_url) == 'm3u8':
+ formats = self._extract_m3u8_formats(
+ format_url, video_id, 'mp4', entry_protocol='m3u8_native', fatal=False)
+ else:
+ formats = [{'url': format_url}]
+
+ return {
+ 'id': video_id,
+ 'title': meta.get('title'),
+ 'description': meta.get('description'),
+ 'formats': formats,
+ 'thumbnail': thumbnail,
+ 'duration': int_or_none(meta.get('duration')),
+ 'uploader': try_get(meta, lambda x: x['user']['name']),
+ 'view_count': meta.get('viewsCount'),
+ 'like_count': meta.get('likesCount'),
+ 'comment_count': meta.get('commentsCount'),
+ 'tags': meta.get('tagList'),
+ 'age_limit': 18,
+ }
+
+
+class MurrtubeUserIE(MurrtubeIE):
+ IE_DESC = 'Murrtube user profile'
+ _VALID_URL = r'https?://murrtube\.net/(?P<id>[^/]+)$'
+ _TEST = {
+ 'url': 'https://murrtube.net/stormy',
+ 'info_dict': {
+ 'id': 'stormy',
+ },
+ 'playlist_mincount': 27,
+ }
+ _PAGE_SIZE = 10
+
+ def _fetch_page(self, username, user_id, page):
+ data = self._download_gql(username, {
+ 'operationName': 'Media',
+ 'variables': {
+ 'limit': self._PAGE_SIZE,
+ 'offset': page * self._PAGE_SIZE,
+ 'sort': 'latest',
+ 'userId': user_id,
+ },
+ 'query': '''\
+query Media($q: String, $sort: String, $userId: ID, $offset: Int!, $limit: Int!) {
+ media(q: $q, sort: $sort, userId: $userId, offset: $offset, limit: $limit) {
+ id
+ __typename
+ }
+}'''},
+ 'Downloading page {0}'.format(page + 1))
+ if data is None:
+ raise ExtractorError(f'Failed to retrieve video list for page {page + 1}')
+
+ media = data['media']
+
+ for entry in media:
+ yield self.url_result('murrtube:{0}'.format(entry['id']), MurrtubeIE.ie_key())
+
+ def _real_extract(self, url):
+ username = self._match_id(url)
+ data = self._download_gql(username, {
+ 'operationName': 'User',
+ 'variables': {
+ 'id': username,
+ },
+ 'query': '''\
+query User($id: ID!) {
+ user(id: $id) {
+ id
+ __typename
+ }
+}'''},
+ 'Downloading user info')
+ if data is None:
+ raise ExtractorError('Failed to fetch user info')
+
+ user = data['user']
+
+ entries = OnDemandPagedList(functools.partial(
+ self._fetch_page, username, user.get('id')), self._PAGE_SIZE)
+
+ return self.playlist_result(entries, username)
diff --git a/hypervideo_dl/extractor/musescore.py b/hypervideo_dl/extractor/musescore.py
index dcd2638..09fadf8 100644
--- a/hypervideo_dl/extractor/musescore.py
+++ b/hypervideo_dl/extractor/musescore.py
@@ -5,7 +5,7 @@ from .common import InfoExtractor
class MuseScoreIE(InfoExtractor):
- _VALID_URL = r'(?:https?://)(?:www\.)?musescore\.com/(?:user/\d+|[^/]+)(?:/scores)?/(?P<id>[^#&?]+)'
+ _VALID_URL = r'https?://(?:www\.)?musescore\.com/(?:user/\d+|[^/]+)(?:/scores)?/(?P<id>[^#&?]+)'
_TESTS = [{
'url': 'https://musescore.com/user/73797/scores/142975',
'info_dict': {
@@ -13,7 +13,7 @@ class MuseScoreIE(InfoExtractor):
'ext': 'mp3',
'title': 'WA Mozart Marche Turque (Turkish March fingered)',
'description': 'md5:7ede08230e4eaabd67a4a98bb54d07be',
- 'thumbnail': r're:(?:https?://)(?:www\.)?musescore\.com/.*\.png[^$]+',
+ 'thumbnail': r're:https?://(?:www\.)?musescore\.com/.*\.png[^$]+',
'uploader': 'PapyPiano',
'creator': 'Wolfgang Amadeus Mozart',
}
@@ -24,7 +24,7 @@ class MuseScoreIE(InfoExtractor):
'ext': 'mp3',
'title': 'Sweet Child O\' Mine – Guns N\' Roses sweet child',
'description': 'md5:4dca71191c14abc312a0a4192492eace',
- 'thumbnail': r're:(?:https?://)(?:www\.)?musescore\.com/.*\.png[^$]+',
+ 'thumbnail': r're:https?://(?:www\.)?musescore\.com/.*\.png[^$]+',
'uploader': 'roxbelviolin',
'creator': 'Guns N´Roses Arr. Roxbel Violin',
}
@@ -35,7 +35,7 @@ class MuseScoreIE(InfoExtractor):
'ext': 'mp3',
'title': 'Für Elise – Beethoven',
'description': 'md5:49515a3556d5ecaf9fa4b2514064ac34',
- 'thumbnail': r're:(?:https?://)(?:www\.)?musescore\.com/.*\.png[^$]+',
+ 'thumbnail': r're:https?://(?:www\.)?musescore\.com/.*\.png[^$]+',
'uploader': 'ClassicMan',
'creator': 'Ludwig van Beethoven (1770–1827)',
}
diff --git a/hypervideo_dl/extractor/musicdex.py b/hypervideo_dl/extractor/musicdex.py
new file mode 100644
index 0000000..05f7220
--- /dev/null
+++ b/hypervideo_dl/extractor/musicdex.py
@@ -0,0 +1,175 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ date_from_str,
+ format_field,
+ try_get,
+ unified_strdate,
+)
+
+
+class MusicdexBaseIE(InfoExtractor):
+ def _return_info(self, track_json, album_json, id):
+ return {
+ 'id': str(id),
+ 'title': track_json.get('name'),
+ 'track': track_json.get('name'),
+ 'description': track_json.get('description'),
+ 'track_number': track_json.get('number'),
+ 'url': format_field(track_json, 'url', 'https://www.musicdex.org/%s'),
+ 'duration': track_json.get('duration'),
+ 'genre': [genre.get('name') for genre in track_json.get('genres') or []],
+ 'like_count': track_json.get('likes_count'),
+ 'view_count': track_json.get('plays'),
+ 'artist': [artist.get('name') for artist in track_json.get('artists') or []],
+ 'album_artist': [artist.get('name') for artist in album_json.get('artists') or []],
+ 'thumbnail': format_field(album_json, 'image', 'https://www.musicdex.org/%s'),
+ 'album': album_json.get('name'),
+ 'release_year': try_get(album_json, lambda x: date_from_str(unified_strdate(x['release_date'])).year),
+ 'extractor_key': MusicdexSongIE.ie_key(),
+ 'extractor': 'MusicdexSong',
+ }
+
+
+class MusicdexSongIE(MusicdexBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?musicdex\.org/track/(?P<id>\d+)'
+
+ _TESTS = [{
+ 'url': 'https://www.musicdex.org/track/306/dual-existence',
+ 'info_dict': {
+ 'id': '306',
+ 'ext': 'mp3',
+ 'title': 'dual existence',
+ 'description': '#NIPPONSEI @ IRC.RIZON.NET',
+ 'track': 'dual existence',
+ 'track_number': 1,
+ 'duration': 266000,
+ 'genre': ['Anime'],
+ 'like_count': int,
+ 'view_count': int,
+ 'artist': ['fripSide'],
+ 'album_artist': ['fripSide'],
+ 'thumbnail': 'https://www.musicdex.org/storage/album/9iDIam1DHTVqUG4UclFIEq1WAFGXfPW4y0TtZa91.png',
+ 'album': 'To Aru Kagaku no Railgun T OP2 Single - dual existence',
+ 'release_year': 2020
+ },
+ 'params': {'skip_download': True}
+ }]
+
+ def _real_extract(self, url):
+ id = self._match_id(url)
+ data_json = self._download_json(f'https://www.musicdex.org/secure/tracks/{id}?defaultRelations=true', id)['track']
+ return self._return_info(data_json, data_json.get('album') or {}, id)
+
+
+class MusicdexAlbumIE(MusicdexBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?musicdex\.org/album/(?P<id>\d+)'
+
+ _TESTS = [{
+ 'url': 'https://www.musicdex.org/album/56/tenmon-and-eiichiro-yanagi-minori/ef-a-tale-of-memories-original-soundtrack-2-fortissimo',
+ 'playlist_mincount': 28,
+ 'info_dict': {
+ 'id': '56',
+ 'genre': ['OST'],
+ 'view_count': int,
+ 'artist': ['TENMON & Eiichiro Yanagi / minori'],
+ 'title': 'ef - a tale of memories Original Soundtrack 2 ~fortissimo~',
+ 'release_year': 2008,
+ 'thumbnail': 'https://www.musicdex.org/storage/album/2rSHkyYBYfB7sbvElpEyTMcUn6toY7AohOgJuDlE.jpg',
+ },
+ }]
+
+ def _real_extract(self, url):
+ id = self._match_id(url)
+ data_json = self._download_json(f'https://www.musicdex.org/secure/albums/{id}?defaultRelations=true', id)['album']
+ entries = [self._return_info(track, data_json, track['id']) for track in data_json.get('tracks') or [] if track.get('id')]
+
+ return {
+ '_type': 'playlist',
+ 'id': id,
+ 'title': data_json.get('name'),
+ 'description': data_json.get('description'),
+ 'genre': [genre.get('name') for genre in data_json.get('genres') or []],
+ 'view_count': data_json.get('plays'),
+ 'artist': [artist.get('name') for artist in data_json.get('artists') or []],
+ 'thumbnail': format_field(data_json, 'image', 'https://www.musicdex.org/%s'),
+ 'release_year': try_get(data_json, lambda x: date_from_str(unified_strdate(x['release_date'])).year),
+ 'entries': entries,
+ }
+
+
+class MusicdexPageIE(MusicdexBaseIE):
+ def _entries(self, id):
+ next_page_url = self._API_URL % id
+ while next_page_url:
+ data_json = self._download_json(next_page_url, id)['pagination']
+ for data in data_json.get('data') or []:
+ yield data
+ next_page_url = data_json.get('next_page_url')
+
+
+class MusicdexArtistIE(MusicdexPageIE):
+ _VALID_URL = r'https?://(?:www\.)?musicdex\.org/artist/(?P<id>\d+)'
+ _API_URL = 'https://www.musicdex.org/secure/artists/%s/albums?page=1'
+
+ _TESTS = [{
+ 'url': 'https://www.musicdex.org/artist/11/fripside',
+ 'playlist_mincount': 28,
+ 'info_dict': {
+ 'id': '11',
+ 'view_count': int,
+ 'title': 'fripSide',
+ 'thumbnail': 'https://www.musicdex.org/storage/artist/ZmOz0lN2vsweegB660em3xWffCjLPmTQHqJls5Xx.jpg',
+ },
+ }]
+
+ def _real_extract(self, url):
+ id = self._match_id(url)
+ data_json = self._download_json(f'https://www.musicdex.org/secure/artists/{id}', id)['artist']
+ entries = []
+ for album in self._entries(id):
+ entries.extend(self._return_info(track, album, track['id']) for track in album.get('tracks') or [] if track.get('id'))
+
+ return {
+ '_type': 'playlist',
+ 'id': id,
+ 'title': data_json.get('name'),
+ 'view_count': data_json.get('plays'),
+ 'thumbnail': format_field(data_json, 'image_small', 'https://www.musicdex.org/%s'),
+ 'entries': entries,
+ }
+
+
+class MusicdexPlaylistIE(MusicdexPageIE):
+ _VALID_URL = r'https?://(?:www\.)?musicdex\.org/playlist/(?P<id>\d+)'
+ _API_URL = 'https://www.musicdex.org/secure/playlists/%s/tracks?perPage=10000&page=1'
+
+ _TESTS = [{
+ 'url': 'https://www.musicdex.org/playlist/9/test',
+ 'playlist_mincount': 73,
+ 'info_dict': {
+ 'id': '9',
+ 'view_count': int,
+ 'title': 'Test',
+ 'thumbnail': 'https://www.musicdex.org/storage/album/jXATI79f0IbQ2sgsKYOYRCW3zRwF3XsfHhzITCuJ.jpg',
+ 'description': 'Test 123 123 21312 32121321321321312',
+ },
+ }]
+
+ def _real_extract(self, url):
+ id = self._match_id(url)
+ data_json = self._download_json(f'https://www.musicdex.org/secure/playlists/{id}', id)['playlist']
+ entries = [self._return_info(track, track.get('album') or {}, track['id'])
+ for track in self._entries(id) or [] if track.get('id')]
+
+ return {
+ '_type': 'playlist',
+ 'id': id,
+ 'title': data_json.get('name'),
+ 'description': data_json.get('description'),
+ 'view_count': data_json.get('plays'),
+ 'thumbnail': format_field(data_json, 'image', 'https://www.musicdex.org/%s'),
+ 'entries': entries,
+ }
diff --git a/hypervideo_dl/extractor/mxplayer.py b/hypervideo_dl/extractor/mxplayer.py
index 5874556..3c2afd8 100644
--- a/hypervideo_dl/extractor/mxplayer.py
+++ b/hypervideo_dl/extractor/mxplayer.py
@@ -180,7 +180,7 @@ class MxplayerIE(InfoExtractor):
class MxplayerShowIE(InfoExtractor):
- _VALID_URL = r'(?:https?://)(?:www\.)?mxplayer\.in/show/(?P<display_id>[-\w]+)-(?P<id>\w+)/?(?:$|[#?])'
+ _VALID_URL = r'https?://(?:www\.)?mxplayer\.in/show/(?P<display_id>[-\w]+)-(?P<id>\w+)/?(?:$|[#?])'
_TESTS = [{
'url': 'https://www.mxplayer.in/show/watch-chakravartin-ashoka-samrat-series-online-a8f44e3cc0814b5601d17772cedf5417',
'playlist_mincount': 440,
diff --git a/hypervideo_dl/extractor/myspass.py b/hypervideo_dl/extractor/myspass.py
index db7ebc9..1775d5f 100644
--- a/hypervideo_dl/extractor/myspass.py
+++ b/hypervideo_dl/extractor/myspass.py
@@ -1,8 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
@@ -13,33 +11,74 @@ from ..utils import (
class MySpassIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?myspass\.de/([^/]+/)*(?P<id>\d+)'
- _TEST = {
+ _VALID_URL = r'https?://(?:www\.)?myspass\.de/(?:[^/]+/)*(?P<id>\d+)/?[^/]*$'
+ _TESTS = [{
'url': 'http://www.myspass.de/myspass/shows/tvshows/absolute-mehrheit/Absolute-Mehrheit-vom-17022013-Die-Highlights-Teil-2--/11741/',
'md5': '0b49f4844a068f8b33f4b7c88405862b',
'info_dict': {
'id': '11741',
'ext': 'mp4',
- 'description': 'Wer kann in die Fußstapfen von Wolfgang Kubicki treten und die Mehrheit der Zuschauer hinter sich versammeln? Wird vielleicht sogar die Absolute Mehrheit geknackt und der Jackpot von 200.000 Euro mit nach Hause genommen?',
+ 'description': 'md5:9f0db5044c8fe73f528a390498f7ce9b',
'title': '17.02.2013 - Die Highlights, Teil 2',
+ 'thumbnail': r're:.*\.jpg',
+ 'duration': 323.0,
+ 'episode': '17.02.2013 - Die Highlights, Teil 2',
+ 'season_id': '544',
+ 'episode_number': 1,
+ 'series': 'Absolute Mehrheit',
+ 'season_number': 2,
+ 'season': 'Season 2',
+ },
+ },
+ {
+ 'url': 'https://www.myspass.de/shows/tvshows/tv-total/Novak-Puffovic-bei-bester-Laune--/44996/',
+ 'md5': 'eb28b7c5e254192046e86ebaf7deac8f',
+ 'info_dict': {
+ 'id': '44996',
+ 'ext': 'mp4',
+ 'description': 'md5:74c7f886e00834417f1e427ab0da6121',
+ 'title': 'Novak Puffovic bei bester Laune',
+ 'thumbnail': r're:.*\.jpg',
+ 'episode_number': 8,
+ 'episode': 'Novak Puffovic bei bester Laune',
+ 'series': 'TV total',
+ 'season': 'Season 19',
+ 'season_id': '987',
+ 'duration': 2941.0,
+ 'season_number': 19,
+ },
+ },
+ {
+ 'url': 'https://www.myspass.de/channels/tv-total-raabigramm/17033/20831/',
+ 'md5': '7b293a6b9f3a7acdd29304c8d0dbb7cc',
+ 'info_dict': {
+ 'id': '20831',
+ 'ext': 'mp4',
+ 'description': 'Gefühle pur: Schaut euch die ungeschnittene Version von Stefans Liebesbeweis an die Moderationsgrazie von Welt, Verona Feldbusch, an.',
+ 'title': 'Raabigramm Verona Feldbusch',
+ 'thumbnail': r're:.*\.jpg',
+ 'episode_number': 6,
+ 'episode': 'Raabigramm Verona Feldbusch',
+ 'series': 'TV total',
+ 'season': 'Season 1',
+ 'season_id': '34',
+ 'duration': 105.0,
+ 'season_number': 1,
},
- }
+ }]
def _real_extract(self, url):
video_id = self._match_id(url)
- metadata = self._download_xml(
- 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=' + video_id,
- video_id)
+ metadata = self._download_xml('http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=' + video_id, video_id)
title = xpath_text(metadata, 'title', fatal=True)
video_url = xpath_text(metadata, 'url_flv', 'download url', True)
video_id_int = int(video_id)
- for group in re.search(r'/myspass2009/\d+/(\d+)/(\d+)/(\d+)/', video_url).groups():
+ for group in self._search_regex(r'/myspass2009/\d+/(\d+)/(\d+)/(\d+)/', video_url, 'myspass', group=(1, 2, 3), default=[]):
group_int = int(group)
if group_int > video_id_int:
- video_url = video_url.replace(
- group, compat_str(group_int // video_id_int))
+ video_url = video_url.replace(group, compat_str(group_int // video_id_int))
return {
'id': video_id,
diff --git a/hypervideo_dl/extractor/n1.py b/hypervideo_dl/extractor/n1.py
index 7a09c67..fdb7f32 100644
--- a/hypervideo_dl/extractor/n1.py
+++ b/hypervideo_dl/extractor/n1.py
@@ -3,8 +3,6 @@ from __future__ import unicode_literals
import re
-from .youtube import YoutubeIE
-from .reddit import RedditRIE
from .common import InfoExtractor
from ..utils import (
unified_timestamp,
@@ -40,7 +38,7 @@ class N1InfoAssetIE(InfoExtractor):
class N1InfoIIE(InfoExtractor):
IE_NAME = 'N1Info:article'
- _VALID_URL = r'https?://(?:(?:ba|rs|hr)\.)?n1info\.(?:com|si)/(?:[^/]+/){1,2}(?P<id>[^/]+)'
+ _VALID_URL = r'https?://(?:(?:(?:ba|rs|hr)\.)?n1info\.(?:com|si)|nova\.rs)/(?:[^/]+/){1,2}(?P<id>[^/]+)'
_TESTS = [{
# Youtube embedded
'url': 'https://rs.n1info.com/sport-klub/tenis/kako-je-djokovic-propustio-istorijsku-priliku-video/',
@@ -90,10 +88,18 @@ class N1InfoIIE(InfoExtractor):
'uploader': 'YouLotWhatDontStop',
},
'params': {
- 'format': 'bestvideo',
'skip_download': True,
},
}, {
+ 'url': 'https://nova.rs/vesti/politika/zaklina-tatalovic-ani-brnabic-pricate-lazi-video/',
+ 'info_dict': {
+ 'id': 'tnjganabrnabicizaklinatatalovic100danavladegp-novas-worldwide',
+ 'ext': 'mp4',
+ 'title': 'Žaklina Tatalović Ani Brnabić: Pričate laži (VIDEO)',
+ 'upload_date': '20211102',
+ 'timestamp': 1635861677,
+ },
+ }, {
'url': 'https://hr.n1info.com/vijesti/pravobraniteljica-o-ubojstvu-u-zagrebu-radi-se-o-doista-nezapamcenoj-situaciji/',
'only_matching': True,
}]
@@ -116,16 +122,16 @@ class N1InfoIIE(InfoExtractor):
'title': title,
'thumbnail': video_data.get('data-thumbnail'),
'timestamp': timestamp,
- 'ie_key': N1InfoAssetIE.ie_key()})
+ 'ie_key': 'N1InfoAsset'})
embedded_videos = re.findall(r'(<iframe[^>]+>)', webpage)
for embedded_video in embedded_videos:
video_data = extract_attributes(embedded_video)
- url = video_data.get('src')
+ url = video_data.get('src') or ''
if url.startswith('https://www.youtube.com'):
- entries.append(self.url_result(url, ie=YoutubeIE.ie_key()))
+ entries.append(self.url_result(url, ie='Youtube'))
elif url.startswith('https://www.redditmedia.com'):
- entries.append(self.url_result(url, ie=RedditRIE.ie_key()))
+ entries.append(self.url_result(url, ie='RedditR'))
return {
'_type': 'playlist',
diff --git a/hypervideo_dl/extractor/nate.py b/hypervideo_dl/extractor/nate.py
new file mode 100644
index 0000000..072faf6
--- /dev/null
+++ b/hypervideo_dl/extractor/nate.py
@@ -0,0 +1,124 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import itertools
+
+from .common import InfoExtractor
+from ..utils import (
+ int_or_none,
+ str_or_none,
+ traverse_obj,
+ unified_strdate,
+)
+
+
+class NateIE(InfoExtractor):
+ _VALID_URL = r'https?://tv\.nate\.com/clip/(?P<id>[0-9]+)'
+
+ _TESTS = [{
+ 'url': 'https://tv.nate.com/clip/1848976',
+ 'info_dict': {
+ 'id': '1848976',
+ 'ext': 'mp4',
+ 'title': '[결승 오프닝 타이틀] 2018 LCK 서머 스플릿 결승전 kt Rolster VS Griffin',
+ 'description': 'md5:e1b79a7dcf0d8d586443f11366f50e6f',
+ 'thumbnail': r're:^https?://.*\.jpg',
+ 'upload_date': '20180908',
+ 'age_limit': 15,
+ 'duration': 73,
+ 'uploader': '2018 LCK 서머 스플릿(롤챔스)',
+ 'channel': '2018 LCK 서머 스플릿(롤챔스)',
+ 'channel_id': '3606',
+ 'uploader_id': '3606',
+ 'tags': 'count:59',
+ },
+ 'params': {'skip_download': True}
+ }, {
+ 'url': 'https://tv.nate.com/clip/4300566',
+ 'info_dict': {
+ 'id': '4300566',
+ 'ext': 'mp4',
+ 'title': '[심쿵엔딩] 이준호x이세영, 서로를 기억하며 끌어안는 두 사람!💕, MBC 211204 방송',
+ 'description': 'md5:be1653502d9c13ce344ddf7828e089fa',
+ 'thumbnail': r're:^https?://.*\.jpg',
+ 'upload_date': '20211204',
+ 'age_limit': 15,
+ 'duration': 201,
+ 'uploader': '옷소매 붉은 끝동',
+ 'channel': '옷소매 붉은 끝동',
+ 'channel_id': '27987',
+ 'uploader_id': '27987',
+ 'tags': 'count:20',
+ },
+ 'params': {'skip_download': True}
+ }]
+
+ _QUALITY = {
+ '36': 2160,
+ '35': 1080,
+ '34': 720,
+ '33': 480,
+ '32': 360,
+ '31': 270,
+ }
+
+ def _real_extract(self, url):
+ id = self._match_id(url)
+ video_data = self._download_json(f'https://tv.nate.com/api/v1/clip/{id}', id)
+ formats = [{
+ 'format_id': f_url[-2:],
+ 'url': f_url,
+ 'height': self._QUALITY.get(f_url[-2:]),
+ 'quality': int_or_none(f_url[-2:]),
+ } for f_url in video_data.get('smcUriList') or []]
+ self._sort_formats(formats)
+ return {
+ 'id': id,
+ 'title': video_data.get('clipTitle'),
+ 'description': video_data.get('synopsis'),
+ 'thumbnail': video_data.get('contentImg'),
+ 'upload_date': unified_strdate(traverse_obj(video_data, 'broadDate', 'regDate')),
+ 'age_limit': video_data.get('targetAge'),
+ 'duration': video_data.get('playTime'),
+ 'formats': formats,
+ 'uploader': video_data.get('programTitle'),
+ 'channel': video_data.get('programTitle'),
+ 'channel_id': str_or_none(video_data.get('programSeq')),
+ 'uploader_id': str_or_none(video_data.get('programSeq')),
+ 'tags': video_data['hashTag'].split(',') if video_data.get('hashTag') else None,
+ }
+
+
+class NateProgramIE(InfoExtractor):
+ _VALID_URL = r'https?://tv\.nate\.com/program/clips/(?P<id>[0-9]+)'
+
+ _TESTS = [{
+ 'url': 'https://tv.nate.com/program/clips/27987',
+ 'playlist_mincount': 191,
+ 'info_dict': {
+ 'id': '27987',
+ },
+ }, {
+ 'url': 'https://tv.nate.com/program/clips/3606',
+ 'playlist_mincount': 15,
+ 'info_dict': {
+ 'id': '3606',
+ },
+ }]
+
+ def _entries(self, id):
+ for page_num in itertools.count(1):
+ program_data = self._download_json(f'https://tv.nate.com/api/v1/program/{id}/clip/ranking?size=20&page={page_num}',
+ id, note=f'Downloading page {page_num}')
+ for clip in program_data.get('content') or []:
+ clip_id = clip.get('clipSeq')
+ if clip_id:
+ yield self.url_result(
+ 'https://tv.nate.com/clip/%s' % clip_id,
+ ie=NateIE.ie_key(), video_id=clip_id)
+ if program_data.get('last'):
+ break
+
+ def _real_extract(self, url):
+ id = self._match_id(url)
+ return self.playlist_result(self._entries(id), playlist_id=id)
diff --git a/hypervideo_dl/extractor/naver.py b/hypervideo_dl/extractor/naver.py
index acf53c1..a6821ba 100644
--- a/hypervideo_dl/extractor/naver.py
+++ b/hypervideo_dl/extractor/naver.py
@@ -40,6 +40,7 @@ class NaverBaseIE(InfoExtractor):
formats.append({
'format_id': '%s_%s' % (stream.get('type') or stream_type, dict_get(encoding_option, ('name', 'id'))),
'url': stream_url,
+ 'ext': 'mp4',
'width': int_or_none(encoding_option.get('width')),
'height': int_or_none(encoding_option.get('height')),
'vbr': int_or_none(bitrate.get('video')),
@@ -174,7 +175,7 @@ class NaverLiveIE(InfoExtractor):
'url': 'https://tv.naver.com/l/52010',
'info_dict': {
'id': '52010',
- 'ext': 'm3u8',
+ 'ext': 'mp4',
'title': '[LIVE] 뉴스특보 : "수도권 거리두기, 2주간 2단계로 조정"',
'description': 'md5:df7f0c237a5ed5e786ce5c91efbeaab3',
'channel_id': 'NTV-ytnnews24-0',
@@ -184,7 +185,7 @@ class NaverLiveIE(InfoExtractor):
'url': 'https://tv.naver.com/l/51549',
'info_dict': {
'id': '51549',
- 'ext': 'm3u8',
+ 'ext': 'mp4',
'title': '연합뉴스TV - 코로나19 뉴스특보',
'description': 'md5:c655e82091bc21e413f549c0eaccc481',
'channel_id': 'NTV-yonhapnewstv-0',
@@ -233,7 +234,7 @@ class NaverLiveIE(InfoExtractor):
continue
formats.extend(self._extract_m3u8_formats(
- quality.get('url'), video_id, 'm3u8',
+ quality.get('url'), video_id, 'mp4',
m3u8_id=quality.get('qualityId'), live=True
))
self._sort_formats(formats)
diff --git a/hypervideo_dl/extractor/nba.py b/hypervideo_dl/extractor/nba.py
index 7390ef8..359cc52 100644
--- a/hypervideo_dl/extractor/nba.py
+++ b/hypervideo_dl/extractor/nba.py
@@ -165,14 +165,10 @@ class NBAWatchIE(NBAWatchBaseIE):
def _real_extract(self, url):
display_id = self._match_id(url)
collection_id = parse_qs(url).get('collection', [None])[0]
- if collection_id:
- if self.get_param('noplaylist'):
- self.to_screen('Downloading just video %s because of --no-playlist' % display_id)
- else:
- self.to_screen('Downloading playlist %s - add --no-playlist to just download video' % collection_id)
- return self.url_result(
- 'https://www.nba.com/watch/list/collection/' + collection_id,
- NBAWatchCollectionIE.ie_key(), collection_id)
+ if self._yes_playlist(collection_id, display_id):
+ return self.url_result(
+ 'https://www.nba.com/watch/list/collection/' + collection_id,
+ NBAWatchCollectionIE.ie_key(), collection_id)
return self._extract_video('seoName', display_id)
diff --git a/hypervideo_dl/extractor/nbc.py b/hypervideo_dl/extractor/nbc.py
index f304f19..1094034 100644
--- a/hypervideo_dl/extractor/nbc.py
+++ b/hypervideo_dl/extractor/nbc.py
@@ -197,9 +197,12 @@ class NBCSportsVPlayerIE(InfoExtractor):
'timestamp': 1426270238,
'upload_date': '20150313',
'uploader': 'NBCU-SPORTS',
+ 'duration': 72.818,
+ 'chapters': [],
+ 'thumbnail': r're:^https?://.*\.jpg$'
}
}, {
- 'url': 'https://vplayer.nbcsports.com/p/BxmELC/nbcsports_embed/select/media/_hqLjQ95yx8Z',
+ 'url': 'https://vplayer.nbcsports.com/p/BxmELC/nbcsports_embed/select/media/PEgOtlNcC_y2',
'only_matching': True,
}, {
'url': 'https://www.nbcsports.com/vplayer/p/BxmELC/nbcsports/select/PHJSaFWbrTY9?form=html&autoPlay=true',
@@ -208,16 +211,15 @@ class NBCSportsVPlayerIE(InfoExtractor):
@staticmethod
def _extract_url(webpage):
- iframe_m = re.search(
- r'<(?:iframe[^>]+|div[^>]+data-(?:mpx-)?)src="(?P<url>%s[^"]+)"' % NBCSportsVPlayerIE._VALID_URL_BASE, webpage)
- if iframe_m:
- return iframe_m.group('url')
+ video_urls = re.search(
+ r'(?:iframe[^>]+|var video|div[^>]+data-(?:mpx-)?)[sS]rc\s?=\s?"(?P<url>%s[^\"]+)' % NBCSportsVPlayerIE._VALID_URL_BASE, webpage)
+ if video_urls:
+ return video_urls.group('url')
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- theplatform_url = self._og_search_video_url(webpage).replace(
- 'vplayer.nbcsports.com', 'player.theplatform.com')
+ theplatform_url = self._html_search_regex(r'tp:releaseUrl="(.+?)"', webpage, 'url')
return self.url_result(theplatform_url, 'ThePlatform')
@@ -235,6 +237,9 @@ class NBCSportsIE(InfoExtractor):
'uploader': 'NBCU-SPORTS',
'upload_date': '20150330',
'timestamp': 1427726529,
+ 'chapters': [],
+ 'thumbnail': 'https://hdliveextra-a.akamaihd.net/HD/image_sports/NBCU_Sports_Group_-_nbcsports/253/303/izzodps.jpg',
+ 'duration': 528.395,
}
}, {
# data-mpx-src
@@ -305,7 +310,7 @@ class NBCSportsStreamIE(AdobePassIE):
self._sort_formats(formats)
return {
'id': video_id,
- 'title': self._live_title(title) if is_live else title,
+ 'title': title,
'description': live_source.get('description'),
'formats': formats,
'is_live': is_live,
@@ -403,9 +408,7 @@ class NBCNewsIE(ThePlatformIE):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- data = self._parse_json(self._search_regex(
- r'<script[^>]+id="__NEXT_DATA__"[^>]*>({.+?})</script>',
- webpage, 'bootstrap json'), video_id)['props']['initialState']
+ data = self._search_nextjs_data(webpage, video_id)['props']['initialState']
video_data = try_get(data, lambda x: x['video']['current'], dict)
if not video_data:
video_data = data['article']['content'][0]['primaryMedia']['video']
@@ -545,8 +548,6 @@ class NBCOlympicsStreamIE(AdobePassIE):
title = event_config['eventTitle']
is_live = {'live': True, 'replay': False}.get(event_config.get('eventStatus'))
- if is_live:
- title = self._live_title(title)
source_url = self._download_json(
f'https://api-leap.nbcsports.com/feeds/assets/{pid}?application=NBCOlympics&platform=desktop&format=nbc-player&env=staging',
diff --git a/hypervideo_dl/extractor/ndr.py b/hypervideo_dl/extractor/ndr.py
index f2bae2c..1917254 100644
--- a/hypervideo_dl/extractor/ndr.py
+++ b/hypervideo_dl/extractor/ndr.py
@@ -245,8 +245,6 @@ class NDREmbedBaseIE(InfoExtractor):
live = playlist.get('config', {}).get('streamType') in ['httpVideoLive', 'httpAudioLive']
title = config['title']
- if live:
- title = self._live_title(title)
uploader = ppjson.get('config', {}).get('branding')
upload_date = ppjson.get('config', {}).get('publicationDate')
duration = int_or_none(config.get('duration'))
diff --git a/hypervideo_dl/extractor/nebula.py b/hypervideo_dl/extractor/nebula.py
index 9698a35..77f2535 100644
--- a/hypervideo_dl/extractor/nebula.py
+++ b/hypervideo_dl/extractor/nebula.py
@@ -1,22 +1,161 @@
# coding: utf-8
from __future__ import unicode_literals
+import itertools
import json
import time
+import urllib
-from urllib.error import HTTPError
-from .common import InfoExtractor
-from ..compat import compat_str, compat_urllib_parse_unquote, compat_urllib_parse_quote
from ..utils import (
ExtractorError,
parse_iso8601,
try_get,
- urljoin,
)
+from .common import InfoExtractor
+
+
+class NebulaBaseIE(InfoExtractor):
+ _NETRC_MACHINE = 'watchnebula'
+
+ _nebula_api_token = None
+ _nebula_bearer_token = None
+ _zype_access_token = None
+
+ def _perform_nebula_auth(self):
+ username, password = self._get_login_info()
+ if not (username and password):
+ self.raise_login_required()
+
+ data = json.dumps({'email': username, 'password': password}).encode('utf8')
+ response = self._download_json(
+ 'https://api.watchnebula.com/api/v1/auth/login/',
+ data=data, fatal=False, video_id=None,
+ headers={
+ 'content-type': 'application/json',
+ # Submitting the 'sessionid' cookie always causes a 403 on auth endpoint
+ 'cookie': ''
+ },
+ note='Logging in to Nebula with supplied credentials',
+ errnote='Authentication failed or rejected')
+ if not response or not response.get('key'):
+ self.raise_login_required()
+
+ # save nebula token as cookie
+ self._set_cookie(
+ 'nebula.app', 'nebula-auth',
+ urllib.parse.quote(
+ json.dumps({
+ "apiToken": response["key"],
+ "isLoggingIn": False,
+ "isLoggingOut": False,
+ }, separators=(",", ":"))),
+ expire_time=int(time.time()) + 86400 * 365,
+ )
+
+ return response['key']
+
+ def _retrieve_nebula_api_token(self):
+ """
+ Check cookie jar for valid token. Try to authenticate using credentials if no valid token
+ can be found in the cookie jar.
+ """
+ nebula_cookies = self._get_cookies('https://nebula.app')
+ nebula_cookie = nebula_cookies.get('nebula-auth')
+ if nebula_cookie:
+ self.to_screen('Authenticating to Nebula with token from cookie jar')
+ nebula_cookie_value = urllib.parse.unquote(nebula_cookie.value)
+ nebula_api_token = self._parse_json(nebula_cookie_value, None).get('apiToken')
+ if nebula_api_token:
+ return nebula_api_token
+
+ return self._perform_nebula_auth()
+
+ def _call_nebula_api(self, url, video_id=None, method='GET', auth_type='api', note=''):
+ assert method in ('GET', 'POST',)
+ assert auth_type in ('api', 'bearer',)
+
+ def inner_call():
+ authorization = f'Token {self._nebula_api_token}' if auth_type == 'api' else f'Bearer {self._nebula_bearer_token}'
+ return self._download_json(
+ url, video_id, note=note, headers={'Authorization': authorization},
+ data=b'' if method == 'POST' else None)
+
+ try:
+ return inner_call()
+ except ExtractorError as exc:
+ # if 401 or 403, attempt credential re-auth and retry
+ if exc.cause and isinstance(exc.cause, urllib.error.HTTPError) and exc.cause.code in (401, 403):
+ self.to_screen(f'Reauthenticating to Nebula and retrying, because last {auth_type} call resulted in error {exc.cause.code}')
+ self._perform_login()
+ return inner_call()
+ else:
+ raise
+
+ def _fetch_nebula_bearer_token(self):
+ """
+ Get a Bearer token for the Nebula API. This will be required to fetch video meta data.
+ """
+ response = self._call_nebula_api('https://api.watchnebula.com/api/v1/authorization/',
+ method='POST',
+ note='Authorizing to Nebula')
+ return response['token']
+
+ def _fetch_zype_access_token(self):
+ """
+ Get a Zype access token, which is required to access video streams -- in our case: to
+ generate video URLs.
+ """
+ user_object = self._call_nebula_api('https://api.watchnebula.com/api/v1/auth/user/', note='Retrieving Zype access token')
+
+ access_token = try_get(user_object, lambda x: x['zype_auth_info']['access_token'], str)
+ if not access_token:
+ if try_get(user_object, lambda x: x['is_subscribed'], bool):
+ # TODO: Reimplement the same Zype token polling the Nebula frontend implements
+ # see https://github.com/ytdl-org/youtube-dl/pull/24805#issuecomment-749231532
+ raise ExtractorError(
+ 'Unable to extract Zype access token from Nebula API authentication endpoint. '
+ 'Open an arbitrary video in a browser with this account to generate a token',
+ expected=True)
+ raise ExtractorError('Unable to extract Zype access token from Nebula API authentication endpoint')
+ return access_token
+
+ def _build_video_info(self, episode):
+ zype_id = episode['zype_id']
+ zype_video_url = f'https://player.zype.com/embed/{zype_id}.html?access_token={self._zype_access_token}'
+ channel_slug = episode['channel_slug']
+ return {
+ 'id': episode['zype_id'],
+ 'display_id': episode['slug'],
+ '_type': 'url_transparent',
+ 'ie_key': 'Zype',
+ 'url': zype_video_url,
+ 'title': episode['title'],
+ 'description': episode['description'],
+ 'timestamp': parse_iso8601(episode['published_at']),
+ 'thumbnails': [{
+ # 'id': tn.get('name'), # this appears to be null
+ 'url': tn['original'],
+ 'height': key,
+ } for key, tn in episode['assets']['thumbnail'].items()],
+ 'duration': episode['duration'],
+ 'channel': episode['channel_title'],
+ 'channel_id': channel_slug,
+ 'channel_url': f'https://nebula.app/{channel_slug}',
+ 'uploader': episode['channel_title'],
+ 'uploader_id': channel_slug,
+ 'uploader_url': f'https://nebula.app/{channel_slug}',
+ 'series': episode['channel_title'],
+ 'creator': episode['channel_title'],
+ }
+ def _perform_login(self, username=None, password=None):
+ # FIXME: username should be passed from here to inner functions
+ self._nebula_api_token = self._retrieve_nebula_api_token()
+ self._nebula_bearer_token = self._fetch_nebula_bearer_token()
+ self._zype_access_token = self._fetch_zype_access_token()
-class NebulaIE(InfoExtractor):
+class NebulaIE(NebulaBaseIE):
_VALID_URL = r'https?://(?:www\.)?(?:watchnebula\.com|nebula\.app)/videos/(?P<id>[-\w]+)'
_TESTS = [
{
@@ -30,12 +169,13 @@ class NebulaIE(InfoExtractor):
'upload_date': '20180731',
'timestamp': 1533009600,
'channel': 'Lindsay Ellis',
+ 'channel_id': 'lindsayellis',
'uploader': 'Lindsay Ellis',
+ 'uploader_id': 'lindsayellis',
},
'params': {
'usenetrc': True,
},
- 'skip': 'All Nebula content requires authentication',
},
{
'url': 'https://nebula.app/videos/the-logistics-of-d-day-landing-craft-how-the-allies-got-ashore',
@@ -47,13 +187,14 @@ class NebulaIE(InfoExtractor):
'description': r're:^In this episode we explore the unsung heroes of D-Day, the landing craft.',
'upload_date': '20200327',
'timestamp': 1585348140,
- 'channel': 'The Logistics of D-Day',
- 'uploader': 'The Logistics of D-Day',
+ 'channel': 'Real Engineering',
+ 'channel_id': 'realengineering',
+ 'uploader': 'Real Engineering',
+ 'uploader_id': 'realengineering',
},
'params': {
'usenetrc': True,
},
- 'skip': 'All Nebula content requires authentication',
},
{
'url': 'https://nebula.app/videos/money-episode-1-the-draw',
@@ -66,173 +207,82 @@ class NebulaIE(InfoExtractor):
'upload_date': '20200323',
'timestamp': 1584980400,
'channel': 'Tom Scott Presents: Money',
+ 'channel_id': 'tom-scott-presents-money',
'uploader': 'Tom Scott Presents: Money',
+ 'uploader_id': 'tom-scott-presents-money',
},
'params': {
'usenetrc': True,
},
- 'skip': 'All Nebula content requires authentication',
},
{
'url': 'https://watchnebula.com/videos/money-episode-1-the-draw',
'only_matching': True,
},
]
- _NETRC_MACHINE = 'watchnebula'
- _nebula_token = None
+ def _fetch_video_metadata(self, slug):
+ return self._call_nebula_api(f'https://content.watchnebula.com/video/{slug}/',
+ video_id=slug,
+ auth_type='bearer',
+ note='Fetching video meta data')
- def _retrieve_nebula_auth(self):
- """
- Log in to Nebula, and returns a Nebula API token
- """
+ def _real_extract(self, url):
+ slug = self._match_id(url)
+ video = self._fetch_video_metadata(slug)
+ return self._build_video_info(video)
- username, password = self._get_login_info()
- if not (username and password):
- self.raise_login_required()
- self.report_login()
- data = json.dumps({'email': username, 'password': password}).encode('utf8')
- response = self._download_json(
- 'https://api.watchnebula.com/api/v1/auth/login/',
- data=data, fatal=False, video_id=None,
- headers={
- 'content-type': 'application/json',
- # Submitting the 'sessionid' cookie always causes a 403 on auth endpoint
- 'cookie': ''
+class NebulaCollectionIE(NebulaBaseIE):
+ IE_NAME = 'nebula:collection'
+ _VALID_URL = r'https?://(?:www\.)?(?:watchnebula\.com|nebula\.app)/(?!videos/)(?P<id>[-\w]+)'
+ _TESTS = [
+ {
+ 'url': 'https://nebula.app/tom-scott-presents-money',
+ 'info_dict': {
+ 'id': 'tom-scott-presents-money',
+ 'title': 'Tom Scott Presents: Money',
+ 'description': 'Tom Scott hosts a series all about trust, negotiation and money.',
},
- note='Authenticating to Nebula with supplied credentials',
- errnote='Authentication failed or rejected')
- if not response or not response.get('key'):
- self.raise_login_required()
-
- # save nebula token as cookie
- self._set_cookie(
- 'nebula.app', 'nebula-auth',
- compat_urllib_parse_quote(
- json.dumps({
- "apiToken": response["key"],
- "isLoggingIn": False,
- "isLoggingOut": False,
- }, separators=(",", ":"))),
- expire_time=int(time.time()) + 86400 * 365,
- )
-
- return response['key']
-
- def _retrieve_zype_api_key(self, page_url, display_id):
- """
- Retrieves the Zype API key
- """
-
- # Find the js that has the API key from the webpage and download it
- webpage = self._download_webpage(page_url, video_id=display_id)
- main_script_relpath = self._search_regex(
- r'<script[^>]*src="(?P<script_relpath>[^"]*main.[0-9a-f]*.chunk.js)"[^>]*>', webpage,
- group='script_relpath', name='script relative path', fatal=True)
- main_script_abspath = urljoin(page_url, main_script_relpath)
- main_script = self._download_webpage(main_script_abspath, video_id=display_id,
- note='Retrieving Zype API key')
-
- api_key = self._search_regex(
- r'REACT_APP_ZYPE_API_KEY\s*:\s*"(?P<api_key>[\w-]*)"', main_script,
- group='api_key', name='API key', fatal=True)
-
- return api_key
-
- def _call_zype_api(self, path, params, video_id, api_key, note):
- """
- A helper for making calls to the Zype API.
- """
- query = {'api_key': api_key, 'per_page': 1}
- query.update(params)
- return self._download_json('https://api.zype.com' + path, video_id, query=query, note=note)
-
- def _call_nebula_api(self, path, video_id, access_token, note):
- """
- A helper for making calls to the Nebula API.
- """
- return self._download_json('https://api.watchnebula.com/api/v1' + path, video_id, headers={
- 'Authorization': 'Token {access_token}'.format(access_token=access_token)
- }, note=note)
-
- def _fetch_zype_access_token(self, video_id):
- try:
- user_object = self._call_nebula_api('/auth/user/', video_id, self._nebula_token, note='Retrieving Zype access token')
- except ExtractorError as exc:
- # if 401, attempt credential auth and retry
- if exc.cause and isinstance(exc.cause, HTTPError) and exc.cause.code == 401:
- self._nebula_token = self._retrieve_nebula_auth()
- user_object = self._call_nebula_api('/auth/user/', video_id, self._nebula_token, note='Retrieving Zype access token')
- else:
- raise
-
- access_token = try_get(user_object, lambda x: x['zype_auth_info']['access_token'], compat_str)
- if not access_token:
- if try_get(user_object, lambda x: x['is_subscribed'], bool):
- # TODO: Reimplement the same Zype token polling the Nebula frontend implements
- # see https://github.com/ytdl-org/youtube-dl/pull/24805#issuecomment-749231532
- raise ExtractorError(
- 'Unable to extract Zype access token from Nebula API authentication endpoint. '
- 'Open an arbitrary video in a browser with this account to generate a token',
- expected=True)
- raise ExtractorError('Unable to extract Zype access token from Nebula API authentication endpoint')
- return access_token
-
- def _extract_channel_title(self, video_meta):
- # TODO: Implement the API calls giving us the channel list,
- # so that we can do the title lookup and then figure out the channel URL
- categories = video_meta.get('categories', []) if video_meta else []
- # the channel name is the value of the first category
- for category in categories:
- if category.get('value'):
- return category['value'][0]
-
- def _real_initialize(self):
- # check cookie jar for valid token
- nebula_cookies = self._get_cookies('https://nebula.app')
- nebula_cookie = nebula_cookies.get('nebula-auth')
- if nebula_cookie:
- self.to_screen('Authenticating to Nebula with token from cookie jar')
- nebula_cookie_value = compat_urllib_parse_unquote(nebula_cookie.value)
- self._nebula_token = self._parse_json(nebula_cookie_value, None).get('apiToken')
+ 'playlist_count': 5,
+ 'params': {
+ 'usenetrc': True,
+ },
+ }, {
+ 'url': 'https://nebula.app/lindsayellis',
+ 'info_dict': {
+ 'id': 'lindsayellis',
+ 'title': 'Lindsay Ellis',
+ 'description': 'Enjoy these hottest of takes on Disney, Transformers, and Musicals.',
+ },
+ 'playlist_mincount': 100,
+ 'params': {
+ 'usenetrc': True,
+ },
+ },
+ ]
- # try to authenticate using credentials if no valid token has been found
- if not self._nebula_token:
- self._nebula_token = self._retrieve_nebula_auth()
+ def _generate_playlist_entries(self, collection_id, channel):
+ episodes = channel['episodes']['results']
+ for page_num in itertools.count(2):
+ for episode in episodes:
+ yield self._build_video_info(episode)
+ next_url = channel['episodes']['next']
+ if not next_url:
+ break
+ channel = self._call_nebula_api(next_url, collection_id, auth_type='bearer',
+ note=f'Retrieving channel page {page_num}')
+ episodes = channel['episodes']['results']
def _real_extract(self, url):
- display_id = self._match_id(url)
- api_key = self._retrieve_zype_api_key(url, display_id)
-
- response = self._call_zype_api('/videos', {'friendly_title': display_id},
- display_id, api_key, note='Retrieving metadata from Zype')
- if len(response.get('response') or []) != 1:
- raise ExtractorError('Unable to find video on Zype API')
- video_meta = response['response'][0]
-
- video_id = video_meta['_id']
- zype_access_token = self._fetch_zype_access_token(display_id)
+ collection_id = self._match_id(url)
+ channel_url = f'https://content.watchnebula.com/video/channels/{collection_id}/'
+ channel = self._call_nebula_api(channel_url, collection_id, auth_type='bearer', note='Retrieving channel')
+ channel_details = channel['details']
- channel_title = self._extract_channel_title(video_meta)
-
- return {
- 'id': video_id,
- 'display_id': display_id,
- '_type': 'url_transparent',
- 'ie_key': 'Zype',
- 'url': 'https://player.zype.com/embed/%s.html?access_token=%s' % (video_id, zype_access_token),
- 'title': video_meta.get('title'),
- 'description': video_meta.get('description'),
- 'timestamp': parse_iso8601(video_meta.get('published_at')),
- 'thumbnails': [{
- 'id': tn.get('name'), # this appears to be null
- 'url': tn['url'],
- 'width': tn.get('width'),
- 'height': tn.get('height'),
- } for tn in video_meta.get('thumbnails', [])],
- 'duration': video_meta.get('duration'),
- 'channel': channel_title,
- 'uploader': channel_title, # we chose uploader = channel name
- # TODO: uploader_url, channel_id, channel_url
- }
+ return self.playlist_result(
+ entries=self._generate_playlist_entries(collection_id, channel),
+ playlist_id=collection_id,
+ playlist_title=channel_details['title'],
+ playlist_description=channel_details['description']
+ )
diff --git a/hypervideo_dl/extractor/neteasemusic.py b/hypervideo_dl/extractor/neteasemusic.py
index 7652371..57b4774 100644
--- a/hypervideo_dl/extractor/neteasemusic.py
+++ b/hypervideo_dl/extractor/neteasemusic.py
@@ -405,17 +405,12 @@ class NetEaseMusicProgramIE(NetEaseMusicBaseIE):
name = info['name']
description = info['description']
- if not info['songs'] or self.get_param('noplaylist'):
- if info['songs']:
- self.to_screen(
- 'Downloading just the main audio %s because of --no-playlist'
- % info['mainSong']['id'])
-
+ if not self._yes_playlist(info['songs'] and program_id, info['mainSong']['id']):
formats = self.extract_formats(info['mainSong'])
self._sort_formats(formats)
return {
- 'id': program_id,
+ 'id': info['mainSong']['id'],
'title': name,
'description': description,
'creator': info['dj']['brand'],
@@ -425,10 +420,6 @@ class NetEaseMusicProgramIE(NetEaseMusicBaseIE):
'formats': formats,
}
- self.to_screen(
- 'Downloading playlist %s - add --no-playlist to just download the main audio %s'
- % (program_id, info['mainSong']['id']))
-
song_ids = [info['mainSong']['id']]
song_ids.extend([song['id'] for song in info['songs']])
entries = [
diff --git a/hypervideo_dl/extractor/newgrounds.py b/hypervideo_dl/extractor/newgrounds.py
index bbbd9e8..6525a6d 100644
--- a/hypervideo_dl/extractor/newgrounds.py
+++ b/hypervideo_dl/extractor/newgrounds.py
@@ -6,7 +6,9 @@ import re
from .common import InfoExtractor
from ..utils import (
+ clean_html,
extract_attributes,
+ get_element_by_id,
int_or_none,
parse_count,
parse_duration,
@@ -29,7 +31,8 @@ class NewgroundsIE(InfoExtractor):
'timestamp': 1378878540,
'upload_date': '20130911',
'duration': 143,
- 'description': 'md5:6d885138814015dfd656c2ddb00dacfc',
+ 'view_count': int,
+ 'description': 'md5:b8b3c2958875189f07d8e313462e8c4f',
},
}, {
'url': 'https://www.newgrounds.com/portal/view/1',
@@ -41,6 +44,7 @@ class NewgroundsIE(InfoExtractor):
'uploader': 'Brian-Beaton',
'timestamp': 955064100,
'upload_date': '20000406',
+ 'view_count': int,
'description': 'Scrotum plays "catch."',
'age_limit': 17,
},
@@ -54,7 +58,8 @@ class NewgroundsIE(InfoExtractor):
'uploader': 'ZONE-SAMA',
'timestamp': 1487965140,
'upload_date': '20170224',
- 'description': 'ZTV News Episode 8 (February 2017)',
+ 'view_count': int,
+ 'description': 'md5:aff9b330ec2e78ed93b1ad6d017accc6',
'age_limit': 17,
},
'params': {
@@ -70,7 +75,8 @@ class NewgroundsIE(InfoExtractor):
'uploader': 'Egoraptor',
'timestamp': 1140663240,
'upload_date': '20060223',
- 'description': 'Metal Gear is awesome is so is this movie.',
+ 'view_count': int,
+ 'description': 'md5:9246c181614e23754571995104da92e0',
'age_limit': 13,
}
}, {
@@ -80,7 +86,7 @@ class NewgroundsIE(InfoExtractor):
'id': '297383',
'ext': 'swf',
'title': 'Metal Gear Awesome',
- 'description': 'Metal Gear is awesome is so is this movie.',
+ 'description': 'Metal Gear Awesome',
'uploader': 'Egoraptor',
'upload_date': '20060223',
'timestamp': 1140663240,
@@ -100,8 +106,7 @@ class NewgroundsIE(InfoExtractor):
uploader = None
webpage = self._download_webpage(url, media_id)
- title = self._html_search_regex(
- r'<title>(.+?)</title>', webpage, 'title')
+ title = self._html_extract_title(webpage)
media_url_string = self._search_regex(
r'"url"\s*:\s*("[^"]+"),', webpage, 'media url', default=None)
@@ -145,10 +150,13 @@ class NewgroundsIE(InfoExtractor):
(r'<dt>\s*Uploaded\s*</dt>\s*<dd>([^<]+</dd>\s*<dd>[^<]+)',
r'<dt>\s*Uploaded\s*</dt>\s*<dd>([^<]+)'), webpage, 'timestamp',
default=None))
+
duration = parse_duration(self._html_search_regex(
r'"duration"\s*:\s*["\']?(\d+)["\']?', webpage,
'duration', default=None))
+ description = clean_html(get_element_by_id('author_comments', webpage)) or self._og_search_description(webpage)
+
view_count = parse_count(self._html_search_regex(
r'(?s)<dt>\s*(?:Views|Listens)\s*</dt>\s*<dd>([\d\.,]+)</dd>', webpage,
'view count', default=None))
@@ -177,7 +185,7 @@ class NewgroundsIE(InfoExtractor):
'duration': duration,
'formats': formats,
'thumbnail': self._og_search_thumbnail(webpage),
- 'description': self._og_search_description(webpage),
+ 'description': description,
'age_limit': age_limit,
'view_count': view_count,
}
@@ -210,8 +218,7 @@ class NewgroundsPlaylistIE(InfoExtractor):
webpage = self._download_webpage(url, playlist_id)
- title = self._search_regex(
- r'<title>([^>]+)</title>', webpage, 'title', default=None)
+ title = self._html_extract_title(webpage, default=None)
# cut left menu
webpage = self._search_regex(
diff --git a/hypervideo_dl/extractor/newstube.py b/hypervideo_dl/extractor/newstube.py
index dab4aec..479141a 100644
--- a/hypervideo_dl/extractor/newstube.py
+++ b/hypervideo_dl/extractor/newstube.py
@@ -5,11 +5,9 @@ import base64
import hashlib
from .common import InfoExtractor
-from ..aes import aes_cbc_decrypt
+from ..aes import aes_cbc_decrypt_bytes, unpad_pkcs7
from ..utils import (
- bytes_to_intlist,
int_or_none,
- intlist_to_bytes,
parse_codecs,
parse_duration,
)
@@ -47,10 +45,8 @@ class NewstubeIE(InfoExtractor):
}))
key = hashlib.pbkdf2_hmac(
'sha1', video_guid.replace('-', '').encode(), enc_data[:16], 1)[:16]
- dec_data = aes_cbc_decrypt(
- bytes_to_intlist(enc_data[32:]), bytes_to_intlist(key),
- bytes_to_intlist(enc_data[16:32]))
- sources = self._parse_json(intlist_to_bytes(dec_data[:-dec_data[-1]]), video_guid)
+ dec_data = unpad_pkcs7(aes_cbc_decrypt_bytes(enc_data[32:], key, enc_data[16:32]))
+ sources = self._parse_json(dec_data, video_guid)
formats = []
for source in sources:
diff --git a/hypervideo_dl/extractor/newsy.py b/hypervideo_dl/extractor/newsy.py
new file mode 100644
index 0000000..cf31641
--- /dev/null
+++ b/hypervideo_dl/extractor/newsy.py
@@ -0,0 +1,51 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ js_to_json,
+ merge_dicts,
+)
+
+
+class NewsyIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?newsy\.com/stories/(?P<id>[^/?#$&]+)'
+
+ _TESTS = [{
+ 'url': 'https://www.newsy.com/stories/nft-trend-leads-to-fraudulent-art-auctions/',
+ 'info_dict': {
+ 'id': '609d65125b086c24fb529312',
+ 'ext': 'mp4',
+ 'title': 'NFT Art Auctions Have A Piracy Problem',
+ 'description': 'md5:971e52ab8bc97e50305475cde8284c83',
+ 'display_id': 'nft-trend-leads-to-fraudulent-art-auctions',
+ 'timestamp': 1621339200,
+ 'duration': 339630,
+ 'thumbnail': 'https://cdn.newsy.com/images/videos/x/1620927824_xyrrP4.jpg',
+ 'upload_date': '20210518'
+ },
+ 'params': {'skip_download': True}
+ }]
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
+ data_json = self._parse_json(self._html_search_regex(
+ r'data-video-player\s?=\s?"({[^"]+})">', webpage, 'data'), display_id, js_to_json)
+ ld_json = self._search_json_ld(webpage, display_id, fatal=False)
+
+ formats, subtitles = [], {}
+ if data_json.get('stream'):
+ fmts, subs = self._extract_m3u8_formats_and_subtitles(data_json['stream'], display_id)
+ formats.extend(fmts)
+ subtitles = self._merge_subtitles(subtitles, subs)
+ self._sort_formats(formats)
+ return merge_dicts(ld_json, {
+ 'id': data_json['id'],
+ 'display_id': display_id,
+ 'title': data_json.get('headline'),
+ 'duration': data_json.get('duration'),
+ 'thumbnail': data_json.get('image'),
+ 'formats': formats,
+ 'subtitles': subtitles,
+ })
diff --git a/hypervideo_dl/extractor/nexx.py b/hypervideo_dl/extractor/nexx.py
index 860d636..a521bb6 100644
--- a/hypervideo_dl/extractor/nexx.py
+++ b/hypervideo_dl/extractor/nexx.py
@@ -12,6 +12,8 @@ from ..utils import (
ExtractorError,
int_or_none,
parse_duration,
+ srt_subtitles_timecode,
+ traverse_obj,
try_get,
urlencode_postdata,
)
@@ -20,7 +22,7 @@ from ..utils import (
class NexxIE(InfoExtractor):
_VALID_URL = r'''(?x)
(?:
- https?://api\.nexx(?:\.cloud|cdn\.com)/v3/(?P<domain_id>\d+)/videos/byid/|
+ https?://api\.nexx(?:\.cloud|cdn\.com)/v3(?:\.\d)?/(?P<domain_id>\d+)/videos/byid/|
nexx:(?:(?P<domain_id_s>\d+):)?|
https?://arc\.nexx\.cloud/api/video/
)
@@ -42,35 +44,37 @@ class NexxIE(InfoExtractor):
'timestamp': 1384264416,
'upload_date': '20131112',
},
+ 'skip': 'Spiegel nexx CDNs are now disabled'
}, {
- # episode
- 'url': 'https://api.nexx.cloud/v3/741/videos/byid/247858',
+ # episode with captions
+ 'url': 'https://api.nexx.cloud/v3.1/741/videos/byid/1701834',
'info_dict': {
- 'id': '247858',
+ 'id': '1701834',
'ext': 'mp4',
- 'title': 'Return of the Golden Child (OV)',
- 'description': 'md5:5d969537509a92b733de21bae249dc63',
- 'release_year': 2017,
+ 'title': 'Mein Leben mit \'nem TikTok E-Boy 😤',
+ 'alt_title': 'Mein Leben mit \'nem TikTok E-Boy 😤',
+ 'description': 'md5:f84f395a881fd143f952c892deab528d',
'thumbnail': r're:^https?://.*\.jpg$',
- 'duration': 1397,
- 'timestamp': 1495033267,
- 'upload_date': '20170517',
+ 'duration': 770,
+ 'timestamp': 1595600027,
+ 'upload_date': '20200724',
'episode_number': 2,
'season_number': 2,
+ 'episode': 'Episode 2',
+ 'season': 'Season 2',
},
'params': {
'skip_download': True,
},
- 'skip': 'HTTP Error 404: Not Found',
}, {
- # does not work via arc
'url': 'nexx:741:1269984',
- 'md5': 'c714b5b238b2958dc8d5642addba6886',
+ 'md5': 'd5f14e14b592501e51addd5abef95a7f',
'info_dict': {
'id': '1269984',
'ext': 'mp4',
- 'title': '1 TAG ohne KLO... wortwörtlich! 😑',
- 'alt_title': '1 TAG ohne KLO... wortwörtlich! 😑',
+ 'title': '1 TAG ohne KLO... wortwörtlich! ?',
+ 'alt_title': '1 TAG ohne KLO... wortwörtlich! ?',
+ 'description': 'md5:2016393a31991a900946432ccdd09a6f',
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 607,
'timestamp': 1518614955,
@@ -91,6 +95,7 @@ class NexxIE(InfoExtractor):
'timestamp': 1527874460,
'upload_date': '20180601',
},
+ 'skip': 'Spiegel nexx CDNs are now disabled'
}, {
'url': 'https://api.nexxcdn.com/v3/748/videos/byid/128907',
'only_matching': True,
@@ -138,6 +143,8 @@ class NexxIE(InfoExtractor):
return NexxIE._extract_urls(webpage)[0]
def _handle_error(self, response):
+ if traverse_obj(response, ('metadata', 'notice'), expected_type=str):
+ self.report_warning('%s said: %s' % (self.IE_NAME, response['metadata']['notice']))
status = int_or_none(try_get(
response, lambda x: x['metadata']['status']) or 200)
if 200 <= status < 300:
@@ -220,6 +227,65 @@ class NexxIE(InfoExtractor):
return formats
+ def _extract_3q_formats(self, video, video_id):
+ stream_data = video['streamdata']
+ cdn = stream_data['cdnType']
+ assert cdn == '3q'
+
+ q_acc, q_prefix, q_locator, q_hash = stream_data['qAccount'], stream_data['qPrefix'], stream_data['qLocator'], stream_data['qHash']
+ protection_key = traverse_obj(
+ video, ('protectiondata', 'key'), expected_type=str)
+
+ def get_cdn_shield_base(shield_type=''):
+ for secure in ('', 's'):
+ cdn_shield = stream_data.get('cdnShield%sHTTP%s' % (shield_type, secure.upper()))
+ if cdn_shield:
+ return 'http%s://%s' % (secure, cdn_shield)
+ return f'http://sdn-global-{"prog" if shield_type.lower() == "prog" else "streaming"}-cache.3qsdn.com/' + (f's/{protection_key}/' if protection_key else '')
+
+ stream_base = get_cdn_shield_base()
+
+ formats = []
+ formats.extend(self._extract_m3u8_formats(
+ f'{stream_base}{q_acc}/files/{q_prefix}/{q_locator}/{q_acc}-{stream_data.get("qHEVCHash") or q_hash}.ism/manifest.m3u8',
+ video_id, 'mp4', m3u8_id=f'{cdn}-hls', fatal=False))
+ formats.extend(self._extract_mpd_formats(
+ f'{stream_base}{q_acc}/files/{q_prefix}/{q_locator}/{q_acc}-{q_hash}.ism/manifest.mpd',
+ video_id, mpd_id=f'{cdn}-dash', fatal=False))
+
+ progressive_base = get_cdn_shield_base('Prog')
+ q_references = stream_data.get('qReferences') or ''
+ fds = q_references.split(',')
+ for fd in fds:
+ ss = fd.split(':')
+ if len(ss) != 3:
+ continue
+ tbr = int_or_none(ss[1], scale=1000)
+ formats.append({
+ 'url': f'{progressive_base}{q_acc}/uploads/{q_acc}-{ss[2]}.webm',
+ 'format_id': f'{cdn}-{ss[0]}{"-%s" % tbr if tbr else ""}',
+ 'tbr': tbr,
+ })
+
+ azure_file_distribution = stream_data.get('azureFileDistribution') or ''
+ fds = azure_file_distribution.split(',')
+ for fd in fds:
+ ss = fd.split(':')
+ if len(ss) != 3:
+ continue
+ tbr = int_or_none(ss[0])
+ width, height = ss[1].split('x') if len(ss[1].split('x')) == 2 else (None, None)
+ f = {
+ 'url': f'{progressive_base}{q_acc}/files/{q_prefix}/{q_locator}/{ss[2]}.mp4',
+ 'format_id': f'{cdn}-http-{"-%s" % tbr if tbr else ""}',
+ 'tbr': tbr,
+ 'width': int_or_none(width),
+ 'height': int_or_none(height),
+ }
+ formats.append(f)
+
+ return formats
+
def _extract_azure_formats(self, video, video_id):
stream_data = video['streamdata']
cdn = stream_data['cdnType']
@@ -345,10 +411,11 @@ class NexxIE(InfoExtractor):
# md5( operation + domain_id + domain_secret )
# where domain_secret is a static value that will be given by nexx.tv
# as per [1]. Here is how this "secret" is generated (reversed
- # from _play.api.init function, search for clienttoken). So it's
- # actually not static and not that much of a secret.
+ # from _play._factory.data.getDomainData function, search for
+ # domaintoken or enableAPIAccess). So it's actually not static
+ # and not that much of a secret.
# 1. https://nexxtvstorage.blob.core.windows.net/files/201610/27.pdf
- secret = result['device']['clienttoken'][int(device_id[0]):]
+ secret = result['device']['domaintoken'][int(device_id[0]):]
secret = secret[0:len(secret) - int(device_id[-1])]
op = 'byid'
@@ -360,15 +427,18 @@ class NexxIE(InfoExtractor):
result = self._call_api(
domain_id, 'videos/%s/%s' % (op, video_id), video_id, data={
- 'additionalfields': 'language,channel,actors,studio,licenseby,slug,subtitle,teaser,description',
+ 'additionalfields': 'language,channel,format,licenseby,slug,fileversion,episode,season',
'addInteractionOptions': '1',
'addStatusDetails': '1',
'addStreamDetails': '1',
- 'addCaptions': '1',
+ 'addFeatures': '1',
+ # Caption format selection doesn't seem to be enforced?
+ 'addCaptions': 'vtt',
'addScenes': '1',
+ 'addChapters': '1',
'addHotSpots': '1',
+ 'addConnectedMedia': 'persons',
'addBumpers': '1',
- 'captionFormat': 'data',
}, headers={
'X-Request-CID': cid,
'X-Request-Token': request_token,
@@ -384,28 +454,48 @@ class NexxIE(InfoExtractor):
formats = self._extract_azure_formats(video, video_id)
elif cdn == 'free':
formats = self._extract_free_formats(video, video_id)
+ elif cdn == '3q':
+ formats = self._extract_3q_formats(video, video_id)
else:
- # TODO: reverse more cdns
- assert False
+ self.raise_no_formats(f'{cdn} formats are currently not supported', video_id)
self._sort_formats(formats)
+ subtitles = {}
+ for sub in video.get('captiondata') or []:
+ if sub.get('data'):
+ subtitles.setdefault(sub.get('language', 'en'), []).append({
+ 'ext': 'srt',
+ 'data': '\n\n'.join(
+ f'{i + 1}\n{srt_subtitles_timecode(line["fromms"] / 1000)} --> {srt_subtitles_timecode(line["toms"] / 1000)}\n{line["caption"]}'
+ for i, line in enumerate(sub['data'])),
+ 'name': sub.get('language_long') or sub.get('title')
+ })
+ elif sub.get('url'):
+ subtitles.setdefault(sub.get('language', 'en'), []).append({
+ 'url': sub['url'],
+ 'ext': sub.get('format'),
+ 'name': sub.get('language_long') or sub.get('title')
+ })
+
return {
'id': video_id,
'title': title,
'alt_title': general.get('subtitle'),
'description': general.get('description'),
'release_year': int_or_none(general.get('year')),
- 'creator': general.get('studio') or general.get('studio_adref'),
+ 'creator': general.get('studio') or general.get('studio_adref') or None,
'thumbnail': try_get(
video, lambda x: x['imagedata']['thumb'], compat_str),
'duration': parse_duration(general.get('runtime')),
'timestamp': int_or_none(general.get('uploaded')),
- 'episode_number': int_or_none(try_get(
- video, lambda x: x['episodedata']['episode'])),
- 'season_number': int_or_none(try_get(
- video, lambda x: x['episodedata']['season'])),
+ 'episode_number': traverse_obj(
+ video, (('episodedata', 'general'), 'episode'), expected_type=int, get_all=False),
+ 'season_number': traverse_obj(
+ video, (('episodedata', 'general'), 'season'), expected_type=int, get_all=False),
+ 'cast': traverse_obj(video, ('connectedmedia', ..., 'title'), expected_type=str),
'formats': formats,
+ 'subtitles': subtitles,
}
@@ -427,7 +517,6 @@ class NexxEmbedIE(InfoExtractor):
'upload_date': '20140305',
},
'params': {
- 'format': 'bestvideo',
'skip_download': True,
},
}, {
diff --git a/hypervideo_dl/extractor/nfb.py b/hypervideo_dl/extractor/nfb.py
new file mode 100644
index 0000000..a12e503
--- /dev/null
+++ b/hypervideo_dl/extractor/nfb.py
@@ -0,0 +1,62 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import int_or_none
+
+
+class NFBIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?nfb\.ca/film/(?P<id>[^/?#&]+)'
+ _TESTS = [{
+ 'url': 'https://www.nfb.ca/film/trafficopter/',
+ 'info_dict': {
+ 'id': 'trafficopter',
+ 'ext': 'mp4',
+ 'title': 'Trafficopter',
+ 'description': 'md5:060228455eb85cf88785c41656776bc0',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'uploader': 'Barrie Howells',
+ 'release_year': 1972,
+ },
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage('https://www.nfb.ca/film/%s/' % video_id, video_id)
+
+ iframe = self._html_search_regex(
+ r'<[^>]+\bid=["\']player-iframe["\'][^>]*src=["\']([^"\']+)',
+ webpage, 'iframe', default=None, fatal=True)
+ if iframe.startswith('/'):
+ iframe = f'https://www.nfb.ca{iframe}'
+
+ player = self._download_webpage(iframe, video_id)
+
+ source = self._html_search_regex(
+ r'source:\s*\'([^\']+)',
+ player, 'source', default=None, fatal=True)
+
+ formats, subtitles = self._extract_m3u8_formats_and_subtitles(source, video_id, ext='mp4')
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': self._html_search_regex(
+ r'<[^>]+\bid=["\']titleHeader["\'][^>]*>\s*<h1[^>]*>\s*([^<]+?)\s*</h1>',
+ webpage, 'title', default=None),
+ 'description': self._html_search_regex(
+ r'<[^>]+\bid=["\']tabSynopsis["\'][^>]*>\s*<p[^>]*>\s*([^<]+)',
+ webpage, 'description', default=None),
+ 'thumbnail': self._html_search_regex(
+ r'poster:\s*\'([^\']+)',
+ player, 'thumbnail', default=None),
+ 'uploader': self._html_search_regex(
+ r'<[^>]+\bitemprop=["\']name["\'][^>]*>([^<]+)',
+ webpage, 'uploader', default=None),
+ 'release_year': int_or_none(self._html_search_regex(
+ r'<[^>]+\bitemprop=["\']datePublished["\'][^>]*>([^<]+)',
+ webpage, 'release_year', default=None)),
+ 'formats': formats,
+ 'subtitles': subtitles,
+ }
diff --git a/hypervideo_dl/extractor/nfl.py b/hypervideo_dl/extractor/nfl.py
index 871923e..821276a 100644
--- a/hypervideo_dl/extractor/nfl.py
+++ b/hypervideo_dl/extractor/nfl.py
@@ -89,7 +89,7 @@ class NFLBaseIE(InfoExtractor):
'ext': determine_ext(image_url, 'jpg'),
}]
info.update({
- 'title': self._live_title(title) if is_live else title,
+ 'title': title,
'is_live': is_live,
'description': clean_html(item.get('description')),
'thumbnails': thumbnails,
diff --git a/hypervideo_dl/extractor/nhk.py b/hypervideo_dl/extractor/nhk.py
index 950a3d0..3b8efc3 100644
--- a/hypervideo_dl/extractor/nhk.py
+++ b/hypervideo_dl/extractor/nhk.py
@@ -1,8 +1,15 @@
from __future__ import unicode_literals
+import re
from .common import InfoExtractor
-from ..utils import urljoin
+from ..utils import (
+ parse_duration,
+ traverse_obj,
+ unescapeHTML,
+ unified_timestamp,
+ urljoin
+)
class NhkBaseIE(InfoExtractor):
@@ -73,6 +80,7 @@ class NhkBaseIE(InfoExtractor):
m3u8_id='hls', fatal=False)
for f in info['formats']:
f['language'] = lang
+ self._sort_formats(info['formats'])
else:
info.update({
'_type': 'url_transparent',
@@ -175,3 +183,145 @@ class NhkVodProgramIE(NhkBaseIE):
program_title = entries[0].get('series')
return self.playlist_result(entries, program_id, program_title)
+
+
+class NhkForSchoolBangumiIE(InfoExtractor):
+ _VALID_URL = r'https?://www2\.nhk\.or\.jp/school/movie/(?P<type>bangumi|clip)\.cgi\?das_id=(?P<id>[a-zA-Z0-9_-]+)'
+ _TESTS = [{
+ 'url': 'https://www2.nhk.or.jp/school/movie/bangumi.cgi?das_id=D0005150191_00000',
+ 'info_dict': {
+ 'id': 'D0005150191_00003',
+ 'title': 'にている かな',
+ 'duration': 599.999,
+ 'timestamp': 1396414800,
+
+ 'upload_date': '20140402',
+ 'ext': 'mp4',
+
+ 'chapters': 'count:12'
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }]
+
+ def _real_extract(self, url):
+ program_type, video_id = self._match_valid_url(url).groups()
+
+ webpage = self._download_webpage(
+ f'https://www2.nhk.or.jp/school/movie/{program_type}.cgi?das_id={video_id}', video_id)
+
+ # searches all variables
+ base_values = {g.group(1): g.group(2) for g in re.finditer(r'var\s+([a-zA-Z_]+)\s*=\s*"([^"]+?)";', webpage)}
+ # and programObj values too
+ program_values = {g.group(1): g.group(3) for g in re.finditer(r'(?:program|clip)Obj\.([a-zA-Z_]+)\s*=\s*(["\'])([^"]+?)\2;', webpage)}
+ # extract all chapters
+ chapter_durations = [parse_duration(g.group(1)) for g in re.finditer(r'chapterTime\.push\(\'([0-9:]+?)\'\);', webpage)]
+ chapter_titles = [' '.join([g.group(1) or '', unescapeHTML(g.group(2))]).strip() for g in re.finditer(r'<div class="cpTitle"><span>(scene\s*\d+)?</span>([^<]+?)</div>', webpage)]
+
+ # this is how player_core.js is actually doing (!)
+ version = base_values.get('r_version') or program_values.get('version')
+ if version:
+ video_id = f'{video_id.split("_")[0]}_{version}'
+
+ formats = self._extract_m3u8_formats(
+ f'https://nhks-vh.akamaihd.net/i/das/{video_id[0:8]}/{video_id}_V_000.f4v/master.m3u8',
+ video_id, ext='mp4', m3u8_id='hls')
+ self._sort_formats(formats)
+
+ duration = parse_duration(base_values.get('r_duration'))
+
+ chapters = None
+ if chapter_durations and chapter_titles and len(chapter_durations) == len(chapter_titles):
+ start_time = chapter_durations
+ end_time = chapter_durations[1:] + [duration]
+ chapters = [{
+ 'start_time': s,
+ 'end_time': e,
+ 'title': t,
+ } for s, e, t in zip(start_time, end_time, chapter_titles)]
+
+ return {
+ 'id': video_id,
+ 'title': program_values.get('name'),
+ 'duration': parse_duration(base_values.get('r_duration')),
+ 'timestamp': unified_timestamp(base_values['r_upload']),
+ 'formats': formats,
+ 'chapters': chapters,
+ }
+
+
+class NhkForSchoolSubjectIE(InfoExtractor):
+ IE_DESC = 'Portal page for each school subjects, like Japanese (kokugo, 国語) or math (sansuu/suugaku or 算数・数学)'
+ KNOWN_SUBJECTS = (
+ 'rika', 'syakai', 'kokugo',
+ 'sansuu', 'seikatsu', 'doutoku',
+ 'ongaku', 'taiiku', 'zukou',
+ 'gijutsu', 'katei', 'sougou',
+ 'eigo', 'tokkatsu',
+ 'tokushi', 'sonota',
+ )
+ _VALID_URL = r'https?://www\.nhk\.or\.jp/school/(?P<id>%s)/?(?:[\?#].*)?$' % '|'.join(re.escape(s) for s in KNOWN_SUBJECTS)
+
+ _TESTS = [{
+ 'url': 'https://www.nhk.or.jp/school/sougou/',
+ 'info_dict': {
+ 'id': 'sougou',
+ 'title': '総合的な学習の時間',
+ },
+ 'playlist_mincount': 16,
+ }, {
+ 'url': 'https://www.nhk.or.jp/school/rika/',
+ 'info_dict': {
+ 'id': 'rika',
+ 'title': '理科',
+ },
+ 'playlist_mincount': 15,
+ }]
+
+ def _real_extract(self, url):
+ subject_id = self._match_id(url)
+ webpage = self._download_webpage(url, subject_id)
+
+ return self.playlist_from_matches(
+ re.finditer(rf'href="((?:https?://www\.nhk\.or\.jp)?/school/{re.escape(subject_id)}/[^/]+/)"', webpage),
+ subject_id,
+ self._html_search_regex(r'(?s)<span\s+class="subjectName">\s*<img\s*[^<]+>\s*([^<]+?)</span>', webpage, 'title', fatal=False),
+ lambda g: urljoin(url, g.group(1)))
+
+
+class NhkForSchoolProgramListIE(InfoExtractor):
+ _VALID_URL = r'https?://www\.nhk\.or\.jp/school/(?P<id>(?:%s)/[a-zA-Z0-9_-]+)' % (
+ '|'.join(re.escape(s) for s in NhkForSchoolSubjectIE.KNOWN_SUBJECTS)
+ )
+ _TESTS = [{
+ 'url': 'https://www.nhk.or.jp/school/sougou/q/',
+ 'info_dict': {
+ 'id': 'sougou/q',
+ 'title': 'Q~こどものための哲学',
+ },
+ 'playlist_mincount': 20,
+ }]
+
+ def _real_extract(self, url):
+ program_id = self._match_id(url)
+
+ webpage = self._download_webpage(f'https://www.nhk.or.jp/school/{program_id}/', program_id)
+
+ title = (self._og_search_title(webpage)
+ or self._html_extract_title(webpage)
+ or self._html_search_regex(r'<h3>([^<]+?)とは?\s*</h3>', webpage, 'title', fatal=False))
+ title = re.sub(r'\s*\|\s*NHK\s+for\s+School\s*$', '', title) if title else None
+ description = self._html_search_regex(
+ r'(?s)<div\s+class="programDetail\s*">\s*<p>[^<]+</p>',
+ webpage, 'description', fatal=False, group=0)
+
+ bangumi_list = self._download_json(
+ f'https://www.nhk.or.jp/school/{program_id}/meta/program.json', program_id)
+ # they're always bangumi
+ bangumis = [
+ self.url_result(f'https://www2.nhk.or.jp/school/movie/bangumi.cgi?das_id={x}')
+ for x in traverse_obj(bangumi_list, ('part', ..., 'part-video-dasid')) or []]
+
+ return self.playlist_result(bangumis, program_id, title, description)
diff --git a/hypervideo_dl/extractor/niconico.py b/hypervideo_dl/extractor/niconico.py
index 76f0870..4eb6ed0 100644
--- a/hypervideo_dl/extractor/niconico.py
+++ b/hypervideo_dl/extractor/niconico.py
@@ -2,32 +2,39 @@
from __future__ import unicode_literals
import datetime
+import functools
import itertools
import json
import re
+import time
from .common import InfoExtractor, SearchInfoExtractor
-from ..postprocessor.ffmpeg import FFmpegPostProcessor
from ..compat import (
- compat_str,
compat_parse_qs,
compat_urllib_parse_urlparse,
+ compat_HTTPError,
)
from ..utils import (
ExtractorError,
- dict_get,
+ OnDemandPagedList,
+ bug_reports_message,
+ clean_html,
float_or_none,
int_or_none,
- OnDemandPagedList,
+ join_nonempty,
parse_duration,
+ parse_filesize,
parse_iso8601,
- PostProcessingError,
+ parse_resolution,
+ qualities,
remove_start,
str_or_none,
+ traverse_obj,
try_get,
- unified_timestamp,
+ unescapeHTML,
+ update_url_query,
+ url_or_none,
urlencode_postdata,
- xpath_text,
)
@@ -37,7 +44,7 @@ class NiconicoIE(InfoExtractor):
_TESTS = [{
'url': 'http://www.nicovideo.jp/watch/sm22312215',
- 'md5': 'a5bad06f1347452102953f323c69da34s',
+ 'md5': 'd1a75c0823e2f629128c43e1212760f9',
'info_dict': {
'id': 'sm22312215',
'ext': 'mp4',
@@ -160,35 +167,42 @@ class NiconicoIE(InfoExtractor):
}, {
'url': 'http://sp.nicovideo.jp/watch/sm28964488?ss_pos=1&cp_in=wt_tg',
'only_matching': True,
+ }, {
+ 'note': 'a video that is only served as an ENCRYPTED HLS.',
+ 'url': 'https://www.nicovideo.jp/watch/so38016254',
+ 'only_matching': True,
}]
- _VALID_URL = r'https?://(?:www\.|secure\.|sp\.)?nicovideo\.jp/watch/(?P<id>(?:[a-z]{2})?[0-9]+)'
+ _VALID_URL = r'https?://(?:(?:www\.|secure\.|sp\.)?nicovideo\.jp/watch|nico\.ms)/(?P<id>(?:[a-z]{2})?[0-9]+)'
_NETRC_MACHINE = 'niconico'
-
+ _COMMENT_API_ENDPOINTS = (
+ 'https://nvcomment.nicovideo.jp/legacy/api.json',
+ 'https://nmsg.nicovideo.jp/api.json',)
_API_HEADERS = {
'X-Frontend-ID': '6',
- 'X-Frontend-Version': '0'
+ 'X-Frontend-Version': '0',
+ 'X-Niconico-Language': 'en-us',
+ 'Referer': 'https://www.nicovideo.jp/',
+ 'Origin': 'https://www.nicovideo.jp',
}
- def _real_initialize(self):
- self._login()
-
- def _login(self):
- username, password = self._get_login_info()
- # No authentication to be performed
- if not username:
- return True
-
- # Log in
+ def _perform_login(self, username, password):
login_ok = True
login_form_strs = {
'mail_tel': username,
'password': password,
}
+ self._request_webpage(
+ 'https://account.nicovideo.jp/login', None,
+ note='Acquiring Login session')
urlh = self._request_webpage(
- 'https://account.nicovideo.jp/api/v1/login', None,
+ 'https://account.nicovideo.jp/login/redirector?show_button_twitter=1&site=niconico&show_button_facebook=1', None,
note='Logging in', errnote='Unable to log in',
- data=urlencode_postdata(login_form_strs))
+ data=urlencode_postdata(login_form_strs),
+ headers={
+ 'Referer': 'https://account.nicovideo.jp/login',
+ 'Content-Type': 'application/x-www-form-urlencoded',
+ })
if urlh is False:
login_ok = False
else:
@@ -200,8 +214,8 @@ class NiconicoIE(InfoExtractor):
return login_ok
def _get_heartbeat_info(self, info_dict):
-
video_id, video_src_id, audio_src_id = info_dict['url'].split(':')[1].split('/')
+ dmc_protocol = info_dict['_expected_protocol']
api_data = (
info_dict.get('_api_data')
@@ -216,49 +230,50 @@ class NiconicoIE(InfoExtractor):
session_api_endpoint = try_get(session_api_data, lambda x: x['urls'][0])
def ping():
- status = try_get(
- self._download_json(
- 'https://nvapi.nicovideo.jp/v1/2ab0cbaa/watch', video_id,
- query={'t': try_get(api_data, lambda x: x['media']['delivery']['trackingId'])},
- note='Acquiring permission for downloading video',
- headers=self._API_HEADERS),
- lambda x: x['meta']['status'])
- if status != 200:
- self.report_warning('Failed to acquire permission for playing video. The video may not download.')
+ tracking_id = traverse_obj(api_data, ('media', 'delivery', 'trackingId'))
+ if tracking_id:
+ tracking_url = update_url_query('https://nvapi.nicovideo.jp/v1/2ab0cbaa/watch', {'t': tracking_id})
+ watch_request_response = self._download_json(
+ tracking_url, video_id,
+ note='Acquiring permission for downloading video', fatal=False,
+ headers=self._API_HEADERS)
+ if traverse_obj(watch_request_response, ('meta', 'status')) != 200:
+ self.report_warning('Failed to acquire permission for playing video. Video download may fail.')
yesno = lambda x: 'yes' if x else 'no'
- # m3u8 (encryption)
- if try_get(api_data, lambda x: x['media']['delivery']['encryption']) is not None:
+ if dmc_protocol == 'http':
+ protocol = 'http'
+ protocol_parameters = {
+ 'http_output_download_parameters': {
+ 'use_ssl': yesno(session_api_data['urls'][0]['isSsl']),
+ 'use_well_known_port': yesno(session_api_data['urls'][0]['isWellKnownPort']),
+ }
+ }
+ elif dmc_protocol == 'hls':
protocol = 'm3u8'
- encryption = self._parse_json(session_api_data['token'], video_id)['hls_encryption']
- session_api_http_parameters = {
- 'parameters': {
- 'hls_parameters': {
- 'encryption': {
- encryption: {
- 'encrypted_key': try_get(api_data, lambda x: x['media']['delivery']['encryption']['encryptedKey']),
- 'key_uri': try_get(api_data, lambda x: x['media']['delivery']['encryption']['keyUri'])
- }
- },
- 'transfer_preset': '',
- 'use_ssl': yesno(session_api_endpoint['isSsl']),
- 'use_well_known_port': yesno(session_api_endpoint['isWellKnownPort']),
- 'segment_duration': 6000,
- }
+ segment_duration = try_get(self._configuration_arg('segment_duration'), lambda x: int(x[0])) or 6000
+ parsed_token = self._parse_json(session_api_data['token'], video_id)
+ encryption = traverse_obj(api_data, ('media', 'delivery', 'encryption'))
+ protocol_parameters = {
+ 'hls_parameters': {
+ 'segment_duration': segment_duration,
+ 'transfer_preset': '',
+ 'use_ssl': yesno(session_api_data['urls'][0]['isSsl']),
+ 'use_well_known_port': yesno(session_api_data['urls'][0]['isWellKnownPort']),
}
}
- # http
- else:
- protocol = 'http'
- session_api_http_parameters = {
- 'parameters': {
- 'http_output_download_parameters': {
- 'use_ssl': yesno(session_api_endpoint['isSsl']),
- 'use_well_known_port': yesno(session_api_endpoint['isWellKnownPort']),
+ if 'hls_encryption' in parsed_token and encryption:
+ protocol_parameters['hls_parameters']['encryption'] = {
+ parsed_token['hls_encryption']: {
+ 'encrypted_key': encryption['encryptedKey'],
+ 'key_uri': encryption['keyUri'],
}
}
- }
+ else:
+ protocol = 'm3u8_native'
+ else:
+ raise ExtractorError(f'Unsupported DMC protocol: {dmc_protocol}')
session_response = self._download_json(
session_api_endpoint['url'], video_id,
@@ -292,11 +307,13 @@ class NiconicoIE(InfoExtractor):
'lifetime': session_api_data.get('heartbeatLifetime')
}
},
- 'priority': session_api_data.get('priority'),
+ 'priority': session_api_data['priority'],
'protocol': {
'name': 'http',
'parameters': {
- 'http_parameters': session_api_http_parameters
+ 'http_parameters': {
+ 'parameters': protocol_parameters
+ }
}
},
'recipe_id': session_api_data.get('recipeId'),
@@ -324,36 +341,35 @@ class NiconicoIE(InfoExtractor):
return info_dict, heartbeat_info_dict
- def _extract_format_for_quality(self, api_data, video_id, audio_quality, video_quality):
- def parse_format_id(id_code):
- mobj = re.match(r'''(?x)
- (?:archive_)?
- (?:(?P<codec>[^_]+)_)?
- (?:(?P<br>[\d]+)kbps_)?
- (?:(?P<res>[\d+]+)p_)?
- ''', '%s_' % id_code)
- return mobj.groupdict() if mobj else {}
-
- protocol = 'niconico_dmc'
- format_id = '-'.join(map(lambda s: remove_start(s['id'], 'archive_'), [video_quality, audio_quality]))
- vdict = parse_format_id(video_quality['id'])
- adict = parse_format_id(audio_quality['id'])
- resolution = try_get(video_quality, lambda x: x['metadata']['resolution'], dict) or {'height': vdict.get('res')}
- vbr = try_get(video_quality, lambda x: x['metadata']['bitrate'], float)
+ def _extract_format_for_quality(self, video_id, audio_quality, video_quality, dmc_protocol):
+
+ if not audio_quality.get('isAvailable') or not video_quality.get('isAvailable'):
+ return None
+
+ def extract_video_quality(video_quality):
+ return parse_filesize('%sB' % self._search_regex(
+ r'\| ([0-9]*\.?[0-9]*[MK])', video_quality, 'vbr', default=''))
+
+ format_id = '-'.join(
+ [remove_start(s['id'], 'archive_') for s in (video_quality, audio_quality)] + [dmc_protocol])
+
+ vid_qual_label = traverse_obj(video_quality, ('metadata', 'label'))
+ vid_quality = traverse_obj(video_quality, ('metadata', 'bitrate'))
return {
- 'url': '%s:%s/%s/%s' % (protocol, video_id, video_quality['id'], audio_quality['id']),
+ 'url': 'niconico_dmc:%s/%s/%s' % (video_id, video_quality['id'], audio_quality['id']),
'format_id': format_id,
- 'format_note': 'DMC %s' % try_get(video_quality, lambda x: x['metadata']['label'], compat_str),
+ 'format_note': join_nonempty('DMC', vid_qual_label, dmc_protocol.upper(), delim=' '),
'ext': 'mp4', # Session API are used in HTML5, which always serves mp4
- 'vcodec': vdict.get('codec'),
- 'acodec': adict.get('codec'),
- 'vbr': float_or_none(vbr, 1000) or float_or_none(vdict.get('br')),
- 'abr': float_or_none(audio_quality.get('bitrate'), 1000) or float_or_none(adict.get('br')),
- 'height': int_or_none(resolution.get('height', vdict.get('res'))),
- 'width': int_or_none(resolution.get('width')),
- 'quality': -2 if 'low' in format_id else -1, # Default quality value is -1
- 'protocol': protocol,
+ 'acodec': 'aac',
+ 'vcodec': 'h264',
+ 'abr': float_or_none(traverse_obj(audio_quality, ('metadata', 'bitrate')), 1000),
+ 'vbr': float_or_none(vid_quality if vid_quality > 0 else extract_video_quality(vid_qual_label), 1000),
+ 'height': traverse_obj(video_quality, ('metadata', 'resolution', 'height')),
+ 'width': traverse_obj(video_quality, ('metadata', 'resolution', 'width')),
+ 'quality': -2 if 'low' in video_quality['id'] else None,
+ 'protocol': 'niconico_dmc',
+ '_expected_protocol': dmc_protocol,
'http_headers': {
'Origin': 'https://www.nicovideo.jp',
'Referer': 'https://www.nicovideo.jp/watch/' + video_id,
@@ -363,251 +379,220 @@ class NiconicoIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
- # Get video webpage for API data.
- webpage, handle = self._download_webpage_handle(
- 'http://www.nicovideo.jp/watch/' + video_id, video_id)
- if video_id.startswith('so'):
- video_id = self._match_id(handle.geturl())
-
- api_data = self._parse_json(self._html_search_regex(
- 'data-api-data="([^"]+)"', webpage,
- 'API data', default='{}'), video_id)
-
- def get_video_info_web(items):
- return dict_get(api_data['video'], items)
-
- # Get video info
- video_info_xml = self._download_xml(
- 'http://ext.nicovideo.jp/api/getthumbinfo/' + video_id,
- video_id, note='Downloading video info page')
-
- def get_video_info_xml(items):
- if not isinstance(items, list):
- items = [items]
- for item in items:
- ret = xpath_text(video_info_xml, './/' + item)
- if ret:
- return ret
-
- if get_video_info_xml('error'):
- error_code = get_video_info_xml('code')
-
- if error_code == 'DELETED':
- raise ExtractorError('The video has been deleted.',
- expected=True)
- elif error_code == 'NOT_FOUND':
- raise ExtractorError('The video is not found.',
- expected=True)
- elif error_code == 'COMMUNITY':
- self.to_screen('%s: The video is community members only.' % video_id)
- else:
- raise ExtractorError('%s reports error: %s' % (self.IE_NAME, error_code))
+ try:
+ webpage, handle = self._download_webpage_handle(
+ 'http://www.nicovideo.jp/watch/' + video_id, video_id)
+ if video_id.startswith('so'):
+ video_id = self._match_id(handle.geturl())
- # Start extracting video formats
- formats = []
-
- # Get HTML5 videos info
- quality_info = try_get(api_data, lambda x: x['media']['delivery']['movie'])
- if not quality_info:
- raise ExtractorError('The video can\'t be downloaded', expected=True)
-
- for audio_quality in quality_info.get('audios') or {}:
- for video_quality in quality_info.get('videos') or {}:
- if not audio_quality.get('isAvailable') or not video_quality.get('isAvailable'):
- continue
- formats.append(self._extract_format_for_quality(
- api_data, video_id, audio_quality, video_quality))
-
- # Get flv/swf info
- timestamp = None
- video_real_url = try_get(api_data, lambda x: x['video']['smileInfo']['url'])
- if video_real_url:
- is_economy = video_real_url.endswith('low')
-
- if is_economy:
- self.report_warning('Site is currently in economy mode! You will only have access to lower quality streams')
+ api_data = self._parse_json(self._html_search_regex(
+ 'data-api-data="([^"]+)"', webpage,
+ 'API data', default='{}'), video_id)
+ except ExtractorError as e:
+ try:
+ api_data = self._download_json(
+ 'https://www.nicovideo.jp/api/watch/v3/%s?_frontendId=6&_frontendVersion=0&actionTrackId=AAAAAAAAAA_%d' % (video_id, round(time.time() * 1000)), video_id,
+ note='Downloading API JSON', errnote='Unable to fetch data')['data']
+ except ExtractorError:
+ if not isinstance(e.cause, compat_HTTPError):
+ raise
+ webpage = e.cause.read().decode('utf-8', 'replace')
+ error_msg = self._html_search_regex(
+ r'(?s)<section\s+class="(?:(?:ErrorMessage|WatchExceptionPage-message)\s*)+">(.+?)</section>',
+ webpage, 'error reason', default=None)
+ if not error_msg:
+ raise
+ raise ExtractorError(re.sub(r'\s+', ' ', error_msg), expected=True)
- # Invoking ffprobe to determine resolution
- pp = FFmpegPostProcessor(self._downloader)
- cookies = self._get_cookies('https://nicovideo.jp').output(header='', sep='; path=/; domain=nicovideo.jp;\n')
+ formats = []
- self.to_screen('%s: %s' % (video_id, 'Checking smile format with ffprobe'))
+ def get_video_info(*items, get_first=True, **kwargs):
+ return traverse_obj(api_data, ('video', *items), get_all=not get_first, **kwargs)
- try:
- metadata = pp.get_metadata_object(video_real_url, ['-cookies', cookies])
- except PostProcessingError as err:
- raise ExtractorError(err.msg, expected=True)
-
- v_stream = a_stream = {}
-
- # Some complex swf files doesn't have video stream (e.g. nm4809023)
- for stream in metadata['streams']:
- if stream['codec_type'] == 'video':
- v_stream = stream
- elif stream['codec_type'] == 'audio':
- a_stream = stream
-
- # Community restricted videos seem to have issues with the thumb API not returning anything at all
- filesize = int(
- (get_video_info_xml('size_high') if not is_economy else get_video_info_xml('size_low'))
- or metadata['format']['size']
- )
- extension = (
- get_video_info_xml('movie_type')
- or 'mp4' if 'mp4' in metadata['format']['format_name'] else metadata['format']['format_name']
- )
-
- # 'creation_time' tag on video stream of re-encoded SMILEVIDEO mp4 files are '1970-01-01T00:00:00.000000Z'.
- timestamp = (
- parse_iso8601(get_video_info_web('first_retrieve'))
- or unified_timestamp(get_video_info_web('postedDateTime'))
- )
- metadata_timestamp = (
- parse_iso8601(try_get(v_stream, lambda x: x['tags']['creation_time']))
- or timestamp if extension != 'mp4' else 0
- )
-
- # According to compconf, smile videos from pre-2017 are always better quality than their DMC counterparts
- smile_threshold_timestamp = parse_iso8601('2016-12-08T00:00:00+09:00')
-
- is_source = timestamp < smile_threshold_timestamp or metadata_timestamp > 0
-
- # If movie file size is unstable, old server movie is not source movie.
- if filesize > 1:
- formats.append({
- 'url': video_real_url,
- 'format_id': 'smile' if not is_economy else 'smile_low',
- 'format_note': 'SMILEVIDEO source' if not is_economy else 'SMILEVIDEO low quality',
- 'ext': extension,
- 'container': extension,
- 'vcodec': v_stream.get('codec_name'),
- 'acodec': a_stream.get('codec_name'),
- # Some complex swf files doesn't have total bit rate metadata (e.g. nm6049209)
- 'tbr': int_or_none(metadata['format'].get('bit_rate'), scale=1000),
- 'vbr': int_or_none(v_stream.get('bit_rate'), scale=1000),
- 'abr': int_or_none(a_stream.get('bit_rate'), scale=1000),
- 'height': int_or_none(v_stream.get('height')),
- 'width': int_or_none(v_stream.get('width')),
- 'source_preference': 5 if not is_economy else -2,
- 'quality': 5 if is_source and not is_economy else None,
- 'filesize': filesize
- })
+ quality_info = api_data['media']['delivery']['movie']
+ session_api_data = quality_info['session']
+ for (audio_quality, video_quality, protocol) in itertools.product(quality_info['audios'], quality_info['videos'], session_api_data['protocols']):
+ fmt = self._extract_format_for_quality(video_id, audio_quality, video_quality, protocol)
+ if fmt:
+ formats.append(fmt)
self._sort_formats(formats)
# Start extracting information
- title = (
- get_video_info_xml('title') # prefer to get the untranslated original title
- or get_video_info_web(['originalTitle', 'title'])
- or self._og_search_title(webpage, default=None)
- or self._html_search_regex(
- r'<span[^>]+class="videoHeaderTitle"[^>]*>([^<]+)</span>',
- webpage, 'video title'))
-
- watch_api_data_string = self._html_search_regex(
- r'<div[^>]+id="watchAPIDataContainer"[^>]+>([^<]+)</div>',
- webpage, 'watch api data', default=None)
- watch_api_data = self._parse_json(watch_api_data_string, video_id) if watch_api_data_string else {}
- video_detail = watch_api_data.get('videoDetail', {})
-
- thumbnail = (
- self._html_search_regex(r'<meta property="og:image" content="([^"]+)">', webpage, 'thumbnail data', default=None)
- or dict_get( # choose highest from 720p to 240p
- get_video_info_web('thumbnail'),
- ['ogp', 'player', 'largeUrl', 'middleUrl', 'url'])
- or self._html_search_meta('image', webpage, 'thumbnail', default=None)
- or video_detail.get('thumbnail'))
-
- description = get_video_info_web('description')
-
- if not timestamp:
- match = self._html_search_meta('datePublished', webpage, 'date published', default=None)
- if match:
- timestamp = parse_iso8601(match.replace('+', ':00+'))
- if not timestamp and video_detail.get('postedAt'):
- timestamp = parse_iso8601(
- video_detail['postedAt'].replace('/', '-'),
- delimiter=' ', timezone=datetime.timedelta(hours=9))
- timestamp = timestamp or try_get(api_data, lambda x: parse_iso8601(x['video']['registeredAt']))
-
- view_count = int_or_none(get_video_info_web(['view_counter', 'viewCount']))
- if not view_count:
- match = self._html_search_regex(
- r'>Views: <strong[^>]*>([^<]+)</strong>',
- webpage, 'view count', default=None)
- if match:
- view_count = int_or_none(match.replace(',', ''))
- view_count = (
- view_count
- or video_detail.get('viewCount')
- or try_get(api_data, lambda x: x['video']['count']['view']))
-
- comment_count = (
- int_or_none(get_video_info_web('comment_num'))
- or video_detail.get('commentCount')
- or try_get(api_data, lambda x: x['video']['count']['comment']))
-
- if not comment_count:
- match = self._html_search_regex(
- r'>Comments: <strong[^>]*>([^<]+)</strong>',
- webpage, 'comment count', default=None)
- if match:
- comment_count = int_or_none(match.replace(',', ''))
-
- duration = (parse_duration(
- get_video_info_web('length')
- or self._html_search_meta(
- 'video:duration', webpage, 'video duration', default=None))
- or video_detail.get('length')
- or get_video_info_web('duration'))
-
- webpage_url = get_video_info_web('watch_url') or url
-
- # for channel movie and community movie
- channel_id = try_get(
- api_data,
- (lambda x: x['channel']['globalId'],
- lambda x: x['community']['globalId']))
- channel = try_get(
- api_data,
- (lambda x: x['channel']['name'],
- lambda x: x['community']['name']))
-
- # Note: cannot use api_data.get('owner', {}) because owner may be set to "null"
- # in the JSON, which will cause None to be returned instead of {}.
- owner = try_get(api_data, lambda x: x.get('owner'), dict) or {}
- uploader_id = str_or_none(
- get_video_info_web(['ch_id', 'user_id'])
- or owner.get('id')
- or channel_id
- )
- uploader = (
- get_video_info_web(['ch_name', 'user_nickname'])
- or owner.get('nickname')
- or channel
- )
+ tags = None
+ if webpage:
+ # use og:video:tag (not logged in)
+ og_video_tags = re.finditer(r'<meta\s+property="og:video:tag"\s*content="(.*?)">', webpage)
+ tags = list(filter(None, (clean_html(x.group(1)) for x in og_video_tags)))
+ if not tags:
+ # use keywords and split with comma (not logged in)
+ kwds = self._html_search_meta('keywords', webpage, default=None)
+ if kwds:
+ tags = [x for x in kwds.split(',') if x]
+ if not tags:
+ # find in json (logged in)
+ tags = traverse_obj(api_data, ('tag', 'items', ..., 'name'))
+
+ thumb_prefs = qualities(['url', 'middleUrl', 'largeUrl', 'player', 'ogp'])
return {
'id': video_id,
'_api_data': api_data,
- 'title': title,
+ 'title': get_video_info(('originalTitle', 'title')) or self._og_search_title(webpage, default=None),
'formats': formats,
- 'thumbnail': thumbnail,
- 'description': description,
- 'uploader': uploader,
- 'timestamp': timestamp,
- 'uploader_id': uploader_id,
- 'channel': channel,
- 'channel_id': channel_id,
- 'view_count': view_count,
- 'comment_count': comment_count,
- 'duration': duration,
- 'webpage_url': webpage_url,
+ 'thumbnails': [{
+ 'id': key,
+ 'url': url,
+ 'ext': 'jpg',
+ 'preference': thumb_prefs(key),
+ **parse_resolution(url, lenient=True),
+ } for key, url in (get_video_info('thumbnail') or {}).items() if url],
+ 'description': clean_html(get_video_info('description')),
+ 'uploader': traverse_obj(api_data, ('owner', 'nickname'), ('channel', 'name'), ('community', 'name')),
+ 'uploader_id': str_or_none(traverse_obj(api_data, ('owner', 'id'), ('channel', 'id'), ('community', 'id'))),
+ 'timestamp': parse_iso8601(get_video_info('registeredAt')) or parse_iso8601(
+ self._html_search_meta('video:release_date', webpage, 'date published', default=None)),
+ 'channel': traverse_obj(api_data, ('channel', 'name'), ('community', 'name')),
+ 'channel_id': traverse_obj(api_data, ('channel', 'id'), ('community', 'id')),
+ 'view_count': int_or_none(get_video_info('count', 'view')),
+ 'tags': tags,
+ 'genre': traverse_obj(api_data, ('genre', 'label'), ('genre', 'key')),
+ 'comment_count': get_video_info('count', 'comment', expected_type=int),
+ 'duration': (
+ parse_duration(self._html_search_meta('video:duration', webpage, 'video duration', default=None))
+ or get_video_info('duration')),
+ 'webpage_url': url_or_none(url) or f'https://www.nicovideo.jp/watch/{video_id}',
+ 'subtitles': self.extract_subtitles(video_id, api_data, session_api_data),
+ }
+
+ def _get_subtitles(self, video_id, api_data, session_api_data):
+ comment_user_key = traverse_obj(api_data, ('comment', 'keys', 'userKey'))
+ user_id_str = session_api_data.get('serviceUserId')
+
+ thread_ids = traverse_obj(api_data, ('comment', 'threads', lambda _, v: v['isActive']))
+ raw_danmaku = self._extract_all_comments(video_id, thread_ids, user_id_str, comment_user_key)
+ if not raw_danmaku:
+ self.report_warning(f'Failed to get comments. {bug_reports_message()}')
+ return
+ return {
+ 'comments': [{
+ 'ext': 'json',
+ 'data': json.dumps(raw_danmaku),
+ }],
}
+ def _extract_all_comments(self, video_id, threads, user_id, user_key):
+ auth_data = {
+ 'user_id': user_id,
+ 'userkey': user_key,
+ } if user_id and user_key else {'user_id': ''}
+
+ # Request Start
+ post_data = [{'ping': {'content': 'rs:0'}}]
+ for i, thread in enumerate(threads):
+ thread_id = thread['id']
+ thread_fork = thread['fork']
+ # Post Start (2N)
+ post_data.append({'ping': {'content': f'ps:{i * 2}'}})
+ post_data.append({'thread': {
+ 'fork': thread_fork,
+ 'language': 0,
+ 'nicoru': 3,
+ 'scores': 1,
+ 'thread': thread_id,
+ 'version': '20090904',
+ 'with_global': 1,
+ **auth_data,
+ }})
+ # Post Final (2N)
+ post_data.append({'ping': {'content': f'pf:{i * 2}'}})
+
+ # Post Start (2N+1)
+ post_data.append({'ping': {'content': f'ps:{i * 2 + 1}'}})
+ post_data.append({'thread_leaves': {
+ # format is '<bottom of minute range>-<top of minute range>:<comments per minute>,<total last comments'
+ # unfortunately NND limits (deletes?) comment returns this way, so you're only able to grab the last 1000 per language
+ 'content': '0-999999:999999,999999,nicoru:999999',
+ 'fork': thread_fork,
+ 'language': 0,
+ 'nicoru': 3,
+ 'scores': 1,
+ 'thread': thread_id,
+ **auth_data,
+ }})
+ # Post Final (2N+1)
+ post_data.append({'ping': {'content': f'pf:{i * 2 + 1}'}})
+ # Request Final
+ post_data.append({'ping': {'content': 'rf:0'}})
+
+ for api_url in self._COMMENT_API_ENDPOINTS:
+ comments = self._download_json(
+ api_url, video_id, data=json.dumps(post_data).encode(), fatal=False,
+ headers={
+ 'Referer': 'https://www.nicovideo.jp/watch/%s' % video_id,
+ 'Origin': 'https://www.nicovideo.jp',
+ 'Content-Type': 'text/plain;charset=UTF-8',
+ },
+ note='Downloading comments', errnote=f'Failed to access endpoint {api_url}')
+ if comments:
+ return comments
+
+
+class NiconicoPlaylistBaseIE(InfoExtractor):
+ _PAGE_SIZE = 100
+
+ _API_HEADERS = {
+ 'X-Frontend-ID': '6',
+ 'X-Frontend-Version': '0',
+ 'X-Niconico-Language': 'en-us'
+ }
+
+ def _call_api(self, list_id, resource, query):
+ "Implement this in child class"
+ pass
+
+ @staticmethod
+ def _parse_owner(item):
+ return {
+ 'uploader': traverse_obj(item, ('owner', 'name')),
+ 'uploader_id': traverse_obj(item, ('owner', 'id')),
+ }
+
+ def _fetch_page(self, list_id, page):
+ page += 1
+ resp = self._call_api(list_id, 'page %d' % page, {
+ 'page': page,
+ 'pageSize': self._PAGE_SIZE,
+ })
+ # this is needed to support both mylist and user
+ for video in traverse_obj(resp, ('items', ..., ('video', None))) or []:
+ video_id = video.get('id')
+ if not video_id:
+ # skip {"video": {"id": "blablabla", ...}}
+ continue
+ count = video.get('count') or {}
+ get_count = lambda x: int_or_none(count.get(x))
+ yield {
+ '_type': 'url',
+ 'id': video_id,
+ 'title': video.get('title'),
+ 'url': f'https://www.nicovideo.jp/watch/{video_id}',
+ 'description': video.get('shortDescription'),
+ 'duration': int_or_none(video.get('duration')),
+ 'view_count': get_count('view'),
+ 'comment_count': get_count('comment'),
+ 'thumbnail': traverse_obj(video, ('thumbnail', ('nHdUrl', 'largeUrl', 'listingUrl', 'url'))),
+ 'ie_key': NiconicoIE.ie_key(),
+ **self._parse_owner(video),
+ }
+
+ def _entries(self, list_id):
+ return OnDemandPagedList(functools.partial(self._fetch_page, list_id), self._PAGE_SIZE)
+
-class NiconicoPlaylistIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?nicovideo\.jp/(?:user/\d+/|my/)?mylist/(?P<id>\d+)'
+class NiconicoPlaylistIE(NiconicoPlaylistBaseIE):
+ IE_NAME = 'niconico:playlist'
+ _VALID_URL = r'https?://(?:(?:www\.|sp\.)?nicovideo\.jp|nico\.ms)/(?:user/\d+/)?(?:my/)?mylist/(?:#/)?(?P<id>\d+)'
_TESTS = [{
'url': 'http://www.nicovideo.jp/mylist/27411728',
@@ -618,73 +603,115 @@ class NiconicoPlaylistIE(InfoExtractor):
'uploader': 'のっく',
'uploader_id': '805442',
},
- 'playlist_mincount': 225,
+ 'playlist_mincount': 291,
}, {
'url': 'https://www.nicovideo.jp/user/805442/mylist/27411728',
'only_matching': True,
+ }, {
+ 'url': 'https://www.nicovideo.jp/my/mylist/#/68048635',
+ 'only_matching': True,
}]
- _API_HEADERS = {
- 'X-Frontend-ID': '6',
- 'X-Frontend-Version': '0'
- }
+ def _call_api(self, list_id, resource, query):
+ return self._download_json(
+ f'https://nvapi.nicovideo.jp/v2/mylists/{list_id}', list_id,
+ f'Downloading {resource}', query=query,
+ headers=self._API_HEADERS)['data']['mylist']
def _real_extract(self, url):
list_id = self._match_id(url)
+ mylist = self._call_api(list_id, 'list', {
+ 'pageSize': 1,
+ })
+ return self.playlist_result(
+ self._entries(list_id), list_id,
+ mylist.get('name'), mylist.get('description'), **self._parse_owner(mylist))
- def get_page_data(pagenum, pagesize):
- return self._download_json(
- 'http://nvapi.nicovideo.jp/v2/mylists/' + list_id, list_id,
- query={'page': 1 + pagenum, 'pageSize': pagesize},
- headers=self._API_HEADERS).get('data').get('mylist')
-
- data = get_page_data(0, 1)
- title = data.get('name')
- description = data.get('description')
- uploader = data.get('owner').get('name')
- uploader_id = data.get('owner').get('id')
-
- def pagefunc(pagenum):
- data = get_page_data(pagenum, 25)
- return ({
- '_type': 'url',
- 'url': 'http://www.nicovideo.jp/watch/' + item.get('watchId'),
- } for item in data.get('items'))
-
- return {
- '_type': 'playlist',
- 'id': list_id,
- 'title': title,
- 'description': description,
- 'uploader': uploader,
- 'uploader_id': uploader_id,
- 'entries': OnDemandPagedList(pagefunc, 25),
- }
-
-
-NicovideoSearchIE_NAME = 'nicovideo:search'
+class NiconicoSeriesIE(InfoExtractor):
+ IE_NAME = 'niconico:series'
+ _VALID_URL = r'https?://(?:(?:www\.|sp\.)?nicovideo\.jp|nico\.ms)/series/(?P<id>\d+)'
-class NicovideoSearchURLIE(InfoExtractor):
- IE_NAME = f'{NicovideoSearchIE_NAME}_url'
- IE_DESC = 'Nico video search URLs'
- _VALID_URL = r'https?://(?:www\.)?nicovideo\.jp/search/(?P<id>[^?#&]+)?'
_TESTS = [{
- 'url': 'http://www.nicovideo.jp/search/sm9',
+ 'url': 'https://www.nicovideo.jp/series/110226',
'info_dict': {
- 'id': 'sm9',
- 'title': 'sm9'
+ 'id': '110226',
+ 'title': 'ご立派ァ!のシリーズ',
},
- 'playlist_mincount': 40,
+ 'playlist_mincount': 10, # as of 2021/03/17
}, {
- 'url': 'https://www.nicovideo.jp/search/sm9?sort=h&order=d&end=2020-12-31&start=2020-01-01',
+ 'url': 'https://www.nicovideo.jp/series/12312/',
'info_dict': {
- 'id': 'sm9',
- 'title': 'sm9'
+ 'id': '12312',
+ 'title': 'バトルスピリッツ お勧めカード紹介(調整中)',
},
- 'playlist_count': 31,
+ 'playlist_mincount': 97, # as of 2021/03/17
+ }, {
+ 'url': 'https://nico.ms/series/203559',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ list_id = self._match_id(url)
+ webpage = self._download_webpage(f'https://www.nicovideo.jp/series/{list_id}', list_id)
+
+ title = self._search_regex(
+ (r'<title>「(.+)(全',
+ r'<div class="TwitterShareButton"\s+data-text="(.+)\s+https:'),
+ webpage, 'title', fatal=False)
+ if title:
+ title = unescapeHTML(title)
+ playlist = [
+ self.url_result(f'https://www.nicovideo.jp/watch/{v_id}', video_id=v_id)
+ for v_id in re.findall(r'href="/watch/([a-z0-9]+)" data-href="/watch/\1', webpage)]
+ return self.playlist_result(playlist, list_id, title)
+
+
+class NiconicoHistoryIE(NiconicoPlaylistBaseIE):
+ IE_NAME = 'niconico:history'
+ IE_DESC = 'NicoNico user history. Requires cookies.'
+ _VALID_URL = r'https?://(?:www\.|sp\.)?nicovideo\.jp/my/history'
+
+ _TESTS = [{
+ 'note': 'PC page, with /video',
+ 'url': 'https://www.nicovideo.jp/my/history/video',
+ 'only_matching': True,
+ }, {
+ 'note': 'PC page, without /video',
+ 'url': 'https://www.nicovideo.jp/my/history',
+ 'only_matching': True,
+ }, {
+ 'note': 'mobile page, with /video',
+ 'url': 'https://sp.nicovideo.jp/my/history/video',
+ 'only_matching': True,
+ }, {
+ 'note': 'mobile page, without /video',
+ 'url': 'https://sp.nicovideo.jp/my/history',
+ 'only_matching': True,
}]
+ def _call_api(self, list_id, resource, query):
+ return self._download_json(
+ 'https://nvapi.nicovideo.jp/v1/users/me/watch/history', 'history',
+ f'Downloading {resource}', query=query,
+ headers=self._API_HEADERS)['data']
+
+ def _real_extract(self, url):
+ list_id = 'history'
+ try:
+ mylist = self._call_api(list_id, 'list', {
+ 'pageSize': 1,
+ })
+ except ExtractorError as e:
+ if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
+ self.raise_login_required('You have to be logged in to get your watch history')
+ raise
+ return self.playlist_result(self._entries(list_id), list_id, **self._parse_owner(mylist))
+
+
+class NicovideoSearchBaseIE(InfoExtractor):
+ _SEARCH_TYPE = 'search'
+
def _entries(self, url, item_id, query=None, note='Downloading page %(page)s'):
query = query or {}
pages = [query['page']] if 'page' in query else itertools.count(1)
@@ -697,26 +724,45 @@ class NicovideoSearchURLIE(InfoExtractor):
if not results:
break
- def _real_extract(self, url):
- query = self._match_id(url)
- return self.playlist_result(self._entries(url, query), query, query)
+ def _search_results(self, query):
+ return self._entries(
+ self._proto_relative_url(f'//www.nicovideo.jp/{self._SEARCH_TYPE}/{query}'), query)
-class NicovideoSearchIE(SearchInfoExtractor, NicovideoSearchURLIE):
- IE_DESC = 'Nico video searches'
- _MAX_RESULTS = float('inf')
- IE_NAME = NicovideoSearchIE_NAME
+class NicovideoSearchIE(NicovideoSearchBaseIE, SearchInfoExtractor):
+ IE_DESC = 'Nico video search'
+ IE_NAME = 'nicovideo:search'
_SEARCH_KEY = 'nicosearch'
- _TESTS = []
- def _search_results(self, query):
- return self._entries(
- self._proto_relative_url(f'//www.nicovideo.jp/search/{query}'), query)
+
+class NicovideoSearchURLIE(NicovideoSearchBaseIE):
+ IE_NAME = f'{NicovideoSearchIE.IE_NAME}_url'
+ IE_DESC = 'Nico video search URLs'
+ _VALID_URL = r'https?://(?:www\.)?nicovideo\.jp/search/(?P<id>[^?#&]+)?'
+ _TESTS = [{
+ 'url': 'http://www.nicovideo.jp/search/sm9',
+ 'info_dict': {
+ 'id': 'sm9',
+ 'title': 'sm9'
+ },
+ 'playlist_mincount': 40,
+ }, {
+ 'url': 'https://www.nicovideo.jp/search/sm9?sort=h&order=d&end=2020-12-31&start=2020-01-01',
+ 'info_dict': {
+ 'id': 'sm9',
+ 'title': 'sm9'
+ },
+ 'playlist_count': 31,
+ }]
+
+ def _real_extract(self, url):
+ query = self._match_id(url)
+ return self.playlist_result(self._entries(url, query), query, query)
-class NicovideoSearchDateIE(NicovideoSearchIE):
- IE_DESC = 'Nico video searches, newest first'
- IE_NAME = f'{NicovideoSearchIE_NAME}:date'
+class NicovideoSearchDateIE(NicovideoSearchBaseIE, SearchInfoExtractor):
+ IE_DESC = 'Nico video search, newest first'
+ IE_NAME = f'{NicovideoSearchIE.IE_NAME}:date'
_SEARCH_KEY = 'nicosearchdate'
_TESTS = [{
'url': 'nicosearchdateall:a',
@@ -757,7 +803,26 @@ class NicovideoSearchDateIE(NicovideoSearchIE):
if page_num:
query['page'] = str(page_num)
- yield from NicovideoSearchURLIE._entries(self, url, item_id, query=query, note=note)
+ yield from super()._entries(url, item_id, query=query, note=note)
+
+
+class NicovideoTagURLIE(NicovideoSearchBaseIE):
+ IE_NAME = 'niconico:tag'
+ IE_DESC = 'NicoNico video tag URLs'
+ _SEARCH_TYPE = 'tag'
+ _VALID_URL = r'https?://(?:www\.)?nicovideo\.jp/tag/(?P<id>[^?#&]+)?'
+ _TESTS = [{
+ 'url': 'https://www.nicovideo.jp/tag/ドキュメンタリー淫夢',
+ 'info_dict': {
+ 'id': 'ドキュメンタリー淫夢',
+ 'title': 'ドキュメンタリー淫夢'
+ },
+ 'playlist_mincount': 400,
+ }]
+
+ def _real_extract(self, url):
+ query = self._match_id(url)
+ return self.playlist_result(self._entries(url, query), query, query)
class NiconicoUserIE(InfoExtractor):
diff --git a/hypervideo_dl/extractor/ninecninemedia.py b/hypervideo_dl/extractor/ninecninemedia.py
index 4aaf21a..7818427 100644
--- a/hypervideo_dl/extractor/ninecninemedia.py
+++ b/hypervideo_dl/extractor/ninecninemedia.py
@@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-
from .common import InfoExtractor
from ..utils import (
float_or_none,
@@ -99,3 +98,37 @@ class NineCNineMediaIE(InfoExtractor):
}
return info
+
+
+class CPTwentyFourIE(InfoExtractor):
+ IE_NAME = 'cp24'
+ _GEO_COUNTRIES = ['CA']
+ _VALID_URL = r'https?://(?:www\.)?cp24\.com/news/(?P<id>[^?#]+)'
+
+ _TESTS = [{
+ 'url': 'https://www.cp24.com/news/video-shows-atm-being-ripped-out-of-business-by-pickup-truck-driver-in-mississauga-1.5676877',
+ 'info_dict': {
+ 'id': '2328005',
+ 'ext': 'mp4',
+ 'title': 'WATCH: Truck rips ATM from Mississauga business',
+ 'description': 'md5:cf7498480885f080a754389a2b2f7073',
+ 'timestamp': 1637618377,
+ 'episode_number': None,
+ 'season': 'Season 0',
+ 'season_number': 0,
+ 'season_id': 57974,
+ 'series': 'CTV News Toronto',
+ 'duration': 26.86,
+ 'thumbnail': 'http://images2.9c9media.com/image_asset/2014_11_5_2eb609a0-475b-0132-fbd6-34b52f6f1279_jpg_2000x1125.jpg',
+ 'upload_date': '20211122',
+ },
+ 'params': {'skip_download': True, 'format': 'bv'}
+ }]
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
+ id, destination = self._search_regex(
+ r'getAuthStates\("(?P<id>[^"]+)",\s?"(?P<destination>[^"]+)"\);',
+ webpage, 'video id and destination', group=('id', 'destination'))
+ return self.url_result(f'9c9media:{destination}:{id}', ie=NineCNineMediaIE.ie_key(), video_id=id)
diff --git a/hypervideo_dl/extractor/nitter.py b/hypervideo_dl/extractor/nitter.py
index a0546cd..8bb709c 100644
--- a/hypervideo_dl/extractor/nitter.py
+++ b/hypervideo_dl/extractor/nitter.py
@@ -5,7 +5,6 @@ from .common import InfoExtractor
from ..compat import compat_urlparse
from ..utils import (
parse_count,
- unified_strdate,
unified_timestamp,
remove_end,
determine_ext,
@@ -25,6 +24,16 @@ class NitterIE(InfoExtractor):
'nitter.v6vgyqpa7yefkorazmg5d5fimstmvm2vtbirt6676mt7qmllrcnwycqd.onion',
'i23nv6w3juvzlw32xzoxcqzktegd4i4fu3nmnc2ewv4ggiu4ledwklad.onion',
'26oq3gioiwcmfojub37nz5gzbkdiqp7fue5kvye7d4txv4ny6fb4wwid.onion',
+ 'vfaomgh4jxphpbdfizkm5gbtjahmei234giqj4facbwhrfjtcldauqad.onion',
+ 'iwgu3cv7ywf3gssed5iqtavmrlszgsxazkmwwnt4h2kdait75thdyrqd.onion',
+ 'erpnncl5nhyji3c32dcfmztujtl3xaddqb457jsbkulq24zqq7ifdgad.onion',
+ 'ckzuw5misyahmg7j5t5xwwuj3bwy62jfolxyux4brfflramzsvvd3syd.onion',
+ 'jebqj47jgxleaiosfcxfibx2xdahjettuydlxbg64azd4khsxv6kawid.onion',
+ 'nttr2iupbb6fazdpr2rgbooon2tzbbsvvkagkgkwohhodjzj43stxhad.onion',
+ 'nitraeju2mipeziu2wtcrqsxg7h62v5y4eqgwi75uprynkj74gevvuqd.onion',
+ 'nitter.lqs5fjmajyp7rvp4qvyubwofzi6d4imua7vs237rkc4m5qogitqwrgyd.onion',
+ 'ibsboeui2im5o7dxnik3s5yghufumgy5abevtij5nbizequfpu4qi4ad.onion',
+ 'ec5nvbycpfa5k6ro77blxgkyrzbkv7uy6r5cngcbkadtjj2733nm3uyd.onion',
'nitter.i2p',
'u6ikd6zndl3c4dsdq4mmujpntgeevdk5qzkfb57r4tnfeccrn2qa.b32.i2p',
@@ -36,28 +45,55 @@ class NitterIE(InfoExtractor):
'nitter.42l.fr',
'nitter.pussthecat.org',
'nitter.nixnet.services',
- 'nitter.mastodont.cat',
- 'nitter.tedomum.net',
'nitter.fdn.fr',
'nitter.1d4.us',
'nitter.kavin.rocks',
- 'tweet.lambda.dance',
- 'nitter.cc',
- 'nitter.vxempire.xyz',
'nitter.unixfox.eu',
'nitter.domain.glass',
- 'nitter.himiko.cloud',
'nitter.eu',
'nitter.namazso.eu',
- 'nitter.mailstation.de',
'nitter.actionsack.com',
- 'nitter.cattube.org',
- 'nitter.dark.fail',
'birdsite.xanny.family',
- 'nitter.40two.app',
- 'nitter.skrep.in',
+ 'nitter.hu',
+ 'twitr.gq',
+ 'nitter.moomoo.me',
+ 'nittereu.moomoo.me',
+ 'bird.from.tf',
+ 'nitter.it',
+ 'twitter.censors.us',
+ 'twitter.grimneko.de',
+ 'nitter.alefvanoon.xyz',
+ 'n.hyperborea.cloud',
+ 'nitter.ca',
+ 'twitter.076.ne.jp',
+ 'twitter.mstdn.social',
+ 'nitter.fly.dev',
+ 'notabird.site',
+ 'nitter.weiler.rocks',
+ 'nitter.silkky.cloud',
+ 'nitter.sethforprivacy.com',
+ 'nttr.stream',
+ 'nitter.cutelab.space',
+ 'nitter.nl',
+ 'nitter.mint.lgbt',
+ 'nitter.bus-hit.me',
+ 'fuckthesacklers.network',
+ 'nitter.govt.land',
+ 'nitter.datatunnel.xyz',
+ 'nitter.esmailelbob.xyz',
+ 'tw.artemislena.eu',
+ 'de.nttr.stream',
+ 'nitter.winscloud.net',
+ 'nitter.tiekoetter.com',
+ 'nitter.spaceint.fr',
+ 'twtr.bch.bar',
+ 'nitter.exonip.de',
+ 'nitter.mastodon.pro',
+ 'nitter.notraxx.ch',
+
# not in the list anymore
+ 'nitter.skrep.in',
'nitter.snopyta.org',
)
@@ -68,96 +104,121 @@ class NitterIE(InfoExtractor):
# official, rate limited
'nitter.net',
# offline
+ 'is-nitter.resolv.ee',
+ 'lu-nitter.resolv.ee',
'nitter.13ad.de',
+ 'nitter.40two.app',
+ 'nitter.cattube.org',
+ 'nitter.cc',
+ 'nitter.dark.fail',
+ 'nitter.himiko.cloud',
+ 'nitter.koyu.space',
+ 'nitter.mailstation.de',
+ 'nitter.mastodont.cat',
+ 'nitter.tedomum.net',
+ 'nitter.tokhmi.xyz',
'nitter.weaponizedhumiliation.com',
+ 'nitter.vxempire.xyz',
+ 'tweet.lambda.dance',
)
INSTANCES = NON_HTTP_INSTANCES + HTTP_INSTANCES + DEAD_INSTANCES
- _INSTANCES_RE = '(?:' + '|'.join([re.escape(instance) for instance in INSTANCES]) + ')'
- _VALID_URL = r'https?://%(instance)s/(?P<uploader_id>.+)/status/(?P<id>[0-9]+)(#.)?' % {'instance': _INSTANCES_RE}
+ _INSTANCES_RE = f'(?:{"|".join(map(re.escape, INSTANCES))})'
+ _VALID_URL = fr'https?://{_INSTANCES_RE}/(?P<uploader_id>.+)/status/(?P<id>[0-9]+)(#.)?'
current_instance = random.choice(HTTP_INSTANCES)
_TESTS = [
{
# GIF (wrapped in mp4)
- 'url': 'https://%s/firefox/status/1314279897502629888#m' % current_instance,
+ 'url': f'https://{current_instance}/firefox/status/1314279897502629888#m',
'info_dict': {
'id': '1314279897502629888',
'ext': 'mp4',
- 'title': 'Firefox 🔥 - You know the old saying, if you see something say something. Now you actually can with the YouTube regrets extension. \n\nReport harmful YouTube recommendations so others can avoid watching them. ➡️ https://mzl.la/3iFIiyg\n\n#UnfckTheInternet',
- 'description': 'You know the old saying, if you see something say something. Now you actually can with the YouTube regrets extension. \n\nReport harmful YouTube recommendations so others can avoid watching them. ➡️ https://mzl.la/3iFIiyg\n\n#UnfckTheInternet',
+ 'title': 'md5:7890a9277da4639ab624dd899424c5d8',
+ 'description': 'md5:5fea96a4d3716c350f8b95b21b3111fe',
'thumbnail': r're:^https?://.*\.jpg$',
'uploader': 'Firefox 🔥',
'uploader_id': 'firefox',
- 'uploader_url': 'https://%s/firefox' % current_instance,
+ 'uploader_url': f'https://{current_instance}/firefox',
'upload_date': '20201008',
'timestamp': 1602183720,
+ 'like_count': int,
+ 'repost_count': int,
+ 'comment_count': int,
},
}, { # normal video
- 'url': 'https://%s/Le___Doc/status/1299715685392756737#m' % current_instance,
+ 'url': f'https://{current_instance}/Le___Doc/status/1299715685392756737#m',
'info_dict': {
'id': '1299715685392756737',
'ext': 'mp4',
- 'title': 'Le Doc - "Je ne prédis jamais rien"\nD Raoult, Août 2020...',
+ 'title': 're:^.* - "Je ne prédis jamais rien"\nD Raoult, Août 2020...',
'description': '"Je ne prédis jamais rien"\nD Raoult, Août 2020...',
'thumbnail': r're:^https?://.*\.jpg$',
- 'uploader': 'Le Doc',
+ 'uploader': 're:^Le *Doc',
'uploader_id': 'Le___Doc',
- 'uploader_url': 'https://%s/Le___Doc' % current_instance,
+ 'uploader_url': f'https://{current_instance}/Le___Doc',
'upload_date': '20200829',
- 'timestamp': 1598711341,
+ 'timestamp': 1598711340,
'view_count': int,
'like_count': int,
'repost_count': int,
'comment_count': int,
},
}, { # video embed in a "Streaming Political Ads" box
- 'url': 'https://%s/mozilla/status/1321147074491092994#m' % current_instance,
+ 'url': f'https://{current_instance}/mozilla/status/1321147074491092994#m',
'info_dict': {
'id': '1321147074491092994',
'ext': 'mp4',
- 'title': "Mozilla - Are you being targeted with weird, ominous or just plain annoying political ads while streaming your favorite shows?\n\nThis isn't a real political ad, but if you're watching streaming TV in the U.S., chances are you've seen quite a few. \n\nLearn more ➡️ https://mzl.la/StreamingAds",
- 'description': "Are you being targeted with weird, ominous or just plain annoying political ads while streaming your favorite shows?\n\nThis isn't a real political ad, but if you're watching streaming TV in the U.S., chances are you've seen quite a few. \n\nLearn more ➡️ https://mzl.la/StreamingAds",
+ 'title': 'md5:8290664aabb43b9189145c008386bf12',
+ 'description': 'md5:9cf2762d49674bc416a191a689fb2aaa',
'thumbnail': r're:^https?://.*\.jpg$',
'uploader': 'Mozilla',
'uploader_id': 'mozilla',
- 'uploader_url': 'https://%s/mozilla' % current_instance,
+ 'uploader_url': f'https://{current_instance}/mozilla',
'upload_date': '20201027',
- 'timestamp': 1603820982
+ 'timestamp': 1603820940,
+ 'view_count': int,
+ 'like_count': int,
+ 'repost_count': int,
+ 'comment_count': int,
},
+ 'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest'],
}, { # not the first tweet but main-tweet
- 'url': 'https://%s/TheNaturalNu/status/1379050895539724290#m' % current_instance,
+ 'url': f'https://{current_instance}/firefox/status/1354848277481414657#m',
'info_dict': {
- 'id': '1379050895539724290',
+ 'id': '1354848277481414657',
'ext': 'mp4',
- 'title': 'Dorothy Zbornak - This had me hollering!!',
- 'description': 'This had me hollering!!',
+ 'title': 'md5:bef647f03bd1c6b15b687ea70dfc9700',
+ 'description': 'md5:5efba25e2f9dac85ebcd21160cb4341f',
'thumbnail': r're:^https?://.*\.jpg$',
- 'uploader': 'Dorothy Zbornak',
- 'uploader_id': 'TheNaturalNu',
- 'uploader_url': 'https://%s/TheNaturalNu' % current_instance,
- 'timestamp': 1617626329,
- 'upload_date': '20210405'
+ 'uploader': 'Firefox 🔥',
+ 'uploader_id': 'firefox',
+ 'uploader_url': f'https://{current_instance}/firefox',
+ 'upload_date': '20210128',
+ 'timestamp': 1611855960,
+ 'view_count': int,
+ 'like_count': int,
+ 'repost_count': int,
+ 'comment_count': int,
}
}
]
def _real_extract(self, url):
- video_id = self._match_id(url)
+ video_id, uploader_id = self._match_valid_url(url).group('id', 'uploader_id')
parsed_url = compat_urlparse.urlparse(url)
- base_url = '%s://%s' % (parsed_url.scheme, parsed_url.netloc)
+ base_url = f'{parsed_url.scheme}://{parsed_url.netloc}'
self._set_cookie(parsed_url.netloc, 'hlsPlayback', 'on')
- full_webpage = self._download_webpage(url, video_id)
+ full_webpage = webpage = self._download_webpage(url, video_id)
main_tweet_start = full_webpage.find('class="main-tweet"')
if main_tweet_start > 0:
webpage = full_webpage[main_tweet_start:]
- if not webpage:
- webpage = full_webpage
- video_url = '%s%s' % (base_url, self._html_search_regex(r'(?:<video[^>]+data-url|<source[^>]+src)="([^"]+)"', webpage, 'video url'))
+ video_url = '%s%s' % (base_url, self._html_search_regex(
+ r'(?:<video[^>]+data-url|<source[^>]+src)="([^"]+)"', webpage, 'video url'))
ext = determine_ext(video_url)
if ext == 'unknown_video':
@@ -168,61 +229,49 @@ class NitterIE(InfoExtractor):
'ext': ext
}]
- title = self._og_search_description(full_webpage)
- if not title:
- title = self._html_search_regex(r'<div class="tweet-content[^>]+>([^<]+)</div>', webpage, 'title')
- description = title
+ title = description = self._og_search_description(full_webpage) or self._html_search_regex(
+ r'<div class="tweet-content[^>]+>([^<]+)</div>', webpage, 'title', fatal=False)
- mobj = self._match_valid_url(url)
- uploader_id = (
- mobj.group('uploader_id')
- or self._html_search_regex(r'<a class="fullname"[^>]+title="([^"]+)"', webpage, 'uploader name', fatal=False)
- )
+ uploader_id = self._html_search_regex(
+ r'<a class="username"[^>]+title="@([^"]+)"', webpage, 'uploader id', fatal=False) or uploader_id
- if uploader_id:
- uploader_url = '%s/%s' % (base_url, uploader_id)
+ uploader = self._html_search_regex(
+ r'<a class="fullname"[^>]+title="([^"]+)"', webpage, 'uploader name', fatal=False)
+ if uploader:
+ title = f'{uploader} - {title}'
- uploader = self._html_search_regex(r'<a class="fullname"[^>]+title="([^"]+)"', webpage, 'uploader name', fatal=False)
+ counts = {
+ f'{x[0]}_count': self._html_search_regex(
+ fr'<span[^>]+class="icon-{x[1]}[^>]*></span>([^<]*)</div>',
+ webpage, f'{x[0]} count', fatal=False)
+ for x in (('view', 'play'), ('like', 'heart'), ('repost', 'retweet'), ('comment', 'comment'))
+ }
+ counts = {field: 0 if count == '' else parse_count(count) for field, count in counts.items()}
- if uploader:
- title = '%s - %s' % (uploader, title)
-
- view_count = parse_count(self._html_search_regex(r'<span[^>]+class="icon-play[^>]*></span>\s([^<]+)</div>', webpage, 'view count', fatal=False))
- like_count = parse_count(self._html_search_regex(r'<span[^>]+class="icon-heart[^>]*></span>\s([^<]+)</div>', webpage, 'like count', fatal=False))
- repost_count = parse_count(self._html_search_regex(r'<span[^>]+class="icon-retweet[^>]*></span>\s([^<]+)</div>', webpage, 'repost count', fatal=False))
- comment_count = parse_count(self._html_search_regex(r'<span[^>]+class="icon-comment[^>]*></span>\s([^<]+)</div>', webpage, 'repost count', fatal=False))
-
- thumbnail = self._html_search_meta('og:image', full_webpage, 'thumbnail url')
- if not thumbnail:
- thumbnail = '%s%s' % (base_url, self._html_search_regex(r'<video[^>]+poster="([^"]+)"', webpage, 'thumbnail url', fatal=False))
- thumbnail = remove_end(thumbnail, '%3Asmall')
-
- thumbnails = []
- thumbnail_ids = ('thumb', 'small', 'large', 'medium', 'orig')
- for id in thumbnail_ids:
- thumbnails.append({
- 'id': id,
- 'url': thumbnail + '%3A' + id,
- })
-
- date = self._html_search_regex(r'<span[^>]+class="tweet-date"[^>]*><a[^>]+title="([^"]+)"', webpage, 'upload date', fatal=False)
- upload_date = unified_strdate(date)
- timestamp = unified_timestamp(date)
+ thumbnail = (
+ self._html_search_meta('og:image', full_webpage, 'thumbnail url')
+ or remove_end('%s%s' % (base_url, self._html_search_regex(
+ r'<video[^>]+poster="([^"]+)"', webpage, 'thumbnail url', fatal=False)), '%3Asmall'))
+
+ thumbnails = [
+ {'id': id, 'url': f'{thumbnail}%3A{id}'}
+ for id in ('thumb', 'small', 'large', 'medium', 'orig')
+ ]
+
+ date = self._html_search_regex(
+ r'<span[^>]+class="tweet-date"[^>]*><a[^>]+title="([^"]+)"',
+ webpage, 'upload date', default='').replace('·', '')
return {
'id': video_id,
'title': title,
'description': description,
'uploader': uploader,
- 'timestamp': timestamp,
+ 'timestamp': unified_timestamp(date),
'uploader_id': uploader_id,
- 'uploader_url': uploader_url,
- 'view_count': view_count,
- 'like_count': like_count,
- 'repost_count': repost_count,
- 'comment_count': comment_count,
+ 'uploader_url': f'{base_url}/{uploader_id}',
'formats': formats,
'thumbnails': thumbnails,
'thumbnail': thumbnail,
- 'upload_date': upload_date,
+ **counts,
}
diff --git a/hypervideo_dl/extractor/njpwworld.py b/hypervideo_dl/extractor/njpwworld.py
index 3639d14..68c8c8e 100644
--- a/hypervideo_dl/extractor/njpwworld.py
+++ b/hypervideo_dl/extractor/njpwworld.py
@@ -43,15 +43,7 @@ class NJPWWorldIE(InfoExtractor):
_LOGIN_URL = 'https://front.njpwworld.com/auth/login'
- def _real_initialize(self):
- self._login()
-
- def _login(self):
- username, password = self._get_login_info()
- # No authentication to be performed
- if not username:
- return True
-
+ def _perform_login(self, username, password):
# Setup session (will set necessary cookies)
self._request_webpage(
'https://njpwworld.com/', None, note='Setting up session')
@@ -77,13 +69,8 @@ class NJPWWorldIE(InfoExtractor):
for kind, vid in re.findall(r'if\s+\(\s*imageQualityType\s*==\s*\'([^\']+)\'\s*\)\s*{\s*video_id\s*=\s*"(\d+)"', webpage):
player_path = '/intent?id=%s&type=url' % vid
player_url = compat_urlparse.urljoin(url, player_path)
- formats.append({
- 'url': player_url,
- 'format_id': kind,
- 'ext': 'mp4',
- 'protocol': 'm3u8',
- 'quality': 2 if kind == 'high' else 1,
- })
+ formats += self._extract_m3u8_formats(
+ player_url, video_id, 'mp4', 'm3u8_native', m3u8_id=kind, fatal=False, quality=int(kind == 'high'))
self._sort_formats(formats)
diff --git a/hypervideo_dl/extractor/noco.py b/hypervideo_dl/extractor/noco.py
index 78c4952..28af909 100644
--- a/hypervideo_dl/extractor/noco.py
+++ b/hypervideo_dl/extractor/noco.py
@@ -61,14 +61,7 @@ class NocoIE(InfoExtractor):
}
]
- def _real_initialize(self):
- self._login()
-
- def _login(self):
- username, password = self._get_login_info()
- if username is None:
- return
-
+ def _perform_login(self, username, password):
login = self._download_json(
self._LOGIN_URL, None, 'Logging in',
data=urlencode_postdata({
diff --git a/hypervideo_dl/extractor/noodlemagazine.py b/hypervideo_dl/extractor/noodlemagazine.py
new file mode 100644
index 0000000..2f170bb
--- /dev/null
+++ b/hypervideo_dl/extractor/noodlemagazine.py
@@ -0,0 +1,67 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ parse_duration,
+ parse_count,
+ unified_strdate
+)
+
+
+class NoodleMagazineIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www|adult\.)?noodlemagazine\.com/watch/(?P<id>[0-9-_]+)'
+ _TEST = {
+ 'url': 'https://adult.noodlemagazine.com/watch/-67421364_456239604',
+ 'md5': '9e02aa763612929d0b4b850591a9248b',
+ 'info_dict': {
+ 'id': '-67421364_456239604',
+ 'title': 'Aria alexander manojob',
+ 'thumbnail': r're:^https://.*\.jpg',
+ 'ext': 'mp4',
+ 'duration': 903,
+ 'view_count': int,
+ 'like_count': int,
+ 'description': 'Aria alexander manojob',
+ 'tags': ['aria', 'alexander', 'manojob'],
+ 'upload_date': '20190218',
+ 'age_limit': 18
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+ title = self._og_search_title(webpage)
+ duration = parse_duration(self._html_search_meta('video:duration', webpage, 'duration', default=None))
+ description = self._og_search_property('description', webpage, default='').replace(' watch online hight quality video', '')
+ tags = self._html_search_meta('video:tag', webpage, default='').split(', ')
+ view_count = parse_count(self._html_search_meta('ya:ovs:views_total', webpage, default=None))
+ like_count = parse_count(self._html_search_meta('ya:ovs:likes', webpage, default=None))
+ upload_date = unified_strdate(self._html_search_meta('ya:ovs:upload_date', webpage, default=''))
+
+ key = self._html_search_regex(rf'/{video_id}\?(?:.*&)?m=([^&"\'\s,]+)', webpage, 'key')
+ playlist_info = self._download_json(f'https://adult.noodlemagazine.com/playlist/{video_id}?m={key}', video_id)
+ thumbnail = self._og_search_property('image', webpage, default=None) or playlist_info.get('image')
+
+ formats = [{
+ 'url': source.get('file'),
+ 'quality': source.get('label'),
+ 'ext': source.get('type'),
+ } for source in playlist_info.get('sources')]
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'formats': formats,
+ 'title': title,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ 'description': description,
+ 'tags': tags,
+ 'view_count': view_count,
+ 'like_count': like_count,
+ 'upload_date': upload_date,
+ 'age_limit': 18
+ }
diff --git a/hypervideo_dl/extractor/nova.py b/hypervideo_dl/extractor/nova.py
index 3acb881..00a64f8 100644
--- a/hypervideo_dl/extractor/nova.py
+++ b/hypervideo_dl/extractor/nova.py
@@ -10,6 +10,7 @@ from ..utils import (
int_or_none,
js_to_json,
qualities,
+ traverse_obj,
unified_strdate,
url_or_none,
)
@@ -17,30 +18,45 @@ from ..utils import (
class NovaEmbedIE(InfoExtractor):
_VALID_URL = r'https?://media\.cms\.nova\.cz/embed/(?P<id>[^/?#&]+)'
- _TEST = {
+ _TESTS = [{
'url': 'https://media.cms.nova.cz/embed/8o0n0r?autoplay=1',
- 'md5': 'ee009bafcc794541570edd44b71cbea3',
'info_dict': {
'id': '8o0n0r',
- 'ext': 'mp4',
'title': '2180. díl',
'thumbnail': r're:^https?://.*\.jpg',
'duration': 2578,
},
- }
+ 'params': {
+ 'skip_download': True,
+ 'ignore_no_formats_error': True,
+ },
+ 'expected_warnings': ['DRM protected', 'Requested format is not available'],
+ }, {
+ 'url': 'https://media.cms.nova.cz/embed/KybpWYvcgOa',
+ 'info_dict': {
+ 'id': 'KybpWYvcgOa',
+ 'ext': 'mp4',
+ 'title': 'Borhyová oslavila 60? Soutěžící z pořadu odboural moderátora Ondřeje Sokola',
+ 'thumbnail': r're:^https?://.*\.jpg',
+ 'duration': 114,
+ },
+ 'params': {'skip_download': 'm3u8'},
+ }]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
+ has_drm = False
duration = None
formats = []
player = self._parse_json(
self._search_regex(
- r'Player\.init\s*\([^,]+,\s*(?:\w+\s*\?\s*{.+?}\s*:\s*)?({.+})\s*,\s*{.+?}\s*\)\s*;',
- webpage, 'player', default='{}'), video_id, fatal=False)
+ (r'(?:(?:replacePlaceholders|processAdTagModifier).*?:\s*)?(?:replacePlaceholders|processAdTagModifier)\s*\(\s*(?P<json>{.*?})\s*\)(?:\s*\))?\s*,',
+ r'Player\.init\s*\([^,]+,(?P<cndn>\s*\w+\s*\?)?\s*(?P<json>{(?(cndn).+?|.+)})\s*(?(cndn):|,\s*{.+?}\s*\)\s*;)'),
+ webpage, 'player', default='{}', group='json'), video_id, fatal=False)
if player:
for format_id, format_list in player['tracks'].items():
if not isinstance(format_list, list):
@@ -48,6 +64,10 @@ class NovaEmbedIE(InfoExtractor):
for format_dict in format_list:
if not isinstance(format_dict, dict):
continue
+ if (not self.get_param('allow_unplayable_formats')
+ and traverse_obj(format_dict, ('drm', 'keySystem'))):
+ has_drm = True
+ continue
format_url = url_or_none(format_dict.get('src'))
format_type = format_dict.get('type')
ext = determine_ext(format_url)
@@ -104,6 +124,8 @@ class NovaEmbedIE(InfoExtractor):
f['format_id'] = f_id
formats.append(f)
+ if not formats and has_drm:
+ self.report_drm(video_id)
self._sort_formats(formats)
title = self._og_search_title(
diff --git a/hypervideo_dl/extractor/novaplay.py b/hypervideo_dl/extractor/novaplay.py
index 724986a..bfb2c87 100644
--- a/hypervideo_dl/extractor/novaplay.py
+++ b/hypervideo_dl/extractor/novaplay.py
@@ -41,9 +41,7 @@ class NovaPlayIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- video_props = self._parse_json(self._search_regex(
- r'<script\s?id=\"__NEXT_DATA__\"\s?type=\"application/json\">({.+})</script>',
- webpage, 'video_props'), video_id)['props']['pageProps']['video']
+ video_props = self._search_nextjs_data(webpage, video_id)['props']['pageProps']['video']
m3u8_url = self._download_json(
f'https://nbg-api.fite.tv/api/v2/videos/{video_id}/streams',
video_id, headers={'x-flipps-user-agent': 'Flipps/75/9.7'})[0]['url']
diff --git a/hypervideo_dl/extractor/npo.py b/hypervideo_dl/extractor/npo.py
index ed547d0..a8aaef6 100644
--- a/hypervideo_dl/extractor/npo.py
+++ b/hypervideo_dl/extractor/npo.py
@@ -467,7 +467,7 @@ class NPOIE(NPOBaseIE):
return {
'id': video_id,
- 'title': self._live_title(title) if is_live else title,
+ 'title': title,
'description': metadata.get('info'),
'thumbnail': metadata.get('images', [{'url': None}])[-1]['url'],
'upload_date': unified_strdate(metadata.get('gidsdatum')),
@@ -561,7 +561,7 @@ class NPORadioIE(InfoExtractor):
return {
'id': video_id,
'url': stream['url'],
- 'title': self._live_title(title),
+ 'title': title,
'acodec': codec,
'ext': codec,
'is_live': True,
diff --git a/hypervideo_dl/extractor/npr.py b/hypervideo_dl/extractor/npr.py
index 9d1122f..49f062d 100644
--- a/hypervideo_dl/extractor/npr.py
+++ b/hypervideo_dl/extractor/npr.py
@@ -91,7 +91,8 @@ class NprIE(InfoExtractor):
elif format_id == 'smil':
smil_formats = self._extract_smil_formats(
format_url, media_id, transform_source=lambda s: s.replace(
- 'rtmp://flash.npr.org/ondemand/', 'https://ondemand.npr.org/'))
+ 'rtmp://flash.npr.org/ondemand/', 'https://ondemand.npr.org/'),
+ fatal=False)
self._check_formats(smil_formats, media_id)
formats.extend(smil_formats)
else:
diff --git a/hypervideo_dl/extractor/nrk.py b/hypervideo_dl/extractor/nrk.py
index b556bc6..4d723e8 100644
--- a/hypervideo_dl/extractor/nrk.py
+++ b/hypervideo_dl/extractor/nrk.py
@@ -8,6 +8,7 @@ import re
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
+ compat_HTTPError,
determine_ext,
ExtractorError,
int_or_none,
@@ -147,10 +148,14 @@ class NRKIE(NRKBaseIE):
def _real_extract(self, url):
video_id = self._match_id(url).split('/')[-1]
- path_templ = 'playback/%s/' + video_id
-
def call_playback_api(item, query=None):
- return self._call_api(path_templ % item, video_id, item, query=query)
+ try:
+ return self._call_api(f'playback/{item}/program/{video_id}', video_id, item, query=query)
+ except ExtractorError as e:
+ if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400:
+ return self._call_api(f'playback/{item}/{video_id}', video_id, item, query=query)
+ raise
+
# known values for preferredCdn: akamai, iponly, minicdn and telenor
manifest = call_playback_api('manifest', {'preferredCdn': 'akamai'})
@@ -188,7 +193,7 @@ class NRKIE(NRKBaseIE):
title = titles['title']
alt_title = titles.get('subtitle')
- description = preplay.get('description')
+ description = try_get(preplay, lambda x: x['description'].replace('\r', '\n'))
duration = parse_duration(playable.get('duration')) or parse_duration(data.get('duration'))
thumbnails = []
diff --git a/hypervideo_dl/extractor/nrl.py b/hypervideo_dl/extractor/nrl.py
index 22a2df8..0bd5086 100644
--- a/hypervideo_dl/extractor/nrl.py
+++ b/hypervideo_dl/extractor/nrl.py
@@ -16,7 +16,6 @@ class NRLTVIE(InfoExtractor):
'params': {
# m3u8 download
'skip_download': True,
- 'format': 'bestvideo',
},
}
diff --git a/hypervideo_dl/extractor/ntvcojp.py b/hypervideo_dl/extractor/ntvcojp.py
index 0c8221b..c9af911 100644
--- a/hypervideo_dl/extractor/ntvcojp.py
+++ b/hypervideo_dl/extractor/ntvcojp.py
@@ -3,8 +3,9 @@ from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
- js_to_json,
+ ExtractorError,
smuggle_url,
+ traverse_obj,
)
@@ -19,7 +20,7 @@ class NTVCoJpCUIE(InfoExtractor):
'ext': 'mp4',
'title': '桜エビと炒り卵がポイント! 「中華風 エビチリおにぎり」──『美虎』五十嵐美幸',
'upload_date': '20181213',
- 'description': 'md5:211b52f4fd60f3e0e72b68b0c6ba52a9',
+ 'description': 'md5:1985b51a9abc285df0104d982a325f2a',
'uploader_id': '3855502814001',
'timestamp': 1544669941,
},
@@ -28,22 +29,30 @@ class NTVCoJpCUIE(InfoExtractor):
'skip_download': True,
},
}
+
BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s'
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
- player_config = self._parse_json(self._search_regex(
- r'(?s)PLAYER_CONFIG\s*=\s*({.+?})',
- webpage, 'player config'), display_id, js_to_json)
- video_id = player_config['videoId']
- account_id = player_config.get('account') or '3855502814001'
+ player_config = self._search_nuxt_data(webpage, display_id)
+ video_id = traverse_obj(player_config, ('movie', 'video_id'))
+ if not video_id:
+ raise ExtractorError('Failed to extract video ID for Brightcove')
+ account_id = traverse_obj(player_config, ('player', 'account')) or '3855502814001'
+ title = traverse_obj(player_config, ('movie', 'name'))
+ if not title:
+ og_title = self._og_search_title(webpage, fatal=False) or traverse_obj(player_config, ('player', 'title'))
+ if og_title:
+ title = og_title.split('(', 1)[0].strip()
+ description = (traverse_obj(player_config, ('movie', 'description'))
+ or self._html_search_meta(['description', 'og:description'], webpage))
return {
'_type': 'url_transparent',
'id': video_id,
'display_id': display_id,
- 'title': self._search_regex(r'<h1[^>]+class="title"[^>]*>([^<]+)', webpage, 'title').strip(),
- 'description': self._html_search_meta(['description', 'og:description'], webpage),
+ 'title': title,
+ 'description': description,
'url': smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % (account_id, video_id), {'geo_countries': ['JP']}),
'ie_key': 'BrightcoveNew',
}
diff --git a/hypervideo_dl/extractor/nuvid.py b/hypervideo_dl/extractor/nuvid.py
index 7487824..84fb97d 100644
--- a/hypervideo_dl/extractor/nuvid.py
+++ b/hypervideo_dl/extractor/nuvid.py
@@ -1,11 +1,14 @@
# coding: utf-8
from __future__ import unicode_literals
+import re
from .common import InfoExtractor
from ..utils import (
parse_duration,
int_or_none,
- try_get,
+ strip_or_none,
+ traverse_obj,
+ url_or_none,
)
@@ -20,14 +23,30 @@ class NuvidIE(InfoExtractor):
'title': 'italian babe',
'duration': 321.0,
'age_limit': 18,
+ 'thumbnail': r're:https?://.+\.jpg',
}
}, {
'url': 'https://m.nuvid.com/video/6523263',
+ 'md5': 'ebd22ce8e47e1d9a4d0756a15c67da52',
'info_dict': {
'id': '6523263',
'ext': 'mp4',
- 'age_limit': 18,
'title': 'Slut brunette college student anal dorm',
+ 'duration': 421.0,
+ 'age_limit': 18,
+ 'thumbnail': r're:https?://.+\.jpg',
+ 'thumbnails': list,
+ }
+ }, {
+ 'url': 'http://m.nuvid.com/video/6415801/',
+ 'md5': '638d5ececb138d5753593f751ae3f697',
+ 'info_dict': {
+ 'id': '6415801',
+ 'ext': 'mp4',
+ 'title': 'My best friend wanted to fuck my wife for a long time',
+ 'duration': 1882,
+ 'age_limit': 18,
+ 'thumbnail': r're:https?://.+\.jpg',
}
}]
@@ -46,6 +65,16 @@ class NuvidIE(InfoExtractor):
'Content-Type': 'application/x-www-form-urlencoded; charset=utf-8',
})
+ webpage = self._download_webpage(
+ 'http://m.nuvid.com/video/%s' % (video_id, ),
+ video_id, 'Downloading video page', fatal=False) or ''
+
+ title = strip_or_none(video_data.get('title') or self._html_search_regex(
+ (r'''<span\s[^>]*?\btitle\s*=\s*(?P<q>"|'|\b)(?P<title>[^"]+)(?P=q)\s*>''',
+ r'''<div\s[^>]*?\bclass\s*=\s*(?P<q>"|'|\b)thumb-holder video(?P=q)>\s*<h5\b[^>]*>(?P<title>[^<]+)</h5''',
+ r'''<span\s[^>]*?\bclass\s*=\s*(?P<q>"|'|\b)title_thumb(?P=q)>(?P<title>[^<]+)</span'''),
+ webpage, 'title', group='title'))
+
formats = [{
'url': source,
'format_id': qualities.get(quality),
@@ -55,19 +84,19 @@ class NuvidIE(InfoExtractor):
self._check_formats(formats, video_id)
self._sort_formats(formats)
- title = video_data.get('title')
- thumbnail_base_url = try_get(video_data, lambda x: x['thumbs']['url'])
- thumbnail_extension = try_get(video_data, lambda x: x['thumbs']['extension'])
- thumbnail_id = self._search_regex(
- r'/media/videos/tmb/6523263/preview/(/d+)' + thumbnail_extension, video_data.get('poster', ''), 'thumbnail id', default=19)
- thumbnail = f'{thumbnail_base_url}player/{thumbnail_id}{thumbnail_extension}'
- duration = parse_duration(video_data.get('duration') or video_data.get('duration_format'))
+ duration = parse_duration(traverse_obj(video_data, 'duration', 'duration_format'))
+ thumbnails = [
+ {'url': thumb_url} for thumb_url in re.findall(
+ r'<div\s+class\s*=\s*"video-tmb-wrap"\s*>\s*<img\s+src\s*=\s*"([^"]+)"\s*/>', webpage)
+ if url_or_none(thumb_url)]
+ if url_or_none(video_data.get('poster')):
+ thumbnails.append({'url': video_data['poster'], 'preference': 1})
return {
'id': video_id,
'formats': formats,
'title': title,
- 'thumbnail': thumbnail,
+ 'thumbnails': thumbnails,
'duration': duration,
'age_limit': 18,
}
diff --git a/hypervideo_dl/extractor/odnoklassniki.py b/hypervideo_dl/extractor/odnoklassniki.py
index 9cacd38..293f1aa 100644
--- a/hypervideo_dl/extractor/odnoklassniki.py
+++ b/hypervideo_dl/extractor/odnoklassniki.py
@@ -12,6 +12,7 @@ from ..compat import (
)
from ..utils import (
ExtractorError,
+ float_or_none,
unified_strdate,
int_or_none,
qualities,
@@ -34,6 +35,38 @@ class OdnoklassnikiIE(InfoExtractor):
(?P<id>[\d-]+)
'''
_TESTS = [{
+ 'note': 'Coub embedded',
+ 'url': 'http://ok.ru/video/1484130554189',
+ 'info_dict': {
+ 'id': '1keok9',
+ 'ext': 'mp4',
+ 'timestamp': 1545580896,
+ 'view_count': int,
+ 'thumbnail': 'https://coub-anubis-a.akamaized.net/coub_storage/coub/simple/cw_image/c5ac87553bd/608e806a1239c210ab692/1545580913_00026.jpg',
+ 'title': 'Народная забава',
+ 'uploader': 'Nevata',
+ 'upload_date': '20181223',
+ 'age_limit': 0,
+ 'uploader_id': 'nevata.s',
+ 'like_count': int,
+ 'duration': 8.08,
+ 'repost_count': int,
+ },
+ }, {
+ 'note': 'vk.com embedded',
+ 'url': 'https://ok.ru/video/3568183087575',
+ 'info_dict': {
+ 'id': '-165101755_456243749',
+ 'ext': 'mp4',
+ 'uploader_id': '-165101755',
+ 'duration': 132,
+ 'timestamp': 1642869935,
+ 'upload_date': '20220122',
+ 'thumbnail': str,
+ 'title': str,
+ 'uploader': str,
+ },
+ }, {
# metadata in JSON
'url': 'http://ok.ru/video/20079905452',
'md5': '0b62089b479e06681abaaca9d204f152',
@@ -97,6 +130,14 @@ class OdnoklassnikiIE(InfoExtractor):
},
'skip': 'Video has not been found',
}, {
+ 'note': 'Only available in mobile webpage',
+ 'url': 'https://m.ok.ru/video/2361249957145',
+ 'info_dict': {
+ 'id': '2361249957145',
+ 'title': 'Быковское крещение',
+ 'duration': 3038.181,
+ },
+ }, {
'url': 'http://ok.ru/web-api/video/moviePlayer/20079905452',
'only_matching': True,
}, {
@@ -131,13 +172,24 @@ class OdnoklassnikiIE(InfoExtractor):
return mobj.group('url')
def _real_extract(self, url):
+ try:
+ return self._extract_desktop(url)
+ except ExtractorError as e:
+ try:
+ return self._extract_mobile(url)
+ except ExtractorError:
+ # error message of desktop webpage is in English
+ raise e
+
+ def _extract_desktop(self, url):
start_time = int_or_none(compat_parse_qs(
compat_urllib_parse_urlparse(url).query).get('fromTime', [None])[0])
video_id = self._match_id(url)
webpage = self._download_webpage(
- 'http://ok.ru/video/%s' % video_id, video_id)
+ 'http://ok.ru/video/%s' % video_id, video_id,
+ note='Downloading desktop webpage')
error = self._search_regex(
r'[^>]+class="vp_video_stub_txt"[^>]*>([^<]+)<',
@@ -151,6 +203,10 @@ class OdnoklassnikiIE(InfoExtractor):
webpage, 'player', group='player')),
video_id)
+ # embedded external player
+ if player.get('isExternalPlayer') and player.get('url'):
+ return self.url_result(player['url'])
+
flashvars = player['flashvars']
metadata = flashvars.get('metadata')
@@ -206,6 +262,14 @@ class OdnoklassnikiIE(InfoExtractor):
'start_time': start_time,
}
+ # pladform
+ if provider == 'OPEN_GRAPH':
+ info.update({
+ '_type': 'url_transparent',
+ 'url': movie['contentId'],
+ })
+ return info
+
if provider == 'USER_YOUTUBE':
info.update({
'_type': 'url_transparent',
@@ -215,7 +279,7 @@ class OdnoklassnikiIE(InfoExtractor):
assert title
if provider == 'LIVE_TV_APP':
- info['title'] = self._live_title(title)
+ info['title'] = title
quality = qualities(('4', '0', '1', '2', '3', '5'))
@@ -265,3 +329,32 @@ class OdnoklassnikiIE(InfoExtractor):
info['formats'] = formats
return info
+
+ def _extract_mobile(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(
+ 'http://m.ok.ru/video/%s' % video_id, video_id,
+ note='Downloading mobile webpage')
+
+ error = self._search_regex(
+ r'видео</a>\s*<div\s+class="empty">(.+?)</div>',
+ webpage, 'error', default=None)
+ if error:
+ raise ExtractorError(error, expected=True)
+
+ json_data = self._search_regex(
+ r'data-video="(.+?)"', webpage, 'json data')
+ json_data = self._parse_json(unescapeHTML(json_data), video_id) or {}
+
+ return {
+ 'id': video_id,
+ 'title': json_data.get('videoName'),
+ 'duration': float_or_none(json_data.get('videoDuration'), scale=1000),
+ 'thumbnail': json_data.get('videoPosterSrc'),
+ 'formats': [{
+ 'format_id': 'mobile',
+ 'url': json_data.get('videoSrc'),
+ 'ext': 'mp4',
+ }]
+ }
diff --git a/hypervideo_dl/extractor/oktoberfesttv.py b/hypervideo_dl/extractor/oktoberfesttv.py
index a914068..2765674 100644
--- a/hypervideo_dl/extractor/oktoberfesttv.py
+++ b/hypervideo_dl/extractor/oktoberfesttv.py
@@ -25,8 +25,8 @@ class OktoberfestTVIE(InfoExtractor):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- title = self._live_title(self._html_search_regex(
- r'<h1><strong>.*?</strong>(.*?)</h1>', webpage, 'title'))
+ title = self._html_search_regex(
+ r'<h1><strong>.*?</strong>(.*?)</h1>', webpage, 'title')
clip = self._search_regex(
r"clip:\s*\{\s*url:\s*'([^']+)'", webpage, 'clip')
diff --git a/hypervideo_dl/extractor/olympics.py b/hypervideo_dl/extractor/olympics.py
index 0bc9206..784f282 100644
--- a/hypervideo_dl/extractor/olympics.py
+++ b/hypervideo_dl/extractor/olympics.py
@@ -2,22 +2,27 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..utils import unified_strdate
+from ..utils import (
+ int_or_none,
+ try_get
+)
class OlympicsReplayIE(InfoExtractor):
- _VALID_URL = r'(?:https?://)(?:www\.)?olympics\.com/tokyo-2020/(?:[a-z]{2}/)?replay/(?P<id>[^/#&?]+)'
+ _VALID_URL = r'https?://(?:www\.)?olympics\.com(?:/tokyo-2020)?/[a-z]{2}/(?:replay|video)/(?P<id>[^/#&?]+)'
_TESTS = [{
- 'url': 'https://olympics.com/tokyo-2020/en/replay/300622eb-abc0-43ea-b03b-c5f2d429ec7b/jumping-team-qualifier',
+ 'url': 'https://olympics.com/fr/video/men-s-109kg-group-a-weightlifting-tokyo-2020-replays',
'info_dict': {
- 'id': '300622eb-abc0-43ea-b03b-c5f2d429ec7b',
+ 'id': 'f6a0753c-8e6f-4b7d-a435-027054a4f8e9',
'ext': 'mp4',
- 'title': 'Jumping Team Qualifier',
- 'release_date': '20210806',
- 'upload_date': '20210713',
+ 'title': '+109kg (H) Groupe A - Haltérophilie | Replay de Tokyo 2020',
+ 'upload_date': '20210801',
+ 'timestamp': 1627783200,
+ 'description': 'md5:c66af4a5bc7429dbcc43d15845ff03b3',
+ 'uploader': 'International Olympic Committee',
},
'params': {
- 'format': 'bv',
+ 'skip_download': True,
},
}, {
'url': 'https://olympics.com/tokyo-2020/en/replay/bd242924-4b22-49a5-a846-f1d4c809250d/mens-bronze-medal-match-hun-esp',
@@ -26,31 +31,39 @@ class OlympicsReplayIE(InfoExtractor):
def _real_extract(self, url):
id = self._match_id(url)
- # The parameters are hardcoded in the webpage, it's not necessary to download the webpage just for these parameters.
- # If in downloading webpage serves other functions aswell, then extract these parameters from it.
- token_url = 'https://appovptok.ovpobs.tv/api/identity/app/token?api_key=OTk5NDcxOjpvY3N3LWFwaXVzZXI%3D&api_secret=ODY4ODM2MjE3ODMwYmVjNTAxMWZlMDJiMTYxZmY0MjFiMjMwMjllMjJmNDA1YWRiYzA5ODcxYTZjZTljZDkxOTo6NTM2NWIzNjRlMTM1ZmI2YWNjNmYzMGMzOGM3NzZhZTY%3D'
- token = self._download_webpage(token_url, id)
- headers = {'x-obs-app-token': token}
- data_json = self._download_json(f'https://appocswtok.ovpobs.tv/api/schedule-sessions/{id}?include=stream',
- id, headers=headers)
- meta_data = data_json['data']['attributes']
- for t_dict in data_json['included']:
- if t_dict.get('type') == 'Stream':
- stream_data = t_dict['attributes']
+
+ webpage = self._download_webpage(url, id)
+ title = self._html_search_meta(('title', 'og:title', 'twitter:title'), webpage)
+ uuid = self._html_search_meta('episode_uid', webpage)
+ m3u8_url = self._html_search_meta('video_url', webpage)
+ json_ld = self._search_json_ld(webpage, uuid)
+ thumbnails_list = json_ld.get('image')
+ if not thumbnails_list:
+ thumbnails_list = self._html_search_regex(
+ r'["\']image["\']:\s*["\']([^"\']+)["\']', webpage, 'images', default='')
+ thumbnails_list = thumbnails_list.replace('[', '').replace(']', '').split(',')
+ thumbnails_list = [thumbnail.strip() for thumbnail in thumbnails_list]
+ thumbnails = []
+ for thumbnail in thumbnails_list:
+ width_a, height_a, width = self._search_regex(
+ r'/images/image/private/t_(?P<width_a>\d+)-(?P<height_a>\d+)_(?P<width>\d+)/primary/[\W\w\d]+',
+ thumbnail, 'thumb', group=(1, 2, 3), default=(None, None, None))
+ width_a, height_a, width = int_or_none(width_a), int_or_none(height_a), int_or_none(width)
+ thumbnails.append({
+ 'url': thumbnail,
+ 'width': width,
+ 'height': int_or_none(try_get(width, lambda x: x * height_a / width_a))
+ })
m3u8_url = self._download_json(
- 'https://meteringtok.ovpobs.tv/api/playback-sessions', id, headers=headers, query={
- 'alias': stream_data['alias'],
- 'stream': stream_data['stream'],
- 'type': 'vod'
- })['data']['attributes']['url']
- formats, subtitles = self._extract_m3u8_formats_and_subtitles(m3u8_url, id)
+ f'https://olympics.com/tokenGenerator?url={m3u8_url}', uuid, note='Downloading m3u8 url')
+ formats, subtitles = self._extract_m3u8_formats_and_subtitles(m3u8_url, uuid, m3u8_id='hls')
self._sort_formats(formats)
return {
- 'id': id,
- 'title': meta_data['title'],
- 'release_date': unified_strdate(meta_data.get('start') or meta_data.get('broadcastPublished')),
- 'upload_date': unified_strdate(meta_data.get('publishedAt')),
+ 'id': uuid,
+ 'title': title,
+ 'thumbnails': thumbnails,
'formats': formats,
'subtitles': subtitles,
+ **json_ld
}
diff --git a/hypervideo_dl/extractor/ondemandkorea.py b/hypervideo_dl/extractor/ondemandkorea.py
index cc3c587..e933ea2 100644
--- a/hypervideo_dl/extractor/ondemandkorea.py
+++ b/hypervideo_dl/extractor/ondemandkorea.py
@@ -1,6 +1,8 @@
# coding: utf-8
from __future__ import unicode_literals
+import re
+
from .common import InfoExtractor
from ..utils import (
ExtractorError,
@@ -71,8 +73,8 @@ class OnDemandKoreaIE(InfoExtractor):
jw_config = self._parse_json(
self._search_regex(
- r'(?s)odkPlayer\.init.*?(?P<options>{[^;]+}).*?;',
- webpage, 'jw config', group='options'),
+ r'playlist\s*=\s*\[(?P<options>.+)];?$',
+ webpage, 'jw config', flags=re.MULTILINE, group='options'),
video_id, transform_source=js_to_json)
info = self._parse_jwplayer_data(
jw_config, video_id, require_title=False, m3u8_id='hls',
diff --git a/hypervideo_dl/extractor/onefootball.py b/hypervideo_dl/extractor/onefootball.py
new file mode 100644
index 0000000..826faad
--- /dev/null
+++ b/hypervideo_dl/extractor/onefootball.py
@@ -0,0 +1,51 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class OneFootballIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?onefootball\.com/[a-z]{2}/video/[^/&?#]+-(?P<id>\d+)'
+
+ _TESTS = [{
+ 'url': 'https://onefootball.com/en/video/highlights-fc-zuerich-3-3-fc-basel-34012334',
+ 'info_dict': {
+ 'id': '34012334',
+ 'ext': 'mp4',
+ 'title': 'Highlights: FC Zürich 3-3 FC Basel',
+ 'description': 'md5:33d9855cb790702c4fe42a513700aba8',
+ 'thumbnail': 'https://photobooth-api.onefootball.com/api/screenshot/https:%2F%2Fperegrine-api.onefootball.com%2Fv2%2Fphotobooth%2Fcms%2Fen%2F34012334',
+ 'timestamp': 1635874604,
+ 'upload_date': '20211102'
+ },
+ 'params': {'skip_download': True}
+ }, {
+ 'url': 'https://onefootball.com/en/video/klopp-fumes-at-var-decisions-in-west-ham-defeat-34041020',
+ 'info_dict': {
+ 'id': '34041020',
+ 'ext': 'mp4',
+ 'title': 'Klopp fumes at VAR decisions in West Ham defeat',
+ 'description': 'md5:9c50371095a01ad3f63311c73d8f51a5',
+ 'thumbnail': 'https://photobooth-api.onefootball.com/api/screenshot/https:%2F%2Fperegrine-api.onefootball.com%2Fv2%2Fphotobooth%2Fcms%2Fen%2F34041020',
+ 'timestamp': 1636314103,
+ 'upload_date': '20211107'
+ },
+ 'params': {'skip_download': True}
+ }]
+
+ def _real_extract(self, url):
+ id = self._match_id(url)
+ webpage = self._download_webpage(url, id)
+ data_json = self._search_json_ld(webpage, id)
+ m3u8_url = self._html_search_regex(r'(https://cdn\.jwplayer\.com/manifests/.+\.m3u8)', webpage, 'm3u8_url')
+ formats, subtitles = self._extract_m3u8_formats_and_subtitles(m3u8_url, id)
+ self._sort_formats(formats)
+ return {
+ 'id': id,
+ 'title': data_json.get('title'),
+ 'description': data_json.get('description'),
+ 'thumbnail': data_json.get('thumbnail'),
+ 'timestamp': data_json.get('timestamp'),
+ 'formats': formats,
+ 'subtitles': subtitles,
+ }
diff --git a/hypervideo_dl/extractor/onet.py b/hypervideo_dl/extractor/onet.py
index bf53ea0..95177a2 100644
--- a/hypervideo_dl/extractor/onet.py
+++ b/hypervideo_dl/extractor/onet.py
@@ -182,14 +182,9 @@ class OnetChannelIE(OnetBaseIE):
video_id = remove_start(current_clip_info['ckmId'], 'mvp:')
video_name = url_basename(current_clip_info['url'])
- if self.get_param('noplaylist'):
- self.to_screen(
- 'Downloading just video %s because of --no-playlist' % video_name)
+ if not self._yes_playlist(channel_id, video_name, playlist_label='channel'):
return self._extract_from_id(video_id, webpage)
- self.to_screen(
- 'Downloading channel %s - add --no-playlist to just download video %s' % (
- channel_id, video_name))
matches = re.findall(
r'<a[^>]+href=[\'"](%s[a-z]+/[0-9a-z-]+/[0-9a-z]+)' % self._URL_BASE_RE,
webpage)
diff --git a/hypervideo_dl/extractor/opencast.py b/hypervideo_dl/extractor/opencast.py
new file mode 100644
index 0000000..cf8d917
--- /dev/null
+++ b/hypervideo_dl/extractor/opencast.py
@@ -0,0 +1,177 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ determine_ext,
+ ExtractorError,
+ int_or_none,
+ parse_iso8601,
+ traverse_obj,
+ variadic,
+)
+
+
+class OpencastBaseIE(InfoExtractor):
+ _INSTANCES_RE = r'''(?:
+ opencast\.informatik\.kit\.edu|
+ electures\.uni-muenster\.de|
+ oc-presentation\.ltcc\.tuwien\.ac\.at|
+ medien\.ph-noe\.ac\.at|
+ oc-video\.ruhr-uni-bochum\.de|
+ oc-video1\.ruhr-uni-bochum\.de|
+ opencast\.informatik\.uni-goettingen\.de|
+ heicast\.uni-heidelberg\.de|
+ opencast\.hawk\.de:8080|
+ opencast\.hs-osnabrueck\.de|
+ video[0-9]+\.virtuos\.uni-osnabrueck\.de|
+ opencast\.uni-koeln\.de|
+ media\.opencast\.hochschule-rhein-waal\.de|
+ matterhorn\.dce\.harvard\.edu|
+ hs-harz\.opencast\.uni-halle\.de|
+ videocampus\.urz\.uni-leipzig\.de|
+ media\.uct\.ac\.za|
+ vid\.igb\.illinois\.edu|
+ cursosabertos\.c3sl\.ufpr\.br|
+ mcmedia\.missioncollege\.org|
+ clases\.odon\.edu\.uy
+ )'''
+ _UUID_RE = r'[\da-fA-F]{8}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{12}'
+
+ def _call_api(self, host, video_id, **kwargs):
+ return self._download_json(self._API_BASE % (host, video_id), video_id, **kwargs)
+
+ def _parse_mediapackage(self, video):
+ video_id = video.get('id')
+ if video_id is None:
+ raise ExtractorError('Video id was not found')
+
+ formats = []
+ for track in variadic(traverse_obj(video, ('media', 'track')) or []):
+ href = track.get('url')
+ if href is None:
+ continue
+ ext = determine_ext(href, None)
+
+ transport = track.get('transport')
+
+ if transport == 'DASH' or ext == 'mpd':
+ formats.extend(self._extract_mpd_formats_and_subtitles(href, video_id, mpd_id='dash', fatal=False))
+ elif transport == 'HLS' or ext == 'm3u8':
+ formats.extend(self._extract_m3u8_formats_and_subtitles(
+ href, video_id, m3u8_id='hls', entry_protocol='m3u8_native', fatal=False))
+ elif transport == 'HDS' or ext == 'f4m':
+ formats.extend(self._extract_f4m_formats(href, video_id, f4m_id='hds', fatal=False))
+ elif transport == 'SMOOTH':
+ formats.extend(self._extract_ism_formats(href, video_id, ism_id='smooth', fatal=False))
+ elif ext == 'smil':
+ formats.extend(self._extract_smil_formats(href, video_id, fatal=False))
+ else:
+ track_obj = {
+ 'url': href,
+ 'ext': ext,
+ 'format_note': track.get('transport'),
+ 'resolution': traverse_obj(track, ('video', 'resolution')),
+ 'fps': int_or_none(traverse_obj(track, ('video', 'framerate'))),
+ 'vbr': int_or_none(traverse_obj(track, ('video', 'bitrate')), scale=1000),
+ 'vcodec': traverse_obj(track, ('video', 'encoder', 'type')) if track.get('video') else 'none',
+ 'abr': int_or_none(traverse_obj(track, ('audio', 'bitrate')), scale=1000),
+ 'asr': int_or_none(traverse_obj(track, ('audio', 'samplingrate'))),
+ 'acodec': traverse_obj(track, ('audio', 'encoder', 'type')) if track.get('audio') else 'none',
+ }
+
+ if transport == 'RTMP':
+ m_obj = re.search(r'(?:rtmp://[^/]+/(?P<app>[^/]+))/(?P<ext>.+):(?P<playpath>.+)', href)
+ if not m_obj:
+ continue
+ track_obj.update({
+ 'app': m_obj.group('app'),
+ 'ext': m_obj.group('ext'),
+ 'play_path': m_obj.group('ext') + ':' + m_obj.group('playpath'),
+ 'rtmp_live': True,
+ 'preference': -2,
+ })
+ formats.append(track_obj)
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'formats': formats,
+ 'title': video.get('title'),
+ 'series': video.get('seriestitle'),
+ 'season_id': video.get('series'),
+ 'creator': traverse_obj(video, ('creators', 'creator')),
+ 'timestamp': parse_iso8601(video.get('start')),
+ 'thumbnail': traverse_obj(video, ('attachments', 'attachment', ..., 'url'), get_all=False),
+ }
+
+
+class OpencastIE(OpencastBaseIE):
+ _VALID_URL = r'''(?x)
+ https?://(?P<host>%s)/paella/ui/watch.html\?.*?
+ id=(?P<id>%s)
+ ''' % (OpencastBaseIE._INSTANCES_RE, OpencastBaseIE._UUID_RE)
+
+ _API_BASE = 'https://%s/search/episode.json?id=%s'
+
+ _TESTS = [
+ {
+ 'url': 'https://oc-video1.ruhr-uni-bochum.de/paella/ui/watch.html?id=ed063cd5-72c8-46b5-a60a-569243edcea8',
+ 'md5': '554c8e99a90f7be7e874619fcf2a3bc9',
+ 'info_dict': {
+ 'id': 'ed063cd5-72c8-46b5-a60a-569243edcea8',
+ 'ext': 'mp4',
+ 'title': '11 - Kryptographie - 24.11.2015',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'timestamp': 1606208400,
+ 'upload_date': '20201124',
+ },
+ }
+ ]
+
+ def _real_extract(self, url):
+ host, video_id = self._match_valid_url(url).group('host', 'id')
+ return self._parse_mediapackage(
+ self._call_api(host, video_id)['search-results']['result']['mediapackage'])
+
+
+class OpencastPlaylistIE(OpencastBaseIE):
+ _VALID_URL = r'''(?x)
+ https?://(?P<host>%s)/engage/ui/index.html\?.*?
+ epFrom=(?P<id>%s)
+ ''' % (OpencastBaseIE._INSTANCES_RE, OpencastBaseIE._UUID_RE)
+
+ _API_BASE = 'https://%s/search/episode.json?sid=%s'
+
+ _TESTS = [
+ {
+ 'url': 'https://oc-video1.ruhr-uni-bochum.de/engage/ui/index.html?epFrom=cf68a4a1-36b1-4a53-a6ba-61af5705a0d0',
+ 'info_dict': {
+ 'id': 'cf68a4a1-36b1-4a53-a6ba-61af5705a0d0',
+ 'title': 'Kryptographie - WiSe 15/16',
+ },
+ 'playlist_mincount': 28,
+ },
+ {
+ 'url': 'https://oc-video.ruhr-uni-bochum.de/engage/ui/index.html?e=1&p=1&epFrom=b1a54262-3684-403f-9731-8e77c3766f9a',
+ 'info_dict': {
+ 'id': 'b1a54262-3684-403f-9731-8e77c3766f9a',
+ 'title': 'inSTUDIES-Social movements and prefigurative politics in a global perspective',
+ },
+ 'playlist_mincount': 6,
+ },
+ ]
+
+ def _real_extract(self, url):
+ host, video_id = self._match_valid_url(url).group('host', 'id')
+
+ entries = [
+ self._parse_mediapackage(episode['mediapackage'])
+ for episode in variadic(self._call_api(host, video_id)['search-results']['result'])
+ if episode.get('mediapackage')
+ ]
+
+ return self.playlist_result(entries, video_id, traverse_obj(entries, (0, 'series')))
diff --git a/hypervideo_dl/extractor/openload.py b/hypervideo_dl/extractor/openload.py
index dfdd0e5..fe4740a 100644
--- a/hypervideo_dl/extractor/openload.py
+++ b/hypervideo_dl/extractor/openload.py
@@ -16,8 +16,7 @@ from ..utils import (
ExtractorError,
get_exe_version,
is_outdated_version,
- std_headers,
- process_communicate_or_kill,
+ Popen,
)
@@ -208,7 +207,7 @@ class PhantomJSwrapper(object):
replaces = self.options
replaces['url'] = url
- user_agent = headers.get('User-Agent') or std_headers['User-Agent']
+ user_agent = headers.get('User-Agent') or self.extractor.get_param('http_headers')['User-Agent']
replaces['ua'] = user_agent.replace('"', '\\"')
replaces['jscode'] = jscode
@@ -223,11 +222,10 @@ class PhantomJSwrapper(object):
else:
self.extractor.to_screen('%s: %s' % (video_id, note2))
- p = subprocess.Popen([
- self.exe, '--ssl-protocol=any',
- self._TMP_FILES['script'].name
- ], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
- out, err = process_communicate_or_kill(p)
+ p = Popen(
+ [self.exe, '--ssl-protocol=any', self._TMP_FILES['script'].name],
+ stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+ out, err = p.communicate_or_kill()
if p.returncode != 0:
raise ExtractorError(
'Executing JS failed\n:' + encodeArgument(err))
diff --git a/hypervideo_dl/extractor/openrec.py b/hypervideo_dl/extractor/openrec.py
index d7073ab..5eb1cdb 100644
--- a/hypervideo_dl/extractor/openrec.py
+++ b/hypervideo_dl/extractor/openrec.py
@@ -4,14 +4,71 @@ from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
ExtractorError,
+ get_first,
+ int_or_none,
traverse_obj,
try_get,
- unified_strdate
+ unified_strdate,
+ unified_timestamp,
)
from ..compat import compat_str
-class OpenRecIE(InfoExtractor):
+class OpenRecBaseIE(InfoExtractor):
+ def _extract_pagestore(self, webpage, video_id):
+ return self._parse_json(
+ self._search_regex(r'(?m)window\.pageStore\s*=\s*(\{.+?\});$', webpage, 'window.pageStore'), video_id)
+
+ def _expand_media(self, video_id, media):
+ for name, m3u8_url in (media or {}).items():
+ if not m3u8_url:
+ continue
+ yield from self._extract_m3u8_formats(
+ m3u8_url, video_id, ext='mp4', m3u8_id=name)
+
+ def _extract_movie(self, webpage, video_id, name, is_live):
+ window_stores = self._extract_pagestore(webpage, video_id)
+ movie_stores = [
+ # extract all three important data (most of data are duplicated each other, but slightly different!)
+ traverse_obj(window_stores, ('v8', 'state', 'movie'), expected_type=dict),
+ traverse_obj(window_stores, ('v8', 'movie'), expected_type=dict),
+ traverse_obj(window_stores, 'movieStore', expected_type=dict),
+ ]
+ if not any(movie_stores):
+ raise ExtractorError(f'Failed to extract {name} info')
+
+ formats = list(self._expand_media(video_id, get_first(movie_stores, 'media')))
+ if not formats and is_live:
+ # archived livestreams
+ cookies = self._get_cookies('https://www.openrec.tv/')
+ detail = self._download_json(
+ f'https://apiv5.openrec.tv/api/v5/movies/{video_id}/detail', video_id,
+ headers={
+ 'Origin': 'https://www.openrec.tv',
+ 'Referer': 'https://www.openrec.tv/',
+ 'access-token': try_get(cookies, lambda x: x.get('access_token').value),
+ 'uuid': try_get(cookies, lambda x: x.get('uuid').value),
+ })
+ new_media = traverse_obj(detail, ('data', 'items', ..., 'media'), get_all=False)
+ formats = list(self._expand_media(video_id, new_media))
+ is_live = False
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': get_first(movie_stores, 'title'),
+ 'description': get_first(movie_stores, 'introduction'),
+ 'thumbnail': get_first(movie_stores, 'thumbnailUrl'),
+ 'formats': formats,
+ 'uploader': get_first(movie_stores, ('channel', 'user', 'name')),
+ 'uploader_id': get_first(movie_stores, ('channel', 'user', 'id')),
+ 'timestamp': int_or_none(get_first(movie_stores, ['publishedAt', 'time']), scale=1000) or unified_timestamp(get_first(movie_stores, 'publishedAt')),
+ 'is_live': is_live,
+ }
+
+
+class OpenRecIE(OpenRecBaseIE):
IE_NAME = 'openrec'
_VALID_URL = r'https?://(?:www\.)?openrec\.tv/live/(?P<id>[^/]+)'
_TESTS = [{
@@ -24,53 +81,12 @@ class OpenRecIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
- webpage = self._download_webpage('https://www.openrec.tv/live/%s' % video_id, video_id)
-
- window_stores = self._parse_json(
- self._search_regex(r'(?m)window\.pageStore\s*=\s*(\{.+?\});$', webpage, 'window.pageStore'), video_id)
- movie_store = traverse_obj(
- window_stores,
- ('v8', 'state', 'movie'),
- ('v8', 'movie'),
- expected_type=dict)
- if not movie_store:
- raise ExtractorError('Failed to extract live info')
-
- title = movie_store.get('title')
- description = movie_store.get('introduction')
- thumbnail = movie_store.get('thumbnailUrl')
-
- channel_user = movie_store.get('channel', {}).get('user')
- uploader = try_get(channel_user, lambda x: x['name'], compat_str)
- uploader_id = try_get(channel_user, lambda x: x['id'], compat_str)
-
- timestamp = traverse_obj(movie_store, ('startedAt', 'time'), expected_type=int)
-
- m3u8_playlists = movie_store.get('media')
- formats = []
- for (name, m3u8_url) in m3u8_playlists.items():
- if not m3u8_url:
- continue
- formats.extend(self._extract_m3u8_formats(
- m3u8_url, video_id, ext='mp4', entry_protocol='m3u8',
- m3u8_id='hls-%s' % name, live=True))
-
- self._sort_formats(formats)
+ webpage = self._download_webpage(f'https://www.openrec.tv/live/{video_id}', video_id)
- return {
- 'id': video_id,
- 'title': title,
- 'description': description,
- 'thumbnail': thumbnail,
- 'formats': formats,
- 'uploader': uploader,
- 'uploader_id': uploader_id,
- 'timestamp': timestamp,
- 'is_live': True,
- }
+ return self._extract_movie(webpage, video_id, 'live', True)
-class OpenRecCaptureIE(InfoExtractor):
+class OpenRecCaptureIE(OpenRecBaseIE):
IE_NAME = 'openrec:capture'
_VALID_URL = r'https?://(?:www\.)?openrec\.tv/capture/(?P<id>[^/]+)'
_TESTS = [{
@@ -89,38 +105,49 @@ class OpenRecCaptureIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
- webpage = self._download_webpage('https://www.openrec.tv/capture/%s' % video_id, video_id)
+ webpage = self._download_webpage(f'https://www.openrec.tv/capture/{video_id}', video_id)
- window_stores = self._parse_json(
- self._search_regex(r'(?m)window\.pageStore\s*=\s*(\{.+?\});$', webpage, 'window.pageStore'), video_id)
+ window_stores = self._extract_pagestore(webpage, video_id)
movie_store = window_stores.get('movie')
capture_data = window_stores.get('capture')
if not capture_data:
raise ExtractorError('Cannot extract title')
- title = capture_data.get('title')
- thumbnail = capture_data.get('thumbnailUrl')
- upload_date = unified_strdate(capture_data.get('createdAt'))
-
- channel_info = movie_store.get('channel') or {}
- uploader = channel_info.get('name')
- uploader_id = channel_info.get('id')
- m3u8_url = capture_data.get('source')
- if not m3u8_url:
- raise ExtractorError('Cannot extract m3u8 url')
formats = self._extract_m3u8_formats(
- m3u8_url, video_id, ext='mp4', entry_protocol='m3u8_native',
- m3u8_id='hls')
-
+ capture_data.get('source'), video_id, ext='mp4')
self._sort_formats(formats)
return {
'id': video_id,
- 'title': title,
- 'thumbnail': thumbnail,
+ 'title': capture_data.get('title'),
+ 'thumbnail': capture_data.get('thumbnailUrl'),
'formats': formats,
- 'uploader': uploader,
- 'uploader_id': uploader_id,
- 'upload_date': upload_date,
+ 'timestamp': unified_timestamp(traverse_obj(movie_store, 'createdAt', expected_type=compat_str)),
+ 'uploader': traverse_obj(movie_store, ('channel', 'name'), expected_type=compat_str),
+ 'uploader_id': traverse_obj(movie_store, ('channel', 'id'), expected_type=compat_str),
+ 'upload_date': unified_strdate(capture_data.get('createdAt')),
}
+
+
+class OpenRecMovieIE(OpenRecBaseIE):
+ IE_NAME = 'openrec:movie'
+ _VALID_URL = r'https?://(?:www\.)?openrec\.tv/movie/(?P<id>[^/]+)'
+ _TESTS = [{
+ 'url': 'https://www.openrec.tv/movie/nqz5xl5km8v',
+ 'info_dict': {
+ 'id': 'nqz5xl5km8v',
+ 'title': '限定コミュニティ(Discord)参加方法ご説明動画',
+ 'description': 'md5:ebd563e5f5b060cda2f02bf26b14d87f',
+ 'thumbnail': r're:https://.+',
+ 'uploader': 'タイキとカズヒロ',
+ 'uploader_id': 'taiki_to_kazuhiro',
+ 'timestamp': 1638856800,
+ },
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(f'https://www.openrec.tv/movie/{video_id}', video_id)
+
+ return self._extract_movie(webpage, video_id, 'movie', False)
diff --git a/hypervideo_dl/extractor/orf.py b/hypervideo_dl/extractor/orf.py
index 428ec97..0628977 100644
--- a/hypervideo_dl/extractor/orf.py
+++ b/hypervideo_dl/extractor/orf.py
@@ -1,22 +1,26 @@
# coding: utf-8
from __future__ import unicode_literals
+import functools
import re
from .common import InfoExtractor
-from ..compat import compat_str
from ..utils import (
clean_html,
determine_ext,
float_or_none,
HEADRequest,
+ InAdvancePagedList,
int_or_none,
+ join_nonempty,
orderedSet,
remove_end,
+ smuggle_url,
str_or_none,
strip_jsonp,
unescapeHTML,
unified_strdate,
+ unsmuggle_url,
url_or_none,
)
@@ -24,9 +28,40 @@ from ..utils import (
class ORFTVthekIE(InfoExtractor):
IE_NAME = 'orf:tvthek'
IE_DESC = 'ORF TVthek'
- _VALID_URL = r'https?://tvthek\.orf\.at/(?:[^/]+/)+(?P<id>\d+)'
+ _VALID_URL = r'(?P<url>https?://tvthek\.orf\.at/(?:(?:[^/]+/){2}){1,2}(?P<id>\d+))(/[^/]+/(?P<vid>\d+))?(?:$|[?#])'
_TESTS = [{
+ 'url': 'https://tvthek.orf.at/profile/ZIB-2/1211/ZIB-2/14121079',
+ 'info_dict': {
+ 'id': '14121079',
+ },
+ 'playlist_count': 11,
+ 'params': {'noplaylist': True}
+ }, {
+ 'url': 'https://tvthek.orf.at/profile/ZIB-2/1211/ZIB-2/14121079/Umfrage-Welches-Tier-ist-Sebastian-Kurz/15083150',
+ 'info_dict': {
+ 'id': '14121079',
+ },
+ 'playlist_count': 1,
+ 'params': {'playlist_items': '5'}
+ }, {
+ 'url': 'https://tvthek.orf.at/profile/ZIB-2/1211/ZIB-2/14121079/Umfrage-Welches-Tier-ist-Sebastian-Kurz/15083150',
+ 'info_dict': {
+ 'id': '14121079',
+ 'playlist_count': 1
+ },
+ 'playlist': [{
+ 'info_dict': {
+ 'id': '15083150',
+ 'ext': 'mp4',
+ 'description': 'md5:7be1c485425f5f255a5e4e4815e77d04',
+ 'thumbnail': 'https://api-tvthek.orf.at/uploads/media/segments/0130/59/824271ea35cd8931a0fb08ab316a5b0a1562342c.jpeg',
+ 'title': 'Umfrage: Welches Tier ist Sebastian Kurz?',
+ }
+ }],
+ 'playlist_count': 1,
+ 'params': {'noplaylist': True, 'skip_download': 'm3u8'}
+ }, {
'url': 'http://tvthek.orf.at/program/Aufgetischt/2745173/Aufgetischt-Mit-der-Steirischen-Tafelrunde/8891389',
'playlist': [{
'md5': '2942210346ed779588f428a92db88712',
@@ -61,8 +96,90 @@ class ORFTVthekIE(InfoExtractor):
'only_matching': True,
}]
+ def _pagefunc(self, url, data_jsb, n, *, image=None):
+ sd = data_jsb[n]
+ video_id, title = str(sd['id']), sd['title']
+ formats = []
+ for fd in sd['sources']:
+ src = url_or_none(fd.get('src'))
+ if not src:
+ continue
+ format_id = join_nonempty('delivery', 'quality', 'quality_string', from_dict=fd)
+ ext = determine_ext(src)
+ if ext == 'm3u8':
+ m3u8_formats = self._extract_m3u8_formats(
+ src, video_id, 'mp4', m3u8_id=format_id, fatal=False, note=f'Downloading {format_id} m3u8 manifest')
+ if any('/geoprotection' in f['url'] for f in m3u8_formats):
+ self.raise_geo_restricted()
+ formats.extend(m3u8_formats)
+ elif ext == 'f4m':
+ formats.extend(self._extract_f4m_formats(
+ src, video_id, f4m_id=format_id, fatal=False))
+ elif ext == 'mpd':
+ formats.extend(self._extract_mpd_formats(
+ src, video_id, mpd_id=format_id, fatal=False, note=f'Downloading {format_id} mpd manifest'))
+ else:
+ formats.append({
+ 'format_id': format_id,
+ 'url': src,
+ 'protocol': fd.get('protocol'),
+ })
+
+ # Check for geoblocking.
+ # There is a property is_geoprotection, but that's always false
+ geo_str = sd.get('geoprotection_string')
+ http_url = next(
+ (f['url'] for f in formats if re.match(r'^https?://.*\.mp4$', f['url'])),
+ None) if geo_str else None
+ if http_url:
+ self._request_webpage(
+ HEADRequest(http_url), video_id, fatal=False, note='Testing for geoblocking',
+ errnote=f'This video seems to be blocked outside of {geo_str}. You may want to try the streaming-* formats')
+
+ self._sort_formats(formats)
+
+ subtitles = {}
+ for sub in sd.get('subtitles', []):
+ sub_src = sub.get('src')
+ if not sub_src:
+ continue
+ subtitles.setdefault(sub.get('lang', 'de-AT'), []).append({
+ 'url': sub_src,
+ })
+
+ upload_date = unified_strdate(sd.get('created_date'))
+
+ thumbnails = []
+ preview = sd.get('preview_image_url')
+ if preview:
+ thumbnails.append({
+ 'id': 'preview',
+ 'url': preview,
+ 'preference': 0,
+ })
+ image = sd.get('image_full_url') or image
+ if image:
+ thumbnails.append({
+ 'id': 'full',
+ 'url': image,
+ 'preference': 1,
+ })
+
+ yield {
+ 'id': video_id,
+ 'title': title,
+ 'webpage_url': smuggle_url(f'{url}/part/{video_id}', {'force_noplaylist': True}),
+ 'formats': formats,
+ 'subtitles': subtitles,
+ 'description': sd.get('description'),
+ 'duration': int_or_none(sd.get('duration_in_seconds')),
+ 'upload_date': upload_date,
+ 'thumbnails': thumbnails,
+ }
+
def _real_extract(self, url):
- playlist_id = self._match_id(url)
+ url, smuggled_data = unsmuggle_url(url)
+ playlist_id, video_id, base_url = self._match_valid_url(url).group('id', 'vid', 'url')
webpage = self._download_webpage(url, playlist_id)
data_jsb = self._parse_json(
@@ -71,112 +188,16 @@ class ORFTVthekIE(InfoExtractor):
webpage, 'playlist', group='json'),
playlist_id, transform_source=unescapeHTML)['playlist']['videos']
- entries = []
- for sd in data_jsb:
- video_id, title = sd.get('id'), sd.get('title')
- if not video_id or not title:
- continue
- video_id = compat_str(video_id)
- formats = []
- for fd in sd['sources']:
- src = url_or_none(fd.get('src'))
- if not src:
- continue
- format_id_list = []
- for key in ('delivery', 'quality', 'quality_string'):
- value = fd.get(key)
- if value:
- format_id_list.append(value)
- format_id = '-'.join(format_id_list)
- ext = determine_ext(src)
- if ext == 'm3u8':
- m3u8_formats = self._extract_m3u8_formats(
- src, video_id, 'mp4', m3u8_id=format_id, fatal=False)
- if any('/geoprotection' in f['url'] for f in m3u8_formats):
- self.raise_geo_restricted()
- formats.extend(m3u8_formats)
- elif ext == 'f4m':
- formats.extend(self._extract_f4m_formats(
- src, video_id, f4m_id=format_id, fatal=False))
- elif ext == 'mpd':
- formats.extend(self._extract_mpd_formats(
- src, video_id, mpd_id=format_id, fatal=False))
- else:
- formats.append({
- 'format_id': format_id,
- 'url': src,
- 'protocol': fd.get('protocol'),
- })
+ if not self._yes_playlist(playlist_id, video_id, smuggled_data):
+ data_jsb = [sd for sd in data_jsb if str(sd.get('id')) == video_id]
- # Check for geoblocking.
- # There is a property is_geoprotection, but that's always false
- geo_str = sd.get('geoprotection_string')
- if geo_str:
- try:
- http_url = next(
- f['url']
- for f in formats
- if re.match(r'^https?://.*\.mp4$', f['url']))
- except StopIteration:
- pass
- else:
- req = HEADRequest(http_url)
- self._request_webpage(
- req, video_id,
- note='Testing for geoblocking',
- errnote=((
- 'This video seems to be blocked outside of %s. '
- 'You may want to try the streaming-* formats.')
- % geo_str),
- fatal=False)
-
- self._check_formats(formats, video_id)
- self._sort_formats(formats)
-
- subtitles = {}
- for sub in sd.get('subtitles', []):
- sub_src = sub.get('src')
- if not sub_src:
- continue
- subtitles.setdefault(sub.get('lang', 'de-AT'), []).append({
- 'url': sub_src,
- })
-
- upload_date = unified_strdate(sd.get('created_date'))
-
- thumbnails = []
- preview = sd.get('preview_image_url')
- if preview:
- thumbnails.append({
- 'id': 'preview',
- 'url': preview,
- 'preference': 0,
- })
- image = sd.get('image_full_url')
- if not image and len(data_jsb) == 1:
- image = self._og_search_thumbnail(webpage)
- if image:
- thumbnails.append({
- 'id': 'full',
- 'url': image,
- 'preference': 1,
- })
-
- entries.append({
- '_type': 'video',
- 'id': video_id,
- 'title': title,
- 'formats': formats,
- 'subtitles': subtitles,
- 'description': sd.get('description'),
- 'duration': int_or_none(sd.get('duration_in_seconds')),
- 'upload_date': upload_date,
- 'thumbnails': thumbnails,
- })
+ playlist_count = len(data_jsb)
+ image = self._og_search_thumbnail(webpage) if playlist_count == 1 else None
+ page_func = functools.partial(self._pagefunc, base_url, data_jsb, image=image)
return {
'_type': 'playlist',
- 'entries': entries,
+ 'entries': InAdvancePagedList(page_func, playlist_count, 1),
'id': playlist_id,
}
diff --git a/hypervideo_dl/extractor/packtpub.py b/hypervideo_dl/extractor/packtpub.py
index c06fca7..62c52cd 100644
--- a/hypervideo_dl/extractor/packtpub.py
+++ b/hypervideo_dl/extractor/packtpub.py
@@ -47,10 +47,7 @@ class PacktPubIE(PacktPubBaseIE):
_NETRC_MACHINE = 'packtpub'
_TOKEN = None
- def _real_initialize(self):
- username, password = self._get_login_info()
- if username is None:
- return
+ def _perform_login(self, username, password):
try:
self._TOKEN = self._download_json(
'https://services.packtpub.com/auth-v1/users/tokens', None,
diff --git a/hypervideo_dl/extractor/panopto.py b/hypervideo_dl/extractor/panopto.py
new file mode 100644
index 0000000..3388f7f
--- /dev/null
+++ b/hypervideo_dl/extractor/panopto.py
@@ -0,0 +1,607 @@
+import re
+import calendar
+import json
+import functools
+from datetime import datetime
+from random import random
+
+from .common import InfoExtractor
+from ..compat import (
+ compat_urllib_parse_urlparse,
+ compat_urlparse
+)
+
+from ..utils import (
+ bug_reports_message,
+ ExtractorError,
+ get_first,
+ int_or_none,
+ OnDemandPagedList,
+ parse_qs,
+ srt_subtitles_timecode,
+ traverse_obj,
+)
+
+
+class PanoptoBaseIE(InfoExtractor):
+ BASE_URL_RE = r'(?P<base_url>https?://[\w.-]+\.panopto.(?:com|eu)/Panopto)'
+
+ # see panopto core.js
+ _SUB_LANG_MAPPING = {
+ 0: 'en-US',
+ 1: 'en-GB',
+ 2: 'es-MX',
+ 3: 'es-ES',
+ 4: 'de-DE',
+ 5: 'fr-FR',
+ 6: 'nl-NL',
+ 7: 'th-TH',
+ 8: 'zh-CN',
+ 9: 'zh-TW',
+ 10: 'ko-KR',
+ 11: 'ja-JP',
+ 12: 'ru-RU',
+ 13: 'pt-PT',
+ 14: 'pl-PL',
+ 15: 'en-AU',
+ 16: 'da-DK',
+ 17: 'fi-FI',
+ 18: 'hu-HU',
+ 19: 'nb-NO',
+ 20: 'sv-SE',
+ 21: 'it-IT'
+ }
+
+ def _call_api(self, base_url, path, video_id, data=None, fatal=True, **kwargs):
+ response = self._download_json(
+ base_url + path, video_id, data=json.dumps(data).encode('utf8') if data else None,
+ fatal=fatal, headers={'accept': 'application/json', 'content-type': 'application/json'}, **kwargs)
+ if not response:
+ return
+ error_code = traverse_obj(response, 'ErrorCode')
+ if error_code == 2:
+ self.raise_login_required(method='cookies')
+ elif error_code is not None:
+ msg = f'Panopto said: {response.get("ErrorMessage")}'
+ if fatal:
+ raise ExtractorError(msg, video_id=video_id, expected=True)
+ else:
+ self.report_warning(msg, video_id=video_id)
+ return response
+
+ @staticmethod
+ def _parse_fragment(url):
+ return {k: json.loads(v[0]) for k, v in compat_urlparse.parse_qs(compat_urllib_parse_urlparse(url).fragment).items()}
+
+ @staticmethod
+ def _extract_urls(webpage):
+ return [m.group('url') for m in re.finditer(
+ r'<iframe[^>]+src=["\'](?P<url>%s/Pages/(Viewer|Embed|Sessions/List)\.aspx[^"\']+)' % PanoptoIE.BASE_URL_RE,
+ webpage)]
+
+
+class PanoptoIE(PanoptoBaseIE):
+ _VALID_URL = PanoptoBaseIE.BASE_URL_RE + r'/Pages/(Viewer|Embed)\.aspx.*(?:\?|&)id=(?P<id>[a-f0-9-]+)'
+ _TESTS = [
+ {
+ 'url': 'https://demo.hosted.panopto.com/Panopto/Pages/Viewer.aspx?id=26b3ae9e-4a48-4dcc-96ba-0befba08a0fb',
+ 'info_dict': {
+ 'id': '26b3ae9e-4a48-4dcc-96ba-0befba08a0fb',
+ 'title': 'Panopto for Business - Use Cases',
+ 'timestamp': 1459184200,
+ 'thumbnail': r're:https://demo\.hosted\.panopto\.com/.+',
+ 'upload_date': '20160328',
+ 'ext': 'mp4',
+ 'cast': [],
+ 'chapters': [],
+ 'duration': 88.17099999999999,
+ 'average_rating': int,
+ 'uploader_id': '2db6b718-47a0-4b0b-9e17-ab0b00f42b1e',
+ 'channel_id': 'e4c6a2fc-1214-4ca0-8fb7-aef2e29ff63a',
+ 'channel': 'Showcase Videos'
+ },
+ },
+ {
+ 'url': 'https://demo.hosted.panopto.com/Panopto/Pages/Viewer.aspx?id=ed01b077-c9e5-4c7b-b8ff-15fa306d7a59',
+ 'info_dict': {
+ 'id': 'ed01b077-c9e5-4c7b-b8ff-15fa306d7a59',
+ 'title': 'Overcoming Top 4 Challenges of Enterprise Video',
+ 'uploader': 'Panopto Support',
+ 'timestamp': 1449409251,
+ 'thumbnail': r're:https://demo\.hosted\.panopto\.com/.+',
+ 'upload_date': '20151206',
+ 'ext': 'mp4',
+ 'chapters': 'count:12',
+ 'cast': ['Panopto Support'],
+ 'uploader_id': 'a96d1a31-b4de-489b-9eee-b4a5b414372c',
+ 'average_rating': int,
+ 'description': 'md5:4391837802b3fc856dadf630c4b375d1',
+ 'duration': 1088.2659999999998,
+ 'channel_id': '9f3c1921-43bb-4bda-8b3a-b8d2f05a8546',
+ 'channel': 'Webcasts',
+ },
+ },
+ {
+ # Extra params in URL
+ 'url': 'https://howtovideos.hosted.panopto.com/Panopto/Pages/Viewer.aspx?randomparam=thisisnotreal&id=5fa74e93-3d87-4694-b60e-aaa4012214ed&advance=true',
+ 'info_dict': {
+ 'id': '5fa74e93-3d87-4694-b60e-aaa4012214ed',
+ 'ext': 'mp4',
+ 'duration': 129.513,
+ 'cast': ['Kathryn Kelly'],
+ 'uploader_id': '316a0a58-7fa2-4cd9-be1c-64270d284a56',
+ 'timestamp': 1569845768,
+ 'tags': ['Viewer', 'Enterprise'],
+ 'chapters': [],
+ 'upload_date': '20190930',
+ 'thumbnail': r're:https://howtovideos\.hosted\.panopto\.com/.+',
+ 'description': 'md5:2d844aaa1b1a14ad0e2601a0993b431f',
+ 'title': 'Getting Started: View a Video',
+ 'average_rating': int,
+ 'uploader': 'Kathryn Kelly',
+ 'channel_id': 'fb93bc3c-6750-4b80-a05b-a921013735d3',
+ 'channel': 'Getting Started',
+ }
+ },
+ {
+ # Does not allow normal Viewer.aspx. AUDIO livestream has no url, so should be skipped and only give one stream.
+ 'url': 'https://unisa.au.panopto.com/Panopto/Pages/Embed.aspx?id=9d9a0fa3-e99a-4ebd-a281-aac2017f4da4',
+ 'info_dict': {
+ 'id': '9d9a0fa3-e99a-4ebd-a281-aac2017f4da4',
+ 'ext': 'mp4',
+ 'cast': ['LTS CLI Script'],
+ 'chapters': [],
+ 'duration': 2178.45,
+ 'description': 'md5:ee5cf653919f55b72bce2dbcf829c9fa',
+ 'channel_id': 'b23e673f-c287-4cb1-8344-aae9005a69f8',
+ 'average_rating': int,
+ 'uploader_id': '38377323-6a23-41e2-9ff6-a8e8004bf6f7',
+ 'uploader': 'LTS CLI Script',
+ 'timestamp': 1572458134,
+ 'title': 'WW2 Vets Interview 3 Ronald Stanley George',
+ 'thumbnail': r're:https://unisa\.au\.panopto\.com/.+',
+ 'channel': 'World War II Veteran Interviews',
+ 'upload_date': '20191030',
+ },
+ },
+ {
+ # Slides/storyboard
+ 'url': 'https://demo.hosted.panopto.com/Panopto/Pages/Viewer.aspx?id=a7f12f1d-3872-4310-84b0-f8d8ab15326b',
+ 'info_dict': {
+ 'id': 'a7f12f1d-3872-4310-84b0-f8d8ab15326b',
+ 'ext': 'mhtml',
+ 'timestamp': 1448798857,
+ 'duration': 4712.681,
+ 'title': 'Cache Memory - CompSci 15-213, Lecture 12',
+ 'channel_id': 'e4c6a2fc-1214-4ca0-8fb7-aef2e29ff63a',
+ 'uploader_id': 'a96d1a31-b4de-489b-9eee-b4a5b414372c',
+ 'upload_date': '20151129',
+ 'average_rating': 0,
+ 'uploader': 'Panopto Support',
+ 'channel': 'Showcase Videos',
+ 'description': 'md5:55e51d54233ddb0e6c2ed388ca73822c',
+ 'cast': ['ISR Videographer', 'Panopto Support'],
+ 'chapters': 'count:28',
+ 'thumbnail': r're:https://demo\.hosted\.panopto\.com/.+',
+ },
+ 'params': {'format': 'mhtml', 'skip_download': True}
+ },
+ {
+ 'url': 'https://na-training-1.hosted.panopto.com/Panopto/Pages/Viewer.aspx?id=8285224a-9a2b-4957-84f2-acb0000c4ea9',
+ 'info_dict': {
+ 'id': '8285224a-9a2b-4957-84f2-acb0000c4ea9',
+ 'ext': 'mp4',
+ 'chapters': [],
+ 'title': 'Company Policy',
+ 'average_rating': 0,
+ 'timestamp': 1615058901,
+ 'channel': 'Human Resources',
+ 'tags': ['HumanResources'],
+ 'duration': 1604.243,
+ 'thumbnail': r're:https://na-training-1\.hosted\.panopto\.com/.+',
+ 'uploader_id': '8e8ba0a3-424f-40df-a4f1-ab3a01375103',
+ 'uploader': 'Cait M.',
+ 'upload_date': '20210306',
+ 'cast': ['Cait M.'],
+ 'subtitles': {'en-US': [{'ext': 'srt', 'data': 'md5:a3f4d25963fdeace838f327097c13265'}],
+ 'es-ES': [{'ext': 'srt', 'data': 'md5:57e9dad365fd0fbaf0468eac4949f189'}]},
+ },
+ 'params': {'writesubtitles': True, 'skip_download': True}
+ }, {
+ # On Panopto there are two subs: "Default" and en-US. en-US is blank and should be skipped.
+ 'url': 'https://na-training-1.hosted.panopto.com/Panopto/Pages/Viewer.aspx?id=940cbd41-f616-4a45-b13e-aaf1000c915b',
+ 'info_dict': {
+ 'id': '940cbd41-f616-4a45-b13e-aaf1000c915b',
+ 'ext': 'mp4',
+ 'subtitles': 'count:1',
+ 'title': 'HR Benefits Review Meeting*',
+ 'cast': ['Panopto Support'],
+ 'chapters': [],
+ 'timestamp': 1575024251,
+ 'thumbnail': r're:https://na-training-1\.hosted\.panopto\.com/.+',
+ 'channel': 'Zoom',
+ 'description': 'md5:04f90a9c2c68b7828144abfb170f0106',
+ 'uploader': 'Panopto Support',
+ 'average_rating': 0,
+ 'duration': 409.34499999999997,
+ 'uploader_id': 'b6ac04ad-38b8-4724-a004-a851004ea3df',
+ 'upload_date': '20191129',
+
+ },
+ 'params': {'writesubtitles': True, 'skip_download': True}
+ },
+ {
+ 'url': 'https://ucc.cloud.panopto.eu/Panopto/Pages/Viewer.aspx?id=0e8484a4-4ceb-4d98-a63f-ac0200b455cb',
+ 'only_matching': True
+ },
+ {
+ 'url': 'https://brown.hosted.panopto.com/Panopto/Pages/Embed.aspx?id=0b3ff73b-36a0-46c5-8455-aadf010a3638',
+ 'only_matching': True
+ },
+ ]
+
+ @classmethod
+ def suitable(cls, url):
+ return False if PanoptoPlaylistIE.suitable(url) else super().suitable(url)
+
+ def _mark_watched(self, base_url, video_id, delivery_info):
+ duration = traverse_obj(delivery_info, ('Delivery', 'Duration'), expected_type=float)
+ invocation_id = delivery_info.get('InvocationId')
+ stream_id = traverse_obj(delivery_info, ('Delivery', 'Streams', ..., 'PublicID'), get_all=False, expected_type=str)
+ if invocation_id and stream_id and duration:
+ timestamp_str = f'/Date({calendar.timegm(datetime.utcnow().timetuple())}000)/'
+ data = {
+ 'streamRequests': [
+ {
+ 'ClientTimeStamp': timestamp_str,
+ 'ID': 0,
+ 'InvocationID': invocation_id,
+ 'PlaybackSpeed': 1,
+ 'SecondsListened': duration - 1,
+ 'SecondsRejected': 0,
+ 'StartPosition': 0,
+ 'StartReason': 2,
+ 'StopReason': None,
+ 'StreamID': stream_id,
+ 'TimeStamp': timestamp_str,
+ 'UpdatesRejected': 0
+ },
+ ]}
+
+ self._download_webpage(
+ base_url + '/Services/Analytics.svc/AddStreamRequests', video_id,
+ fatal=False, data=json.dumps(data).encode('utf8'), headers={'content-type': 'application/json'},
+ note='Marking watched', errnote='Unable to mark watched')
+
+ @staticmethod
+ def _extract_chapters(timestamps):
+ chapters = []
+ for timestamp in timestamps or []:
+ caption = timestamp.get('Caption')
+ start, duration = int_or_none(timestamp.get('Time')), int_or_none(timestamp.get('Duration'))
+ if not caption or start is None or duration is None:
+ continue
+ chapters.append({
+ 'start_time': start,
+ 'end_time': start + duration,
+ 'title': caption
+ })
+ return chapters
+
+ @staticmethod
+ def _extract_mhtml_formats(base_url, timestamps):
+ image_frags = {}
+ for timestamp in timestamps or []:
+ duration = timestamp.get('Duration')
+ obj_id, obj_sn = timestamp.get('ObjectIdentifier'), timestamp.get('ObjectSequenceNumber'),
+ if timestamp.get('EventTargetType') == 'PowerPoint' and obj_id is not None and obj_sn is not None:
+ image_frags.setdefault('slides', []).append({
+ 'url': base_url + f'/Pages/Viewer/Image.aspx?id={obj_id}&number={obj_sn}',
+ 'duration': duration
+ })
+
+ obj_pid, session_id, abs_time = timestamp.get('ObjectPublicIdentifier'), timestamp.get('SessionID'), timestamp.get('AbsoluteTime')
+ if None not in (obj_pid, session_id, abs_time):
+ image_frags.setdefault('chapter', []).append({
+ 'url': base_url + f'/Pages/Viewer/Thumb.aspx?eventTargetPID={obj_pid}&sessionPID={session_id}&number={obj_sn}&isPrimary=false&absoluteTime={abs_time}',
+ 'duration': duration,
+ })
+ for name, fragments in image_frags.items():
+ yield {
+ 'format_id': name,
+ 'ext': 'mhtml',
+ 'protocol': 'mhtml',
+ 'acodec': 'none',
+ 'vcodec': 'none',
+ 'url': 'about:invalid',
+ 'fragments': fragments
+ }
+
+ @staticmethod
+ def _json2srt(data, delivery):
+ def _gen_lines():
+ for i, line in enumerate(data):
+ start_time = line['Time']
+ duration = line.get('Duration')
+ if duration:
+ end_time = start_time + duration
+ else:
+ end_time = traverse_obj(data, (i + 1, 'Time')) or delivery['Duration']
+ yield f'{i + 1}\n{srt_subtitles_timecode(start_time)} --> {srt_subtitles_timecode(end_time)}\n{line["Caption"]}'
+ return '\n\n'.join(_gen_lines())
+
+ def _get_subtitles(self, base_url, video_id, delivery):
+ subtitles = {}
+ for lang in delivery.get('AvailableLanguages') or []:
+ response = self._call_api(
+ base_url, '/Pages/Viewer/DeliveryInfo.aspx', video_id, fatal=False,
+ note='Downloading captions JSON metadata', query={
+ 'deliveryId': video_id,
+ 'getCaptions': True,
+ 'language': str(lang),
+ 'responseType': 'json'
+ }
+ )
+ if not isinstance(response, list):
+ continue
+ subtitles.setdefault(self._SUB_LANG_MAPPING.get(lang) or 'default', []).append({
+ 'ext': 'srt',
+ 'data': self._json2srt(response, delivery),
+ })
+ return subtitles
+
+ def _extract_streams_formats_and_subtitles(self, video_id, streams, **fmt_kwargs):
+ formats = []
+ subtitles = {}
+ for stream in streams or []:
+ stream_formats = []
+ http_stream_url = stream.get('StreamHttpUrl')
+ stream_url = stream.get('StreamUrl')
+
+ if http_stream_url:
+ stream_formats.append({'url': http_stream_url})
+
+ if stream_url:
+ media_type = stream.get('ViewerMediaFileTypeName')
+ if media_type in ('hls', ):
+ m3u8_formats, stream_subtitles = self._extract_m3u8_formats_and_subtitles(stream_url, video_id)
+ stream_formats.extend(m3u8_formats)
+ subtitles = self._merge_subtitles(subtitles, stream_subtitles)
+ else:
+ stream_formats.append({
+ 'url': stream_url
+ })
+ for fmt in stream_formats:
+ fmt.update({
+ 'format_note': stream.get('Tag'),
+ **fmt_kwargs
+ })
+ formats.extend(stream_formats)
+
+ return formats, subtitles
+
+ def _real_extract(self, url):
+ base_url, video_id = self._match_valid_url(url).group('base_url', 'id')
+ delivery_info = self._call_api(
+ base_url, '/Pages/Viewer/DeliveryInfo.aspx', video_id,
+ query={
+ 'deliveryId': video_id,
+ 'invocationId': '',
+ 'isLiveNotes': 'false',
+ 'refreshAuthCookie': 'true',
+ 'isActiveBroadcast': 'false',
+ 'isEditing': 'false',
+ 'isKollectiveAgentInstalled': 'false',
+ 'isEmbed': 'false',
+ 'responseType': 'json',
+ }
+ )
+
+ delivery = delivery_info['Delivery']
+ session_start_time = int_or_none(delivery.get('SessionStartTime'))
+ timestamps = delivery.get('Timestamps')
+
+ # Podcast stream is usually the combined streams. We will prefer that by default.
+ podcast_formats, podcast_subtitles = self._extract_streams_formats_and_subtitles(
+ video_id, delivery.get('PodcastStreams'), format_note='PODCAST')
+
+ streams_formats, streams_subtitles = self._extract_streams_formats_and_subtitles(
+ video_id, delivery.get('Streams'), preference=-10)
+
+ formats = podcast_formats + streams_formats
+ formats.extend(self._extract_mhtml_formats(base_url, timestamps))
+ subtitles = self._merge_subtitles(
+ podcast_subtitles, streams_subtitles, self.extract_subtitles(base_url, video_id, delivery))
+
+ self._sort_formats(formats)
+ self.mark_watched(base_url, video_id, delivery_info)
+
+ return {
+ 'id': video_id,
+ 'title': delivery.get('SessionName'),
+ 'cast': traverse_obj(delivery, ('Contributors', ..., 'DisplayName'), default=[], expected_type=lambda x: x or None),
+ 'timestamp': session_start_time - 11640000000 if session_start_time else None,
+ 'duration': delivery.get('Duration'),
+ 'thumbnail': base_url + f'/Services/FrameGrabber.svc/FrameRedirect?objectId={video_id}&mode=Delivery&random={random()}',
+ 'average_rating': delivery.get('AverageRating'),
+ 'chapters': self._extract_chapters(timestamps),
+ 'uploader': delivery.get('OwnerDisplayName') or None,
+ 'uploader_id': delivery.get('OwnerId'),
+ 'description': delivery.get('SessionAbstract'),
+ 'tags': traverse_obj(delivery, ('Tags', ..., 'Content')),
+ 'channel_id': delivery.get('SessionGroupPublicID'),
+ 'channel': traverse_obj(delivery, 'SessionGroupLongName', 'SessionGroupShortName', get_all=False),
+ 'formats': formats,
+ 'subtitles': subtitles
+ }
+
+
+class PanoptoPlaylistIE(PanoptoBaseIE):
+ _VALID_URL = PanoptoBaseIE.BASE_URL_RE + r'/Pages/(Viewer|Embed)\.aspx.*(?:\?|&)pid=(?P<id>[a-f0-9-]+)'
+ _TESTS = [
+ {
+ 'url': 'https://howtovideos.hosted.panopto.com/Panopto/Pages/Viewer.aspx?pid=f3b39fcf-882f-4849-93d6-a9f401236d36&id=5fa74e93-3d87-4694-b60e-aaa4012214ed&advance=true',
+ 'info_dict': {
+ 'title': 'Featured Video Tutorials',
+ 'id': 'f3b39fcf-882f-4849-93d6-a9f401236d36',
+ 'description': '',
+ },
+ 'playlist_mincount': 36
+ },
+ {
+ 'url': 'https://utsa.hosted.panopto.com/Panopto/Pages/Viewer.aspx?pid=e2900555-3ad4-4bdb-854d-ad2401686190',
+ 'info_dict': {
+ 'title': 'Library Website Introduction Playlist',
+ 'id': 'e2900555-3ad4-4bdb-854d-ad2401686190',
+ 'description': 'md5:f958bca50a1cbda15fdc1e20d32b3ecb',
+ },
+ 'playlist_mincount': 4
+ },
+
+ ]
+
+ def _entries(self, base_url, playlist_id, session_list_id):
+ session_list_info = self._call_api(
+ base_url, f'/Api/SessionLists/{session_list_id}?collections[0].maxCount=500&collections[0].name=items', playlist_id)
+
+ items = session_list_info['Items']
+ for item in items:
+ if item.get('TypeName') != 'Session':
+ self.report_warning('Got an item in the playlist that is not a Session' + bug_reports_message(), only_once=True)
+ continue
+ yield {
+ '_type': 'url',
+ 'id': item.get('Id'),
+ 'url': item.get('ViewerUri'),
+ 'title': item.get('Name'),
+ 'description': item.get('Description'),
+ 'duration': item.get('Duration'),
+ 'channel': traverse_obj(item, ('Parent', 'Name')),
+ 'channel_id': traverse_obj(item, ('Parent', 'Id'))
+ }
+
+ def _real_extract(self, url):
+ base_url, playlist_id = self._match_valid_url(url).group('base_url', 'id')
+
+ video_id = get_first(parse_qs(url), 'id')
+ if video_id:
+ if self.get_param('noplaylist'):
+ self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
+ return self.url_result(base_url + f'/Pages/Viewer.aspx?id={video_id}', ie_key=PanoptoIE.ie_key(), video_id=video_id)
+ else:
+ self.to_screen(f'Downloading playlist {playlist_id}; add --no-playlist to just download video {video_id}')
+
+ playlist_info = self._call_api(base_url, f'/Api/Playlists/{playlist_id}', playlist_id)
+ return self.playlist_result(
+ self._entries(base_url, playlist_id, playlist_info['SessionListId']),
+ playlist_id=playlist_id, playlist_title=playlist_info.get('Name'),
+ playlist_description=playlist_info.get('Description'))
+
+
+class PanoptoListIE(PanoptoBaseIE):
+ _VALID_URL = PanoptoBaseIE.BASE_URL_RE + r'/Pages/Sessions/List\.aspx'
+ _PAGE_SIZE = 250
+ _TESTS = [
+ {
+ 'url': 'https://demo.hosted.panopto.com/Panopto/Pages/Sessions/List.aspx#folderID=%22e4c6a2fc-1214-4ca0-8fb7-aef2e29ff63a%22',
+ 'info_dict': {
+ 'id': 'e4c6a2fc-1214-4ca0-8fb7-aef2e29ff63a',
+ 'title': 'Showcase Videos'
+ },
+ 'playlist_mincount': 140
+
+ },
+ {
+ 'url': 'https://demo.hosted.panopto.com/Panopto/Pages/Sessions/List.aspx#view=2&maxResults=250',
+ 'info_dict': {
+ 'id': 'panopto_list',
+ 'title': 'panopto_list'
+ },
+ 'playlist_mincount': 300
+ },
+ {
+ # Folder that contains 8 folders and a playlist
+ 'url': 'https://howtovideos.hosted.panopto.com/Panopto/Pages/Sessions/List.aspx?noredirect=true#folderID=%224b9de7ae-0080-4158-8496-a9ba01692c2e%22',
+ 'info_dict': {
+ 'id': '4b9de7ae-0080-4158-8496-a9ba01692c2e',
+ 'title': 'Video Tutorials'
+ },
+ 'playlist_mincount': 9
+ }
+
+ ]
+
+ def _fetch_page(self, base_url, query_params, display_id, page):
+
+ params = {
+ 'sortColumn': 1,
+ 'getFolderData': True,
+ 'includePlaylists': True,
+ **query_params,
+ 'page': page,
+ 'maxResults': self._PAGE_SIZE,
+ }
+
+ response = self._call_api(
+ base_url, '/Services/Data.svc/GetSessions', f'{display_id} page {page+1}',
+ data={'queryParameters': params}, fatal=False)
+
+ for result in get_first(response, 'Results', default=[]):
+ # This could be a video, playlist (or maybe something else)
+ item_id = result.get('DeliveryID')
+ yield {
+ '_type': 'url',
+ 'id': item_id,
+ 'title': result.get('SessionName'),
+ 'url': traverse_obj(result, 'ViewerUrl', 'EmbedUrl', get_all=False) or (base_url + f'/Pages/Viewer.aspx?id={item_id}'),
+ 'duration': result.get('Duration'),
+ 'channel': result.get('FolderName'),
+ 'channel_id': result.get('FolderID'),
+ }
+
+ for folder in get_first(response, 'Subfolders', default=[]):
+ folder_id = folder.get('ID')
+ yield self.url_result(
+ base_url + f'/Pages/Sessions/List.aspx#folderID="{folder_id}"',
+ ie_key=PanoptoListIE.ie_key(), video_id=folder_id, title=folder.get('Name'))
+
+ def _extract_folder_metadata(self, base_url, folder_id):
+ response = self._call_api(
+ base_url, '/Services/Data.svc/GetFolderInfo', folder_id,
+ data={'folderID': folder_id}, fatal=False)
+ return {
+ 'title': get_first(response, 'Name', default=[])
+ }
+
+ def _real_extract(self, url):
+ mobj = self._match_valid_url(url)
+ base_url = mobj.group('base_url')
+
+ query_params = self._parse_fragment(url)
+ folder_id, display_id = query_params.get('folderID'), 'panopto_list'
+
+ if query_params.get('isSubscriptionsPage'):
+ display_id = 'subscriptions'
+ if not query_params.get('subscribableTypes'):
+ query_params['subscribableTypes'] = [0, 1, 2]
+ elif query_params.get('isSharedWithMe'):
+ display_id = 'sharedwithme'
+ elif folder_id:
+ display_id = folder_id
+
+ query = query_params.get('query')
+ if query:
+ display_id += f': query "{query}"'
+
+ info = {
+ '_type': 'playlist',
+ 'id': display_id,
+ 'title': display_id,
+ }
+ if folder_id:
+ info.update(self._extract_folder_metadata(base_url, folder_id))
+
+ info['entries'] = OnDemandPagedList(
+ functools.partial(self._fetch_page, base_url, query_params, display_id), self._PAGE_SIZE)
+
+ return info
diff --git a/hypervideo_dl/extractor/paramountplus.py b/hypervideo_dl/extractor/paramountplus.py
index 338b84d..94a9319 100644
--- a/hypervideo_dl/extractor/paramountplus.py
+++ b/hypervideo_dl/extractor/paramountplus.py
@@ -1,4 +1,5 @@
from __future__ import unicode_literals
+import itertools
from .common import InfoExtractor
from .cbs import CBSBaseIE
@@ -13,12 +14,12 @@ class ParamountPlusIE(CBSBaseIE):
(?:
paramountplus:|
https?://(?:www\.)?(?:
- paramountplus\.com/(?:shows/[^/]+/video|movies/[^/]+)/
+ paramountplus\.com/(?:shows|movies)/(?:video|[^/]+/video|[^/]+)/
)(?P<id>[\w-]+))'''
# All tests are blocked outside US
_TESTS = [{
- 'url': 'https://www.paramountplus.com/shows/catdog/video/Oe44g5_NrlgiZE3aQVONleD6vXc8kP0k/catdog-climb-every-catdog-the-canine-mutiny/',
+ 'url': 'https://www.paramountplus.com/shows/video/Oe44g5_NrlgiZE3aQVONleD6vXc8kP0k/',
'info_dict': {
'id': 'Oe44g5_NrlgiZE3aQVONleD6vXc8kP0k',
'ext': 'mp4',
@@ -33,7 +34,7 @@ class ParamountPlusIE(CBSBaseIE):
'skip_download': 'm3u8',
},
}, {
- 'url': 'https://www.paramountplus.com/shows/tooning-out-the-news/video/6hSWYWRrR9EUTz7IEe5fJKBhYvSUfexd/7-23-21-week-in-review-rep-jahana-hayes-howard-fineman-sen-michael-bennet-sheera-frenkel-cecilia-kang-/',
+ 'url': 'https://www.paramountplus.com/shows/video/6hSWYWRrR9EUTz7IEe5fJKBhYvSUfexd/',
'info_dict': {
'id': '6hSWYWRrR9EUTz7IEe5fJKBhYvSUfexd',
'ext': 'mp4',
@@ -48,7 +49,7 @@ class ParamountPlusIE(CBSBaseIE):
'skip_download': 'm3u8',
},
}, {
- 'url': 'https://www.paramountplus.com/movies/daddys-home/vM2vm0kE6vsS2U41VhMRKTOVHyQAr6pC',
+ 'url': 'https://www.paramountplus.com/movies/video/vM2vm0kE6vsS2U41VhMRKTOVHyQAr6pC/',
'info_dict': {
'id': 'vM2vm0kE6vsS2U41VhMRKTOVHyQAr6pC',
'ext': 'mp4',
@@ -60,11 +61,10 @@ class ParamountPlusIE(CBSBaseIE):
},
'params': {
'skip_download': 'm3u8',
- 'format': 'bestvideo',
},
'expected_warnings': ['Ignoring subtitle tracks'], # TODO: Investigate this
}, {
- 'url': 'https://www.paramountplus.com/movies/sonic-the-hedgehog/5EKDXPOzdVf9voUqW6oRuocyAEeJGbEc',
+ 'url': 'https://www.paramountplus.com/movies/video/5EKDXPOzdVf9voUqW6oRuocyAEeJGbEc/',
'info_dict': {
'id': '5EKDXPOzdVf9voUqW6oRuocyAEeJGbEc',
'ext': 'mp4',
@@ -76,14 +76,19 @@ class ParamountPlusIE(CBSBaseIE):
},
'params': {
'skip_download': 'm3u8',
- 'format': 'bestvideo',
},
'expected_warnings': ['Ignoring subtitle tracks'],
}, {
- 'url': 'https://www.paramountplus.com/shows/all-rise/video/QmR1WhNkh1a_IrdHZrbcRklm176X_rVc/all-rise-space/',
+ 'url': 'https://www.paramountplus.com/shows/the-real-world/video/mOVeHeL9ub9yWdyzSZFYz8Uj4ZBkVzQg/the-real-world-reunion/',
'only_matching': True,
}, {
- 'url': 'https://www.paramountplus.com/movies/million-dollar-american-princesses-meghan-and-harry/C0LpgNwXYeB8txxycdWdR9TjxpJOsdCq',
+ 'url': 'https://www.paramountplus.com/shows/video/mOVeHeL9ub9yWdyzSZFYz8Uj4ZBkVzQg/',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.paramountplus.com/movies/video/W0VyStQqUnqKzJkrpSAIARuCc9YuYGNy/',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.paramountplus.com/movies/paw-patrol-the-movie/W0VyStQqUnqKzJkrpSAIARuCc9YuYGNy/',
'only_matching': True,
}]
@@ -130,11 +135,13 @@ class ParamountPlusSeriesIE(InfoExtractor):
'id': 'spongebob-squarepants',
}
}]
- _API_URL = 'https://www.paramountplus.com/shows/{}/xhr/episodes/page/0/size/100000/xs/0/season/0/'
def _entries(self, show_name):
- show_json = self._download_json(self._API_URL.format(show_name), video_id=show_name)
- if show_json.get('success'):
+ for page in itertools.count():
+ show_json = self._download_json(
+ f'https://www.paramountplus.com/shows/{show_name}/xhr/episodes/page/{page}/size/50/xs/0/season/0', show_name)
+ if not show_json.get('success'):
+ return
for episode in show_json['result']['data']:
yield self.url_result(
'https://www.paramountplus.com%s' % episode['url'],
diff --git a/hypervideo_dl/extractor/parliamentliveuk.py b/hypervideo_dl/extractor/parliamentliveuk.py
index 869ebd8..974d654 100644
--- a/hypervideo_dl/extractor/parliamentliveuk.py
+++ b/hypervideo_dl/extractor/parliamentliveuk.py
@@ -25,9 +25,6 @@ class ParliamentLiveUKIE(InfoExtractor):
'timestamp': 1395153872,
'upload_date': '20140318',
},
- 'params': {
- 'format': 'bestvideo',
- },
}, {
'url': 'http://parliamentlive.tv/event/index/3f24936f-130f-40bf-9a5d-b3d6479da6a4',
'only_matching': True,
diff --git a/hypervideo_dl/extractor/patreon.py b/hypervideo_dl/extractor/patreon.py
index a189c02..963a0d6 100644
--- a/hypervideo_dl/extractor/patreon.py
+++ b/hypervideo_dl/extractor/patreon.py
@@ -88,11 +88,7 @@ class PatreonIE(InfoExtractor):
# Currently Patreon exposes download URL via hidden CSS, so login is not
# needed. Keeping this commented for when this inevitably changes.
'''
- def _login(self):
- username, password = self._get_login_info()
- if username is None:
- return
-
+ def _perform_login(self, username, password):
login_form = {
'redirectUrl': 'http://www.patreon.com/',
'email': username,
@@ -108,8 +104,6 @@ class PatreonIE(InfoExtractor):
if re.search(r'onLoginFailed', login_page):
raise ExtractorError('Unable to login, incorrect username and/or password', expected=True)
- def _real_initialize(self):
- self._login()
'''
def _real_extract(self, url):
@@ -161,7 +155,7 @@ class PatreonIE(InfoExtractor):
if try_get(attributes, lambda x: x['embed']['provider']) == 'Vimeo':
embed_html = try_get(attributes, lambda x: x['embed']['html'])
v_url = url_or_none(compat_urllib_parse_unquote(
- self._search_regex(r'src=(https%3A%2F%2Fplayer\.vimeo\.com.+)%3F', embed_html, 'vimeo url', fatal=False)))
+ self._search_regex(r'(https(?:%3A%2F%2F|://)player\.vimeo\.com.+app_id(?:=|%3D)+\d+)', embed_html, 'vimeo url', fatal=False)))
if v_url:
info.update({
'_type': 'url_transparent',
@@ -191,7 +185,7 @@ class PatreonIE(InfoExtractor):
class PatreonUserIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?patreon\.com/(?P<id>[-_\w\d]+)/?(?:posts/?)?'
+ _VALID_URL = r'https?://(?:www\.)?patreon\.com/(?!rss)(?P<id>[-\w]+)'
_TESTS = [{
'url': 'https://www.patreon.com/dissonancepod/',
diff --git a/hypervideo_dl/extractor/pbs.py b/hypervideo_dl/extractor/pbs.py
index 0eabf9b..e48a2b8 100644
--- a/hypervideo_dl/extractor/pbs.py
+++ b/hypervideo_dl/extractor/pbs.py
@@ -193,7 +193,7 @@ class PBSIE(InfoExtractor):
# Article with embedded player (or direct video)
(?:www\.)?pbs\.org/(?:[^/]+/){1,5}(?P<presumptive_id>[^/]+?)(?:\.html)?/?(?:$|[?\#]) |
# Player
- (?:video|player)\.pbs\.org/(?:widget/)?partnerplayer/(?P<player_id>[^/]+)/
+ (?:video|player)\.pbs\.org/(?:widget/)?partnerplayer/(?P<player_id>[^/]+)
)
''' % '|'.join(list(zip(*_STATIONS))[0])
@@ -545,7 +545,7 @@ class PBSIE(InfoExtractor):
for vid_id in video_id]
return self.playlist_result(entries, display_id)
- info = None
+ info = {}
redirects = []
redirect_urls = set()
@@ -660,6 +660,9 @@ class PBSIE(InfoExtractor):
'protocol': 'http',
})
formats.append(f)
+ for f in formats:
+ if (f.get('format_note') or '').endswith(' AD'): # Audio description
+ f['language_preference'] = -10
self._sort_formats(formats)
rating_str = info.get('rating')
diff --git a/hypervideo_dl/extractor/peekvids.py b/hypervideo_dl/extractor/peekvids.py
new file mode 100644
index 0000000..4bf6855
--- /dev/null
+++ b/hypervideo_dl/extractor/peekvids.py
@@ -0,0 +1,81 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class PeekVidsIE(InfoExtractor):
+ _VALID_URL = r'''(?x)
+ https?://(?:www\.)?peekvids\.com/
+ (?:(?:[^/?#]+/){2}|embed/?\?(?:[^#]*&)?v=)
+ (?P<id>[^/?&#]*)
+ '''
+ _TESTS = [{
+ 'url': 'https://peekvids.com/pc/dane-jones-cute-redhead-with-perfect-tits-with-mini-vamp/BSyLMbN0YCd',
+ 'md5': 'a00940646c428e232407e3e62f0e8ef5',
+ 'info_dict': {
+ 'id': 'BSyLMbN0YCd',
+ 'title': ' Dane Jones - Cute redhead with perfect tits with Mini Vamp, SEXYhub',
+ 'ext': 'mp4',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'description': 'Watch Dane Jones - Cute redhead with perfect tits with Mini Vamp (7 min), uploaded by SEXYhub.com',
+ 'timestamp': 1642579329,
+ 'upload_date': '20220119',
+ 'duration': 416,
+ 'view_count': int,
+ 'age_limit': 18,
+ },
+ }]
+ _DOMAIN = 'www.peekvids.com'
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ short_video_id = self._html_search_regex(r'<video [^>]*data-id="(.+?)"', webpage, 'short video ID')
+ srcs = self._download_json(
+ f'https://{self._DOMAIN}/v-alt/{short_video_id}', video_id,
+ note='Downloading list of source files')
+ formats = [{
+ 'url': url,
+ 'ext': 'mp4',
+ 'format_id': name[8:],
+ } for name, url in srcs.items() if len(name) > 8 and name.startswith('data-src')]
+ if not formats:
+ formats = [{'url': url} for url in srcs.values()]
+ self._sort_formats(formats)
+
+ info = self._search_json_ld(webpage, video_id, expected_type='VideoObject')
+ info.update({
+ 'id': video_id,
+ 'age_limit': 18,
+ 'formats': formats,
+ })
+ return info
+
+
+class PlayVidsIE(PeekVidsIE):
+ _VALID_URL = r'https?://(?:www\.)?playvids\.com/(?:embed/|[^/]{2}/)?(?P<id>[^/?#]*)'
+ _TESTS = [{
+ 'url': 'https://www.playvids.com/U3pBrYhsjXM/pc/dane-jones-cute-redhead-with-perfect-tits-with-mini-vamp',
+ 'md5': 'cd7dfd8a2e815a45402369c76e3c1825',
+ 'info_dict': {
+ 'id': 'U3pBrYhsjXM',
+ 'title': ' Dane Jones - Cute redhead with perfect tits with Mini Vamp, SEXYhub',
+ 'ext': 'mp4',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'description': 'Watch Dane Jones - Cute redhead with perfect tits with Mini Vamp video in HD, uploaded by SEXYhub.com',
+ 'timestamp': 1640435839,
+ 'upload_date': '20211225',
+ 'duration': 416,
+ 'view_count': int,
+ 'age_limit': 18,
+ },
+ }, {
+ 'url': 'https://www.playvids.com/es/U3pBrYhsjXM/pc/dane-jones-cute-redhead-with-perfect-tits-with-mini-vamp',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.playvids.com/embed/U3pBrYhsjXM',
+ 'only_matching': True,
+ }]
+ _DOMAIN = 'www.playvids.com'
diff --git a/hypervideo_dl/extractor/peertube.py b/hypervideo_dl/extractor/peertube.py
index 1e22f24..9d6b821 100644
--- a/hypervideo_dl/extractor/peertube.py
+++ b/hypervideo_dl/extractor/peertube.py
@@ -7,6 +7,7 @@ import re
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
+ format_field,
int_or_none,
parse_resolution,
str_or_none,
@@ -86,6 +87,7 @@ class PeerTubeIE(InfoExtractor):
maindreieck-tv\.de|
mani\.tube|
manicphase\.me|
+ media\.fsfe\.org|
media\.gzevd\.de|
media\.inno3\.cricket|
media\.kaitaia\.life|
@@ -1386,8 +1388,7 @@ class PeerTubePlaylistIE(InfoExtractor):
playlist_timestamp = unified_timestamp(info.get('createdAt'))
channel = try_get(info, lambda x: x['ownerAccount']['name']) or info.get('displayName')
channel_id = try_get(info, lambda x: x['ownerAccount']['id']) or info.get('id')
- thumbnail = info.get('thumbnailPath')
- thumbnail = f'https://{host}{thumbnail}' if thumbnail else None
+ thumbnail = format_field(info, 'thumbnailPath', f'https://{host}%s')
entries = OnDemandPagedList(functools.partial(
self.fetch_page, host, id, type), self._PAGE_SIZE)
diff --git a/hypervideo_dl/extractor/peertv.py b/hypervideo_dl/extractor/peertv.py
new file mode 100644
index 0000000..002d33a
--- /dev/null
+++ b/hypervideo_dl/extractor/peertv.py
@@ -0,0 +1,57 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import js_to_json
+
+
+class PeerTVIE(InfoExtractor):
+ IE_NAME = 'peer.tv'
+ _VALID_URL = r'https?://(?:www\.)?peer\.tv/(?:de|it|en)/(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'https://www.peer.tv/de/841',
+ 'info_dict': {
+ 'id': '841',
+ 'ext': 'mp4',
+ 'title': 'Die Brunnenburg',
+ 'description': 'md5:4395f6142b090338340ab88a3aae24ed',
+ },
+ }, {
+ 'url': 'https://www.peer.tv/it/404',
+ 'info_dict': {
+ 'id': '404',
+ 'ext': 'mp4',
+ 'title': 'Cascate di ghiaccio in Val Gardena',
+ 'description': 'md5:e8e5907f236171842674e8090e3577b8',
+ },
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ video_key = self._html_search_regex(r'player\.peer\.tv/js/([a-zA-Z0-9]+)', webpage, 'video key')
+
+ js = self._download_webpage(f'https://player.peer.tv/js/{video_key}/', video_id,
+ headers={'Referer': 'https://www.peer.tv/'}, note='Downloading session id')
+
+ session_id = self._search_regex(r'["\']session_id["\']:\s*["\']([a-zA-Z0-9]+)["\']', js, 'session id')
+
+ player_webpage = self._download_webpage(
+ f'https://player.peer.tv/jsc/{video_key}/{session_id}?jsr=aHR0cHM6Ly93d3cucGVlci50di9kZS84NDE=&cs=UTF-8&mq=2&ua=0&webm=p&mp4=p&hls=1',
+ video_id, note='Downloading player webpage')
+
+ m3u8_url = self._search_regex(r'["\']playlist_url["\']:\s*(["\'][^"\']+["\'])', player_webpage, 'm3u8 url')
+ m3u8_url = self._parse_json(m3u8_url, video_id, transform_source=js_to_json)
+
+ formats = self._extract_m3u8_formats(m3u8_url, video_id, m3u8_id='hls')
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': self._html_search_regex(r'<h1>(.+?)</h1>', webpage, 'title').replace('\xa0', ' '),
+ 'formats': formats,
+ 'description': self._html_search_meta(('og:description', 'description'), webpage),
+ 'thumbnail': self._html_search_meta(('og:image', 'image'), webpage)
+ }
diff --git a/hypervideo_dl/extractor/peloton.py b/hypervideo_dl/extractor/peloton.py
index 287d341..7d83225 100644
--- a/hypervideo_dl/extractor/peloton.py
+++ b/hypervideo_dl/extractor/peloton.py
@@ -203,7 +203,6 @@ class PelotonLiveIE(InfoExtractor):
'chapters': 'count:3'
},
'params': {
- 'format': 'bestvideo',
'skip_download': 'm3u8',
},
'_skip': 'Account needed'
diff --git a/hypervideo_dl/extractor/periscope.py b/hypervideo_dl/extractor/periscope.py
index b93a02b..1a292b8 100644
--- a/hypervideo_dl/extractor/periscope.py
+++ b/hypervideo_dl/extractor/periscope.py
@@ -33,7 +33,7 @@ class PeriscopeBaseIE(InfoExtractor):
return {
'id': broadcast.get('id') or video_id,
- 'title': self._live_title(title) if is_live else title,
+ 'title': title,
'timestamp': parse_iso8601(broadcast.get('created_at')),
'uploader': uploader,
'uploader_id': broadcast.get('user_id') or broadcast.get('username'),
diff --git a/hypervideo_dl/extractor/piapro.py b/hypervideo_dl/extractor/piapro.py
new file mode 100644
index 0000000..c4eb491
--- /dev/null
+++ b/hypervideo_dl/extractor/piapro.py
@@ -0,0 +1,96 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_urlparse
+from ..utils import (
+ ExtractorError,
+ parse_duration,
+ parse_filesize,
+ str_to_int,
+ unified_timestamp,
+ urlencode_postdata,
+)
+
+
+class PiaproIE(InfoExtractor):
+ _NETRC_MACHINE = 'piapro'
+ _VALID_URL = r'https?://piapro\.jp/t/(?P<id>\w+)/?'
+ _TESTS = [{
+ 'url': 'https://piapro.jp/t/NXYR',
+ 'md5': 'a9d52f27d13bafab7ee34116a7dcfa77',
+ 'info_dict': {
+ 'id': 'NXYR',
+ 'ext': 'mp3',
+ 'uploader': 'wowaka',
+ 'uploader_id': 'wowaka',
+ 'title': '裏表ラバーズ',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ }
+ }]
+
+ _login_status = False
+
+ def _perform_login(self, username, password):
+ login_ok = True
+ login_form_strs = {
+ '_username': username,
+ '_password': password,
+ '_remember_me': 'on',
+ 'login': 'ログイン'
+ }
+ self._request_webpage('https://piapro.jp/login/', None)
+ urlh = self._request_webpage(
+ 'https://piapro.jp/login/exe', None,
+ note='Logging in', errnote='Unable to log in',
+ data=urlencode_postdata(login_form_strs))
+ if urlh is False:
+ login_ok = False
+ else:
+ parts = compat_urlparse.urlparse(urlh.geturl())
+ if parts.path != '/':
+ login_ok = False
+ if not login_ok:
+ self.report_warning(
+ 'unable to log in: bad username or password')
+ self._login_status = login_ok
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ category_id = self._search_regex(r'categoryId=(.+)">', webpage, 'category ID')
+ if category_id not in ('1', '2', '21', '22', '23', '24', '25'):
+ raise ExtractorError('The URL does not contain audio.', expected=True)
+
+ str_duration, str_filesize = self._search_regex(
+ r'サイズ:</span>(.+?)/\(([0-9,]+?[KMG]?B))', webpage, 'duration and size',
+ group=(1, 2), default=(None, None))
+ str_viewcount = self._search_regex(r'閲覧数:</span>([0-9,]+)\s+', webpage, 'view count', fatal=False)
+
+ uploader_id, uploader = self._search_regex(
+ r'<a\s+class="cd_user-name"\s+href="/(.*)">([^<]+)さん<', webpage, 'uploader',
+ group=(1, 2), default=(None, None))
+ content_id = self._search_regex(r'contentId\:\'(.+)\'', webpage, 'content ID')
+ create_date = self._search_regex(r'createDate\:\'(.+)\'', webpage, 'timestamp')
+
+ player_webpage = self._download_webpage(
+ f'https://piapro.jp/html5_player_popup/?id={content_id}&cdate={create_date}',
+ video_id, note='Downloading player webpage')
+
+ return {
+ 'id': video_id,
+ 'title': self._html_search_regex(r'<h1\s+class="cd_works-title">(.+?)</h1>', webpage, 'title', fatal=False),
+ 'description': self._html_search_regex(r'<p\s+class="cd_dtl_cap">(.+?)</p>\s*<div', webpage, 'description', fatal=False),
+ 'uploader': uploader,
+ 'uploader_id': uploader_id,
+ 'timestamp': unified_timestamp(create_date, False),
+ 'duration': parse_duration(str_duration),
+ 'view_count': str_to_int(str_viewcount),
+ 'thumbnail': self._html_search_meta('twitter:image', webpage),
+
+ 'filesize_approx': parse_filesize(str_filesize.replace(',', '')),
+ 'url': self._search_regex(r'mp3:\s*\'(.*?)\'\}', player_webpage, 'url'),
+ 'ext': 'mp3',
+ 'vcodec': 'none',
+ }
diff --git a/hypervideo_dl/extractor/picarto.py b/hypervideo_dl/extractor/picarto.py
index e6c51e1..adf21fd 100644
--- a/hypervideo_dl/extractor/picarto.py
+++ b/hypervideo_dl/extractor/picarto.py
@@ -77,7 +77,7 @@ class PicartoIE(InfoExtractor):
return {
'id': channel_id,
- 'title': self._live_title(title.strip()),
+ 'title': title.strip(),
'is_live': True,
'channel': channel_id,
'channel_id': metadata.get('id'),
@@ -111,7 +111,7 @@ class PicartoVodIE(InfoExtractor):
vod_info = self._parse_json(
self._search_regex(
r'(?s)#vod-player["\']\s*,\s*(\{.+?\})\s*\)', webpage,
- video_id),
+ 'vod player'),
video_id, transform_source=js_to_json)
formats = self._extract_m3u8_formats(
diff --git a/hypervideo_dl/extractor/piksel.py b/hypervideo_dl/extractor/piksel.py
index a362664..84c3de2 100644
--- a/hypervideo_dl/extractor/piksel.py
+++ b/hypervideo_dl/extractor/piksel.py
@@ -4,11 +4,11 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..compat import compat_str
from ..utils import (
dict_get,
ExtractorError,
int_or_none,
+ join_nonempty,
parse_iso8601,
try_get,
unescapeHTML,
@@ -116,12 +116,8 @@ class PikselIE(InfoExtractor):
elif asset_type == 'audio':
tbr = abr
- format_id = ['http']
- if tbr:
- format_id.append(compat_str(tbr))
-
formats.append({
- 'format_id': '-'.join(format_id),
+ 'format_id': join_nonempty('http', tbr),
'url': unescapeHTML(http_url),
'vbr': vbr,
'abr': abr,
@@ -167,7 +163,7 @@ class PikselIE(InfoExtractor):
re.sub(r'/od/[^/]+/', '/od/http/', smil_url), video_id,
transform_source=transform_source, fatal=False))
- self._sort_formats(formats)
+ self._sort_formats(formats, ('tbr', )) # Incomplete resolution information
subtitles = {}
for caption in video_data.get('captions', []):
diff --git a/hypervideo_dl/extractor/pixivsketch.py b/hypervideo_dl/extractor/pixivsketch.py
new file mode 100644
index 0000000..f0ad0b2
--- /dev/null
+++ b/hypervideo_dl/extractor/pixivsketch.py
@@ -0,0 +1,122 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ traverse_obj,
+ unified_timestamp,
+)
+
+
+class PixivSketchBaseIE(InfoExtractor):
+ def _call_api(self, video_id, path, referer, note='Downloading JSON metadata'):
+ response = self._download_json(f'https://sketch.pixiv.net/api/{path}', video_id, note=note, headers={
+ 'Referer': referer,
+ 'X-Requested-With': referer,
+ })
+ errors = traverse_obj(response, ('errors', ..., 'message'))
+ if errors:
+ raise ExtractorError(' '.join(f'{e}.' for e in errors))
+ return response.get('data') or {}
+
+
+class PixivSketchIE(PixivSketchBaseIE):
+ IE_NAME = 'pixiv:sketch'
+ _VALID_URL = r'https?://sketch\.pixiv\.net/@(?P<uploader_id>[a-zA-Z0-9_-]+)/lives/(?P<id>\d+)/?'
+ _TESTS = [{
+ 'url': 'https://sketch.pixiv.net/@nuhutya/lives/3654620468641830507',
+ 'info_dict': {
+ 'id': '7370666691623196569',
+ 'title': 'まにあえクリスマス!',
+ 'uploader': 'ぬふちゃ',
+ 'uploader_id': 'nuhutya',
+ 'channel_id': '9844815',
+ 'age_limit': 0,
+ 'timestamp': 1640351536,
+ },
+ 'skip': True,
+ }, {
+ # these two (age_limit > 0) requires you to login on website, but it's actually not required for download
+ 'url': 'https://sketch.pixiv.net/@namahyou/lives/4393103321546851377',
+ 'info_dict': {
+ 'id': '4907995960957946943',
+ 'title': 'クリスマスなんて知らん🖕',
+ 'uploader': 'すゃもり',
+ 'uploader_id': 'suya2mori2',
+ 'channel_id': '31169300',
+ 'age_limit': 15,
+ 'timestamp': 1640347640,
+ },
+ 'skip': True,
+ }, {
+ 'url': 'https://sketch.pixiv.net/@8aki/lives/3553803162487249670',
+ 'info_dict': {
+ 'id': '1593420639479156945',
+ 'title': 'おまけ本作業(リョナ有)',
+ 'uploader': 'おぶい / Obui',
+ 'uploader_id': 'oving',
+ 'channel_id': '17606',
+ 'age_limit': 18,
+ 'timestamp': 1640330263,
+ },
+ 'skip': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id, uploader_id = self._match_valid_url(url).group('id', 'uploader_id')
+ data = self._call_api(video_id, f'lives/{video_id}.json', url)
+
+ if not traverse_obj(data, 'is_broadcasting'):
+ raise ExtractorError(f'This live is offline. Use https://sketch.pixiv.net/@{uploader_id} for ongoing live.', expected=True)
+
+ m3u8_url = traverse_obj(data, ('owner', 'hls_movie', 'url'))
+ formats = self._extract_m3u8_formats(
+ m3u8_url, video_id, ext='mp4',
+ entry_protocol='m3u8_native', m3u8_id='hls')
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': data.get('name'),
+ 'formats': formats,
+ 'uploader': traverse_obj(data, ('user', 'name'), ('owner', 'user', 'name')),
+ 'uploader_id': traverse_obj(data, ('user', 'unique_name'), ('owner', 'user', 'unique_name')),
+ 'channel_id': str(traverse_obj(data, ('user', 'pixiv_user_id'), ('owner', 'user', 'pixiv_user_id'))),
+ 'age_limit': 18 if data.get('is_r18') else 15 if data.get('is_r15') else 0,
+ 'timestamp': unified_timestamp(data.get('created_at')),
+ 'is_live': True
+ }
+
+
+class PixivSketchUserIE(PixivSketchBaseIE):
+ IE_NAME = 'pixiv:sketch:user'
+ _VALID_URL = r'https?://sketch\.pixiv\.net/@(?P<id>[a-zA-Z0-9_-]+)/?'
+ _TESTS = [{
+ 'url': 'https://sketch.pixiv.net/@nuhutya',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://sketch.pixiv.net/@namahyou',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://sketch.pixiv.net/@8aki',
+ 'only_matching': True,
+ }]
+
+ @classmethod
+ def suitable(cls, url):
+ return super(PixivSketchUserIE, cls).suitable(url) and not PixivSketchIE.suitable(url)
+
+ def _real_extract(self, url):
+ user_id = self._match_id(url)
+ data = self._call_api(user_id, f'lives/users/@{user_id}.json', url)
+
+ if not traverse_obj(data, 'is_broadcasting'):
+ try:
+ self._call_api(user_id, 'users/current.json', url, 'Investigating reason for request failure')
+ except ExtractorError as ex:
+ if ex.cause and ex.cause.code == 401:
+ self.raise_login_required(f'Please log in, or use direct link like https://sketch.pixiv.net/@{user_id}/1234567890', method='cookies')
+ raise ExtractorError('This user is offline', expected=True)
+
+ return self.url_result(f'https://sketch.pixiv.net/@{user_id}/lives/{data["id"]}')
diff --git a/hypervideo_dl/extractor/pladform.py b/hypervideo_dl/extractor/pladform.py
index dc20300..99ade85 100644
--- a/hypervideo_dl/extractor/pladform.py
+++ b/hypervideo_dl/extractor/pladform.py
@@ -28,6 +28,24 @@ class PladformIE(InfoExtractor):
(?P<id>\d+)
'''
_TESTS = [{
+ 'url': 'http://out.pladform.ru/player?pl=18079&type=html5&videoid=100231282',
+ 'info_dict': {
+ 'id': '6216d548e755edae6e8280667d774791',
+ 'ext': 'mp4',
+ 'timestamp': 1406117012,
+ 'title': 'Гарик Мартиросян и Гарик Харламов - Кастинг на концерт ко Дню милиции',
+ 'age_limit': 0,
+ 'upload_date': '20140723',
+ 'thumbnail': str,
+ 'view_count': int,
+ 'description': str,
+ 'category': list,
+ 'uploader_id': '12082',
+ 'uploader': 'Comedy Club',
+ 'duration': 367,
+ },
+ 'expected_warnings': ['HTTP Error 404: Not Found']
+ }, {
'url': 'https://out.pladform.ru/player?pl=64471&videoid=3777899&vk_puid15=0&vk_puid34=0',
'md5': '53362fac3a27352da20fa2803cc5cd6f',
'info_dict': {
@@ -63,13 +81,19 @@ class PladformIE(InfoExtractor):
'http://out.pladform.ru/getVideo', video_id, query={
'pl': pl,
'videoid': video_id,
- })
+ }, fatal=False)
def fail(text):
raise ExtractorError(
'%s returned error: %s' % (self.IE_NAME, text),
expected=True)
+ if not video:
+ targetUrl = self._request_webpage(url, video_id, note='Resolving final URL').geturl()
+ if targetUrl == url:
+ raise ExtractorError('Can\'t parse page')
+ return self.url_result(targetUrl)
+
if video.tag == 'error':
fail(video.text)
diff --git a/hypervideo_dl/extractor/planetmarathi.py b/hypervideo_dl/extractor/planetmarathi.py
new file mode 100644
index 0000000..07ac15b
--- /dev/null
+++ b/hypervideo_dl/extractor/planetmarathi.py
@@ -0,0 +1,76 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ try_get,
+ unified_strdate,
+)
+
+
+class PlanetMarathiIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?planetmarathi\.com/titles/(?P<id>[^/#&?$]+)'
+ _TESTS = [{
+ 'url': 'https://www.planetmarathi.com/titles/ek-unad-divas',
+ 'playlist_mincount': 2,
+ 'info_dict': {
+ 'id': 'ek-unad-divas',
+ },
+ 'playlist': [{
+ 'info_dict': {
+ 'id': 'ASSETS-MOVIE-ASSET-01_ek-unad-divas',
+ 'ext': 'mp4',
+ 'title': 'ek unad divas',
+ 'alt_title': 'चित्रपट',
+ 'description': 'md5:41c7ed6b041c2fea9820a3f3125bd881',
+ 'season_number': None,
+ 'episode_number': 1,
+ 'duration': 5539,
+ 'upload_date': '20210829',
+ },
+ }] # Trailer skipped
+ }, {
+ 'url': 'https://www.planetmarathi.com/titles/baap-beep-baap-season-1',
+ 'playlist_mincount': 10,
+ 'info_dict': {
+ 'id': 'baap-beep-baap-season-1',
+ },
+ 'playlist': [{
+ 'info_dict': {
+ 'id': 'ASSETS-CHARACTER-PROFILE-SEASON-01-ASSET-01_baap-beep-baap-season-1',
+ 'ext': 'mp4',
+ 'title': 'Manohar Kanhere',
+ 'alt_title': 'मनोहर कान्हेरे',
+ 'description': 'md5:285ed45d5c0ab5522cac9a043354ebc6',
+ 'season_number': 1,
+ 'episode_number': 1,
+ 'duration': 29,
+ 'upload_date': '20210829',
+ },
+ }] # Trailers, Episodes, other Character profiles skipped
+ }]
+
+ def _real_extract(self, url):
+ id = self._match_id(url)
+ entries = []
+ json_data = self._download_json(f'https://www.planetmarathi.com/api/v1/titles/{id}/assets', id)['assets']
+ for asset in json_data:
+ asset_title = asset['mediaAssetName']['en']
+ if asset_title == 'Movie':
+ asset_title = id.replace('-', ' ')
+ asset_id = f'{asset["sk"]}_{id}'.replace('#', '-')
+ formats, subtitles = self._extract_m3u8_formats_and_subtitles(asset['mediaAssetURL'], asset_id)
+ self._sort_formats(formats)
+ entries.append({
+ 'id': asset_id,
+ 'title': asset_title,
+ 'alt_title': try_get(asset, lambda x: x['mediaAssetName']['mr']),
+ 'description': try_get(asset, lambda x: x['mediaAssetDescription']['en']),
+ 'season_number': asset.get('mediaAssetSeason'),
+ 'episode_number': asset.get('mediaAssetIndexForAssetType'),
+ 'duration': asset.get('mediaAssetDurationInSeconds'),
+ 'upload_date': unified_strdate(asset.get('created')),
+ 'formats': formats,
+ 'subtitles': subtitles,
+ })
+ return self.playlist_result(entries, playlist_id=id)
diff --git a/hypervideo_dl/extractor/platzi.py b/hypervideo_dl/extractor/platzi.py
index 23c8256..17f52e7 100644
--- a/hypervideo_dl/extractor/platzi.py
+++ b/hypervideo_dl/extractor/platzi.py
@@ -22,14 +22,7 @@ class PlatziBaseIE(InfoExtractor):
_LOGIN_URL = 'https://platzi.com/login/'
_NETRC_MACHINE = 'platzi'
- def _real_initialize(self):
- self._login()
-
- def _login(self):
- username, password = self._get_login_info()
- if username is None:
- return
-
+ def _perform_login(self, username, password):
login_page = self._download_webpage(
self._LOGIN_URL, None, 'Downloading login page')
diff --git a/hypervideo_dl/extractor/playplustv.py b/hypervideo_dl/extractor/playplustv.py
index fd72a37..cad2c3a 100644
--- a/hypervideo_dl/extractor/playplustv.py
+++ b/hypervideo_dl/extractor/playplustv.py
@@ -38,14 +38,10 @@ class PlayPlusTVIE(InfoExtractor):
'Authorization': 'Bearer ' + self._token,
}, query=query)
- def _real_initialize(self):
- email, password = self._get_login_info()
- if email is None:
- self.raise_login_required()
-
+ def _perform_login(self, username, password):
req = PUTRequest(
'https://api.playplus.tv/api/web/login', json.dumps({
- 'email': email,
+ 'email': username,
'password': password,
}).encode(), {
'Content-Type': 'application/json; charset=utf-8',
@@ -61,6 +57,10 @@ class PlayPlusTVIE(InfoExtractor):
self._profile = self._call_api('Profiles')['list'][0]['_id']
+ def _real_initialize(self):
+ if not self._token:
+ self.raise_login_required(method='password')
+
def _real_extract(self, url):
project_id, media_id = self._match_valid_url(url).groups()
media = self._call_api(
diff --git a/hypervideo_dl/extractor/playtvak.py b/hypervideo_dl/extractor/playtvak.py
index 84e92dd..30c8a59 100644
--- a/hypervideo_dl/extractor/playtvak.py
+++ b/hypervideo_dl/extractor/playtvak.py
@@ -167,8 +167,6 @@ class PlaytvakIE(InfoExtractor):
title = item['title']
is_live = item['type'] == 'stream'
- if is_live:
- title = self._live_title(title)
description = self._og_search_description(webpage, default=None) or self._html_search_meta(
'description', webpage, 'description', default=None)
timestamp = None
diff --git a/hypervideo_dl/extractor/playvid.py b/hypervideo_dl/extractor/playvid.py
index 4aef186..e1c406b 100644
--- a/hypervideo_dl/extractor/playvid.py
+++ b/hypervideo_dl/extractor/playvid.py
@@ -85,8 +85,7 @@ class PlayvidIE(InfoExtractor):
# Extract title - should be in the flashvars; if not, look elsewhere
if video_title is None:
- video_title = self._html_search_regex(
- r'<title>(.*?)</title', webpage, 'title')
+ video_title = self._html_extract_title(webpage)
return {
'id': video_id,
diff --git a/hypervideo_dl/extractor/pluralsight.py b/hypervideo_dl/extractor/pluralsight.py
index 801057e..2a5e0e4 100644
--- a/hypervideo_dl/extractor/pluralsight.py
+++ b/hypervideo_dl/extractor/pluralsight.py
@@ -162,14 +162,7 @@ query viewClip {
}
}'''
- def _real_initialize(self):
- self._login()
-
- def _login(self):
- username, password = self._get_login_info()
- if username is None:
- return
-
+ def _perform_login(self, username, password):
login_page = self._download_webpage(
self._LOGIN_URL, None, 'Downloading login page')
diff --git a/hypervideo_dl/extractor/plutotv.py b/hypervideo_dl/extractor/plutotv.py
index 0cf8246..26aff1a 100644
--- a/hypervideo_dl/extractor/plutotv.py
+++ b/hypervideo_dl/extractor/plutotv.py
@@ -20,11 +20,11 @@ from ..utils import (
class PlutoTVIE(InfoExtractor):
_VALID_URL = r'''(?x)
- https?://(?:www\.)?pluto\.tv(?:/en)?/on-demand
+ https?://(?:www\.)?pluto\.tv(?:/[^/]+)?/on-demand
/(?P<video_type>movies|series)
/(?P<series_or_movie_slug>[^/]+)
(?:
- /seasons?/(?P<season_no>\d+)
+ (?:/seasons?/(?P<season_no>\d+))?
(?:/episode/(?P<episode_slug>[^/]+))?
)?
/?(?:$|[#?])'''
@@ -84,6 +84,9 @@ class PlutoTVIE(InfoExtractor):
}, {
'url': 'https://pluto.tv/en/on-demand/series/manhunters-fugitive-task-force/seasons/1/episode/third-times-the-charm-1-1',
'only_matching': True,
+ }, {
+ 'url': 'https://pluto.tv/it/on-demand/series/csi-vegas/episode/legacy-2021-1-1',
+ 'only_matching': True,
}
]
diff --git a/hypervideo_dl/extractor/pokemon.py b/hypervideo_dl/extractor/pokemon.py
index 402b574..b411390 100644
--- a/hypervideo_dl/extractor/pokemon.py
+++ b/hypervideo_dl/extractor/pokemon.py
@@ -1,6 +1,7 @@
# coding: utf-8
from __future__ import unicode_literals
+import re
from .common import InfoExtractor
from ..utils import (
@@ -138,3 +139,42 @@ class PokemonWatchIE(InfoExtractor):
'episode': video_data.get('title'),
'episode_number': int_or_none(video_data.get('episode')),
})
+
+
+class PokemonSoundLibraryIE(InfoExtractor):
+ _VALID_URL = r'https?://soundlibrary\.pokemon\.co\.jp'
+
+ _TESTS = [{
+ 'url': 'https://soundlibrary.pokemon.co.jp/',
+ 'info_dict': {
+ 'title': 'Pokémon Diamond and Pearl Sound Tracks',
+ },
+ 'playlist_mincount': 149,
+ }]
+
+ def _real_extract(self, url):
+ musicbox_webpage = self._download_webpage(
+ 'https://soundlibrary.pokemon.co.jp/musicbox', None,
+ 'Downloading list of songs')
+ song_titles = [x.group(1) for x in re.finditer(r'<span>([^>]+?)</span><br/>をてもち曲に加えます。', musicbox_webpage)]
+ song_titles = song_titles[4::2]
+
+ # each songs don't have permalink; instead we return all songs at once
+ song_entries = [{
+ 'id': f'pokemon-soundlibrary-{song_id}',
+ 'url': f'https://soundlibrary.pokemon.co.jp/api/assets/signing/sounds/wav/{song_id}.wav',
+ # note: the server always serves MP3 files, despite its extension of the URL above
+ 'ext': 'mp3',
+ 'acodec': 'mp3',
+ 'vcodec': 'none',
+ 'title': song_title,
+ 'track': song_title,
+ 'artist': 'Nintendo / Creatures Inc. / GAME FREAK inc.',
+ 'uploader': 'Pokémon',
+ 'release_year': 2006,
+ 'release_date': '20060928',
+ 'track_number': song_id,
+ 'album': 'Pokémon Diamond and Pearl',
+ } for song_id, song_title in enumerate(song_titles, 1)]
+
+ return self.playlist_result(song_entries, playlist_title='Pokémon Diamond and Pearl Sound Tracks')
diff --git a/hypervideo_dl/extractor/pokergo.py b/hypervideo_dl/extractor/pokergo.py
new file mode 100644
index 0000000..c9e2fed
--- /dev/null
+++ b/hypervideo_dl/extractor/pokergo.py
@@ -0,0 +1,109 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import base64
+
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ try_get,
+)
+
+
+class PokerGoBaseIE(InfoExtractor):
+ _NETRC_MACHINE = 'pokergo'
+ _AUTH_TOKEN = None
+ _PROPERTY_ID = '1dfb3940-7d53-4980-b0b0-f28b369a000d'
+
+ def _perform_login(self, username, password):
+ if self._AUTH_TOKEN:
+ return
+ self.report_login()
+ PokerGoBaseIE._AUTH_TOKEN = self._download_json(
+ f'https://subscription.pokergo.com/properties/{self._PROPERTY_ID}/sign-in', None,
+ headers={'authorization': f'Basic {base64.b64encode(f"{username}:{password}".encode()).decode()}'},
+ data=b'')['meta']['token']
+ if not self._AUTH_TOKEN:
+ raise ExtractorError('Unable to get Auth Token.', expected=True)
+
+ def _real_initialize(self):
+ if not self._AUTH_TOKEN:
+ self.raise_login_required(method='password')
+
+
+class PokerGoIE(PokerGoBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?pokergo\.com/videos/(?P<id>[^&$#/?]+)'
+
+ _TESTS = [{
+ 'url': 'https://www.pokergo.com/videos/2a70ec4e-4a80-414b-97ec-725d9b72a7dc',
+ 'info_dict': {
+ 'id': 'aVLOxDzY',
+ 'ext': 'mp4',
+ 'title': 'Poker After Dark | Season 12 (2020) | Cry Me a River | Episode 2',
+ 'description': 'md5:c7a8c29556cbfb6eb3c0d5d622251b71',
+ 'thumbnail': 'https://cdn.jwplayer.com/v2/media/aVLOxDzY/poster.jpg?width=720',
+ 'timestamp': 1608085715,
+ 'duration': 2700.12,
+ 'season_number': 12,
+ 'episode_number': 2,
+ 'series': 'poker after dark',
+ 'upload_date': '20201216',
+ 'season': 'Season 12',
+ 'episode': 'Episode 2',
+ 'display_id': '2a70ec4e-4a80-414b-97ec-725d9b72a7dc',
+ },
+ 'params': {'skip_download': True}
+ }]
+
+ def _real_extract(self, url):
+ id = self._match_id(url)
+ data_json = self._download_json(f'https://api.pokergo.com/v2/properties/{self._PROPERTY_ID}/videos/{id}', id,
+ headers={'authorization': f'Bearer {self._AUTH_TOKEN}'})['data']
+ v_id = data_json['source']
+
+ thumbnails = [{
+ 'url': image['url'],
+ 'id': image.get('label'),
+ 'width': image.get('width'),
+ 'height': image.get('height')
+ } for image in data_json.get('images') or [] if image.get('url')]
+ series_json = next(dct for dct in data_json.get('show_tags') or [] if dct.get('video_id') == id) or {}
+
+ return {
+ '_type': 'url_transparent',
+ 'display_id': id,
+ 'title': data_json.get('title'),
+ 'description': data_json.get('description'),
+ 'duration': data_json.get('duration'),
+ 'thumbnails': thumbnails,
+ 'season_number': series_json.get('season'),
+ 'episode_number': series_json.get('episode_number'),
+ 'series': try_get(series_json, lambda x: x['tag']['name']),
+ 'url': f'https://cdn.jwplayer.com/v2/media/{v_id}'
+ }
+
+
+class PokerGoCollectionIE(PokerGoBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?pokergo\.com/collections/(?P<id>[^&$#/?]+)'
+
+ _TESTS = [{
+ 'url': 'https://www.pokergo.com/collections/19ffe481-5dae-481a-8869-75cc0e3c4700',
+ 'playlist_mincount': 13,
+ 'info_dict': {
+ 'id': '19ffe481-5dae-481a-8869-75cc0e3c4700',
+ },
+ }]
+
+ def _entries(self, id):
+ data_json = self._download_json(f'https://api.pokergo.com/v2/properties/{self._PROPERTY_ID}/collections/{id}?include=entities',
+ id, headers={'authorization': f'Bearer {self._AUTH_TOKEN}'})['data']
+ for video in data_json.get('collection_video') or []:
+ video_id = video.get('id')
+ if video_id:
+ yield self.url_result(
+ f'https://www.pokergo.com/videos/{video_id}',
+ ie=PokerGoIE.ie_key(), video_id=video_id)
+
+ def _real_extract(self, url):
+ id = self._match_id(url)
+ return self.playlist_result(self._entries(id), playlist_id=id)
diff --git a/hypervideo_dl/extractor/polsatgo.py b/hypervideo_dl/extractor/polsatgo.py
new file mode 100644
index 0000000..1e3f46c
--- /dev/null
+++ b/hypervideo_dl/extractor/polsatgo.py
@@ -0,0 +1,90 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from uuid import uuid4
+import json
+
+from .common import InfoExtractor
+from ..utils import (
+ int_or_none,
+ try_get,
+ url_or_none,
+ ExtractorError,
+)
+
+
+class PolsatGoIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?polsat(?:box)?go\.pl/.+/(?P<id>[0-9a-fA-F]+)(?:[/#?]|$)'
+ _TESTS = [{
+ 'url': 'https://polsatgo.pl/wideo/seriale/swiat-wedlug-kiepskich/5024045/sezon-1/5028300/swiat-wedlug-kiepskich-odcinek-88/4121',
+ 'info_dict': {
+ 'id': '4121',
+ 'ext': 'mp4',
+ 'title': 'Świat według Kiepskich - Odcinek 88',
+ 'age_limit': 12,
+ },
+ }]
+
+ def _extract_formats(self, sources, video_id):
+ for source in sources or []:
+ if not source.get('id'):
+ continue
+ url = url_or_none(self._call_api(
+ 'drm', video_id, 'getPseudoLicense',
+ {'mediaId': video_id, 'sourceId': source['id']}).get('url'))
+ if not url:
+ continue
+ yield {
+ 'url': url,
+ 'height': int_or_none(try_get(source, lambda x: x['quality'][:-1]))
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ media = self._call_api('navigation', video_id, 'prePlayData', {'mediaId': video_id})['mediaItem']
+
+ formats = list(self._extract_formats(
+ try_get(media, lambda x: x['playback']['mediaSources']), video_id))
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': media['displayInfo']['title'],
+ 'formats': formats,
+ 'age_limit': int_or_none(media['displayInfo']['ageGroup'])
+ }
+
+ def _call_api(self, endpoint, media_id, method, params):
+ rand_uuid = str(uuid4())
+ res = self._download_json(
+ f'https://b2c-mobile.redefine.pl/rpc/{endpoint}/', media_id,
+ note=f'Downloading {method} JSON metadata',
+ data=json.dumps({
+ 'method': method,
+ 'id': '2137',
+ 'jsonrpc': '2.0',
+ 'params': {
+ **params,
+ 'userAgentData': {
+ 'deviceType': 'mobile',
+ 'application': 'native',
+ 'os': 'android',
+ 'build': 10003,
+ 'widevine': False,
+ 'portal': 'pg',
+ 'player': 'cpplayer',
+ },
+ 'deviceId': {
+ 'type': 'other',
+ 'value': rand_uuid,
+ },
+ 'clientId': rand_uuid,
+ 'cpid': 1,
+ },
+ }).encode('utf-8'),
+ headers={'Content-type': 'application/json'})
+ if not res.get('result'):
+ if res['error']['code'] == 13404:
+ raise ExtractorError('This video is either unavailable in your region or is DRM protected', expected=True)
+ raise ExtractorError(f'Solorz said: {res["error"]["message"]} - {res["error"]["data"]["userMessage"]}')
+ return res['result']
diff --git a/hypervideo_dl/extractor/polskieradio.py b/hypervideo_dl/extractor/polskieradio.py
index 53fe034..b2b3eb2 100644
--- a/hypervideo_dl/extractor/polskieradio.py
+++ b/hypervideo_dl/extractor/polskieradio.py
@@ -2,6 +2,8 @@
from __future__ import unicode_literals
import itertools
+import json
+import math
import re
from .common import InfoExtractor
@@ -12,15 +14,45 @@ from ..compat import (
)
from ..utils import (
extract_attributes,
+ ExtractorError,
+ InAdvancePagedList,
int_or_none,
+ js_to_json,
+ parse_iso8601,
strip_or_none,
unified_timestamp,
unescapeHTML,
+ url_or_none,
)
-class PolskieRadioIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?polskieradio\.pl/\d+/\d+/Artykul/(?P<id>[0-9]+)'
+class PolskieRadioBaseExtractor(InfoExtractor):
+ def _extract_webpage_player_entries(self, webpage, playlist_id, base_data):
+ media_urls = set()
+
+ for data_media in re.findall(r'<[^>]+data-media="?({[^>]+})"?', webpage):
+ media = self._parse_json(data_media, playlist_id, transform_source=unescapeHTML, fatal=False)
+ if not media.get('file') or not media.get('desc'):
+ continue
+ media_url = self._proto_relative_url(media['file'])
+ if media_url in media_urls:
+ continue
+ media_urls.add(media_url)
+ entry = base_data.copy()
+ entry.update({
+ 'id': compat_str(media['id']),
+ 'url': media_url,
+ 'duration': int_or_none(media.get('length')),
+ 'vcodec': 'none' if media.get('provider') == 'audio' else None,
+ })
+ entry_title = compat_urllib_parse_unquote(media['desc'])
+ if entry_title:
+ entry['title'] = entry_title
+ yield entry
+
+
+class PolskieRadioIE(PolskieRadioBaseExtractor):
+ _VALID_URL = r'https?://(?:www\.)?polskieradio(?:24)?\.pl/\d+/\d+/Artykul/(?P<id>[0-9]+)'
_TESTS = [{ # Old-style single broadcast.
'url': 'http://www.polskieradio.pl/7/5102/Artykul/1587943,Prof-Andrzej-Nowak-o-historii-nie-da-sie-myslec-beznamietnie',
'info_dict': {
@@ -59,22 +91,14 @@ class PolskieRadioIE(InfoExtractor):
'thumbnail': r're:^https?://static\.prsa\.pl/images/.*\.jpg$'
},
}],
- }, { # Old-style multiple broadcast playlist.
- 'url': 'https://www.polskieradio.pl/8/4346/Artykul/2487823,Marek-Kondrat-czyta-Mistrza-i-Malgorzate',
- 'info_dict': {
- 'id': '2487823',
- 'title': 'Marek Kondrat czyta "Mistrza i Małgorzatę"',
- 'description': 'md5:8422a95cc83834f2aaeff9d82e9c8f39',
- },
- 'playlist_mincount': 50,
- }, { # New-style multiple broadcast playlist.
- 'url': 'https://www.polskieradio.pl/8/4346/Artykul/2541317,Czytamy-Kalendarz-i-klepsydre-Tadeusza-Konwickiego',
+ }, {
+ # PR4 audition - other frontend
+ 'url': 'https://www.polskieradio.pl/10/6071/Artykul/2610977,Poglos-29-pazdziernika-godz-2301',
'info_dict': {
- 'id': '2541317',
- 'title': 'Czytamy "Kalendarz i klepsydrę" Tadeusza Konwickiego',
- 'description': 'md5:0baeaa46d877f1351fb2eeed3e871f9f',
+ 'id': '2610977',
+ 'ext': 'mp3',
+ 'title': 'Pogłos 29 października godz. 23:01',
},
- 'playlist_mincount': 15,
}, {
'url': 'http://polskieradio.pl/9/305/Artykul/1632955,Bardzo-popularne-slowo-remis',
'only_matching': True,
@@ -85,6 +109,9 @@ class PolskieRadioIE(InfoExtractor):
# with mp4 video
'url': 'http://www.polskieradio.pl/9/299/Artykul/1634903,Brexit-Leszek-Miller-swiat-sie-nie-zawali-Europa-bedzie-trwac-dalej',
'only_matching': True,
+ }, {
+ 'url': 'https://polskieradio24.pl/130/4503/Artykul/2621876,Narusza-nasza-suwerennosc-Publicysci-o-uzaleznieniu-funduszy-UE-od-praworzadnosci',
+ 'only_matching': True,
}]
def _real_extract(self, url):
@@ -94,39 +121,37 @@ class PolskieRadioIE(InfoExtractor):
content = self._search_regex(
r'(?s)<div[^>]+class="\s*this-article\s*"[^>]*>(.+?)<div[^>]+class="tags"[^>]*>',
- webpage, 'content')
+ webpage, 'content', default=None)
timestamp = unified_timestamp(self._html_search_regex(
r'(?s)<span[^>]+id="datetime2"[^>]*>(.+?)</span>',
- webpage, 'timestamp', fatal=False))
+ webpage, 'timestamp', default=None))
- thumbnail_url = self._og_search_thumbnail(webpage)
+ thumbnail_url = self._og_search_thumbnail(webpage, default=None)
- entries = []
+ title = self._og_search_title(webpage).strip()
- media_urls = set()
+ description = strip_or_none(self._og_search_description(webpage, default=None))
+ description = description.replace('\xa0', ' ') if description is not None else None
- for data_media in re.findall(r'<[^>]+data-media="?({[^>]+})"?', content):
- media = self._parse_json(data_media, playlist_id, transform_source=unescapeHTML, fatal=False)
- if not media.get('file') or not media.get('desc'):
- continue
- media_url = self._proto_relative_url(media['file'], 'http:')
- if media_url in media_urls:
- continue
- media_urls.add(media_url)
- entries.append({
- 'id': compat_str(media['id']),
- 'url': media_url,
- 'title': compat_urllib_parse_unquote(media['desc']),
- 'duration': int_or_none(media.get('length')),
- 'vcodec': 'none' if media.get('provider') == 'audio' else None,
+ if not content:
+ return {
+ 'id': playlist_id,
+ 'url': self._proto_relative_url(
+ self._search_regex(
+ r"source:\s*'(//static\.prsa\.pl/[^']+)'",
+ webpage, 'audition record url')),
+ 'title': title,
+ 'description': description,
'timestamp': timestamp,
- 'thumbnail': thumbnail_url
- })
+ 'thumbnail': thumbnail_url,
+ }
- title = self._og_search_title(webpage).strip()
- description = strip_or_none(self._og_search_description(webpage))
- description = description.replace('\xa0', ' ') if description is not None else None
+ entries = self._extract_webpage_player_entries(content, playlist_id, {
+ 'title': title,
+ 'timestamp': timestamp,
+ 'thumbnail': thumbnail_url,
+ })
return self.playlist_result(entries, playlist_id, title, description)
@@ -207,3 +232,201 @@ class PolskieRadioCategoryIE(InfoExtractor):
return self.playlist_result(
self._entries(url, webpage, category_id),
category_id, title)
+
+
+class PolskieRadioPlayerIE(InfoExtractor):
+ IE_NAME = 'polskieradio:player'
+ _VALID_URL = r'https?://player\.polskieradio\.pl/anteny/(?P<id>[^/]+)'
+
+ _BASE_URL = 'https://player.polskieradio.pl'
+ _PLAYER_URL = 'https://player.polskieradio.pl/main.bundle.js'
+ _STATIONS_API_URL = 'https://apipr.polskieradio.pl/api/stacje'
+
+ _TESTS = [{
+ 'url': 'https://player.polskieradio.pl/anteny/trojka',
+ 'info_dict': {
+ 'id': '3',
+ 'ext': 'm4a',
+ 'title': 'Trójka',
+ },
+ 'params': {
+ 'format': 'bestaudio',
+ 'skip_download': 'endless stream',
+ },
+ }]
+
+ def _get_channel_list(self, channel_url='no_channel'):
+ player_code = self._download_webpage(
+ self._PLAYER_URL, channel_url,
+ note='Downloading js player')
+ channel_list = js_to_json(self._search_regex(
+ r';var r="anteny",a=(\[.+?\])},', player_code, 'channel list'))
+ return self._parse_json(channel_list, channel_url)
+
+ def _real_extract(self, url):
+ channel_url = self._match_id(url)
+ channel_list = self._get_channel_list(channel_url)
+
+ channel = next((c for c in channel_list if c.get('url') == channel_url), None)
+
+ if not channel:
+ raise ExtractorError('Channel not found')
+
+ station_list = self._download_json(self._STATIONS_API_URL, channel_url,
+ note='Downloading stream url list',
+ headers={
+ 'Accept': 'application/json',
+ 'Referer': url,
+ 'Origin': self._BASE_URL,
+ })
+ station = next((s for s in station_list
+ if s.get('Name') == (channel.get('streamName') or channel.get('name'))), None)
+ if not station:
+ raise ExtractorError('Station not found even though we extracted channel')
+
+ formats = []
+ for stream_url in station['Streams']:
+ stream_url = self._proto_relative_url(stream_url)
+ if stream_url.endswith('/playlist.m3u8'):
+ formats.extend(self._extract_m3u8_formats(stream_url, channel_url, live=True))
+ elif stream_url.endswith('/manifest.f4m'):
+ formats.extend(self._extract_mpd_formats(stream_url, channel_url))
+ elif stream_url.endswith('/Manifest'):
+ formats.extend(self._extract_ism_formats(stream_url, channel_url))
+ else:
+ formats.append({
+ 'url': stream_url,
+ })
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': compat_str(channel['id']),
+ 'formats': formats,
+ 'title': channel.get('name') or channel.get('streamName'),
+ 'display_id': channel_url,
+ 'thumbnail': f'{self._BASE_URL}/images/{channel_url}-color-logo.png',
+ 'is_live': True,
+ }
+
+
+class PolskieRadioPodcastBaseExtractor(InfoExtractor):
+ _API_BASE = 'https://apipodcasts.polskieradio.pl/api'
+
+ def _parse_episode(self, data):
+ return {
+ 'id': data['guid'],
+ 'formats': [{
+ 'url': data['url'],
+ 'filesize': int_or_none(data.get('fileSize')),
+ }],
+ 'title': data['title'],
+ 'description': data.get('description'),
+ 'duration': int_or_none(data.get('length')),
+ 'timestamp': parse_iso8601(data.get('publishDate')),
+ 'thumbnail': url_or_none(data.get('image')),
+ 'series': data.get('podcastTitle'),
+ 'episode': data['title'],
+ }
+
+
+class PolskieRadioPodcastListIE(PolskieRadioPodcastBaseExtractor):
+ IE_NAME = 'polskieradio:podcast:list'
+ _VALID_URL = r'https?://podcasty\.polskieradio\.pl/podcast/(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'https://podcasty.polskieradio.pl/podcast/8/',
+ 'info_dict': {
+ 'id': '8',
+ 'title': 'Śniadanie w Trójce',
+ 'description': 'md5:57abcc27bc4c6a6b25baa3061975b9ef',
+ 'uploader': 'Beata Michniewicz',
+ },
+ 'playlist_mincount': 714,
+ }]
+ _PAGE_SIZE = 10
+
+ def _call_api(self, podcast_id, page):
+ return self._download_json(
+ f'{self._API_BASE}/Podcasts/{podcast_id}/?pageSize={self._PAGE_SIZE}&page={page}',
+ podcast_id, f'Downloading page {page}')
+
+ def _real_extract(self, url):
+ podcast_id = self._match_id(url)
+ data = self._call_api(podcast_id, 1)
+
+ def get_page(page_num):
+ page_data = self._call_api(podcast_id, page_num + 1) if page_num else data
+ yield from (self._parse_episode(ep) for ep in page_data['items'])
+
+ return {
+ '_type': 'playlist',
+ 'entries': InAdvancePagedList(
+ get_page, math.ceil(data['itemCount'] / self._PAGE_SIZE), self._PAGE_SIZE),
+ 'id': str(data['id']),
+ 'title': data['title'],
+ 'description': data.get('description'),
+ 'uploader': data.get('announcer'),
+ }
+
+
+class PolskieRadioPodcastIE(PolskieRadioPodcastBaseExtractor):
+ IE_NAME = 'polskieradio:podcast'
+ _VALID_URL = r'https?://podcasty\.polskieradio\.pl/track/(?P<id>[a-f\d]{8}(?:-[a-f\d]{4}){4}[a-f\d]{8})'
+ _TESTS = [{
+ 'url': 'https://podcasty.polskieradio.pl/track/6eafe403-cb8f-4756-b896-4455c3713c32',
+ 'info_dict': {
+ 'id': '6eafe403-cb8f-4756-b896-4455c3713c32',
+ 'ext': 'mp3',
+ 'title': 'Theresa May rezygnuje. Co dalej z brexitem?',
+ 'description': 'md5:e41c409a29d022b70ef0faa61dbded60',
+ },
+ }]
+
+ def _real_extract(self, url):
+ podcast_id = self._match_id(url)
+ data = self._download_json(
+ f'{self._API_BASE}/audio',
+ podcast_id, 'Downloading podcast metadata',
+ data=json.dumps({
+ 'guids': [podcast_id],
+ }).encode('utf-8'),
+ headers={
+ 'Content-Type': 'application/json',
+ })
+ return self._parse_episode(data[0])
+
+
+class PolskieRadioRadioKierowcowIE(PolskieRadioBaseExtractor):
+ _VALID_URL = r'https?://(?:www\.)?radiokierowcow\.pl/artykul/(?P<id>[0-9]+)'
+ IE_NAME = 'polskieradio:kierowcow'
+
+ _TESTS = [{
+ 'url': 'https://radiokierowcow.pl/artykul/2694529',
+ 'info_dict': {
+ 'id': '2694529',
+ 'title': 'Zielona fala reliktem przeszłości?',
+ 'description': 'md5:343950a8717c9818fdfd4bd2b8ca9ff2',
+ },
+ 'playlist_count': 3,
+ }]
+
+ def _real_extract(self, url):
+ media_id = self._match_id(url)
+ webpage = self._download_webpage(url, media_id)
+ nextjs_build = self._search_nextjs_data(webpage, media_id)['buildId']
+ article = self._download_json(
+ f'https://radiokierowcow.pl/_next/data/{nextjs_build}/artykul/{media_id}.json?articleId={media_id}',
+ media_id)
+ data = article['pageProps']['data']
+ title = data['title']
+ entries = self._extract_webpage_player_entries(data['content'], media_id, {
+ 'title': title,
+ })
+
+ return {
+ '_type': 'playlist',
+ 'id': media_id,
+ 'entries': entries,
+ 'title': title,
+ 'description': data.get('lead'),
+ }
diff --git a/hypervideo_dl/extractor/pornez.py b/hypervideo_dl/extractor/pornez.py
new file mode 100644
index 0000000..713dc00
--- /dev/null
+++ b/hypervideo_dl/extractor/pornez.py
@@ -0,0 +1,43 @@
+# coding: utf-8
+from __future__ import unicode_literals
+from .common import InfoExtractor
+from ..utils import int_or_none
+
+
+class PornezIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?pornez\.net/video(?P<id>[0-9]+)/'
+ _TEST = {
+ 'url': 'https://pornez.net/video344819/mistresst-funny_penis_names-wmv/',
+ 'md5': '2e19a0a1cff3a5dbea0ef1b9e80bcbbc',
+ 'info_dict': {
+ 'id': '344819',
+ 'ext': 'mp4',
+ 'title': r'mistresst funny_penis_names wmv',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'age_limit': 18,
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+ iframe_src = self._html_search_regex(
+ r'<iframe[^>]+src="(https?://pornez\.net/player/\?[^"]+)"', webpage, 'iframe', fatal=True)
+ title = self._html_search_meta(['name', 'twitter:title', 'og:title'], webpage, 'title', default=None)
+ if title is None:
+ title = self._search_regex(r'<h1>(.*?)</h1>', webpage, 'title', fatal=True)
+ thumbnail = self._html_search_meta(['thumbnailUrl'], webpage, 'title', default=None)
+ webpage = self._download_webpage(iframe_src, video_id)
+ entries = self._parse_html5_media_entries(iframe_src, webpage, video_id)[0]
+ for format in entries['formats']:
+ height = self._search_regex(r'_(\d+)\.m3u8', format['url'], 'height')
+ format['format_id'] = '%sp' % height
+ format['height'] = int_or_none(height)
+
+ entries.update({
+ 'id': video_id,
+ 'title': title,
+ 'thumbnail': thumbnail,
+ 'age_limit': 18
+ })
+ return entries
diff --git a/hypervideo_dl/extractor/pornflip.py b/hypervideo_dl/extractor/pornflip.py
index d0aefa2..accf452 100644
--- a/hypervideo_dl/extractor/pornflip.py
+++ b/hypervideo_dl/extractor/pornflip.py
@@ -29,7 +29,6 @@ class PornFlipIE(InfoExtractor):
'age_limit': 18,
},
'params': {
- 'format': 'bestvideo',
'skip_download': True,
},
},
diff --git a/hypervideo_dl/extractor/pornhub.py b/hypervideo_dl/extractor/pornhub.py
index 6d894af..17c8c91 100644
--- a/hypervideo_dl/extractor/pornhub.py
+++ b/hypervideo_dl/extractor/pornhub.py
@@ -18,6 +18,7 @@ from ..utils import (
clean_html,
determine_ext,
ExtractorError,
+ format_field,
int_or_none,
merge_dicts,
NO_DEFAULT,
@@ -32,7 +33,7 @@ from ..utils import (
class PornHubBaseIE(InfoExtractor):
_NETRC_MACHINE = 'pornhub'
- _PORNHUB_HOST_RE = r'(?:(?P<host>pornhub(?:premium)?\.(?:com|net|org))|pornhubthbh7ap3u\.onion)'
+ _PORNHUB_HOST_RE = r'(?:(?P<host>pornhub(?:premium)?\.(?:com|net|org))|pornhubvybmsymdol4iibwgwtkpwmeyd6luq2gxajgjzfjvotyt5zhyd\.onion)'
def _download_webpage_handle(self, *args, **kwargs):
def dl(*args, **kwargs):
@@ -247,7 +248,7 @@ class PornHubIE(PornHubBaseIE):
'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5a9813bfa7156',
'only_matching': True,
}, {
- 'url': 'http://pornhubthbh7ap3u.onion/view_video.php?viewkey=ph5a9813bfa7156',
+ 'url': 'http://pornhubvybmsymdol4iibwgwtkpwmeyd6luq2gxajgjzfjvotyt5zhyd.onion/view_video.php?viewkey=ph5a9813bfa7156',
'only_matching': True,
}]
@@ -258,8 +259,7 @@ class PornHubIE(PornHubBaseIE):
webpage)
def _extract_count(self, pattern, webpage, name):
- return str_to_int(self._search_regex(
- pattern, webpage, '%s count' % name, fatal=False))
+ return str_to_int(self._search_regex(pattern, webpage, '%s count' % name, default=None))
def _real_extract(self, url):
mobj = self._match_valid_url(url)
@@ -432,7 +432,7 @@ class PornHubIE(PornHubBaseIE):
default=None))
formats.append({
'url': format_url,
- 'format_id': '%dp' % height if height else None,
+ 'format_id': format_field(height, template='%dp'),
'height': height,
})
@@ -562,7 +562,7 @@ class PornHubUserIE(PornHubPlaylistBaseIE):
'url': 'https://www.pornhubpremium.com/pornstar/lily-labeau',
'only_matching': True,
}, {
- 'url': 'https://pornhubthbh7ap3u.onion/model/zoe_ph',
+ 'url': 'https://pornhubvybmsymdol4iibwgwtkpwmeyd6luq2gxajgjzfjvotyt5zhyd.onion/model/zoe_ph',
'only_matching': True,
}]
@@ -733,7 +733,7 @@ class PornHubPagedVideoListIE(PornHubPagedPlaylistBaseIE):
'url': 'https://www.pornhub.com/video/incategories/60fps-1/hd-porn',
'only_matching': True,
}, {
- 'url': 'https://pornhubthbh7ap3u.onion/model/zoe_ph/videos',
+ 'url': 'https://pornhubvybmsymdol4iibwgwtkpwmeyd6luq2gxajgjzfjvotyt5zhyd.onion/model/zoe_ph/videos',
'only_matching': True,
}]
@@ -756,7 +756,7 @@ class PornHubUserVideosUploadIE(PornHubPagedPlaylistBaseIE):
'url': 'https://www.pornhub.com/model/zoe_ph/videos/upload',
'only_matching': True,
}, {
- 'url': 'http://pornhubthbh7ap3u.onion/pornstar/jenny-blighe/videos/upload',
+ 'url': 'http://pornhubvybmsymdol4iibwgwtkpwmeyd6luq2gxajgjzfjvotyt5zhyd.onion/pornstar/jenny-blighe/videos/upload',
'only_matching': True,
}]
diff --git a/hypervideo_dl/extractor/projectveritas.py b/hypervideo_dl/extractor/projectveritas.py
index 1d832a6..9e9867b 100644
--- a/hypervideo_dl/extractor/projectveritas.py
+++ b/hypervideo_dl/extractor/projectveritas.py
@@ -10,7 +10,7 @@ from ..utils import (
class ProjectVeritasIE(InfoExtractor):
- _VALID_URL = r'(?:https?://)(?:www\.)?projectveritas\.com/(?P<type>news|video)/(?P<id>[^/?#]+)'
+ _VALID_URL = r'https?://(?:www\.)?projectveritas\.com/(?P<type>news|video)/(?P<id>[^/?#]+)'
_TESTS = [{
'url': 'https://www.projectveritas.com/news/exclusive-inside-the-new-york-and-new-jersey-hospitals-battling-coronavirus/',
'info_dict': {
diff --git a/hypervideo_dl/extractor/prx.py b/hypervideo_dl/extractor/prx.py
new file mode 100644
index 0000000..80561b8
--- /dev/null
+++ b/hypervideo_dl/extractor/prx.py
@@ -0,0 +1,431 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import itertools
+from .common import InfoExtractor, SearchInfoExtractor
+from ..utils import (
+ urljoin,
+ traverse_obj,
+ int_or_none,
+ mimetype2ext,
+ clean_html,
+ url_or_none,
+ unified_timestamp,
+ str_or_none,
+)
+
+
+class PRXBaseIE(InfoExtractor):
+ PRX_BASE_URL_RE = r'https?://(?:(?:beta|listen)\.)?prx.org/%s'
+
+ def _call_api(self, item_id, path, query=None, fatal=True, note='Downloading CMS API JSON'):
+ return self._download_json(
+ urljoin('https://cms.prx.org/api/v1/', path), item_id, query=query, fatal=fatal, note=note)
+
+ @staticmethod
+ def _get_prx_embed_response(response, section):
+ return traverse_obj(response, ('_embedded', f'prx:{section}'))
+
+ @staticmethod
+ def _extract_file_link(response):
+ return url_or_none(traverse_obj(
+ response, ('_links', 'enclosure', 'href'), expected_type=str))
+
+ @classmethod
+ def _extract_image(cls, image_response):
+ if not isinstance(image_response, dict):
+ return
+ return {
+ 'id': str_or_none(image_response.get('id')),
+ 'filesize': image_response.get('size'),
+ 'width': image_response.get('width'),
+ 'height': image_response.get('height'),
+ 'url': cls._extract_file_link(image_response)
+ }
+
+ @classmethod
+ def _extract_base_info(cls, response):
+ if not isinstance(response, dict):
+ return
+ item_id = str_or_none(response.get('id'))
+ if not item_id:
+ return
+ thumbnail_dict = cls._extract_image(cls._get_prx_embed_response(response, 'image'))
+ description = (
+ clean_html(response.get('description'))
+ or response.get('shortDescription'))
+ return {
+ 'id': item_id,
+ 'title': response.get('title') or item_id,
+ 'thumbnails': [thumbnail_dict] if thumbnail_dict else None,
+ 'description': description,
+ 'release_timestamp': unified_timestamp(response.get('releasedAt')),
+ 'timestamp': unified_timestamp(response.get('createdAt')),
+ 'modified_timestamp': unified_timestamp(response.get('updatedAt')),
+ 'duration': int_or_none(response.get('duration')),
+ 'tags': response.get('tags'),
+ 'episode_number': int_or_none(response.get('episodeIdentifier')),
+ 'season_number': int_or_none(response.get('seasonIdentifier'))
+ }
+
+ @classmethod
+ def _extract_series_info(cls, series_response):
+ base_info = cls._extract_base_info(series_response)
+ if not base_info:
+ return
+ account_info = cls._extract_account_info(
+ cls._get_prx_embed_response(series_response, 'account')) or {}
+ return {
+ **base_info,
+ 'channel_id': account_info.get('channel_id'),
+ 'channel_url': account_info.get('channel_url'),
+ 'channel': account_info.get('channel'),
+ 'series': base_info.get('title'),
+ 'series_id': base_info.get('id'),
+ }
+
+ @classmethod
+ def _extract_account_info(cls, account_response):
+ base_info = cls._extract_base_info(account_response)
+ if not base_info:
+ return
+ name = account_response.get('name')
+ return {
+ **base_info,
+ 'title': name,
+ 'channel_id': base_info.get('id'),
+ 'channel_url': 'https://beta.prx.org/accounts/%s' % base_info.get('id'),
+ 'channel': name,
+ }
+
+ @classmethod
+ def _extract_story_info(cls, story_response):
+ base_info = cls._extract_base_info(story_response)
+ if not base_info:
+ return
+ series = cls._extract_series_info(
+ cls._get_prx_embed_response(story_response, 'series')) or {}
+ account = cls._extract_account_info(
+ cls._get_prx_embed_response(story_response, 'account')) or {}
+ return {
+ **base_info,
+ 'series': series.get('series'),
+ 'series_id': series.get('series_id'),
+ 'channel_id': account.get('channel_id'),
+ 'channel_url': account.get('channel_url'),
+ 'channel': account.get('channel')
+ }
+
+ def _entries(self, item_id, endpoint, entry_func, query=None):
+ """
+ Extract entries from paginated list API
+ @param entry_func: Function to generate entry from response item
+ """
+ total = 0
+ for page in itertools.count(1):
+ response = self._call_api(f'{item_id}: page {page}', endpoint, query={
+ **(query or {}),
+ 'page': page,
+ 'per': 100
+ })
+ items = self._get_prx_embed_response(response, 'items')
+ if not response or not items:
+ break
+
+ yield from filter(None, map(entry_func, items))
+
+ total += response['count']
+ if total >= response['total']:
+ break
+
+ def _story_playlist_entry(self, response):
+ story = self._extract_story_info(response)
+ if not story:
+ return
+ story.update({
+ '_type': 'url',
+ 'url': 'https://beta.prx.org/stories/%s' % story['id'],
+ 'ie_key': PRXStoryIE.ie_key()
+ })
+ return story
+
+ def _series_playlist_entry(self, response):
+ series = self._extract_series_info(response)
+ if not series:
+ return
+ series.update({
+ '_type': 'url',
+ 'url': 'https://beta.prx.org/series/%s' % series['id'],
+ 'ie_key': PRXSeriesIE.ie_key()
+ })
+ return series
+
+
+class PRXStoryIE(PRXBaseIE):
+ _VALID_URL = PRXBaseIE.PRX_BASE_URL_RE % r'stories/(?P<id>\d+)'
+
+ _TESTS = [
+ {
+ # Story with season and episode details
+ 'url': 'https://beta.prx.org/stories/399200',
+ 'info_dict': {
+ 'id': '399200',
+ 'title': 'Fly Me To The Moon',
+ 'description': 'md5:43230168390b95d3322048d8a56bf2bb',
+ 'release_timestamp': 1640250000,
+ 'timestamp': 1640208972,
+ 'modified_timestamp': 1641318202,
+ 'duration': 1004,
+ 'tags': 'count:7',
+ 'episode_number': 8,
+ 'season_number': 5,
+ 'series': 'AirSpace',
+ 'series_id': '38057',
+ 'channel_id': '220986',
+ 'channel_url': 'https://beta.prx.org/accounts/220986',
+ 'channel': 'Air and Space Museum',
+ },
+ 'playlist': [{
+ 'info_dict': {
+ 'id': '399200_part1',
+ 'title': 'Fly Me To The Moon',
+ 'description': 'md5:43230168390b95d3322048d8a56bf2bb',
+ 'release_timestamp': 1640250000,
+ 'timestamp': 1640208972,
+ 'modified_timestamp': 1641318202,
+ 'duration': 530,
+ 'tags': 'count:7',
+ 'episode_number': 8,
+ 'season_number': 5,
+ 'series': 'AirSpace',
+ 'series_id': '38057',
+ 'channel_id': '220986',
+ 'channel_url': 'https://beta.prx.org/accounts/220986',
+ 'channel': 'Air and Space Museum',
+ 'ext': 'mp3',
+ 'upload_date': '20211222',
+ 'episode': 'Episode 8',
+ 'release_date': '20211223',
+ 'season': 'Season 5',
+ 'modified_date': '20220104'
+ }
+ }, {
+ 'info_dict': {
+ 'id': '399200_part2',
+ 'title': 'Fly Me To The Moon',
+ 'description': 'md5:43230168390b95d3322048d8a56bf2bb',
+ 'release_timestamp': 1640250000,
+ 'timestamp': 1640208972,
+ 'modified_timestamp': 1641318202,
+ 'duration': 474,
+ 'tags': 'count:7',
+ 'episode_number': 8,
+ 'season_number': 5,
+ 'series': 'AirSpace',
+ 'series_id': '38057',
+ 'channel_id': '220986',
+ 'channel_url': 'https://beta.prx.org/accounts/220986',
+ 'channel': 'Air and Space Museum',
+ 'ext': 'mp3',
+ 'upload_date': '20211222',
+ 'episode': 'Episode 8',
+ 'release_date': '20211223',
+ 'season': 'Season 5',
+ 'modified_date': '20220104'
+ }
+ }
+
+ ]
+ }, {
+ # Story with only split audio
+ 'url': 'https://beta.prx.org/stories/326414',
+ 'info_dict': {
+ 'id': '326414',
+ 'title': 'Massachusetts v EPA',
+ 'description': 'md5:744fffba08f19f4deab69fa8d49d5816',
+ 'timestamp': 1592509124,
+ 'modified_timestamp': 1592510457,
+ 'duration': 3088,
+ 'tags': 'count:0',
+ 'series': 'Outside/In',
+ 'series_id': '36252',
+ 'channel_id': '206',
+ 'channel_url': 'https://beta.prx.org/accounts/206',
+ 'channel': 'New Hampshire Public Radio',
+ },
+ 'playlist_count': 4
+ }, {
+ # Story with single combined audio
+ 'url': 'https://beta.prx.org/stories/400404',
+ 'info_dict': {
+ 'id': '400404',
+ 'title': 'Cafe Chill (Episode 2022-01)',
+ 'thumbnails': 'count:1',
+ 'description': 'md5:9f1b5a3cbd64fb159d08c3baa31f1539',
+ 'timestamp': 1641233952,
+ 'modified_timestamp': 1641234248,
+ 'duration': 3540,
+ 'series': 'Café Chill',
+ 'series_id': '37762',
+ 'channel_id': '5767',
+ 'channel_url': 'https://beta.prx.org/accounts/5767',
+ 'channel': 'C89.5 - KNHC Seattle',
+ 'ext': 'mp3',
+ 'tags': 'count:0',
+ 'thumbnail': r're:https?://cms\.prx\.org/pub/\w+/0/web/story_image/767965/medium/Aurora_Over_Trees\.jpg',
+ 'upload_date': '20220103',
+ 'modified_date': '20220103'
+ }
+ }, {
+ 'url': 'https://listen.prx.org/stories/399200',
+ 'only_matching': True
+ }
+ ]
+
+ def _extract_audio_pieces(self, audio_response):
+ return [{
+ 'format_id': str_or_none(piece_response.get('id')),
+ 'format_note': str_or_none(piece_response.get('label')),
+ 'filesize': int_or_none(piece_response.get('size')),
+ 'duration': int_or_none(piece_response.get('duration')),
+ 'ext': mimetype2ext(piece_response.get('contentType')),
+ 'asr': int_or_none(piece_response.get('frequency'), scale=1000),
+ 'abr': int_or_none(piece_response.get('bitRate')),
+ 'url': self._extract_file_link(piece_response),
+ 'vcodec': 'none'
+ } for piece_response in sorted(
+ self._get_prx_embed_response(audio_response, 'items') or [],
+ key=lambda p: int_or_none(p.get('position')))]
+
+ def _extract_story(self, story_response):
+ info = self._extract_story_info(story_response)
+ if not info:
+ return
+ audio_pieces = self._extract_audio_pieces(
+ self._get_prx_embed_response(story_response, 'audio'))
+ if len(audio_pieces) == 1:
+ return {
+ 'formats': audio_pieces,
+ **info
+ }
+
+ entries = [{
+ **info,
+ 'id': '%s_part%d' % (info['id'], (idx + 1)),
+ 'formats': [fmt],
+ } for idx, fmt in enumerate(audio_pieces)]
+ return {
+ '_type': 'multi_video',
+ 'entries': entries,
+ **info
+ }
+
+ def _real_extract(self, url):
+ story_id = self._match_id(url)
+ response = self._call_api(story_id, f'stories/{story_id}')
+ return self._extract_story(response)
+
+
+class PRXSeriesIE(PRXBaseIE):
+ _VALID_URL = PRXBaseIE.PRX_BASE_URL_RE % r'series/(?P<id>\d+)'
+ _TESTS = [
+ {
+ 'url': 'https://beta.prx.org/series/36252',
+ 'info_dict': {
+ 'id': '36252',
+ 'title': 'Outside/In',
+ 'thumbnails': 'count:1',
+ 'description': 'md5:a6bedc5f810777bcb09ab30ff9059114',
+ 'timestamp': 1470684964,
+ 'modified_timestamp': 1582308830,
+ 'channel_id': '206',
+ 'channel_url': 'https://beta.prx.org/accounts/206',
+ 'channel': 'New Hampshire Public Radio',
+ 'series': 'Outside/In',
+ 'series_id': '36252'
+ },
+ 'playlist_mincount': 39
+ }, {
+ # Blank series
+ 'url': 'https://beta.prx.org/series/25038',
+ 'info_dict': {
+ 'id': '25038',
+ 'title': '25038',
+ 'timestamp': 1207612800,
+ 'modified_timestamp': 1207612800,
+ 'channel_id': '206',
+ 'channel_url': 'https://beta.prx.org/accounts/206',
+ 'channel': 'New Hampshire Public Radio',
+ 'series': '25038',
+ 'series_id': '25038'
+ },
+ 'playlist_count': 0
+ }
+ ]
+
+ def _extract_series(self, series_response):
+ info = self._extract_series_info(series_response)
+ return {
+ '_type': 'playlist',
+ 'entries': self._entries(info['id'], 'series/%s/stories' % info['id'], self._story_playlist_entry),
+ **info
+ }
+
+ def _real_extract(self, url):
+ series_id = self._match_id(url)
+ response = self._call_api(series_id, f'series/{series_id}')
+ return self._extract_series(response)
+
+
+class PRXAccountIE(PRXBaseIE):
+ _VALID_URL = PRXBaseIE.PRX_BASE_URL_RE % r'accounts/(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'https://beta.prx.org/accounts/206',
+ 'info_dict': {
+ 'id': '206',
+ 'title': 'New Hampshire Public Radio',
+ 'description': 'md5:277f2395301d0aca563c80c70a18ee0a',
+ 'channel_id': '206',
+ 'channel_url': 'https://beta.prx.org/accounts/206',
+ 'channel': 'New Hampshire Public Radio',
+ 'thumbnails': 'count:1'
+ },
+ 'playlist_mincount': 380
+ }]
+
+ def _extract_account(self, account_response):
+ info = self._extract_account_info(account_response)
+ series = self._entries(
+ info['id'], f'accounts/{info["id"]}/series', self._series_playlist_entry)
+ stories = self._entries(
+ info['id'], f'accounts/{info["id"]}/stories', self._story_playlist_entry)
+ return {
+ '_type': 'playlist',
+ 'entries': itertools.chain(series, stories),
+ **info
+ }
+
+ def _real_extract(self, url):
+ account_id = self._match_id(url)
+ response = self._call_api(account_id, f'accounts/{account_id}')
+ return self._extract_account(response)
+
+
+class PRXStoriesSearchIE(PRXBaseIE, SearchInfoExtractor):
+ IE_DESC = 'PRX Stories Search'
+ IE_NAME = 'prxstories:search'
+ _SEARCH_KEY = 'prxstories'
+
+ def _search_results(self, query):
+ yield from self._entries(
+ f'query {query}', 'stories/search', self._story_playlist_entry, query={'q': query})
+
+
+class PRXSeriesSearchIE(PRXBaseIE, SearchInfoExtractor):
+ IE_DESC = 'PRX Series Search'
+ IE_NAME = 'prxseries:search'
+ _SEARCH_KEY = 'prxseries'
+
+ def _search_results(self, query):
+ yield from self._entries(
+ f'query {query}', 'series/search', self._series_playlist_entry, query={'q': query})
diff --git a/hypervideo_dl/extractor/radiode.py b/hypervideo_dl/extractor/radiode.py
index 2c06c8b..0382873 100644
--- a/hypervideo_dl/extractor/radiode.py
+++ b/hypervideo_dl/extractor/radiode.py
@@ -29,7 +29,7 @@ class RadioDeIE(InfoExtractor):
webpage, 'broadcast')
broadcast = self._parse_json(jscode, radio_id)
- title = self._live_title(broadcast['name'])
+ title = broadcast['name']
description = broadcast.get('description') or broadcast.get('shortDescription')
thumbnail = broadcast.get('picture4Url') or broadcast.get('picture4TransUrl') or broadcast.get('logo100x100')
diff --git a/hypervideo_dl/extractor/radiokapital.py b/hypervideo_dl/extractor/radiokapital.py
new file mode 100644
index 0000000..2e93e03
--- /dev/null
+++ b/hypervideo_dl/extractor/radiokapital.py
@@ -0,0 +1,99 @@
+# coding: utf-8
+
+from .common import InfoExtractor
+from ..utils import (
+ clean_html,
+ traverse_obj,
+ unescapeHTML,
+)
+
+import itertools
+from urllib.parse import urlencode
+
+
+class RadioKapitalBaseIE(InfoExtractor):
+ def _call_api(self, resource, video_id, note='Downloading JSON metadata', qs={}):
+ return self._download_json(
+ f'https://www.radiokapital.pl/wp-json/kapital/v1/{resource}?{urlencode(qs)}',
+ video_id, note=note)
+
+ def _parse_episode(self, data):
+ release = '%s%s%s' % (data['published'][6:11], data['published'][3:6], data['published'][:3])
+ return {
+ '_type': 'url_transparent',
+ 'url': data['mixcloud_url'],
+ 'ie_key': 'Mixcloud',
+ 'title': unescapeHTML(data['title']),
+ 'description': clean_html(data.get('content')),
+ 'tags': traverse_obj(data, ('tags', ..., 'name')),
+ 'release_date': release,
+ 'series': traverse_obj(data, ('show', 'title')),
+ }
+
+
+class RadioKapitalIE(RadioKapitalBaseIE):
+ IE_NAME = 'radiokapital'
+ _VALID_URL = r'https?://(?:www\.)?radiokapital\.pl/shows/[a-z\d-]+/(?P<id>[a-z\d-]+)'
+
+ _TESTS = [{
+ 'url': 'https://radiokapital.pl/shows/tutaj-sa-smoki/5-its-okay-to-be-immaterial',
+ 'info_dict': {
+ 'id': 'radiokapital_radio-kapitał-tutaj-są-smoki-5-its-okay-to-be-immaterial-2021-05-20',
+ 'ext': 'm4a',
+ 'title': '#5: It’s okay to\xa0be\xa0immaterial',
+ 'description': 'md5:2499da5fbfb0e88333b7d37ec8e9e4c4',
+ 'uploader': 'Radio Kapitał',
+ 'uploader_id': 'radiokapital',
+ 'timestamp': 1621640164,
+ 'upload_date': '20210521',
+ },
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ episode = self._call_api('episodes/%s' % video_id, video_id)
+ return self._parse_episode(episode)
+
+
+class RadioKapitalShowIE(RadioKapitalBaseIE):
+ IE_NAME = 'radiokapital:show'
+ _VALID_URL = r'https?://(?:www\.)?radiokapital\.pl/shows/(?P<id>[a-z\d-]+)/?(?:$|[?#])'
+
+ _TESTS = [{
+ 'url': 'https://radiokapital.pl/shows/wesz',
+ 'info_dict': {
+ 'id': '100',
+ 'title': 'WĘSZ',
+ 'description': 'md5:3a557a1e0f31af612b0dcc85b1e0ca5c',
+ },
+ 'playlist_mincount': 17,
+ }]
+
+ def _get_episode_list(self, series_id, page_no):
+ return self._call_api(
+ 'episodes', series_id,
+ f'Downloading episode list page #{page_no}', qs={
+ 'show': series_id,
+ 'page': page_no,
+ })
+
+ def _entries(self, series_id):
+ for page_no in itertools.count(1):
+ episode_list = self._get_episode_list(series_id, page_no)
+ yield from (self._parse_episode(ep) for ep in episode_list['items'])
+ if episode_list['next'] is None:
+ break
+
+ def _real_extract(self, url):
+ series_id = self._match_id(url)
+
+ show = self._call_api(f'shows/{series_id}', series_id, 'Downloading show metadata')
+ entries = self._entries(series_id)
+ return {
+ '_type': 'playlist',
+ 'entries': entries,
+ 'id': str(show['id']),
+ 'title': show.get('title'),
+ 'description': clean_html(show.get('content')),
+ }
diff --git a/hypervideo_dl/extractor/radiozet.py b/hypervideo_dl/extractor/radiozet.py
new file mode 100644
index 0000000..2e1ff36
--- /dev/null
+++ b/hypervideo_dl/extractor/radiozet.py
@@ -0,0 +1,51 @@
+# coding: utf-8
+from .common import InfoExtractor
+from ..utils import (
+ traverse_obj,
+ strip_or_none,
+)
+
+
+class RadioZetPodcastIE(InfoExtractor):
+ _VALID_URL = r'https?://player\.radiozet\.pl\/Podcasty/.*?/(?P<id>.+)'
+ _TEST = {
+ 'url': 'https://player.radiozet.pl/Podcasty/Nie-Ma-Za-Co/O-przedmiotach-szkolnych-ktore-przydaja-sie-w-zyciu',
+ 'md5': 'e03665c316b4fbc5f6a8f232948bbba3',
+ 'info_dict': {
+ 'id': '42154',
+ 'display_id': 'O-przedmiotach-szkolnych-ktore-przydaja-sie-w-zyciu',
+ 'title': 'O przedmiotach szkolnych, które przydają się w życiu',
+ 'description': 'md5:fa72bed49da334b09e5b2f79851f185c',
+ 'release_timestamp': 1592985480,
+ 'ext': 'mp3',
+ 'thumbnail': r're:^https?://.*\.png$',
+ 'duration': 83,
+ 'series': 'Nie Ma Za Co',
+ 'creator': 'Katarzyna Pakosińska',
+ }
+ }
+
+ def _call_api(self, podcast_id, display_id):
+ return self._download_json(
+ f'https://player.radiozet.pl/api/podcasts/getPodcast/(node)/{podcast_id}/(station)/radiozet',
+ display_id)
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
+ podcast_id = self._html_search_regex(r'<div.*?\sid="player".*?\sdata-id=[\'"]([^\'"]+)[\'"]',
+ webpage, 'podcast id')
+ data = self._call_api(podcast_id, display_id)['data'][0]
+
+ return {
+ 'id': podcast_id,
+ 'display_id': display_id,
+ 'title': strip_or_none(data.get('title')),
+ 'description': strip_or_none(traverse_obj(data, ('program', 'desc'))),
+ 'release_timestamp': data.get('published_date'),
+ 'url': traverse_obj(data, ('player', 'stream')),
+ 'thumbnail': traverse_obj(data, ('program', 'image', 'original')),
+ 'duration': traverse_obj(data, ('player', 'duration')),
+ 'series': strip_or_none(traverse_obj(data, ('program', 'title'))),
+ 'creator': strip_or_none(traverse_obj(data, ('presenter', 0, 'title'))),
+ }
diff --git a/hypervideo_dl/extractor/radlive.py b/hypervideo_dl/extractor/radlive.py
index 2de7ab0..dc98973 100644
--- a/hypervideo_dl/extractor/radlive.py
+++ b/hypervideo_dl/extractor/radlive.py
@@ -1,6 +1,12 @@
import json
-from ..utils import ExtractorError, traverse_obj, try_get, unified_timestamp
+from ..utils import (
+ ExtractorError,
+ format_field,
+ traverse_obj,
+ try_get,
+ unified_timestamp
+)
from .common import InfoExtractor
@@ -74,7 +80,7 @@ class RadLiveIE(InfoExtractor):
'release_timestamp': release_date,
'channel': channel.get('name'),
'channel_id': channel_id,
- 'channel_url': f'https://rad.live/content/channel/{channel_id}' if channel_id else None,
+ 'channel_url': format_field(channel_id, template='https://rad.live/content/channel/%s'),
}
if content_type == 'episode':
diff --git a/hypervideo_dl/extractor/rai.py b/hypervideo_dl/extractor/rai.py
index 27cd018..6864129 100644
--- a/hypervideo_dl/extractor/rai.py
+++ b/hypervideo_dl/extractor/rai.py
@@ -11,14 +11,17 @@ from ..compat import (
from ..utils import (
determine_ext,
ExtractorError,
+ filter_dict,
find_xpath_attr,
fix_xml_ampersands,
GeoRestrictedError,
HEADRequest,
int_or_none,
+ join_nonempty,
parse_duration,
remove_start,
strip_or_none,
+ traverse_obj,
try_get,
unified_strdate,
unified_timestamp,
@@ -33,7 +36,7 @@ class RaiBaseIE(InfoExtractor):
_GEO_COUNTRIES = ['IT']
_GEO_BYPASS = False
- def _extract_relinker_info(self, relinker_url, video_id):
+ def _extract_relinker_info(self, relinker_url, video_id, audio_only=False):
if not re.match(r'https?://', relinker_url):
return {'formats': [{'url': relinker_url}]}
@@ -76,7 +79,15 @@ class RaiBaseIE(InfoExtractor):
if (ext == 'm3u8' and platform != 'mon') or (ext == 'f4m' and platform != 'flash'):
continue
- if ext == 'm3u8' or 'format=m3u8' in media_url or platform == 'mon':
+ if ext == 'mp3':
+ formats.append({
+ 'url': media_url,
+ 'vcodec': 'none',
+ 'acodec': 'mp3',
+ 'format_id': 'http-mp3',
+ })
+ break
+ elif ext == 'm3u8' or 'format=m3u8' in media_url or platform == 'mon':
formats.extend(self._extract_m3u8_formats(
media_url, video_id, 'mp4', 'm3u8_native',
m3u8_id='hls', fatal=False))
@@ -97,16 +108,17 @@ class RaiBaseIE(InfoExtractor):
if not formats and geoprotection is True:
self.raise_geo_restricted(countries=self._GEO_COUNTRIES, metadata_available=True)
- formats.extend(self._create_http_urls(relinker_url, formats))
+ if not audio_only:
+ formats.extend(self._create_http_urls(relinker_url, formats))
- return dict((k, v) for k, v in {
+ return filter_dict({
'is_live': is_live,
'duration': duration,
'formats': formats,
- }.items() if v is not None)
+ })
def _create_http_urls(self, relinker_url, fmts):
- _RELINKER_REG = r'https?://(?P<host>[^/]+?)/(?:i/)?(?P<extra>[^/]+?)/(?P<path>.+?)/(?P<id>\d+)(?:_(?P<quality>[\d\,]+))?(?:\.mp4|/playlist\.m3u8).+?'
+ _RELINKER_REG = r'https?://(?P<host>[^/]+?)/(?:i/)?(?P<extra>[^/]+?)/(?P<path>.+?)/(?P<id>\w+)(?:_(?P<quality>[\d\,]+))?(?:\.mp4|/playlist\.m3u8).+?'
_MP4_TMPL = '%s&overrideUserAgentRule=mp4-%s'
_QUALITY = {
# tbr: w, h
@@ -135,6 +147,9 @@ class RaiBaseIE(InfoExtractor):
return False if resp.url == url else resp.url
return None
+ # filter out audio-only formats
+ fmts = [f for f in fmts if not f.get('vcodec') == 'none']
+
def get_format_info(tbr):
import math
br = int_or_none(tbr)
@@ -226,7 +241,7 @@ class RaiPlayIE(RaiBaseIE):
'id': 'cb27157f-9dd0-4aee-b788-b1f67643a391',
'ext': 'mp4',
'title': 'Report del 07/04/2014',
- 'alt_title': 'St 2013/14 - Espresso nel caffè - 07/04/2014',
+ 'alt_title': 'St 2013/14 - Report - Espresso nel caffè - 07/04/2014',
'description': 'md5:d730c168a58f4bb35600fc2f881ec04e',
'thumbnail': r're:^https?://.*\.jpg$',
'uploader': 'Rai Gulp',
@@ -234,7 +249,7 @@ class RaiPlayIE(RaiBaseIE):
'series': 'Report',
'season': '2013/14',
'subtitles': {
- 'it': 'count:2',
+ 'it': 'count:4',
},
},
'params': {
@@ -242,18 +257,18 @@ class RaiPlayIE(RaiBaseIE):
},
}, {
# 1080p direct mp4 url
- 'url': 'https://www.raiplay.it/video/2021/03/Leonardo-S1E1-b5703b02-82ee-475a-85b6-c9e4a8adf642.html',
- 'md5': '2e501e8651d72f05ffe8f5d286ad560b',
+ 'url': 'https://www.raiplay.it/video/2021/11/Blanca-S1E1-Senza-occhi-b1255a4a-8e72-4a2f-b9f3-fc1308e00736.html',
+ 'md5': 'aeda7243115380b2dd5e881fd42d949a',
'info_dict': {
- 'id': 'b5703b02-82ee-475a-85b6-c9e4a8adf642',
+ 'id': 'b1255a4a-8e72-4a2f-b9f3-fc1308e00736',
'ext': 'mp4',
- 'title': 'Leonardo - S1E1',
- 'alt_title': 'St 1 Ep 1 - Episodio 1',
- 'description': 'md5:f5360cd267d2de146e4e3879a5a47d31',
+ 'title': 'Blanca - S1E1 - Senza occhi',
+ 'alt_title': 'St 1 Ep 1 - Blanca - Senza occhi',
+ 'description': 'md5:75f95d5c030ec8bac263b1212322e28c',
'thumbnail': r're:^https?://.*\.jpg$',
'uploader': 'Rai 1',
- 'duration': 3229,
- 'series': 'Leonardo',
+ 'duration': 6493,
+ 'series': 'Blanca',
'season': 'Season 1',
},
}, {
@@ -306,12 +321,13 @@ class RaiPlayIE(RaiBaseIE):
program_info = media.get('program_info') or {}
season = media.get('season')
+ alt_title = join_nonempty(media.get('subtitle'), media.get('toptitle'), delim=' - ')
+
info = {
'id': remove_start(media.get('id'), 'ContentItem-') or video_id,
'display_id': video_id,
- 'title': self._live_title(title) if relinker_info.get(
- 'is_live') else title,
- 'alt_title': strip_or_none(media.get('subtitle')),
+ 'title': title,
+ 'alt_title': strip_or_none(alt_title),
'description': media.get('description'),
'uploader': strip_or_none(media.get('channel')),
'creator': strip_or_none(media.get('editor') or None),
@@ -351,26 +367,44 @@ class RaiPlayLiveIE(RaiPlayIE):
class RaiPlayPlaylistIE(InfoExtractor):
- _VALID_URL = r'(?P<base>https?://(?:www\.)?raiplay\.it/programmi/(?P<id>[^/?#&]+))'
+ _VALID_URL = r'(?P<base>https?://(?:www\.)?raiplay\.it/programmi/(?P<id>[^/?#&]+))(?:/(?P<extra_id>[^?#&]+))?'
_TESTS = [{
- 'url': 'http://www.raiplay.it/programmi/nondirloalmiocapo/',
+ 'url': 'https://www.raiplay.it/programmi/nondirloalmiocapo/',
'info_dict': {
'id': 'nondirloalmiocapo',
'title': 'Non dirlo al mio capo',
'description': 'md5:98ab6b98f7f44c2843fd7d6f045f153b',
},
'playlist_mincount': 12,
+ }, {
+ 'url': 'https://www.raiplay.it/programmi/nondirloalmiocapo/episodi/stagione-2/',
+ 'info_dict': {
+ 'id': 'nondirloalmiocapo',
+ 'title': 'Non dirlo al mio capo - Stagione 2',
+ 'description': 'md5:98ab6b98f7f44c2843fd7d6f045f153b',
+ },
+ 'playlist_mincount': 12,
}]
def _real_extract(self, url):
- base, playlist_id = self._match_valid_url(url).groups()
+ base, playlist_id, extra_id = self._match_valid_url(url).groups()
program = self._download_json(
base + '.json', playlist_id, 'Downloading program JSON')
+ if extra_id:
+ extra_id = extra_id.upper().rstrip('/')
+
+ playlist_title = program.get('name')
entries = []
for b in (program.get('blocks') or []):
for s in (b.get('sets') or []):
+ if extra_id:
+ if extra_id != join_nonempty(
+ b.get('name'), s.get('name'), delim='/').replace(' ', '-').upper():
+ continue
+ playlist_title = join_nonempty(playlist_title, s.get('name'), delim=' - ')
+
s_id = s.get('id')
if not s_id:
continue
@@ -389,10 +423,128 @@ class RaiPlayPlaylistIE(InfoExtractor):
video_id=RaiPlayIE._match_id(video_url)))
return self.playlist_result(
- entries, playlist_id, program.get('name'),
+ entries, playlist_id, playlist_title,
try_get(program, lambda x: x['program_info']['description']))
+class RaiPlaySoundIE(RaiBaseIE):
+ _VALID_URL = r'(?P<base>https?://(?:www\.)?raiplaysound\.it/.+?-(?P<id>%s))\.(?:html|json)' % RaiBaseIE._UUID_RE
+ _TESTS = [{
+ 'url': 'https://www.raiplaysound.it/audio/2021/12/IL-RUGGITO-DEL-CONIGLIO-1ebae2a7-7cdb-42bb-842e-fe0d193e9707.html',
+ 'md5': '8970abf8caf8aef4696e7b1f2adfc696',
+ 'info_dict': {
+ 'id': '1ebae2a7-7cdb-42bb-842e-fe0d193e9707',
+ 'ext': 'mp3',
+ 'title': 'Il Ruggito del Coniglio del 10/12/2021',
+ 'description': 'md5:2a17d2107e59a4a8faa0e18334139ee2',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'uploader': 'rai radio 2',
+ 'duration': 5685,
+ 'series': 'Il Ruggito del Coniglio',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }]
+
+ def _real_extract(self, url):
+ base, audio_id = self._match_valid_url(url).group('base', 'id')
+ media = self._download_json(f'{base}.json', audio_id, 'Downloading audio JSON')
+ uid = try_get(media, lambda x: remove_start(remove_start(x['uniquename'], 'ContentItem-'), 'Page-'))
+
+ info = {}
+ formats = []
+ relinkers = set(traverse_obj(media, (('downloadable_audio', 'audio', ('live', 'cards', 0, 'audio')), 'url')))
+ for r in relinkers:
+ info = self._extract_relinker_info(r, audio_id, True)
+ formats.extend(info.get('formats'))
+
+ date_published = try_get(media, (lambda x: f'{x["create_date"]} {x.get("create_time") or ""}',
+ lambda x: x['live']['create_date']))
+
+ podcast_info = traverse_obj(media, 'podcast_info', ('live', 'cards', 0)) or {}
+ thumbnails = [{
+ 'url': urljoin(url, thumb_url),
+ } for thumb_url in (podcast_info.get('images') or {}).values() if thumb_url]
+
+ return {
+ **info,
+ 'id': uid or audio_id,
+ 'display_id': audio_id,
+ 'title': traverse_obj(media, 'title', 'episode_title'),
+ 'alt_title': traverse_obj(media, ('track_info', 'media_name')),
+ 'description': media.get('description'),
+ 'uploader': traverse_obj(media, ('track_info', 'channel'), expected_type=strip_or_none),
+ 'creator': traverse_obj(media, ('track_info', 'editor'), expected_type=strip_or_none),
+ 'timestamp': unified_timestamp(date_published),
+ 'thumbnails': thumbnails,
+ 'series': podcast_info.get('title'),
+ 'season_number': int_or_none(media.get('season')),
+ 'episode': media.get('episode_title'),
+ 'episode_number': int_or_none(media.get('episode')),
+ 'formats': formats,
+ }
+
+
+class RaiPlaySoundLiveIE(RaiPlaySoundIE):
+ _VALID_URL = r'(?P<base>https?://(?:www\.)?raiplaysound\.it/(?P<id>[^/?#&]+)$)'
+ _TESTS = [{
+ 'url': 'https://www.raiplaysound.it/radio2',
+ 'info_dict': {
+ 'id': 'b00a50e6-f404-4af6-8f8c-ff3b9af73a44',
+ 'display_id': 'radio2',
+ 'ext': 'mp4',
+ 'title': 'Rai Radio 2',
+ 'uploader': 'rai radio 2',
+ 'creator': 'raiplaysound',
+ 'is_live': True,
+ },
+ 'params': {
+ 'skip_download': 'live',
+ },
+ }]
+
+
+class RaiPlaySoundPlaylistIE(InfoExtractor):
+ _VALID_URL = r'(?P<base>https?://(?:www\.)?raiplaysound\.it/(?:programmi|playlist|audiolibri)/(?P<id>[^/?#&]+))(?:/(?P<extra_id>[^?#&]+))?'
+ _TESTS = [{
+ 'url': 'https://www.raiplaysound.it/programmi/ilruggitodelconiglio',
+ 'info_dict': {
+ 'id': 'ilruggitodelconiglio',
+ 'title': 'Il Ruggito del Coniglio',
+ 'description': 'md5:1bbaf631245a7ab1ec4d9fbb3c7aa8f3',
+ },
+ 'playlist_mincount': 65,
+ }, {
+ 'url': 'https://www.raiplaysound.it/programmi/ilruggitodelconiglio/puntate/prima-stagione-1995',
+ 'info_dict': {
+ 'id': 'ilruggitodelconiglio_puntate_prima-stagione-1995',
+ 'title': 'Prima Stagione 1995',
+ },
+ 'playlist_count': 1,
+ }]
+
+ def _real_extract(self, url):
+ base, playlist_id, extra_id = self._match_valid_url(url).group('base', 'id', 'extra_id')
+ url = f'{base}.json'
+ program = self._download_json(url, playlist_id, 'Downloading program JSON')
+
+ if extra_id:
+ extra_id = extra_id.rstrip('/')
+ playlist_id += '_' + extra_id.replace('/', '_')
+ path = next(c['path_id'] for c in program.get('filters') or [] if extra_id in c.get('weblink'))
+ program = self._download_json(
+ urljoin('https://www.raiplaysound.it', path), playlist_id, 'Downloading program secondary JSON')
+
+ entries = [
+ self.url_result(urljoin(base, c['path_id']), ie=RaiPlaySoundIE.ie_key())
+ for c in traverse_obj(program, 'cards', ('block', 'cards')) or []
+ if c.get('path_id')]
+
+ return self.playlist_result(entries, playlist_id, program.get('title'),
+ traverse_obj(program, ('podcast_info', 'description')))
+
+
class RaiIE(RaiBaseIE):
_VALID_URL = r'https?://[^/]+\.(?:rai\.(?:it|tv)|rainews\.it)/.+?-(?P<id>%s)(?:-.+?)?\.html' % RaiBaseIE._UUID_RE
_TESTS = [{
diff --git a/hypervideo_dl/extractor/rcti.py b/hypervideo_dl/extractor/rcti.py
index 31d9779..ac42e58 100644
--- a/hypervideo_dl/extractor/rcti.py
+++ b/hypervideo_dl/extractor/rcti.py
@@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import itertools
import json
import random
import time
@@ -12,6 +11,7 @@ from ..utils import (
dict_get,
ExtractorError,
strip_or_none,
+ traverse_obj,
try_get
)
@@ -26,7 +26,7 @@ class RCTIPlusBaseIE(InfoExtractor):
json = self._download_json(
url, video_id, note=note, headers={'Authorization': self._AUTH_KEY})
if json.get('status', {}).get('code', 0) != 0:
- raise ExtractorError('%s said: %s' % (self.IE_NAME, json["status"]["message_client"]), cause=json)
+ raise ExtractorError(f'{self.IE_NAME} said: {json["status"]["message_client"]}', cause=json)
return json.get('data'), json.get('meta')
@@ -85,9 +85,6 @@ class RCTIPlusIE(RCTIPlusBaseIE):
'series': 'iNews Malam',
'channel': 'INews',
},
- 'params': {
- 'format': 'bestvideo',
- },
}, { # Missed event/replay
'url': 'https://www.rctiplus.com/missed-event/2507/mou-signing-ceremony-27-juli-2021-1400-wib',
'md5': '649c5f27250faed1452ca8b91e06922d',
@@ -132,7 +129,6 @@ class RCTIPlusIE(RCTIPlusBaseIE):
},
'params': {
'skip_download': True,
- 'format': 'bestvideo',
},
}]
_CONVIVA_JSON_TEMPLATE = {
@@ -227,18 +223,30 @@ class RCTIPlusIE(RCTIPlusBaseIE):
class RCTIPlusSeriesIE(RCTIPlusBaseIE):
- _VALID_URL = r'https://www\.rctiplus\.com/programs/(?P<id>\d+)/(?P<display_id>[^/?#&]+)'
+ _VALID_URL = r'https://www\.rctiplus\.com/programs/(?P<id>\d+)/(?P<display_id>[^/?#&]+)(?:/(?P<type>episodes|extras|clips))?'
_TESTS = [{
- 'url': 'https://www.rctiplus.com/programs/540/upin-ipin',
- 'playlist_mincount': 417,
+ 'url': 'https://www.rctiplus.com/programs/829/putri-untuk-pangeran',
+ 'playlist_mincount': 1019,
'info_dict': {
- 'id': '540',
- 'title': 'Upin & Ipin',
- 'description': 'md5:22cc912381f389664416844e1ec4f86b',
+ 'id': '829',
+ 'title': 'Putri Untuk Pangeran',
+ 'description': 'md5:aca7b54d05bd95a67d4f4613cc1d622d',
+ 'age_limit': 2,
+ 'cast': ['Verrel Bramasta', 'Ranty Maria', 'Riza Syah', 'Ivan Fadilla', 'Nicole Parham', 'Dll', 'Aviv Elham'],
+ 'display_id': 'putri-untuk-pangeran',
+ 'tag': 'count:18',
},
- }, {
- 'url': 'https://www.rctiplus.com/programs/540/upin-ipin/episodes?utm_source=Rplusdweb&utm_medium=share_copy&utm_campaign=programsupin-ipin',
- 'only_matching': True,
+ }, { # No episodes
+ 'url': 'https://www.rctiplus.com/programs/615/inews-pagi',
+ 'playlist_mincount': 388,
+ 'info_dict': {
+ 'id': '615',
+ 'title': 'iNews Pagi',
+ 'description': 'md5:f18ee3d4643cfb41c358e5a9b693ee04',
+ 'age_limit': 2,
+ 'tag': 'count:11',
+ 'display_id': 'inews-pagi',
+ }
}]
_AGE_RATINGS = { # Based off https://id.wikipedia.org/wiki/Sistem_rating_konten_televisi with additional ratings
'S-SU': 2,
@@ -273,47 +281,63 @@ class RCTIPlusSeriesIE(RCTIPlusBaseIE):
display_id, '%s page %s' % (note, page_num))[0] or []
for video_json in episode_list:
- link = video_json['share_link']
- url_res = self.url_result(link, 'RCTIPlus', video_json.get('product_id'), video_json.get('title'))
- url_res.update(metadata)
- yield url_res
+ yield {
+ '_type': 'url',
+ 'url': video_json['share_link'],
+ 'ie_key': RCTIPlusIE.ie_key(),
+ 'id': video_json.get('product_id'),
+ 'title': video_json.get('title'),
+ 'display_id': video_json.get('title_code').replace('_', '-'),
+ 'description': video_json.get('summary'),
+ 'timestamp': video_json.get('release_date'),
+ 'duration': video_json.get('duration'),
+ 'season_number': video_json.get('season'),
+ 'episode_number': video_json.get('episode'),
+ **metadata
+ }
+
+ def _series_entries(self, series_id, display_id=None, video_type=None, metadata={}):
+ if not video_type or video_type in 'episodes':
+ try:
+ seasons_list = self._call_api(
+ f'https://api.rctiplus.com/api/v1/program/{series_id}/season',
+ display_id, 'Downloading seasons list JSON')[0]
+ except ExtractorError as e:
+ if 'not found' not in str(e):
+ raise
+ seasons_list = []
+ for season in seasons_list:
+ yield from self._entries(
+ f'https://api.rctiplus.com/api/v2/program/{series_id}/episode?season={season["season"]}',
+ display_id, f'Downloading season {season["season"]} episode entries', metadata)
+ if not video_type or video_type in 'extras':
+ yield from self._entries(
+ f'https://api.rctiplus.com/api/v2/program/{series_id}/extra?content_id=0',
+ display_id, 'Downloading extra entries', metadata)
+ if not video_type or video_type in 'clips':
+ yield from self._entries(
+ f'https://api.rctiplus.com/api/v2/program/{series_id}/clip?content_id=0',
+ display_id, 'Downloading clip entries', metadata)
def _real_extract(self, url):
- series_id, display_id = self._match_valid_url(url).groups()
+ series_id, display_id, video_type = self._match_valid_url(url).group('id', 'display_id', 'type')
+ if video_type:
+ self.report_warning(
+ f'Only {video_type} will be downloaded. '
+ f'To download everything from the series, remove "/{video_type}" from the URL')
series_meta, meta_paths = self._call_api(
- 'https://api.rctiplus.com/api/v1/program/%s/detail' % series_id, display_id, 'Downloading series metadata')
+ f'https://api.rctiplus.com/api/v1/program/{series_id}/detail', display_id, 'Downloading series metadata')
metadata = {
- 'age_limit': try_get(series_meta, lambda x: self._AGE_RATINGS[x['age_restriction'][0]['code']])
+ 'age_limit': try_get(series_meta, lambda x: self._AGE_RATINGS[x['age_restriction'][0]['code']]),
+ 'cast': traverse_obj(series_meta, (('starring', 'creator', 'writer'), ..., 'name'),
+ expected_type=lambda x: strip_or_none(x) or None),
+ 'tag': traverse_obj(series_meta, ('tag', ..., 'name'),
+ expected_type=lambda x: strip_or_none(x) or None),
}
-
- cast = []
- for star in series_meta.get('starring', []):
- cast.append(strip_or_none(star.get('name')))
- for star in series_meta.get('creator', []):
- cast.append(strip_or_none(star.get('name')))
- for star in series_meta.get('writer', []):
- cast.append(strip_or_none(star.get('name')))
- metadata['cast'] = cast
-
- tags = []
- for tag in series_meta.get('tag', []):
- tags.append(strip_or_none(tag.get('name')))
- metadata['tag'] = tags
-
- entries = []
- seasons_list = self._call_api(
- 'https://api.rctiplus.com/api/v1/program/%s/season' % series_id, display_id, 'Downloading seasons list JSON')[0]
- for season in seasons_list:
- entries.append(self._entries('https://api.rctiplus.com/api/v2/program/%s/episode?season=%s' % (series_id, season['season']),
- display_id, 'Downloading season %s episode entries' % season['season'], metadata))
-
- entries.append(self._entries('https://api.rctiplus.com/api/v2/program/%s/clip?content_id=0' % series_id,
- display_id, 'Downloading clip entries', metadata))
- entries.append(self._entries('https://api.rctiplus.com/api/v2/program/%s/extra?content_id=0' % series_id,
- display_id, 'Downloading extra entries', metadata))
-
- return self.playlist_result(itertools.chain(*entries), series_id, series_meta.get('title'), series_meta.get('summary'), **metadata)
+ return self.playlist_result(
+ self._series_entries(series_id, display_id, video_type, metadata), series_id,
+ series_meta.get('title'), series_meta.get('summary'), display_id=display_id, **metadata)
class RCTIPlusTVIE(RCTIPlusBaseIE):
@@ -329,7 +353,6 @@ class RCTIPlusTVIE(RCTIPlusBaseIE):
},
'params': {
'skip_download': True,
- 'format': 'bestvideo',
}
}, {
# Returned video will always change
@@ -350,5 +373,6 @@ class RCTIPlusTVIE(RCTIPlusBaseIE):
tv_id = match.get('tvname') or match.get('eventname')
webpage = self._download_webpage(url, tv_id)
video_type, video_id = self._search_regex(
- r'url\s*:\s*["\']https://api\.rctiplus\.com/api/v./(?P<type>[^/]+)/(?P<id>\d+)/url', webpage, 'video link', group=('type', 'id'))
+ r'url\s*:\s*["\']https://api\.rctiplus\.com/api/v./(?P<type>[^/]+)/(?P<id>\d+)/url',
+ webpage, 'video link', group=('type', 'id'))
return self.url_result(f'https://www.rctiplus.com/{video_type}/{video_id}/{tv_id}', 'RCTIPlus')
diff --git a/hypervideo_dl/extractor/redbulltv.py b/hypervideo_dl/extractor/redbulltv.py
index e7fdcce..756a366 100644
--- a/hypervideo_dl/extractor/redbulltv.py
+++ b/hypervideo_dl/extractor/redbulltv.py
@@ -81,12 +81,11 @@ class RedBullTVIE(InfoExtractor):
title = video['title'].strip()
- formats = self._extract_m3u8_formats(
+ formats, subtitles = self._extract_m3u8_formats_and_subtitles(
'https://dms.redbull.tv/v3/%s/%s/playlist.m3u8' % (video_id, token),
video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls')
self._sort_formats(formats)
- subtitles = {}
for resource in video.get('resources', []):
if resource.startswith('closed_caption_'):
splitted_resource = resource.split('_')
diff --git a/hypervideo_dl/extractor/reddit.py b/hypervideo_dl/extractor/reddit.py
index c75d95a..a042a59 100644
--- a/hypervideo_dl/extractor/reddit.py
+++ b/hypervideo_dl/extractor/reddit.py
@@ -8,46 +8,11 @@ from ..utils import (
try_get,
unescapeHTML,
url_or_none,
+ traverse_obj
)
class RedditIE(InfoExtractor):
- _VALID_URL = r'https?://v\.redd\.it/(?P<id>[^/?#&]+)'
- _TEST = {
- # from https://www.reddit.com/r/videos/comments/6rrwyj/that_small_heart_attack/
- 'url': 'https://v.redd.it/zv89llsvexdz',
- 'md5': '0a070c53eba7ec4534d95a5a1259e253',
- 'info_dict': {
- 'id': 'zv89llsvexdz',
- 'ext': 'mp4',
- 'title': 'zv89llsvexdz',
- },
- 'params': {
- 'format': 'bestvideo',
- },
- }
-
- def _real_extract(self, url):
- video_id = self._match_id(url)
-
- formats = self._extract_m3u8_formats(
- 'https://v.redd.it/%s/HLSPlaylist.m3u8' % video_id, video_id,
- 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)
-
- formats.extend(self._extract_mpd_formats(
- 'https://v.redd.it/%s/DASHPlaylist.mpd' % video_id, video_id,
- mpd_id='dash', fatal=False))
-
- self._sort_formats(formats)
-
- return {
- 'id': video_id,
- 'title': video_id,
- 'formats': formats,
- }
-
-
-class RedditRIE(InfoExtractor):
_VALID_URL = r'https?://(?P<subdomain>[^/]+\.)?reddit(?:media)?\.com/r/(?P<slug>[^/]+/comments/(?P<id>[^/?#&]+))'
_TESTS = [{
'url': 'https://www.reddit.com/r/videos/comments/6rrwyj/that_small_heart_attack/',
@@ -67,7 +32,6 @@ class RedditRIE(InfoExtractor):
'age_limit': 0,
},
'params': {
- 'format': 'bestvideo',
'skip_download': True,
},
}, {
@@ -151,19 +115,53 @@ class RedditRIE(InfoExtractor):
for resolution in resolutions:
add_thumbnail(resolution)
- return {
- '_type': 'url_transparent',
- 'url': video_url,
+ info = {
'title': data.get('title'),
'thumbnails': thumbnails,
'timestamp': float_or_none(data.get('created_utc')),
'uploader': data.get('author'),
- 'duration': int_or_none(try_get(
- data,
- (lambda x: x['media']['reddit_video']['duration'],
- lambda x: x['secure_media']['reddit_video']['duration']))),
'like_count': int_or_none(data.get('ups')),
'dislike_count': int_or_none(data.get('downs')),
'comment_count': int_or_none(data.get('num_comments')),
'age_limit': age_limit,
}
+
+ # Check if media is hosted on reddit:
+ reddit_video = traverse_obj(data, (('media', 'secure_media'), 'reddit_video'), get_all=False)
+ if reddit_video:
+ playlist_urls = [
+ try_get(reddit_video, lambda x: unescapeHTML(x[y]))
+ for y in ('dash_url', 'hls_url')
+ ]
+
+ # Update video_id
+ display_id = video_id
+ video_id = self._search_regex(
+ r'https?://v\.redd\.it/(?P<id>[^/?#&]+)', reddit_video['fallback_url'],
+ 'video_id', default=display_id)
+
+ dash_playlist_url = playlist_urls[0] or f'https://v.redd.it/{video_id}/DASHPlaylist.mpd'
+ hls_playlist_url = playlist_urls[1] or f'https://v.redd.it/{video_id}/HLSPlaylist.m3u8'
+
+ formats = self._extract_m3u8_formats(
+ hls_playlist_url, display_id, 'mp4',
+ entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)
+ formats.extend(self._extract_mpd_formats(
+ dash_playlist_url, display_id, mpd_id='dash', fatal=False))
+ self._sort_formats(formats)
+
+ return {
+ **info,
+ 'id': video_id,
+ 'display_id': display_id,
+ 'formats': formats,
+ 'duration': int_or_none(reddit_video.get('duration')),
+ }
+
+ # Not hosted on reddit, must continue extraction
+ return {
+ **info,
+ 'display_id': video_id,
+ '_type': 'url_transparent',
+ 'url': video_url,
+ }
diff --git a/hypervideo_dl/extractor/redgifs.py b/hypervideo_dl/extractor/redgifs.py
new file mode 100644
index 0000000..55196b7
--- /dev/null
+++ b/hypervideo_dl/extractor/redgifs.py
@@ -0,0 +1,232 @@
+# coding: utf-8
+import functools
+
+from .common import InfoExtractor
+from ..compat import compat_parse_qs
+from ..utils import (
+ ExtractorError,
+ int_or_none,
+ qualities,
+ try_get,
+ OnDemandPagedList,
+)
+
+
+class RedGifsBaseInfoExtractor(InfoExtractor):
+ _FORMATS = {
+ 'gif': 250,
+ 'sd': 480,
+ 'hd': None,
+ }
+
+ def _parse_gif_data(self, gif_data):
+ video_id = gif_data.get('id')
+ quality = qualities(tuple(self._FORMATS.keys()))
+
+ orig_height = int_or_none(gif_data.get('height'))
+ aspect_ratio = try_get(gif_data, lambda x: orig_height / x['width'])
+
+ formats = []
+ for format_id, height in self._FORMATS.items():
+ video_url = gif_data['urls'].get(format_id)
+ if not video_url:
+ continue
+ height = min(orig_height, height or orig_height)
+ formats.append({
+ 'url': video_url,
+ 'format_id': format_id,
+ 'width': height * aspect_ratio if aspect_ratio else None,
+ 'height': height,
+ 'quality': quality(format_id),
+ })
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'webpage_url': f'https://redgifs.com/watch/{video_id}',
+ 'ie_key': RedGifsIE.ie_key(),
+ 'extractor': 'RedGifs',
+ 'title': ' '.join(gif_data.get('tags') or []) or 'RedGifs',
+ 'timestamp': int_or_none(gif_data.get('createDate')),
+ 'uploader': gif_data.get('userName'),
+ 'duration': int_or_none(gif_data.get('duration')),
+ 'view_count': int_or_none(gif_data.get('views')),
+ 'like_count': int_or_none(gif_data.get('likes')),
+ 'categories': gif_data.get('tags') or [],
+ 'tags': gif_data.get('tags'),
+ 'age_limit': 18,
+ 'formats': formats,
+ }
+
+ def _call_api(self, ep, video_id, *args, **kwargs):
+ data = self._download_json(
+ f'https://api.redgifs.com/v2/{ep}', video_id, *args, **kwargs)
+ if 'error' in data:
+ raise ExtractorError(f'RedGifs said: {data["error"]}', expected=True, video_id=video_id)
+ return data
+
+ def _fetch_page(self, ep, video_id, query, page):
+ query['page'] = page + 1
+ data = self._call_api(
+ ep, video_id, query=query, note=f'Downloading JSON metadata page {page + 1}')
+
+ for entry in data['gifs']:
+ yield self._parse_gif_data(entry)
+
+ def _prepare_api_query(self, query, fields):
+ api_query = [
+ (field_name, query.get(field_name, (default,))[0])
+ for field_name, default in fields.items()]
+
+ return {key: val for key, val in api_query if val is not None}
+
+ def _paged_entries(self, ep, item_id, query, fields):
+ page = int_or_none(query.get('page', (None,))[0])
+ page_fetcher = functools.partial(
+ self._fetch_page, ep, item_id, self._prepare_api_query(query, fields))
+ return page_fetcher(page) if page else OnDemandPagedList(page_fetcher, self._PAGE_SIZE)
+
+
+class RedGifsIE(RedGifsBaseInfoExtractor):
+ _VALID_URL = r'https?://(?:(?:www\.)?redgifs\.com/watch/|thumbs2\.redgifs\.com/)(?P<id>[^-/?#\.]+)'
+ _TESTS = [{
+ 'url': 'https://www.redgifs.com/watch/squeakyhelplesswisent',
+ 'info_dict': {
+ 'id': 'squeakyhelplesswisent',
+ 'ext': 'mp4',
+ 'title': 'Hotwife Legs Thick',
+ 'timestamp': 1636287915,
+ 'upload_date': '20211107',
+ 'uploader': 'ignored52',
+ 'duration': 16,
+ 'view_count': int,
+ 'like_count': int,
+ 'categories': list,
+ 'age_limit': 18,
+ }
+ }, {
+ 'url': 'https://thumbs2.redgifs.com/SqueakyHelplessWisent-mobile.mp4#t=0',
+ 'info_dict': {
+ 'id': 'squeakyhelplesswisent',
+ 'ext': 'mp4',
+ 'title': 'Hotwife Legs Thick',
+ 'timestamp': 1636287915,
+ 'upload_date': '20211107',
+ 'uploader': 'ignored52',
+ 'duration': 16,
+ 'view_count': int,
+ 'like_count': int,
+ 'categories': list,
+ 'age_limit': 18,
+ }
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url).lower()
+ video_info = self._call_api(
+ f'gifs/{video_id}', video_id, note='Downloading video info')
+ return self._parse_gif_data(video_info['gif'])
+
+
+class RedGifsSearchIE(RedGifsBaseInfoExtractor):
+ IE_DESC = 'Redgifs search'
+ _VALID_URL = r'https?://(?:www\.)?redgifs\.com/browse\?(?P<query>[^#]+)'
+ _PAGE_SIZE = 80
+ _TESTS = [
+ {
+ 'url': 'https://www.redgifs.com/browse?tags=Lesbian',
+ 'info_dict': {
+ 'id': 'tags=Lesbian',
+ 'title': 'Lesbian',
+ 'description': 'RedGifs search for Lesbian, ordered by trending'
+ },
+ 'playlist_mincount': 100,
+ },
+ {
+ 'url': 'https://www.redgifs.com/browse?type=g&order=latest&tags=Lesbian',
+ 'info_dict': {
+ 'id': 'type=g&order=latest&tags=Lesbian',
+ 'title': 'Lesbian',
+ 'description': 'RedGifs search for Lesbian, ordered by latest'
+ },
+ 'playlist_mincount': 100,
+ },
+ {
+ 'url': 'https://www.redgifs.com/browse?type=g&order=latest&tags=Lesbian&page=2',
+ 'info_dict': {
+ 'id': 'type=g&order=latest&tags=Lesbian&page=2',
+ 'title': 'Lesbian',
+ 'description': 'RedGifs search for Lesbian, ordered by latest'
+ },
+ 'playlist_count': 80,
+ }
+ ]
+
+ def _real_extract(self, url):
+ query_str = self._match_valid_url(url).group('query')
+ query = compat_parse_qs(query_str)
+ if not query.get('tags'):
+ raise ExtractorError('Invalid query tags', expected=True)
+
+ tags = query.get('tags')[0]
+ order = query.get('order', ('trending',))[0]
+
+ query['search_text'] = [tags]
+ entries = self._paged_entries('gifs/search', query_str, query, {
+ 'search_text': None,
+ 'order': 'trending',
+ 'type': None,
+ })
+
+ return self.playlist_result(
+ entries, query_str, tags, f'RedGifs search for {tags}, ordered by {order}')
+
+
+class RedGifsUserIE(RedGifsBaseInfoExtractor):
+ IE_DESC = 'Redgifs user'
+ _VALID_URL = r'https?://(?:www\.)?redgifs\.com/users/(?P<username>[^/?#]+)(?:\?(?P<query>[^#]+))?'
+ _PAGE_SIZE = 30
+ _TESTS = [
+ {
+ 'url': 'https://www.redgifs.com/users/lamsinka89',
+ 'info_dict': {
+ 'id': 'lamsinka89',
+ 'title': 'lamsinka89',
+ 'description': 'RedGifs user lamsinka89, ordered by recent'
+ },
+ 'playlist_mincount': 100,
+ },
+ {
+ 'url': 'https://www.redgifs.com/users/lamsinka89?page=3',
+ 'info_dict': {
+ 'id': 'lamsinka89?page=3',
+ 'title': 'lamsinka89',
+ 'description': 'RedGifs user lamsinka89, ordered by recent'
+ },
+ 'playlist_count': 30,
+ },
+ {
+ 'url': 'https://www.redgifs.com/users/lamsinka89?order=best&type=g',
+ 'info_dict': {
+ 'id': 'lamsinka89?order=best&type=g',
+ 'title': 'lamsinka89',
+ 'description': 'RedGifs user lamsinka89, ordered by best'
+ },
+ 'playlist_mincount': 100,
+ }
+ ]
+
+ def _real_extract(self, url):
+ username, query_str = self._match_valid_url(url).group('username', 'query')
+ playlist_id = f'{username}?{query_str}' if query_str else username
+
+ query = compat_parse_qs(query_str)
+ order = query.get('order', ('recent',))[0]
+
+ entries = self._paged_entries(f'users/{username}/search', playlist_id, query, {
+ 'order': 'recent',
+ 'type': None,
+ })
+
+ return self.playlist_result(
+ entries, playlist_id, username, f'RedGifs user {username}, ordered by {order}')
diff --git a/hypervideo_dl/extractor/redtube.py b/hypervideo_dl/extractor/redtube.py
index 747ce51..7fee54f 100644
--- a/hypervideo_dl/extractor/redtube.py
+++ b/hypervideo_dl/extractor/redtube.py
@@ -17,17 +17,20 @@ from ..utils import (
class RedTubeIE(InfoExtractor):
_VALID_URL = r'https?://(?:(?:\w+\.)?redtube\.com/|embed\.redtube\.com/\?.*?\bid=)(?P<id>[0-9]+)'
_TESTS = [{
- 'url': 'http://www.redtube.com/66418',
- 'md5': 'fc08071233725f26b8f014dba9590005',
+ 'url': 'https://www.redtube.com/38864951',
+ 'md5': '4fba70cbca3aefd25767ab4b523c9878',
'info_dict': {
- 'id': '66418',
+ 'id': '38864951',
'ext': 'mp4',
- 'title': 'Sucked on a toilet',
- 'upload_date': '20110811',
- 'duration': 596,
+ 'title': 'Public Sex on the Balcony in Freezing Paris! Amateur Couple LeoLulu',
+ 'description': 'Watch video Public Sex on the Balcony in Freezing Paris! Amateur Couple LeoLulu on Redtube, home of free Blowjob porn videos and Blonde sex movies online. Video length: (10:46) - Uploaded by leolulu - Verified User - Starring Pornstar: Leolulu',
+ 'upload_date': '20210111',
+ 'timestamp': 1610343109,
+ 'duration': 646,
'view_count': int,
'age_limit': 18,
- }
+ 'thumbnail': r're:https://\wi-ph\.rdtcdn\.com/videos/.+/.+\.jpg',
+ },
}, {
'url': 'http://embed.redtube.com/?bgcolor=000000&id=1443286',
'only_matching': True,
@@ -84,15 +87,25 @@ class RedTubeIE(InfoExtractor):
r'mediaDefinition["\']?\s*:\s*(\[.+?}\s*\])', webpage,
'media definitions', default='{}'),
video_id, fatal=False)
- if medias and isinstance(medias, list):
- for media in medias:
+ for media in medias if isinstance(medias, list) else []:
+ format_url = url_or_none(media.get('videoUrl'))
+ if not format_url:
+ continue
+ format_id = media.get('format')
+ quality = media.get('quality')
+ if format_id == 'hls' or (format_id == 'mp4' and not quality):
+ more_media = self._download_json(format_url, video_id, fatal=False)
+ else:
+ more_media = [media]
+ for media in more_media if isinstance(more_media, list) else []:
format_url = url_or_none(media.get('videoUrl'))
if not format_url:
continue
- if media.get('format') == 'hls' or determine_ext(format_url) == 'm3u8':
+ format_id = media.get('format')
+ if format_id == 'hls' or determine_ext(format_url) == 'm3u8':
formats.extend(self._extract_m3u8_formats(
format_url, video_id, 'mp4',
- entry_protocol='m3u8_native', m3u8_id='hls',
+ entry_protocol='m3u8_native', m3u8_id=format_id or 'hls',
fatal=False))
continue
format_id = media.get('quality')
diff --git a/hypervideo_dl/extractor/rmcdecouverte.py b/hypervideo_dl/extractor/rmcdecouverte.py
index 422d47a..8bfce34 100644
--- a/hypervideo_dl/extractor/rmcdecouverte.py
+++ b/hypervideo_dl/extractor/rmcdecouverte.py
@@ -26,7 +26,6 @@ class RMCDecouverteIE(InfoExtractor):
'upload_date': '20210428',
},
'params': {
- 'format': 'bestvideo',
'skip_download': True,
},
}, {
diff --git a/hypervideo_dl/extractor/rokfin.py b/hypervideo_dl/extractor/rokfin.py
new file mode 100644
index 0000000..0fd65db
--- /dev/null
+++ b/hypervideo_dl/extractor/rokfin.py
@@ -0,0 +1,256 @@
+# coding: utf-8
+import itertools
+from datetime import datetime
+
+from .common import InfoExtractor
+from ..utils import (
+ determine_ext,
+ ExtractorError,
+ float_or_none,
+ format_field,
+ int_or_none,
+ str_or_none,
+ traverse_obj,
+ unified_timestamp,
+ url_or_none,
+)
+
+
+_API_BASE_URL = 'https://prod-api-v2.production.rokfin.com/api/v2/public/'
+
+
+class RokfinIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?rokfin\.com/(?P<id>(?P<type>post|stream)/\d+)'
+ _TESTS = [{
+ 'url': 'https://www.rokfin.com/post/57548/Mitt-Romneys-Crazy-Solution-To-Climate-Change',
+ 'info_dict': {
+ 'id': 'post/57548',
+ 'ext': 'mp4',
+ 'title': 'Mitt Romney\'s Crazy Solution To Climate Change',
+ 'thumbnail': r're:https://img\.production\.rokfin\.com/.+',
+ 'upload_date': '20211023',
+ 'timestamp': 1634998029,
+ 'channel': 'Jimmy Dore',
+ 'channel_id': 65429,
+ 'channel_url': 'https://rokfin.com/TheJimmyDoreShow',
+ 'duration': 213.0,
+ 'availability': 'public',
+ 'live_status': 'not_live',
+ 'dislike_count': int,
+ 'like_count': int,
+ }
+ }, {
+ 'url': 'https://rokfin.com/post/223/Julian-Assange-Arrested-Streaming-In-Real-Time',
+ 'info_dict': {
+ 'id': 'post/223',
+ 'ext': 'mp4',
+ 'title': 'Julian Assange Arrested: Streaming In Real Time',
+ 'thumbnail': r're:https://img\.production\.rokfin\.com/.+',
+ 'upload_date': '20190412',
+ 'timestamp': 1555052644,
+ 'channel': 'Ron Placone',
+ 'channel_id': 10,
+ 'channel_url': 'https://rokfin.com/RonPlacone',
+ 'availability': 'public',
+ 'live_status': 'not_live',
+ 'dislike_count': int,
+ 'like_count': int,
+ 'tags': ['FreeThinkingMedia^', 'RealProgressives^'],
+ }
+ }, {
+ 'url': 'https://www.rokfin.com/stream/10543/Its-A-Crazy-Mess-Regional-Director-Blows-Whistle-On-Pfizers-Vaccine-Trial-Data',
+ 'info_dict': {
+ 'id': 'stream/10543',
+ 'ext': 'mp4',
+ 'title': '"It\'s A Crazy Mess" Regional Director Blows Whistle On Pfizer\'s Vaccine Trial Data',
+ 'thumbnail': r're:https://img\.production\.rokfin\.com/.+',
+ 'description': 'md5:324ce2d3e3b62e659506409e458b9d8e',
+ 'channel': 'Ryan Cristián',
+ 'channel_id': 53856,
+ 'channel_url': 'https://rokfin.com/TLAVagabond',
+ 'availability': 'public',
+ 'is_live': False,
+ 'was_live': True,
+ 'live_status': 'was_live',
+ 'timestamp': 1635874720,
+ 'release_timestamp': 1635874720,
+ 'release_date': '20211102',
+ 'upload_date': '20211102',
+ 'dislike_count': int,
+ 'like_count': int,
+ 'tags': ['FreeThinkingMedia^'],
+ }
+ }]
+
+ def _real_extract(self, url):
+ video_id, video_type = self._match_valid_url(url).group('id', 'type')
+
+ metadata = self._download_json(f'{_API_BASE_URL}{video_id}', video_id)
+
+ scheduled = unified_timestamp(metadata.get('scheduledAt'))
+ live_status = ('was_live' if metadata.get('stoppedAt')
+ else 'is_upcoming' if scheduled
+ else 'is_live' if video_type == 'stream'
+ else 'not_live')
+
+ video_url = traverse_obj(metadata, 'url', ('content', 'contentUrl'), expected_type=url_or_none)
+ formats, subtitles = [{'url': video_url}] if video_url else [], {}
+ if determine_ext(video_url) == 'm3u8':
+ formats, subtitles = self._extract_m3u8_formats_and_subtitles(
+ video_url, video_id, fatal=False, live=live_status == 'is_live')
+
+ if not formats:
+ if traverse_obj(metadata, 'premiumPlan', 'premium'):
+ self.raise_login_required('This video is only available to premium users', True, method='cookies')
+ elif scheduled:
+ self.raise_no_formats(
+ f'Stream is offline; sheduled for {datetime.fromtimestamp(scheduled).strftime("%Y-%m-%d %H:%M:%S")}',
+ video_id=video_id, expected=True)
+ self._sort_formats(formats)
+
+ uploader = traverse_obj(metadata, ('createdBy', 'username'), ('creator', 'username'))
+ timestamp = (scheduled or float_or_none(metadata.get('postedAtMilli'), 1000)
+ or unified_timestamp(metadata.get('creationDateTime')))
+ return {
+ 'id': video_id,
+ 'formats': formats,
+ 'subtitles': subtitles,
+ 'title': str_or_none(traverse_obj(metadata, 'title', ('content', 'contentTitle'))),
+ 'duration': float_or_none(traverse_obj(metadata, ('content', 'duration'))),
+ 'thumbnail': url_or_none(traverse_obj(metadata, 'thumbnail', ('content', 'thumbnailUrl1'))),
+ 'description': str_or_none(traverse_obj(metadata, 'description', ('content', 'contentDescription'))),
+ 'like_count': int_or_none(metadata.get('likeCount')),
+ 'dislike_count': int_or_none(metadata.get('dislikeCount')),
+ 'channel': str_or_none(traverse_obj(metadata, ('createdBy', 'name'), ('creator', 'name'))),
+ 'channel_id': traverse_obj(metadata, ('createdBy', 'id'), ('creator', 'id')),
+ 'channel_url': url_or_none(f'https://rokfin.com/{uploader}') if uploader else None,
+ 'timestamp': timestamp,
+ 'release_timestamp': timestamp if live_status != 'not_live' else None,
+ 'tags': traverse_obj(metadata, ('tags', ..., 'title'), expected_type=str_or_none),
+ 'live_status': live_status,
+ 'availability': self._availability(
+ needs_premium=bool(traverse_obj(metadata, 'premiumPlan', 'premium')),
+ is_private=False, needs_subscription=False, needs_auth=False, is_unlisted=False),
+ # 'comment_count': metadata.get('numComments'), # Data provided by website is wrong
+ '__post_extractor': self.extract_comments(video_id) if video_type == 'post' else None,
+ }
+
+ def _get_comments(self, video_id):
+ pages_total = None
+ for page_n in itertools.count():
+ raw_comments = self._download_json(
+ f'{_API_BASE_URL}comment?postId={video_id[5:]}&page={page_n}&size=50',
+ video_id, note=f'Downloading viewer comments page {page_n + 1}{format_field(pages_total, template=" of %s")}',
+ fatal=False) or {}
+
+ for comment in raw_comments.get('content') or []:
+ yield {
+ 'text': str_or_none(comment.get('comment')),
+ 'author': str_or_none(comment.get('name')),
+ 'id': comment.get('commentId'),
+ 'author_id': comment.get('userId'),
+ 'parent': 'root',
+ 'like_count': int_or_none(comment.get('numLikes')),
+ 'dislike_count': int_or_none(comment.get('numDislikes')),
+ 'timestamp': unified_timestamp(comment.get('postedAt'))
+ }
+
+ pages_total = int_or_none(raw_comments.get('totalPages')) or None
+ is_last = raw_comments.get('last')
+ if not raw_comments.get('content') or is_last or (page_n > pages_total if pages_total else is_last is not False):
+ return
+
+
+class RokfinPlaylistBaseIE(InfoExtractor):
+ _TYPES = {
+ 'video': 'post',
+ 'audio': 'post',
+ 'stream': 'stream',
+ 'dead_stream': 'stream',
+ 'stack': 'stack',
+ }
+
+ def _get_video_data(self, metadata):
+ for content in metadata.get('content') or []:
+ media_type = self._TYPES.get(content.get('mediaType'))
+ video_id = content.get('id') if media_type == 'post' else content.get('mediaId')
+ if not media_type or not video_id:
+ continue
+
+ yield self.url_result(f'https://rokfin.com/{media_type}/{video_id}', video_id=f'{media_type}/{video_id}',
+ video_title=str_or_none(traverse_obj(content, ('content', 'contentTitle'))))
+
+
+class RokfinStackIE(RokfinPlaylistBaseIE):
+ IE_NAME = 'rokfin:stack'
+ _VALID_URL = r'https?://(?:www\.)?rokfin\.com/stack/(?P<id>[^/]+)'
+ _TESTS = [{
+ 'url': 'https://www.rokfin.com/stack/271/Tulsi-Gabbard-Portsmouth-Townhall-FULL--Feb-9-2020',
+ 'playlist_count': 8,
+ 'info_dict': {
+ 'id': '271',
+ },
+ }]
+
+ def _real_extract(self, url):
+ list_id = self._match_id(url)
+ return self.playlist_result(self._get_video_data(
+ self._download_json(f'{_API_BASE_URL}stack/{list_id}', list_id)), list_id)
+
+
+class RokfinChannelIE(RokfinPlaylistBaseIE):
+ IE_NAME = 'rokfin:channel'
+ _VALID_URL = r'https?://(?:www\.)?rokfin\.com/(?!((feed/?)|(discover/?)|(channels/?))$)(?P<id>[^/]+)/?$'
+ _TESTS = [{
+ 'url': 'https://rokfin.com/TheConvoCouch',
+ 'playlist_mincount': 100,
+ 'info_dict': {
+ 'id': '12071-new',
+ 'title': 'TheConvoCouch - New',
+ 'description': 'md5:bb622b1bca100209b91cd685f7847f06',
+ },
+ }]
+
+ _TABS = {
+ 'new': 'posts',
+ 'top': 'top',
+ 'videos': 'video',
+ 'podcasts': 'audio',
+ 'streams': 'stream',
+ 'stacks': 'stack',
+ }
+
+ def _real_initialize(self):
+ self._validate_extractor_args()
+
+ def _validate_extractor_args(self):
+ requested_tabs = self._configuration_arg('tab', None)
+ if requested_tabs is not None and (len(requested_tabs) > 1 or requested_tabs[0] not in self._TABS):
+ raise ExtractorError(f'Invalid extractor-arg "tab". Must be one of {", ".join(self._TABS)}', expected=True)
+
+ def _entries(self, channel_id, channel_name, tab):
+ pages_total = None
+ for page_n in itertools.count(0):
+ if tab in ('posts', 'top'):
+ data_url = f'{_API_BASE_URL}user/{channel_name}/{tab}?page={page_n}&size=50'
+ else:
+ data_url = f'{_API_BASE_URL}post/search/{tab}?page={page_n}&size=50&creator={channel_id}'
+ metadata = self._download_json(
+ data_url, channel_name,
+ note=f'Downloading video metadata page {page_n + 1}{format_field(pages_total, template=" of %s")}')
+
+ yield from self._get_video_data(metadata)
+ pages_total = int_or_none(metadata.get('totalPages')) or None
+ is_last = metadata.get('last')
+ if is_last or (page_n > pages_total if pages_total else is_last is not False):
+ return
+
+ def _real_extract(self, url):
+ channel_name = self._match_id(url)
+ channel_info = self._download_json(f'{_API_BASE_URL}user/{channel_name}', channel_name)
+ channel_id = channel_info['id']
+ tab = self._configuration_arg('tab', default=['new'])[0]
+
+ return self.playlist_result(
+ self._entries(channel_id, channel_name, self._TABS[tab]),
+ f'{channel_id}-{tab}', f'{channel_name} - {tab.title()}', str_or_none(channel_info.get('description')))
diff --git a/hypervideo_dl/extractor/roosterteeth.py b/hypervideo_dl/extractor/roosterteeth.py
index 2c815bd..a55dd4f 100644
--- a/hypervideo_dl/extractor/roosterteeth.py
+++ b/hypervideo_dl/extractor/roosterteeth.py
@@ -1,25 +1,88 @@
# coding: utf-8
-from __future__ import unicode_literals
-
from .common import InfoExtractor
-from ..compat import (
- compat_HTTPError,
- compat_str,
-)
+from ..compat import compat_HTTPError
from ..utils import (
ExtractorError,
int_or_none,
+ join_nonempty,
+ LazyList,
+ parse_qs,
str_or_none,
+ traverse_obj,
+ url_or_none,
urlencode_postdata,
+ urljoin,
+ update_url_query,
)
-class RoosterTeethIE(InfoExtractor):
- _VALID_URL = r'https?://(?:.+?\.)?roosterteeth\.com/(?:episode|watch)/(?P<id>[^/?#&]+)'
+class RoosterTeethBaseIE(InfoExtractor):
_NETRC_MACHINE = 'roosterteeth'
+ _API_BASE = 'https://svod-be.roosterteeth.com'
+ _API_BASE_URL = f'{_API_BASE}/api/v1'
+
+ def _perform_login(self, username, password):
+ if self._get_cookies(self._API_BASE_URL).get('rt_access_token'):
+ return
+
+ try:
+ self._download_json(
+ 'https://auth.roosterteeth.com/oauth/token',
+ None, 'Logging in', data=urlencode_postdata({
+ 'client_id': '4338d2b4bdc8db1239360f28e72f0d9ddb1fd01e7a38fbb07b4b1f4ba4564cc5',
+ 'grant_type': 'password',
+ 'username': username,
+ 'password': password,
+ }))
+ except ExtractorError as e:
+ msg = 'Unable to login'
+ if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
+ resp = self._parse_json(e.cause.read().decode(), None, fatal=False)
+ if resp:
+ error = resp.get('extra_info') or resp.get('error_description') or resp.get('error')
+ if error:
+ msg += ': ' + error
+ self.report_warning(msg)
+
+ def _extract_video_info(self, data):
+ thumbnails = []
+ for image in traverse_obj(data, ('included', 'images')):
+ if image.get('type') not in ('episode_image', 'bonus_feature_image'):
+ continue
+ thumbnails.extend([{
+ 'id': name,
+ 'url': url,
+ } for name, url in (image.get('attributes') or {}).items() if url_or_none(url)])
+
+ attributes = data.get('attributes') or {}
+ title = traverse_obj(attributes, 'title', 'display_title')
+ sub_only = attributes.get('is_sponsors_only')
+
+ return {
+ 'id': str(data.get('id')),
+ 'display_id': attributes.get('slug'),
+ 'title': title,
+ 'description': traverse_obj(attributes, 'description', 'caption'),
+ 'series': attributes.get('show_title'),
+ 'season_number': int_or_none(attributes.get('season_number')),
+ 'season_id': attributes.get('season_id'),
+ 'episode': title,
+ 'episode_number': int_or_none(attributes.get('number')),
+ 'episode_id': str_or_none(data.get('uuid')),
+ 'channel_id': attributes.get('channel_id'),
+ 'duration': int_or_none(attributes.get('length')),
+ 'thumbnails': thumbnails,
+ 'availability': self._availability(
+ needs_premium=sub_only, needs_subscription=sub_only, needs_auth=sub_only,
+ is_private=False, is_unlisted=False),
+ 'tags': attributes.get('genres')
+ }
+
+
+class RoosterTeethIE(RoosterTeethBaseIE):
+ _VALID_URL = r'https?://(?:.+?\.)?roosterteeth\.com/(?:episode|watch)/(?P<id>[^/?#&]+)'
_TESTS = [{
'url': 'http://roosterteeth.com/episode/million-dollars-but-season-2-million-dollars-but-the-game-announcement',
- 'md5': 'e2bd7764732d785ef797700a2489f212',
'info_dict': {
'id': '9156',
'display_id': 'million-dollars-but-season-2-million-dollars-but-the-game-announcement',
@@ -30,19 +93,20 @@ class RoosterTeethIE(InfoExtractor):
'series': 'Million Dollars, But...',
'episode': 'Million Dollars, But... The Game Announcement',
},
+ 'params': {'skip_download': True},
}, {
'url': 'https://roosterteeth.com/watch/rwby-bonus-25',
- 'md5': 'fe8d9d976b272c18a24fe7f1f5830084',
'info_dict': {
- 'id': '31',
+ 'id': '40432',
'display_id': 'rwby-bonus-25',
- 'title': 'Volume 2, World of Remnant 3',
- 'description': 'md5:8d58d3270292ea11da00ea712bbfb009',
- 'episode': 'Volume 2, World of Remnant 3',
- 'channel_id': 'fab60c1c-29cb-43bc-9383-5c3538d9e246',
+ 'title': 'Grimm',
+ 'description': 'md5:f30ff570741213418a8d2c19868b93ab',
+ 'episode': 'Grimm',
+ 'channel_id': '92f780eb-ebfe-4bf5-a3b5-c6ad5460a5f1',
'thumbnail': r're:^https?://.*\.(png|jpe?g)$',
'ext': 'mp4',
},
+ 'params': {'skip_download': True},
}, {
'url': 'http://achievementhunter.roosterteeth.com/episode/off-topic-the-achievement-hunter-podcast-2016-i-didn-t-think-it-would-pass-31',
'only_matching': True,
@@ -63,40 +127,10 @@ class RoosterTeethIE(InfoExtractor):
'url': 'https://roosterteeth.com/watch/million-dollars-but-season-2-million-dollars-but-the-game-announcement',
'only_matching': True,
}]
- _EPISODE_BASE_URL = 'https://svod-be.roosterteeth.com/api/v1/watch/'
-
- def _login(self):
- username, password = self._get_login_info()
- if username is None:
- return
-
- try:
- self._download_json(
- 'https://auth.roosterteeth.com/oauth/token',
- None, 'Logging in', data=urlencode_postdata({
- 'client_id': '4338d2b4bdc8db1239360f28e72f0d9ddb1fd01e7a38fbb07b4b1f4ba4564cc5',
- 'grant_type': 'password',
- 'username': username,
- 'password': password,
- }))
- except ExtractorError as e:
- msg = 'Unable to login'
- if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
- resp = self._parse_json(e.cause.read().decode(), None, fatal=False)
- if resp:
- error = resp.get('extra_info') or resp.get('error_description') or resp.get('error')
- if error:
- msg += ': ' + error
- self.report_warning(msg)
-
- def _real_initialize(self):
- if self._get_cookies(self._EPISODE_BASE_URL).get('rt_access_token'):
- return
- self._login()
def _real_extract(self, url):
display_id = self._match_id(url)
- api_episode_url = self._EPISODE_BASE_URL + display_id
+ api_episode_url = f'{self._API_BASE_URL}/watch/{display_id}'
try:
video_data = self._download_json(
@@ -118,36 +152,62 @@ class RoosterTeethIE(InfoExtractor):
episode = self._download_json(
api_episode_url, display_id,
'Downloading episode JSON metadata')['data'][0]
- attributes = episode['attributes']
- title = attributes.get('title') or attributes['display_title']
- video_id = compat_str(episode['id'])
-
- thumbnails = []
- for image in episode.get('included', {}).get('images', []):
- if image.get('type') in ('episode_image', 'bonus_feature_image'):
- img_attributes = image.get('attributes') or {}
- for k in ('thumb', 'small', 'medium', 'large'):
- img_url = img_attributes.get(k)
- if img_url:
- thumbnails.append({
- 'id': k,
- 'url': img_url,
- })
return {
- 'id': video_id,
'display_id': display_id,
- 'title': title,
- 'description': attributes.get('description') or attributes.get('caption'),
- 'thumbnails': thumbnails,
- 'series': attributes.get('show_title'),
- 'season_number': int_or_none(attributes.get('season_number')),
- 'season_id': attributes.get('season_id'),
- 'episode': title,
- 'episode_number': int_or_none(attributes.get('number')),
- 'episode_id': str_or_none(episode.get('uuid')),
'formats': formats,
- 'channel_id': attributes.get('channel_id'),
- 'duration': int_or_none(attributes.get('length')),
- 'subtitles': subtitles
+ 'subtitles': subtitles,
+ **self._extract_video_info(episode)
+ }
+
+
+class RoosterTeethSeriesIE(RoosterTeethBaseIE):
+ _VALID_URL = r'https?://(?:.+?\.)?roosterteeth\.com/series/(?P<id>[^/?#&]+)'
+ _TESTS = [{
+ 'url': 'https://roosterteeth.com/series/rwby?season=7',
+ 'playlist_count': 13,
+ 'info_dict': {
+ 'id': 'rwby-7',
+ 'title': 'RWBY - Season 7',
}
+ }, {
+ 'url': 'https://roosterteeth.com/series/role-initiative',
+ 'playlist_mincount': 16,
+ 'info_dict': {
+ 'id': 'role-initiative',
+ 'title': 'Role Initiative',
+ }
+ }, {
+ 'url': 'https://roosterteeth.com/series/let-s-play-minecraft?season=9',
+ 'playlist_mincount': 50,
+ 'info_dict': {
+ 'id': 'let-s-play-minecraft-9',
+ 'title': 'Let\'s Play Minecraft - Season 9',
+ }
+ }]
+
+ def _entries(self, series_id, season_number):
+ display_id = join_nonempty(series_id, season_number)
+ # TODO: extract bonus material
+ for data in self._download_json(
+ f'{self._API_BASE_URL}/shows/{series_id}/seasons?order=asc&order_by', display_id)['data']:
+ idx = traverse_obj(data, ('attributes', 'number'))
+ if season_number and idx != season_number:
+ continue
+ season_url = update_url_query(urljoin(self._API_BASE, data['links']['episodes']), {'per_page': 1000})
+ season = self._download_json(season_url, display_id, f'Downloading season {idx} JSON metadata')['data']
+ for episode in season:
+ yield self.url_result(
+ f'https://www.roosterteeth.com{episode["canonical_links"]["self"]}',
+ RoosterTeethIE.ie_key(),
+ **self._extract_video_info(episode))
+
+ def _real_extract(self, url):
+ series_id = self._match_id(url)
+ season_number = traverse_obj(parse_qs(url), ('season', 0), expected_type=int_or_none)
+
+ entries = LazyList(self._entries(series_id, season_number))
+ return self.playlist_result(
+ entries,
+ join_nonempty(series_id, season_number),
+ join_nonempty(entries[0].get('series'), season_number, delim=' - Season '))
diff --git a/hypervideo_dl/extractor/rtbf.py b/hypervideo_dl/extractor/rtbf.py
index f9979d0..4b61fdb 100644
--- a/hypervideo_dl/extractor/rtbf.py
+++ b/hypervideo_dl/extractor/rtbf.py
@@ -85,8 +85,6 @@ class RTBFIE(InfoExtractor):
title = data['title']
is_live = data.get('isLive')
- if is_live:
- title = self._live_title(title)
height_re = r'-(\d+)p\.'
formats = []
diff --git a/hypervideo_dl/extractor/rtl2.py b/hypervideo_dl/extractor/rtl2.py
index 4e3aa03..e291714 100644
--- a/hypervideo_dl/extractor/rtl2.py
+++ b/hypervideo_dl/extractor/rtl2.py
@@ -4,16 +4,13 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..aes import aes_cbc_decrypt
+from ..aes import aes_cbc_decrypt_bytes, unpad_pkcs7
from ..compat import (
compat_b64decode,
- compat_ord,
compat_str,
)
from ..utils import (
- bytes_to_intlist,
ExtractorError,
- intlist_to_bytes,
int_or_none,
strip_or_none,
)
@@ -142,17 +139,12 @@ class RTL2YouIE(RTL2YouBaseIE):
self._BACKWERK_BASE_URL + 'stream/video/' + video_id, video_id)
data, iv = compat_b64decode(stream_data['streamUrl']).decode().split(':')
- stream_url = intlist_to_bytes(aes_cbc_decrypt(
- bytes_to_intlist(compat_b64decode(data)),
- bytes_to_intlist(self._AES_KEY),
- bytes_to_intlist(compat_b64decode(iv))
- ))
+ stream_url = unpad_pkcs7(aes_cbc_decrypt_bytes(
+ compat_b64decode(data), self._AES_KEY, compat_b64decode(iv)))
if b'rtl2_you_video_not_found' in stream_url:
raise ExtractorError('video not found', expected=True)
- formats = self._extract_m3u8_formats(
- stream_url[:-compat_ord(stream_url[-1])].decode(),
- video_id, 'mp4', 'm3u8_native')
+ formats = self._extract_m3u8_formats(stream_url.decode(), video_id, 'mp4', 'm3u8_native')
self._sort_formats(formats)
video_data = self._download_json(
diff --git a/hypervideo_dl/extractor/rtnews.py b/hypervideo_dl/extractor/rtnews.py
new file mode 100644
index 0000000..68b6044
--- /dev/null
+++ b/hypervideo_dl/extractor/rtnews.py
@@ -0,0 +1,199 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import js_to_json
+
+
+class RTNewsIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?rt\.com/[^/]+/(?:[^/]+/)?(?P<id>\d+)'
+
+ _TESTS = [{
+ 'url': 'https://www.rt.com/sport/546301-djokovic-arrives-belgrade-crowds/',
+ 'playlist_mincount': 2,
+ 'info_dict': {
+ 'id': '546301',
+ 'title': 'Crowds gather to greet deported Djokovic as he returns to Serbia (VIDEO)',
+ 'description': 'md5:1d5bfe1a988d81fd74227cfdf93d314d',
+ 'thumbnail': 'https://cdni.rt.com/files/2022.01/article/61e587a085f540102c3386c1.png'
+ },
+ }, {
+ 'url': 'https://www.rt.com/shows/in-question/535980-plot-to-assassinate-julian-assange/',
+ 'playlist_mincount': 1,
+ 'info_dict': {
+ 'id': '535980',
+ 'title': 'The plot to assassinate Julian Assange',
+ 'description': 'md5:55279ce5e4441dc1d16e2e4a730152cd',
+ 'thumbnail': 'https://cdni.rt.com/files/2021.09/article/615226f42030274e8879b53d.png'
+ },
+ 'playlist': [{
+ 'info_dict': {
+ 'id': '6152271d85f5400464496162',
+ 'ext': 'mp4',
+ 'title': '6152271d85f5400464496162',
+ },
+ }]
+ }]
+
+ def _entries(self, webpage):
+ video_urls = set(re.findall(r'https://cdnv\.rt\.com/.*[a-f0-9]+\.mp4', webpage))
+ for v_url in video_urls:
+ v_id = re.search(r'([a-f0-9]+)\.mp4', v_url).group(1)
+ if v_id:
+ yield {
+ 'id': v_id,
+ 'title': v_id,
+ 'url': v_url,
+ }
+
+ def _real_extract(self, url):
+ id = self._match_id(url)
+ webpage = self._download_webpage(url, id)
+
+ return {
+ '_type': 'playlist',
+ 'id': id,
+ 'entries': self._entries(webpage),
+ 'title': self._og_search_title(webpage),
+ 'description': self._og_search_description(webpage),
+ 'thumbnail': self._og_search_thumbnail(webpage),
+ }
+
+
+class RTDocumentryIE(InfoExtractor):
+ _VALID_URL = r'https?://rtd\.rt\.com/(?:(?:series|shows)/[^/]+|films)/(?P<id>[^/?$&#]+)'
+
+ _TESTS = [{
+ 'url': 'https://rtd.rt.com/films/escobars-hitman/',
+ 'info_dict': {
+ 'id': 'escobars-hitman',
+ 'ext': 'mp4',
+ 'title': "Escobar's Hitman. Former drug-gang killer, now loved and loathed in Colombia",
+ 'description': 'md5:647c76984b7cb9a8b52a567e87448d88',
+ 'thumbnail': 'https://cdni.rt.com/rtd-files/films/escobars-hitman/escobars-hitman_11.jpg',
+ 'average_rating': 8.53,
+ 'duration': 3134.0
+ },
+ 'params': {'skip_download': True}
+ }, {
+ 'url': 'https://rtd.rt.com/shows/the-kalashnikova-show-military-secrets-anna-knishenko/iskander-tactical-system-natos-headache/',
+ 'info_dict': {
+ 'id': 'iskander-tactical-system-natos-headache',
+ 'ext': 'mp4',
+ 'title': "Iskander tactical system. NATO's headache | The Kalashnikova Show. Episode 10",
+ 'description': 'md5:da7c24a0aa67bc2bb88c86658508ca87',
+ 'thumbnail': 'md5:89de8ce38c710b7c501ff02d47e2aa89',
+ 'average_rating': 9.27,
+ 'duration': 274.0,
+ 'timestamp': 1605726000,
+ 'view_count': int,
+ 'upload_date': '20201118'
+ },
+ 'params': {'skip_download': True}
+ }, {
+ 'url': 'https://rtd.rt.com/series/i-am-hacked-trailer/introduction-to-safe-digital-life-ep2/',
+ 'info_dict': {
+ 'id': 'introduction-to-safe-digital-life-ep2',
+ 'ext': 'mp4',
+ 'title': 'How to Keep your Money away from Hackers | I am Hacked. Episode 2',
+ 'description': 'md5:c46fa9a5af86c0008c45a3940a8cce87',
+ 'thumbnail': 'md5:a5e81b9bf5aed8f5e23d9c053601b825',
+ 'average_rating': 10.0,
+ 'duration': 1524.0,
+ 'timestamp': 1636977600,
+ 'view_count': int,
+ 'upload_date': '20211115'
+ },
+ 'params': {'skip_download': True}
+ }]
+
+ def _real_extract(self, url):
+ id = self._match_id(url)
+ webpage = self._download_webpage(url, id)
+ ld_json = self._search_json_ld(webpage, None, fatal=False)
+ if not ld_json:
+ self.raise_no_formats('No video/audio found at the provided url.', expected=True)
+ media_json = self._parse_json(
+ self._search_regex(r'(?s)\'Med\'\s*:\s*\[\s*({.+})\s*\]\s*};', webpage, 'media info'),
+ id, transform_source=js_to_json)
+ if 'title' not in ld_json and 'title' in media_json:
+ ld_json['title'] = media_json['title']
+ formats = [{'url': src['file']} for src in media_json.get('sources') or [] if src.get('file')]
+
+ return {
+ 'id': id,
+ 'thumbnail': media_json.get('image'),
+ 'formats': formats,
+ **ld_json
+ }
+
+
+class RTDocumentryPlaylistIE(InfoExtractor):
+ _VALID_URL = r'https?://rtd\.rt\.com/(?:series|shows)/(?P<id>[^/]+)/$'
+
+ _TESTS = [{
+ 'url': 'https://rtd.rt.com/series/i-am-hacked-trailer/',
+ 'playlist_mincount': 6,
+ 'info_dict': {
+ 'id': 'i-am-hacked-trailer',
+ },
+ }, {
+ 'url': 'https://rtd.rt.com/shows/the-kalashnikova-show-military-secrets-anna-knishenko/',
+ 'playlist_mincount': 34,
+ 'info_dict': {
+ 'id': 'the-kalashnikova-show-military-secrets-anna-knishenko',
+ },
+ }]
+
+ def _entries(self, webpage, id):
+ video_urls = set(re.findall(r'list-2__link\s*"\s*href="([^"]+)"', webpage))
+ for v_url in video_urls:
+ if id not in v_url:
+ continue
+ yield self.url_result(
+ 'https://rtd.rt.com%s' % v_url,
+ ie=RTDocumentryIE.ie_key())
+
+ def _real_extract(self, url):
+ id = self._match_id(url)
+ webpage = self._download_webpage(url, id)
+
+ return {
+ '_type': 'playlist',
+ 'id': id,
+ 'entries': self._entries(webpage, id),
+ }
+
+
+class RuptlyIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?ruptly\.tv/[a-z]{2}/videos/(?P<id>\d+-\d+)'
+
+ _TESTS = [{
+ 'url': 'https://www.ruptly.tv/en/videos/20220112-020-Japan-Double-trouble-Tokyo-zoo-presents-adorable-panda-twins',
+ 'info_dict': {
+ 'id': '20220112-020',
+ 'ext': 'mp4',
+ 'title': 'Japan: Double trouble! Tokyo zoo presents adorable panda twins | Video Ruptly',
+ 'description': 'md5:85a8da5fdb31486f0562daf4360ce75a',
+ 'thumbnail': 'https://storage.ruptly.tv/thumbnails/20220112-020/i6JQKnTNpYuqaXsR/i6JQKnTNpYuqaXsR.jpg'
+ },
+ 'params': {'skip_download': True}
+ }]
+
+ def _real_extract(self, url):
+ id = self._match_id(url)
+ webpage = self._download_webpage(url, id)
+ m3u8_url = self._search_regex(r'preview_url"\s?:\s?"(https?://storage\.ruptly\.tv/video_projects/.+\.m3u8)"', webpage, 'm3u8 url', fatal=False)
+ if not m3u8_url:
+ self.raise_no_formats('No video/audio found at the provided url.', expected=True)
+ formats, subs = self._extract_m3u8_formats_and_subtitles(m3u8_url, id, ext='mp4')
+ return {
+ 'id': id,
+ 'formats': formats,
+ 'subtitles': subs,
+ 'title': self._og_search_title(webpage),
+ 'description': self._og_search_description(webpage),
+ 'thumbnail': self._og_search_thumbnail(webpage),
+ }
diff --git a/hypervideo_dl/extractor/rtrfm.py b/hypervideo_dl/extractor/rtrfm.py
new file mode 100644
index 0000000..93d51e8
--- /dev/null
+++ b/hypervideo_dl/extractor/rtrfm.py
@@ -0,0 +1,67 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class RTRFMIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?rtrfm\.com\.au/(?:shows|show-episode)/(?P<id>[^/?\#&]+)'
+ _TESTS = [
+ {
+ 'url': 'https://rtrfm.com.au/shows/breakfast/',
+ 'md5': '46168394d3a5ce237cf47e85d0745413',
+ 'info_dict': {
+ 'id': 'breakfast-2021-11-16',
+ 'ext': 'mp3',
+ 'series': 'Breakfast with Taylah',
+ 'title': r're:^Breakfast with Taylah \d{4}-\d{2}-\d{2}$',
+ 'description': 'md5:0979c3ab1febfbec3f1ccb743633c611',
+ },
+ 'skip': 'ID and md5 changes daily',
+ },
+ {
+ 'url': 'https://rtrfm.com.au/show-episode/breakfast-2021-11-11/',
+ 'md5': '396bedf1e40f96c62b30d4999202a790',
+ 'info_dict': {
+ 'id': 'breakfast-2021-11-11',
+ 'ext': 'mp3',
+ 'series': 'Breakfast with Taylah',
+ 'title': 'Breakfast with Taylah 2021-11-11',
+ 'description': 'md5:0979c3ab1febfbec3f1ccb743633c611',
+ },
+ },
+ {
+ 'url': 'https://rtrfm.com.au/show-episode/breakfast-2020-06-01/',
+ 'md5': '594027f513ec36a24b15d65007a24dff',
+ 'info_dict': {
+ 'id': 'breakfast-2020-06-01',
+ 'ext': 'mp3',
+ 'series': 'Breakfast with Taylah',
+ 'title': 'Breakfast with Taylah 2020-06-01',
+ 'description': r're:^Breakfast with Taylah ',
+ },
+ 'skip': 'This audio has expired',
+ },
+ ]
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
+ show, date, title = self._search_regex(
+ r'''\.playShow(?:From)?\(['"](?P<show>[^'"]+)['"],\s*['"](?P<date>[0-9]{4}-[0-9]{2}-[0-9]{2})['"],\s*['"](?P<title>[^'"]+)['"]''',
+ webpage, 'details', group=('show', 'date', 'title'))
+ url = self._download_json(
+ 'https://restreams.rtrfm.com.au/rzz',
+ show, 'Downloading MP3 URL', query={'n': show, 'd': date})['u']
+ # This is the only indicator of an error until trying to download the URL and
+ # downloads of mp4 URLs always fail (403 for current episodes, 404 for missing).
+ if '.mp4' in url:
+ url = None
+ self.raise_no_formats('Expired or no episode on this date', expected=True)
+ return {
+ 'id': '%s-%s' % (show, date),
+ 'title': '%s %s' % (title, date),
+ 'series': title,
+ 'url': url,
+ 'release_date': date,
+ 'description': self._og_search_description(webpage),
+ }
diff --git a/hypervideo_dl/extractor/rtve.py b/hypervideo_dl/extractor/rtve.py
index 59832ee..7a1dc6f 100644
--- a/hypervideo_dl/extractor/rtve.py
+++ b/hypervideo_dl/extractor/rtve.py
@@ -17,7 +17,7 @@ from ..utils import (
qualities,
remove_end,
remove_start,
- std_headers,
+ try_get,
)
_bytes_to_chr = (lambda x: x) if sys.version_info[0] == 2 else (lambda x: map(chr, x))
@@ -70,7 +70,7 @@ class RTVEALaCartaIE(InfoExtractor):
}]
def _real_initialize(self):
- user_agent_b64 = base64.b64encode(std_headers['User-Agent'].encode('utf-8')).decode('utf-8')
+ user_agent_b64 = base64.b64encode(self.get_param('http_headers')['User-Agent'].encode('utf-8')).decode('utf-8')
self._manager = self._download_json(
'http://www.rtve.es/odin/loki/' + user_agent_b64,
None, 'Fetching manager info')['manager']
@@ -160,7 +160,7 @@ class RTVEALaCartaIE(InfoExtractor):
return {
'id': video_id,
- 'title': self._live_title(title) if is_live else title,
+ 'title': title,
'formats': formats,
'thumbnail': info.get('image'),
'subtitles': subtitles,
@@ -178,6 +178,93 @@ class RTVEALaCartaIE(InfoExtractor):
for s in subs)
+class RTVEAudioIE(RTVEALaCartaIE):
+ IE_NAME = 'rtve.es:audio'
+ IE_DESC = 'RTVE audio'
+ _VALID_URL = r'https?://(?:www\.)?rtve\.es/(alacarta|play)/audios/[^/]+/[^/]+/(?P<id>[0-9]+)'
+
+ _TESTS = [{
+ 'url': 'https://www.rtve.es/alacarta/audios/a-hombros-de-gigantes/palabra-ingeniero-codigos-informaticos-27-04-21/5889192/',
+ 'md5': 'ae06d27bff945c4e87a50f89f6ce48ce',
+ 'info_dict': {
+ 'id': '5889192',
+ 'ext': 'mp3',
+ 'title': 'Códigos informáticos',
+ 'thumbnail': r're:https?://.+/1598856591583.jpg',
+ 'duration': 349.440,
+ 'series': 'A hombros de gigantes',
+ },
+ }, {
+ 'url': 'https://www.rtve.es/play/audios/en-radio-3/ignatius-farray/5791165/',
+ 'md5': '072855ab89a9450e0ba314c717fa5ebc',
+ 'info_dict': {
+ 'id': '5791165',
+ 'ext': 'mp3',
+ 'title': 'Ignatius Farray',
+ 'thumbnail': r're:https?://.+/1613243011863.jpg',
+ 'duration': 3559.559,
+ 'series': 'En Radio 3'
+ },
+ }, {
+ 'url': 'https://www.rtve.es/play/audios/frankenstein-o-el-moderno-prometeo/capitulo-26-ultimo-muerte-victor-juan-jose-plans-mary-shelley/6082623/',
+ 'md5': '0eadab248cc8dd193fa5765712e84d5c',
+ 'info_dict': {
+ 'id': '6082623',
+ 'ext': 'mp3',
+ 'title': 'Capítulo 26 y último: La muerte de Victor',
+ 'thumbnail': r're:https?://.+/1632147445707.jpg',
+ 'duration': 3174.086,
+ 'series': 'Frankenstein o el moderno Prometeo'
+ },
+ }]
+
+ def _extract_png_formats(self, audio_id):
+ """
+ This function retrieves media related png thumbnail which obfuscate
+ valuable information about the media. This information is decrypted
+ via base class _decrypt_url function providing media quality and
+ media url
+ """
+ png = self._download_webpage(
+ 'http://www.rtve.es/ztnr/movil/thumbnail/%s/audios/%s.png' %
+ (self._manager, audio_id),
+ audio_id, 'Downloading url information', query={'q': 'v2'})
+ q = qualities(['Media', 'Alta', 'HQ', 'HD_READY', 'HD_FULL'])
+ formats = []
+ for quality, audio_url in self._decrypt_url(png):
+ ext = determine_ext(audio_url)
+ if ext == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ audio_url, audio_id, 'mp4', 'm3u8_native',
+ m3u8_id='hls', fatal=False))
+ elif ext == 'mpd':
+ formats.extend(self._extract_mpd_formats(
+ audio_url, audio_id, 'dash', fatal=False))
+ else:
+ formats.append({
+ 'format_id': quality,
+ 'quality': q(quality),
+ 'url': audio_url,
+ })
+ self._sort_formats(formats)
+ return formats
+
+ def _real_extract(self, url):
+ audio_id = self._match_id(url)
+ info = self._download_json(
+ 'https://www.rtve.es/api/audios/%s.json' % audio_id,
+ audio_id)['page']['items'][0]
+
+ return {
+ 'id': audio_id,
+ 'title': info['title'].strip(),
+ 'thumbnail': info.get('thumbnail'),
+ 'duration': float_or_none(info.get('duration'), 1000),
+ 'series': try_get(info, lambda x: x['programInfo']['title']),
+ 'formats': self._extract_png_formats(audio_id),
+ }
+
+
class RTVEInfantilIE(RTVEALaCartaIE):
IE_NAME = 'rtve.es:infantil'
IE_DESC = 'RTVE infantil'
@@ -230,7 +317,7 @@ class RTVELiveIE(RTVEALaCartaIE):
return {
'id': video_id,
- 'title': self._live_title(title),
+ 'title': title,
'formats': self._extract_png_formats(vidplayer_id),
'is_live': True,
}
diff --git a/hypervideo_dl/extractor/rtvs.py b/hypervideo_dl/extractor/rtvs.py
index 6573b26..3ea0f18 100644
--- a/hypervideo_dl/extractor/rtvs.py
+++ b/hypervideo_dl/extractor/rtvs.py
@@ -1,11 +1,19 @@
# coding: utf-8
from __future__ import unicode_literals
+import re
+
from .common import InfoExtractor
+from ..utils import (
+ parse_duration,
+ traverse_obj,
+ unified_timestamp,
+)
+
class RTVSIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?rtvs\.sk/(?:radio|televizia)/archiv/\d+/(?P<id>\d+)'
+ _VALID_URL = r'https?://(?:www\.)?rtvs\.sk/(?:radio|televizia)/archiv(?:/\d+)?/(?P<id>\d+)/?(?:[#?]|$)'
_TESTS = [{
# radio archive
'url': 'http://www.rtvs.sk/radio/archiv/11224/414872',
@@ -13,23 +21,37 @@ class RTVSIE(InfoExtractor):
'info_dict': {
'id': '414872',
'ext': 'mp3',
- 'title': 'Ostrov pokladov 1 časť.mp3'
- },
- 'params': {
- 'skip_download': True,
+ 'title': 'Ostrov pokladov 1 časť.mp3',
+ 'duration': 2854,
+ 'thumbnail': 'https://www.rtvs.sk/media/a501/image/file/2/0000/b1R8.rtvs.jpg',
+ 'display_id': '135331',
}
}, {
# tv archive
'url': 'http://www.rtvs.sk/televizia/archiv/8249/63118',
- 'md5': '85e2c55cf988403b70cac24f5c086dc6',
'info_dict': {
'id': '63118',
'ext': 'mp4',
'title': 'Amaro Džives - Náš deň',
- 'description': 'Galavečer pri príležitosti Medzinárodného dňa Rómov.'
- },
- 'params': {
- 'skip_download': True,
+ 'description': 'Galavečer pri príležitosti Medzinárodného dňa Rómov.',
+ 'thumbnail': 'https://www.rtvs.sk/media/a501/image/file/2/0031/L7Qm.amaro_dzives_png.jpg',
+ 'timestamp': 1428555900,
+ 'upload_date': '20150409',
+ 'duration': 4986,
+ }
+ }, {
+ # tv archive
+ 'url': 'https://www.rtvs.sk/televizia/archiv/18083?utm_source=web&utm_medium=rozcestnik&utm_campaign=Robin',
+ 'info_dict': {
+ 'id': '18083',
+ 'ext': 'mp4',
+ 'title': 'Robin',
+ 'description': 'md5:2f70505a7b8364491003d65ff7a0940a',
+ 'timestamp': 1636652760,
+ 'display_id': '307655',
+ 'duration': 831,
+ 'upload_date': '20211111',
+ 'thumbnail': 'https://www.rtvs.sk/media/a501/image/file/2/0916/robin.jpg',
}
}]
@@ -37,11 +59,31 @@ class RTVSIE(InfoExtractor):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
+ iframe_id = self._search_regex(
+ r'<iframe[^>]+id\s*=\s*"player_[^_]+_([0-9]+)"', webpage, 'Iframe ID')
+ iframe_url = self._search_regex(
+ fr'<iframe[^>]+id\s*=\s*"player_[^_]+_{re.escape(iframe_id)}"[^>]+src\s*=\s*"([^"]+)"', webpage, 'Iframe URL')
+
+ webpage = self._download_webpage(iframe_url, video_id, 'Downloading iframe')
+ json_url = self._search_regex(r'var\s+url\s*=\s*"([^"]+)"\s*\+\s*ruurl', webpage, 'json URL')
+ data = self._download_json(f'https:{json_url}b=mozilla&p=win&v=97&f=0&d=1', video_id)
- playlist_url = self._search_regex(
- r'playlist["\']?\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage,
- 'playlist url', group='url')
+ if data.get('clip'):
+ data['playlist'] = [data['clip']]
- data = self._download_json(
- playlist_url, video_id, 'Downloading playlist')[0]
- return self._parse_jwplayer_data(data, video_id=video_id)
+ if traverse_obj(data, ('playlist', 0, 'sources', 0, 'type')) == 'audio/mp3':
+ formats = [{'url': traverse_obj(data, ('playlist', 0, 'sources', 0, 'src'))}]
+ else:
+ formats = self._extract_m3u8_formats(traverse_obj(data, ('playlist', 0, 'sources', 0, 'src')), video_id)
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'display_id': iframe_id,
+ 'title': traverse_obj(data, ('playlist', 0, 'title')),
+ 'description': traverse_obj(data, ('playlist', 0, 'description')),
+ 'duration': parse_duration(traverse_obj(data, ('playlist', 0, 'length'))),
+ 'thumbnail': traverse_obj(data, ('playlist', 0, 'image')),
+ 'timestamp': unified_timestamp(traverse_obj(data, ('playlist', 0, 'datetime_create'))),
+ 'formats': formats
+ }
diff --git a/hypervideo_dl/extractor/rule34video.py b/hypervideo_dl/extractor/rule34video.py
new file mode 100644
index 0000000..a602a9f
--- /dev/null
+++ b/hypervideo_dl/extractor/rule34video.py
@@ -0,0 +1,65 @@
+# coding: utf-8
+from __future__ import unicode_literals
+import re
+
+from ..utils import parse_duration
+from .common import InfoExtractor
+
+
+class Rule34VideoIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?rule34video\.com/videos/(?P<id>\d+)'
+ _TESTS = [
+ {
+ 'url': 'https://rule34video.com/videos/3065157/shot-it-mmd-hmv/',
+ 'md5': 'ffccac2c23799dabbd192621ae4d04f3',
+ 'info_dict': {
+ 'id': '3065157',
+ 'ext': 'mp4',
+ 'title': 'Shot It-(mmd hmv)',
+ 'thumbnail': 'https://rule34video.com/contents/videos_screenshots/3065000/3065157/preview.jpg',
+ 'duration': 347.0,
+ 'age_limit': 18
+ }
+ },
+ {
+ 'url': 'https://rule34video.com/videos/3065296/lara-in-trouble-ep-7-wildeerstudio/',
+ 'md5': '6bb5169f9f6b38cd70882bf2e64f6b86',
+ 'info_dict': {
+ 'id': '3065296',
+ 'ext': 'mp4',
+ 'title': 'Lara in Trouble Ep. 7 [WildeerStudio]',
+ 'thumbnail': 'https://rule34video.com/contents/videos_screenshots/3065000/3065296/preview.jpg',
+ 'duration': 938.0,
+ 'age_limit': 18
+ }
+ },
+ ]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ formats = []
+
+ for mobj in re.finditer(r'<a[^>]+href="(?P<video_url>[^"]+download=true[^"]+)".*>(?P<ext>[^\s]+) (?P<quality>[^<]+)p</a>', webpage):
+ url, ext, quality = mobj.groups()
+ formats.append({
+ 'url': url,
+ 'ext': ext.lower(),
+ 'quality': quality,
+ })
+
+ title = self._html_extract_title(webpage)
+ thumbnail = self._html_search_regex(r'preview_url:\s+\'([^\']+)\'', webpage, 'thumbnail', default=None)
+ duration = self._html_search_regex(r'"icon-clock"></i>\s+<span>((?:\d+:?)+)', webpage, 'duration', default=None)
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'formats': formats,
+ 'title': title,
+ 'thumbnail': thumbnail,
+ 'duration': parse_duration(duration),
+ 'age_limit': 18
+ }
diff --git a/hypervideo_dl/extractor/rumble.py b/hypervideo_dl/extractor/rumble.py
index 49c1f44..a0d5f88 100644
--- a/hypervideo_dl/extractor/rumble.py
+++ b/hypervideo_dl/extractor/rumble.py
@@ -11,6 +11,7 @@ from ..utils import (
int_or_none,
parse_iso8601,
try_get,
+ unescapeHTML,
ExtractorError,
)
@@ -28,6 +29,20 @@ class RumbleEmbedIE(InfoExtractor):
'upload_date': '20191020',
}
}, {
+ 'url': 'https://rumble.com/embed/vslb7v',
+ 'md5': '7418035de1a30a178b8af34dc2b6a52b',
+ 'info_dict': {
+ 'id': 'vslb7v',
+ 'ext': 'mp4',
+ 'title': 'Defense Sec. says US Commitment to NATO Defense \'Ironclad\'',
+ 'timestamp': 1645142135,
+ 'upload_date': '20220217',
+ 'channel_url': 'https://rumble.com/c/CyberTechNews',
+ 'channel': 'CTNews',
+ 'thumbnail': 'https://sp.rmbl.ws/s8/6/7/i/9/h/7i9hd.OvCc.jpg',
+ 'duration': 901,
+ }
+ }, {
'url': 'https://rumble.com/embed/ufe9n.v5pv5f',
'only_matching': True,
}]
@@ -45,7 +60,7 @@ class RumbleEmbedIE(InfoExtractor):
video = self._download_json(
'https://rumble.com/embedJS/', video_id,
query={'request': 'video', 'v': video_id})
- title = video['title']
+ title = unescapeHTML(video['title'])
formats = []
for height, ua in (video.get('ua') or {}).items():
diff --git a/hypervideo_dl/extractor/rutube.py b/hypervideo_dl/extractor/rutube.py
index d027412..2f753b4 100644
--- a/hypervideo_dl/extractor/rutube.py
+++ b/hypervideo_dl/extractor/rutube.py
@@ -230,9 +230,9 @@ class RutubePlaylistBaseIE(RutubeBaseIE):
return self._extract_playlist(self._match_id(url))
-class RutubeChannelIE(RutubePlaylistBaseIE):
- IE_NAME = 'rutube:channel'
- IE_DESC = 'Rutube channels'
+class RutubeTagsIE(RutubePlaylistBaseIE):
+ IE_NAME = 'rutube:tags'
+ IE_DESC = 'Rutube tags'
_VALID_URL = r'https?://rutube\.ru/tags/video/(?P<id>\d+)'
_TESTS = [{
'url': 'http://rutube.ru/tags/video/1800/',
@@ -312,3 +312,18 @@ class RutubePlaylistIE(RutubePlaylistBaseIE):
playlist_kind = qs['pl_type'][0]
playlist_id = qs['pl_id'][0]
return self._extract_playlist(playlist_id, item_kind=playlist_kind)
+
+
+class RutubeChannelIE(RutubePlaylistBaseIE):
+ IE_NAME = 'rutube:channel'
+ IE_DESC = 'Rutube channel'
+ _VALID_URL = r'https?://rutube\.ru/channel/(?P<id>\d+)/videos'
+ _TESTS = [{
+ 'url': 'https://rutube.ru/channel/639184/videos/',
+ 'info_dict': {
+ 'id': '639184',
+ },
+ 'playlist_mincount': 133,
+ }]
+
+ _PAGE_TEMPLATE = 'http://rutube.ru/api/video/person/%s/?page=%s&format=json'
diff --git a/hypervideo_dl/extractor/rutv.py b/hypervideo_dl/extractor/rutv.py
index 7e0de99..0ea8253 100644
--- a/hypervideo_dl/extractor/rutv.py
+++ b/hypervideo_dl/extractor/rutv.py
@@ -6,7 +6,8 @@ import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
- int_or_none
+ int_or_none,
+ str_to_int
)
@@ -179,8 +180,7 @@ class RUTVIE(InfoExtractor):
'player_url': 'http://player.rutv.ru/flash3v/osmf.swf?i=22',
'rtmp_live': True,
'ext': 'flv',
- 'vbr': int(quality),
- 'quality': preference,
+ 'vbr': str_to_int(quality),
}
elif transport == 'm3u8':
formats.extend(self._extract_m3u8_formats(
@@ -191,9 +191,10 @@ class RUTVIE(InfoExtractor):
'url': url
}
fmt.update({
- 'width': width,
- 'height': height,
+ 'width': int_or_none(quality, default=height, invscale=width, scale=height),
+ 'height': int_or_none(quality, default=height),
'format_id': '%s-%s' % (transport, quality),
+ 'source_preference': preference,
})
formats.append(fmt)
@@ -201,7 +202,7 @@ class RUTVIE(InfoExtractor):
return {
'id': video_id,
- 'title': self._live_title(title) if is_live else title,
+ 'title': title,
'description': description,
'thumbnail': thumbnail,
'view_count': view_count,
diff --git a/hypervideo_dl/extractor/ruutu.py b/hypervideo_dl/extractor/ruutu.py
index d9cf39d..5a30e33 100644
--- a/hypervideo_dl/extractor/ruutu.py
+++ b/hypervideo_dl/extractor/ruutu.py
@@ -1,6 +1,9 @@
# coding: utf-8
from __future__ import unicode_literals
+import json
+import re
+
from .common import InfoExtractor
from ..compat import compat_urllib_parse_urlparse
from ..utils import (
@@ -8,6 +11,8 @@ from ..utils import (
ExtractorError,
find_xpath_attr,
int_or_none,
+ traverse_obj,
+ try_call,
unified_strdate,
url_or_none,
xpath_attr,
@@ -123,6 +128,16 @@ class RuutuIE(InfoExtractor):
]
_API_BASE = 'https://gatling.nelonenmedia.fi'
+ @classmethod
+ def _extract_url(cls, webpage):
+ settings = try_call(
+ lambda: json.loads(re.search(
+ r'jQuery\.extend\(Drupal\.settings, ({.+?})\);', webpage).group(1), strict=False))
+ video_id = traverse_obj(settings, (
+ 'mediaCrossbowSettings', 'file', 'field_crossbow_video_id', 'und', 0, 'value'))
+ if video_id:
+ return f'http://www.ruutu.fi/video/{video_id}'
+
def _real_extract(self, url):
video_id = self._match_id(url)
diff --git a/hypervideo_dl/extractor/ruv.py b/hypervideo_dl/extractor/ruv.py
index 8f3cc40..d806ed0 100644
--- a/hypervideo_dl/extractor/ruv.py
+++ b/hypervideo_dl/extractor/ruv.py
@@ -4,6 +4,8 @@ from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
determine_ext,
+ parse_duration,
+ traverse_obj,
unified_timestamp,
)
@@ -99,3 +101,89 @@ class RuvIE(InfoExtractor):
'timestamp': timestamp,
'formats': formats,
}
+
+
+class RuvSpilaIE(InfoExtractor):
+ IE_NAME = 'ruv.is:spila'
+ _VALID_URL = r'https?://(?:www\.)?ruv\.is/(?:(?:sjon|ut)varp|(?:krakka|ung)ruv)/spila/.+/(?P<series_id>[0-9]+)/(?P<id>[a-z0-9]+)'
+ _TESTS = [{
+ 'url': 'https://www.ruv.is/sjonvarp/spila/ithrottir/30657/9jcnd4',
+ 'info_dict': {
+ 'id': '9jcnd4',
+ 'ext': 'mp4',
+ 'title': '01.02.2022',
+ 'chapters': 'count:4',
+ 'timestamp': 1643743500,
+ 'upload_date': '20220201',
+ 'thumbnail': 'https://d38kdhuogyllre.cloudfront.net/fit-in/1960x/filters:quality(65)/hd_posters/94boog-iti3jg.jpg',
+ 'description': 'Íþróttafréttir.',
+ 'age_limit': 0,
+ },
+ }, {
+ 'url': 'https://www.ruv.is/utvarp/spila/i-ljosi-sogunnar/23795/7hqkre',
+ 'info_dict': {
+ 'id': '7hqkre',
+ 'ext': 'mp3',
+ 'thumbnail': 'https://d38kdhuogyllre.cloudfront.net/fit-in/1960x/filters:quality(65)/hd_posters/7hqkre-7uepao.jpg',
+ 'description': 'md5:8d7046549daff35e9a3190dc9901a120',
+ 'chapters': [],
+ 'upload_date': '20220204',
+ 'timestamp': 1643965500,
+ 'title': 'Nellie Bly II',
+ 'age_limit': 0,
+ },
+ }, {
+ 'url': 'https://www.ruv.is/ungruv/spila/ungruv/28046/8beuph',
+ 'only_matching': True
+ }, {
+ 'url': 'https://www.ruv.is/krakkaruv/spila/krakkafrettir/30712/9jbgb0',
+ 'only_matching': True
+ }]
+
+ def _real_extract(self, url):
+ display_id, series_id = self._match_valid_url(url).group('id', 'series_id')
+ program = self._download_json(
+ 'https://www.ruv.is/gql/', display_id, query={'query': '''{
+ Program(id: %s){
+ title image description short_description
+ episodes(id: {value: "%s"}) {
+ rating title duration file image firstrun description
+ clips {
+ time text
+ }
+ subtitles {
+ name value
+ }
+ }
+ }
+ }''' % (series_id, display_id)})['data']['Program']
+ episode = program['episodes'][0]
+
+ subs = {}
+ for trk in episode.get('subtitles'):
+ if trk.get('name') and trk.get('value'):
+ subs.setdefault(trk['name'], []).append({'url': trk['value'], 'ext': 'vtt'})
+
+ media_url = episode['file']
+ if determine_ext(media_url) == 'm3u8':
+ formats = self._extract_m3u8_formats(media_url, display_id)
+ else:
+ formats = [{'url': media_url}]
+
+ clips = [
+ {'start_time': parse_duration(c.get('time')), 'title': c.get('text')}
+ for c in episode.get('clips') or []]
+
+ return {
+ 'id': display_id,
+ 'title': traverse_obj(program, ('episodes', 0, 'title'), 'title'),
+ 'description': traverse_obj(
+ program, ('episodes', 0, 'description'), 'description', 'short_description',
+ expected_type=lambda x: x or None),
+ 'subtitles': subs,
+ 'thumbnail': episode.get('image', '').replace('$$IMAGESIZE$$', '1960') or None,
+ 'timestamp': unified_timestamp(episode.get('firstrun')),
+ 'formats': formats,
+ 'age_limit': episode.get('rating'),
+ 'chapters': clips
+ }
diff --git a/hypervideo_dl/extractor/safari.py b/hypervideo_dl/extractor/safari.py
index cca4464..7b4571d 100644
--- a/hypervideo_dl/extractor/safari.py
+++ b/hypervideo_dl/extractor/safari.py
@@ -25,14 +25,7 @@ class SafariBaseIE(InfoExtractor):
LOGGED_IN = False
- def _real_initialize(self):
- self._login()
-
- def _login(self):
- username, password = self._get_login_info()
- if username is None:
- return
-
+ def _perform_login(self, username, password):
_, urlh = self._download_webpage_handle(
'https://learning.oreilly.com/accounts/login-check/', None,
'Downloading login page')
diff --git a/hypervideo_dl/extractor/sbs.py b/hypervideo_dl/extractor/sbs.py
index 0a806ee..4090f63 100644
--- a/hypervideo_dl/extractor/sbs.py
+++ b/hypervideo_dl/extractor/sbs.py
@@ -10,7 +10,14 @@ from ..utils import (
class SBSIE(InfoExtractor):
IE_DESC = 'sbs.com.au'
- _VALID_URL = r'https?://(?:www\.)?sbs\.com\.au/(?:ondemand(?:/video/(?:single/)?|.*?\bplay=|/watch/)|news/(?:embeds/)?video/)(?P<id>[0-9]+)'
+ _VALID_URL = r'''(?x)
+ https?://(?:www\.)?sbs\.com\.au/(?:
+ ondemand(?:
+ /video/(?:single/)?|
+ /movie/[^/]+/|
+ .*?\bplay=|/watch/
+ )|news/(?:embeds/)?video/
+ )(?P<id>[0-9]+)'''
_TESTS = [{
# Original URL is handled by the generic IE which finds the iframe:
@@ -46,6 +53,13 @@ class SBSIE(InfoExtractor):
}, {
'url': 'https://www.sbs.com.au/ondemand/watch/1698704451971',
'only_matching': True,
+ }, {
+ 'url': 'https://www.sbs.com.au/ondemand/movie/coherence/1469404227931',
+ 'only_matching': True,
+ }, {
+ 'note': 'Live stream',
+ 'url': 'https://www.sbs.com.au/ondemand/video/1726824003663/sbs-24x7-live-stream-nsw',
+ 'only_matching': True,
}]
def _real_extract(self, url):
@@ -75,4 +89,5 @@ class SBSIE(InfoExtractor):
'ie_key': 'ThePlatform',
'id': video_id,
'url': smuggle_url(self._proto_relative_url(theplatform_url), {'force_smil_url': True}),
+ 'is_live': player_params.get('streamType') == 'live',
}
diff --git a/hypervideo_dl/extractor/scte.py b/hypervideo_dl/extractor/scte.py
index ca1de63..7215cf5 100644
--- a/hypervideo_dl/extractor/scte.py
+++ b/hypervideo_dl/extractor/scte.py
@@ -14,14 +14,7 @@ class SCTEBaseIE(InfoExtractor):
_LOGIN_URL = 'https://www.scte.org/SCTE/Sign_In.aspx'
_NETRC_MACHINE = 'scte'
- def _real_initialize(self):
- self._login()
-
- def _login(self):
- username, password = self._get_login_info()
- if username is None:
- return
-
+ def _perform_login(self, username, password):
login_popup = self._download_webpage(
self._LOGIN_URL, None, 'Downloading login popup')
diff --git a/hypervideo_dl/extractor/senategov.py b/hypervideo_dl/extractor/senategov.py
new file mode 100644
index 0000000..b295184
--- /dev/null
+++ b/hypervideo_dl/extractor/senategov.py
@@ -0,0 +1,213 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+ compat_parse_qs,
+ compat_urlparse,
+)
+from ..utils import (
+ ExtractorError,
+ parse_qs,
+ unsmuggle_url,
+)
+
+_COMMITTEES = {
+ 'ag': ('76440', 'http://ag-f.akamaihd.net'),
+ 'aging': ('76442', 'http://aging-f.akamaihd.net'),
+ 'approps': ('76441', 'http://approps-f.akamaihd.net'),
+ 'arch': ('', 'http://ussenate-f.akamaihd.net'),
+ 'armed': ('76445', 'http://armed-f.akamaihd.net'),
+ 'banking': ('76446', 'http://banking-f.akamaihd.net'),
+ 'budget': ('76447', 'http://budget-f.akamaihd.net'),
+ 'cecc': ('76486', 'http://srs-f.akamaihd.net'),
+ 'commerce': ('80177', 'http://commerce1-f.akamaihd.net'),
+ 'csce': ('75229', 'http://srs-f.akamaihd.net'),
+ 'dpc': ('76590', 'http://dpc-f.akamaihd.net'),
+ 'energy': ('76448', 'http://energy-f.akamaihd.net'),
+ 'epw': ('76478', 'http://epw-f.akamaihd.net'),
+ 'ethics': ('76449', 'http://ethics-f.akamaihd.net'),
+ 'finance': ('76450', 'http://finance-f.akamaihd.net'),
+ 'foreign': ('76451', 'http://foreign-f.akamaihd.net'),
+ 'govtaff': ('76453', 'http://govtaff-f.akamaihd.net'),
+ 'help': ('76452', 'http://help-f.akamaihd.net'),
+ 'indian': ('76455', 'http://indian-f.akamaihd.net'),
+ 'intel': ('76456', 'http://intel-f.akamaihd.net'),
+ 'intlnarc': ('76457', 'http://intlnarc-f.akamaihd.net'),
+ 'jccic': ('85180', 'http://jccic-f.akamaihd.net'),
+ 'jec': ('76458', 'http://jec-f.akamaihd.net'),
+ 'judiciary': ('76459', 'http://judiciary-f.akamaihd.net'),
+ 'rpc': ('76591', 'http://rpc-f.akamaihd.net'),
+ 'rules': ('76460', 'http://rules-f.akamaihd.net'),
+ 'saa': ('76489', 'http://srs-f.akamaihd.net'),
+ 'smbiz': ('76461', 'http://smbiz-f.akamaihd.net'),
+ 'srs': ('75229', 'http://srs-f.akamaihd.net'),
+ 'uscc': ('76487', 'http://srs-f.akamaihd.net'),
+ 'vetaff': ('76462', 'http://vetaff-f.akamaihd.net'),
+}
+
+
+class SenateISVPIE(InfoExtractor):
+ _IE_NAME = 'senate.gov:isvp'
+ _VALID_URL = r'https?://(?:www\.)?senate\.gov/isvp/?\?(?P<qs>.+)'
+
+ _TESTS = [{
+ 'url': 'http://www.senate.gov/isvp/?comm=judiciary&type=live&stt=&filename=judiciary031715&auto_play=false&wmode=transparent&poster=http%3A%2F%2Fwww.judiciary.senate.gov%2Fthemes%2Fjudiciary%2Fimages%2Fvideo-poster-flash-fit.png',
+ 'info_dict': {
+ 'id': 'judiciary031715',
+ 'ext': 'mp4',
+ 'title': 'Integrated Senate Video Player',
+ 'thumbnail': r're:^https?://.*\.(?:jpg|png)$',
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'http://www.senate.gov/isvp/?type=live&comm=commerce&filename=commerce011514.mp4&auto_play=false',
+ 'info_dict': {
+ 'id': 'commerce011514',
+ 'ext': 'mp4',
+ 'title': 'Integrated Senate Video Player'
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'http://www.senate.gov/isvp/?type=arch&comm=intel&filename=intel090613&hc_location=ufi',
+ # checksum differs each time
+ 'info_dict': {
+ 'id': 'intel090613',
+ 'ext': 'mp4',
+ 'title': 'Integrated Senate Video Player'
+ }
+ }, {
+ # From http://www.c-span.org/video/?96791-1
+ 'url': 'http://www.senate.gov/isvp?type=live&comm=banking&filename=banking012715',
+ 'only_matching': True,
+ }]
+
+ @staticmethod
+ def _search_iframe_url(webpage):
+ mobj = re.search(
+ r"<iframe[^>]+src=['\"](?P<url>https?://www\.senate\.gov/isvp/?\?[^'\"]+)['\"]",
+ webpage)
+ if mobj:
+ return mobj.group('url')
+
+ def _real_extract(self, url):
+ url, smuggled_data = unsmuggle_url(url, {})
+
+ qs = compat_parse_qs(self._match_valid_url(url).group('qs'))
+ if not qs.get('filename') or not qs.get('type') or not qs.get('comm'):
+ raise ExtractorError('Invalid URL', expected=True)
+
+ video_id = re.sub(r'.mp4$', '', qs['filename'][0])
+
+ webpage = self._download_webpage(url, video_id)
+
+ if smuggled_data.get('force_title'):
+ title = smuggled_data['force_title']
+ else:
+ title = self._html_extract_title(webpage)
+ poster = qs.get('poster')
+ thumbnail = poster[0] if poster else None
+
+ video_type = qs['type'][0]
+ committee = video_type if video_type == 'arch' else qs['comm'][0]
+
+ stream_num, domain = _COMMITTEES[committee]
+
+ formats = []
+ if video_type == 'arch':
+ filename = video_id if '.' in video_id else video_id + '.mp4'
+ m3u8_url = compat_urlparse.urljoin(domain, 'i/' + filename + '/master.m3u8')
+ formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', m3u8_id='m3u8')
+ else:
+ hdcore_sign = 'hdcore=3.1.0'
+ url_params = (domain, video_id, stream_num)
+ f4m_url = f'%s/z/%s_1@%s/manifest.f4m?{hdcore_sign}' % url_params
+ m3u8_url = '%s/i/%s_1@%s/master.m3u8' % url_params
+ for entry in self._extract_f4m_formats(f4m_url, video_id, f4m_id='f4m'):
+ # URLs without the extra param induce an 404 error
+ entry.update({'extra_param_to_segment_url': hdcore_sign})
+ formats.append(entry)
+ for entry in self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', m3u8_id='m3u8'):
+ mobj = re.search(r'(?P<tag>(?:-p|-b)).m3u8', entry['url'])
+ if mobj:
+ entry['format_id'] += mobj.group('tag')
+ formats.append(entry)
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'formats': formats,
+ 'thumbnail': thumbnail,
+ }
+
+
+class SenateGovIE(InfoExtractor):
+ _IE_NAME = 'senate.gov'
+ _VALID_URL = r'https?:\/\/(?:www\.)?(help|appropriations|judiciary|banking|armed-services|finance)\.senate\.gov'
+ _TESTS = [{
+ 'url': 'https://www.help.senate.gov/hearings/vaccines-saving-lives-ensuring-confidence-and-protecting-public-health',
+ 'info_dict': {
+ 'id': 'help090920',
+ 'display_id': 'vaccines-saving-lives-ensuring-confidence-and-protecting-public-health',
+ 'title': 'Vaccines: Saving Lives, Ensuring Confidence, and Protecting Public Health',
+ 'description': 'The U.S. Senate Committee on Health, Education, Labor & Pensions',
+ 'ext': 'mp4',
+ },
+ 'params': {'skip_download': 'm3u8'},
+ }, {
+ 'url': 'https://www.appropriations.senate.gov/hearings/watch?hearingid=B8A25434-5056-A066-6020-1F68CB75F0CD',
+ 'info_dict': {
+ 'id': 'appropsA051518',
+ 'display_id': 'watch?hearingid=B8A25434-5056-A066-6020-1F68CB75F0CD',
+ 'title': 'Review of the FY2019 Budget Request for the U.S. Army',
+ 'ext': 'mp4',
+ },
+ 'params': {'skip_download': 'm3u8'},
+ }, {
+ 'url': 'https://www.banking.senate.gov/hearings/21st-century-communities-public-transportation-infrastructure-investment-and-fast-act-reauthorization',
+ 'info_dict': {
+ 'id': 'banking041521',
+ 'display_id': '21st-century-communities-public-transportation-infrastructure-investment-and-fast-act-reauthorization',
+ 'title': '21st Century Communities: Public Transportation Infrastructure Investment and FAST Act Reauthorization',
+ 'description': 'The Official website of The United States Committee on Banking, Housing, and Urban Affairs',
+ 'ext': 'mp4',
+ },
+ 'params': {'skip_download': 'm3u8'},
+ }]
+
+ def _real_extract(self, url):
+ display_id = self._generic_id(url)
+ webpage = self._download_webpage(url, display_id)
+ parse_info = parse_qs(self._search_regex(
+ r'<iframe class="[^>"]*streaminghearing[^>"]*"\s[^>]*\bsrc="([^">]*)', webpage, 'hearing URL'))
+
+ stream_num, stream_domain = _COMMITTEES[parse_info['comm'][-1]]
+ filename = parse_info['filename'][-1]
+
+ formats = self._extract_m3u8_formats(
+ f'{stream_domain}/i/{filename}_1@{stream_num}/master.m3u8',
+ display_id, ext='mp4')
+ self._sort_formats(formats)
+
+ title = self._html_search_regex(
+ (*self._og_regexes('title'), r'(?s)<title>([^<]*?)</title>'), webpage, 'video title')
+
+ return {
+ 'id': re.sub(r'.mp4$', '', filename),
+ 'display_id': display_id,
+ 'title': re.sub(r'\s+', ' ', title.split('|')[0]).strip(),
+ 'description': self._og_search_description(webpage, default=None),
+ 'thumbnail': self._og_search_thumbnail(webpage, default=None),
+ 'age_limit': self._rta_search(webpage),
+ 'formats': formats
+ }
diff --git a/hypervideo_dl/extractor/sendtonews.py b/hypervideo_dl/extractor/sendtonews.py
index bc38a0f..858547b 100644
--- a/hypervideo_dl/extractor/sendtonews.py
+++ b/hypervideo_dl/extractor/sendtonews.py
@@ -80,7 +80,7 @@ class SendtoNewsIE(InfoExtractor):
'format_id': '%s-%d' % (determine_protocol(f), tbr),
'tbr': tbr,
})
- # 'tbr' was explicitly set to be prefered over 'height' originally,
+ # 'tbr' was explicitly set to be preferred over 'height' originally,
# So this is being kept unless someone can confirm this is unnecessary
self._sort_formats(info_dict['formats'], ('tbr', 'res'))
diff --git a/hypervideo_dl/extractor/sevenplus.py b/hypervideo_dl/extractor/sevenplus.py
index 210c44a..9867961 100644
--- a/hypervideo_dl/extractor/sevenplus.py
+++ b/hypervideo_dl/extractor/sevenplus.py
@@ -35,7 +35,6 @@ class SevenPlusIE(BrightcoveNewIE):
'episode': 'Wind Surf',
},
'params': {
- 'format': 'bestvideo',
'skip_download': True,
}
}, {
diff --git a/hypervideo_dl/extractor/shahid.py b/hypervideo_dl/extractor/shahid.py
index 42de41a..ab45d9c 100644
--- a/hypervideo_dl/extractor/shahid.py
+++ b/hypervideo_dl/extractor/shahid.py
@@ -79,16 +79,12 @@ class ShahidIE(ShahidBaseIE):
'only_matching': True
}]
- def _real_initialize(self):
- email, password = self._get_login_info()
- if email is None:
- return
-
+ def _perform_login(self, username, password):
try:
user_data = self._download_json(
'https://shahid.mbc.net/wd/service/users/login',
None, 'Logging in', data=json.dumps({
- 'email': email,
+ 'email': username,
'password': password,
'basic': 'false',
}).encode('utf-8'), headers={
diff --git a/hypervideo_dl/extractor/shemaroome.py b/hypervideo_dl/extractor/shemaroome.py
index 142d5dc..45c1291 100644
--- a/hypervideo_dl/extractor/shemaroome.py
+++ b/hypervideo_dl/extractor/shemaroome.py
@@ -2,10 +2,9 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..aes import aes_cbc_decrypt
+from ..aes import aes_cbc_decrypt, unpad_pkcs7
from ..compat import (
compat_b64decode,
- compat_ord,
)
from ..utils import (
bytes_to_intlist,
@@ -16,7 +15,7 @@ from ..utils import (
class ShemarooMeIE(InfoExtractor):
- _VALID_URL = r'(?:https?://)(?:www\.)?shemaroome\.com/(?:movies|shows)/(?P<id>[^?#]+)'
+ _VALID_URL = r'https?://(?:www\.)?shemaroome\.com/(?:movies|shows)/(?P<id>[^?#]+)'
_TESTS = [{
'url': 'https://www.shemaroome.com/movies/dil-hai-tumhaara',
'info_dict': {
@@ -76,9 +75,8 @@ class ShemarooMeIE(InfoExtractor):
url_data = bytes_to_intlist(compat_b64decode(data_json['new_play_url']))
key = bytes_to_intlist(compat_b64decode(data_json['key']))
iv = [0] * 16
- m3u8_url = intlist_to_bytes(aes_cbc_decrypt(url_data, key, iv))
- m3u8_url = m3u8_url[:-compat_ord((m3u8_url[-1]))].decode('ascii')
- formats = self._extract_m3u8_formats(m3u8_url, video_id, fatal=False, headers={'stream_key': data_json['stream_key']})
+ m3u8_url = unpad_pkcs7(intlist_to_bytes(aes_cbc_decrypt(url_data, key, iv))).decode('ascii')
+ formats, m3u8_subs = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, fatal=False, headers={'stream_key': data_json['stream_key']})
self._sort_formats(formats)
release_date = self._html_search_regex(
@@ -91,6 +89,7 @@ class ShemarooMeIE(InfoExtractor):
subtitles.setdefault('EN', []).append({
'url': self._proto_relative_url(sub_url),
})
+ subtitles = self._merge_subtitles(subtitles, m3u8_subs)
description = self._html_search_regex(r'(?s)>Synopsis(</.+?)</', webpage, 'description', fatal=False)
return {
diff --git a/hypervideo_dl/extractor/showroomlive.py b/hypervideo_dl/extractor/showroomlive.py
index efd9d56..1aada69 100644
--- a/hypervideo_dl/extractor/showroomlive.py
+++ b/hypervideo_dl/extractor/showroomlive.py
@@ -73,7 +73,7 @@ class ShowRoomLiveIE(InfoExtractor):
return {
'id': compat_str(room.get('live_id') or broadcaster_id),
- 'title': self._live_title(title),
+ 'title': title,
'description': room.get('description'),
'timestamp': int_or_none(room.get('current_live_started_at')),
'uploader': uploader,
diff --git a/hypervideo_dl/extractor/skeb.py b/hypervideo_dl/extractor/skeb.py
new file mode 100644
index 0000000..81aecb3
--- /dev/null
+++ b/hypervideo_dl/extractor/skeb.py
@@ -0,0 +1,143 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import ExtractorError, determine_ext, parse_qs, traverse_obj
+
+
+class SkebIE(InfoExtractor):
+ _VALID_URL = r'https?://skeb\.jp/@[^/]+/works/(?P<id>\d+)'
+
+ _TESTS = [{
+ 'url': 'https://skeb.jp/@riiru_wm/works/10',
+ 'info_dict': {
+ 'id': '466853',
+ 'title': '内容はおまかせします! by 姫ノ森りぃる@一周年',
+ 'descripion': 'md5:1ec50901efc3437cfbfe3790468d532d',
+ 'uploader': '姫ノ森りぃる@一周年',
+ 'uploader_id': 'riiru_wm',
+ 'age_limit': 0,
+ 'tags': [],
+ 'url': r're:https://skeb.+',
+ 'thumbnail': r're:https://skeb.+',
+ 'subtitles': {
+ 'jpn': [{
+ 'url': r're:https://skeb.+',
+ 'ext': 'vtt'
+ }]
+ },
+ 'width': 720,
+ 'height': 405,
+ 'duration': 313,
+ 'fps': 30,
+ 'ext': 'mp4',
+ },
+ }, {
+ 'url': 'https://skeb.jp/@furukawa_nob/works/3',
+ 'info_dict': {
+ 'id': '489408',
+ 'title': 'いつもお世話になってお... by 古川ノブ@音楽とVlo...',
+ 'descripion': 'md5:5adc2e41d06d33b558bf7b1faeb7b9c2',
+ 'uploader': '古川ノブ@音楽とVlogのVtuber',
+ 'uploader_id': 'furukawa_nob',
+ 'age_limit': 0,
+ 'tags': [
+ 'よろしく', '大丈夫', 'お願い', 'でした',
+ '是非', 'O', 'バー', '遊び', 'おはよう',
+ 'オーバ', 'ボイス',
+ ],
+ 'url': r're:https://skeb.+',
+ 'thumbnail': r're:https://skeb.+',
+ 'subtitles': {
+ 'jpn': [{
+ 'url': r're:https://skeb.+',
+ 'ext': 'vtt'
+ }]
+ },
+ 'duration': 98,
+ 'ext': 'mp3',
+ 'vcodec': 'none',
+ 'abr': 128,
+ },
+ }, {
+ 'url': 'https://skeb.jp/@mollowmollow/works/6',
+ 'info_dict': {
+ 'id': '6',
+ 'title': 'ヒロ。\n\n私のキャラク... by 諸々',
+ 'descripion': 'md5:aa6cbf2ba320b50bce219632de195f07',
+ '_type': 'playlist',
+ 'entries': [{
+ 'id': '486430',
+ 'title': 'ヒロ。\n\n私のキャラク... by 諸々',
+ 'descripion': 'md5:aa6cbf2ba320b50bce219632de195f07',
+ }, {
+ 'id': '486431',
+ 'title': 'ヒロ。\n\n私のキャラク... by 諸々',
+ }]
+ }
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ nuxt_data = self._search_nuxt_data(self._download_webpage(url, video_id), video_id)
+
+ parent = {
+ 'id': video_id,
+ 'title': nuxt_data.get('title'),
+ 'descripion': nuxt_data.get('description'),
+ 'uploader': traverse_obj(nuxt_data, ('creator', 'name')),
+ 'uploader_id': traverse_obj(nuxt_data, ('creator', 'screen_name')),
+ 'age_limit': 18 if nuxt_data.get('nsfw') else 0,
+ 'tags': nuxt_data.get('tag_list'),
+ }
+
+ entries = []
+ for item in nuxt_data.get('previews') or []:
+ vid_url = item.get('url')
+ given_ext = traverse_obj(item, ('information', 'extension'))
+ preview_ext = determine_ext(vid_url, default_ext=None)
+ if not preview_ext:
+ content_disposition = parse_qs(vid_url)['response-content-disposition'][0]
+ preview_ext = self._search_regex(
+ r'filename="[^"]+\.([^\.]+?)"', content_disposition,
+ 'preview file extension', fatal=False, group=1)
+ if preview_ext not in ('mp4', 'mp3'):
+ continue
+ if not vid_url or not item.get('id'):
+ continue
+ width, height = traverse_obj(item, ('information', 'width')), traverse_obj(item, ('information', 'height'))
+ if width is not None and height is not None:
+ # the longest side is at most 720px for non-client viewers
+ max_size = max(width, height)
+ width, height = list(x * 720 // max_size for x in (width, height))
+ entries.append({
+ **parent,
+ 'id': str(item['id']),
+ 'url': vid_url,
+ 'thumbnail': item.get('poster_url'),
+ 'subtitles': {
+ 'jpn': [{
+ 'url': item.get('vtt_url'),
+ 'ext': 'vtt',
+ }]
+ } if item.get('vtt_url') else None,
+ 'width': width,
+ 'height': height,
+ 'duration': traverse_obj(item, ('information', 'duration')),
+ 'fps': traverse_obj(item, ('information', 'frame_rate')),
+ 'ext': preview_ext or given_ext,
+ 'vcodec': 'none' if preview_ext == 'mp3' else None,
+ # you'll always get 128kbps MP3 for non-client viewers
+ 'abr': 128 if preview_ext == 'mp3' else None,
+ })
+
+ if not entries:
+ raise ExtractorError('No video/audio attachment found in this commission.', expected=True)
+ elif len(entries) == 1:
+ return entries[0]
+ else:
+ parent.update({
+ '_type': 'playlist',
+ 'entries': entries,
+ })
+ return parent
diff --git a/hypervideo_dl/extractor/sky.py b/hypervideo_dl/extractor/sky.py
index ff2c977..ad1e62d 100644
--- a/hypervideo_dl/extractor/sky.py
+++ b/hypervideo_dl/extractor/sky.py
@@ -105,6 +105,34 @@ class SkyNewsIE(SkyBaseIE):
}
+class SkyNewsStoryIE(SkyBaseIE):
+ IE_NAME = 'sky:news:story'
+ _VALID_URL = r'https?://news\.sky\.com/story/[0-9a-z-]+-(?P<id>[0-9]+)'
+ _TEST = {
+ 'url': 'https://news.sky.com/story/budget-2021-chancellor-rishi-sunak-vows-address-will-deliver-strong-economy-fit-for-a-new-age-of-optimism-12445425',
+ 'info_dict': {
+ 'id': 'ref:0714acb9-123d-42c8-91b8-5c1bc6c73f20',
+ 'title': 'md5:e408dd7aad63f31a1817bbe40c7d276f',
+ 'description': 'md5:a881e12f49212f92be2befe4a09d288a',
+ 'ext': 'mp4',
+ 'upload_date': '20211027',
+ 'timestamp': 1635317494,
+ 'uploader_id': '6058004172001',
+ }
+ }
+
+ def _real_extract(self, url):
+ article_id = self._match_id(url)
+ webpage = self._download_webpage(url, article_id)
+
+ entries = [self._process_ooyala_element(webpage, sdc_el, url)
+ for sdc_el in re.findall(self._SDC_EL_REGEX, webpage)]
+
+ return self.playlist_result(
+ entries, article_id, self._og_search_title(webpage),
+ self._html_search_meta(['og:description', 'description'], webpage))
+
+
class SkySportsNewsIE(SkyBaseIE):
IE_NAME = 'sky:sports:news'
_VALID_URL = r'https?://(?:www\.)?skysports\.com/([^/]+/)*news/\d+/(?P<id>\d+)'
diff --git a/hypervideo_dl/extractor/skyit.py b/hypervideo_dl/extractor/skyit.py
index 14a4d8d..ddb43c0 100644
--- a/hypervideo_dl/extractor/skyit.py
+++ b/hypervideo_dl/extractor/skyit.py
@@ -3,7 +3,6 @@ from __future__ import unicode_literals
from .common import InfoExtractor
from ..compat import (
- compat_str,
compat_parse_qs,
compat_urllib_parse_urlparse,
)
@@ -55,7 +54,7 @@ class SkyItPlayerIE(InfoExtractor):
return {
'id': video_id,
- 'title': self._live_title(title) if is_live else title,
+ 'title': title,
'formats': formats,
'thumbnail': dict_get(video, ('video_still', 'video_still_medium', 'thumb')),
'description': video.get('short_desc') or None,
@@ -125,9 +124,7 @@ class SkyItVideoLiveIE(SkyItPlayerIE):
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
- asset_id = compat_str(self._parse_json(self._search_regex(
- r'<script[^>]+id="__NEXT_DATA__"[^>]*>({.+?})</script>',
- webpage, 'next data'), display_id)['props']['initialState']['livePage']['content']['asset_id'])
+ asset_id = str(self._search_nextjs_data(webpage, display_id)['props']['initialState']['livePage']['content']['asset_id'])
livestream = self._download_json(
'https://apid.sky.it/vdp/v1/getLivestream',
asset_id, query={'id': asset_id})
diff --git a/hypervideo_dl/extractor/skylinewebcams.py b/hypervideo_dl/extractor/skylinewebcams.py
index b7f8ac7..47bbb76 100644
--- a/hypervideo_dl/extractor/skylinewebcams.py
+++ b/hypervideo_dl/extractor/skylinewebcams.py
@@ -36,7 +36,7 @@ class SkylineWebcamsIE(InfoExtractor):
'id': video_id,
'url': stream_url,
'ext': 'mp4',
- 'title': self._live_title(title),
+ 'title': title,
'description': description,
'is_live': True,
}
diff --git a/hypervideo_dl/extractor/skynewsau.py b/hypervideo_dl/extractor/skynewsau.py
index b1d7795..8e079ee 100644
--- a/hypervideo_dl/extractor/skynewsau.py
+++ b/hypervideo_dl/extractor/skynewsau.py
@@ -9,7 +9,7 @@ from ..utils import (
class SkyNewsAUIE(InfoExtractor):
- _VALID_URL = r'(?:https?://)(?:www\.)?skynews\.com\.au/[^/]+/[^/]+/[^/]+/video/(?P<id>[a-z0-9]+)'
+ _VALID_URL = r'https?://(?:www\.)?skynews\.com\.au/[^/]+/[^/]+/[^/]+/video/(?P<id>[a-z0-9]+)'
_TESTS = [{
'url': 'https://www.skynews.com.au/world-news/united-states/incredible-vision-shows-lava-overflowing-from-spains-la-palma-volcano/video/0f4c6243d6903502c01251f228b91a71',
diff --git a/hypervideo_dl/extractor/slideslive.py b/hypervideo_dl/extractor/slideslive.py
index 9409a01..df60846 100644
--- a/hypervideo_dl/extractor/slideslive.py
+++ b/hypervideo_dl/extractor/slideslive.py
@@ -35,9 +35,6 @@ class SlidesLiveIE(InfoExtractor):
'ext': 'mp4',
'title': 'Offline Reinforcement Learning: From Algorithms to Practical Challenges',
},
- 'params': {
- 'format': 'bestvideo',
- },
}, {
# video_service_name = youtube
'url': 'https://slideslive.com/38903721/magic-a-scientific-resurrection-of-an-esoteric-legend',
diff --git a/hypervideo_dl/extractor/sonyliv.py b/hypervideo_dl/extractor/sonyliv.py
index c3ed442..5b6849f 100644
--- a/hypervideo_dl/extractor/sonyliv.py
+++ b/hypervideo_dl/extractor/sonyliv.py
@@ -1,6 +1,9 @@
# coding: utf-8
from __future__ import unicode_literals
+import datetime
+import math
+import random
import time
import uuid
@@ -56,17 +59,60 @@ class SonyLIVIE(InfoExtractor):
'only_matching': True,
}]
_GEO_COUNTRIES = ['IN']
- _TOKEN = None
+ _HEADERS = {}
+ _LOGIN_HINT = 'Use "--username <mobile_number>" to login using OTP or "--username token --password <auth_token>" to login using auth token.'
+ _NETRC_MACHINE = 'sonyliv'
+
+ def _get_device_id(self):
+ e = int(time.time() * 1000)
+ t = list('xxxxxxxxxxxx4xxxyxxxxxxxxxxxxxxx')
+ for i, c in enumerate(t):
+ n = int((e + 16 * random.random()) % 16) | 0
+ e = math.floor(e / 16)
+ if c == 'x':
+ t[i] = str(n)
+ elif c == 'y':
+ t[i] = '{:x}'.format(3 & n | 8)
+ return ''.join(t) + '-' + str(int(time.time() * 1000))
+
+ def _perform_login(self, username, password):
+ self._HEADERS['device_id'] = self._get_device_id()
+ self._HEADERS['content-type'] = 'application/json'
+
+ if username.lower() == 'token' and len(password) > 1198:
+ self._HEADERS['authorization'] = password
+ elif len(username) != 10 or not username.isdigit():
+ raise ExtractorError(f'Invalid username/password; {self._LOGIN_HINT}')
+
+ self.report_login()
+ data = '''{"mobileNumber":"%s","channelPartnerID":"MSMIND","country":"IN","timestamp":"%s",
+ "otpSize":6,"loginType":"REGISTERORSIGNIN","isMobileMandatory":true}
+ ''' % (username, datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S.%MZ"))
+ otp_request_json = self._download_json(
+ 'https://apiv2.sonyliv.com/AGL/1.6/A/ENG/WEB/IN/HR/CREATEOTP-V2',
+ None, note='Sending OTP', data=data.encode(), headers=self._HEADERS)
+ if otp_request_json['resultCode'] == 'KO':
+ raise ExtractorError(otp_request_json['message'], expected=True)
+ otp_code = self._get_tfa_info('OTP')
+ data = '''{"channelPartnerID":"MSMIND","mobileNumber":"%s","country":"IN","otp":"%s",
+ "dmaId":"IN","ageConfirmation":true,"timestamp":"%s","isMobileMandatory":true}
+ ''' % (username, otp_code, datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S.%MZ"))
+ otp_verify_json = self._download_json(
+ 'https://apiv2.sonyliv.com/AGL/2.0/A/ENG/WEB/IN/HR/CONFIRMOTP-V2',
+ None, note='Verifying OTP', data=data.encode(), headers=self._HEADERS)
+ if otp_verify_json['resultCode'] == 'KO':
+ raise ExtractorError(otp_request_json['message'], expected=True)
+ self._HEADERS['authorization'] = otp_verify_json['resultObj']['accessToken']
def _call_api(self, version, path, video_id):
- headers = {}
- if self._TOKEN:
- headers['security_token'] = self._TOKEN
try:
return self._download_json(
'https://apiv2.sonyliv.com/AGL/%s/A/ENG/WEB/%s' % (version, path),
- video_id, headers=headers)['resultObj']
+ video_id, headers=self._HEADERS)['resultObj']
except ExtractorError as e:
+ if isinstance(e.cause, compat_HTTPError) and e.cause.code == 406 and self._parse_json(
+ e.cause.read().decode(), video_id)['message'] == 'Please subscribe to watch this content':
+ self.raise_login_required(self._LOGIN_HINT, method=None)
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
message = self._parse_json(
e.cause.read().decode(), video_id)['message']
@@ -75,8 +121,8 @@ class SonyLIVIE(InfoExtractor):
raise ExtractorError(message)
raise
- def _real_initialize(self):
- self._TOKEN = self._call_api('1.4', 'ALL/GETTOKEN', None)
+ def _initialize_pre_login(self):
+ self._HEADERS['security_token'] = self._call_api('1.4', 'ALL/GETTOKEN', None)
def _real_extract(self, url):
video_id = self._match_id(url)
diff --git a/hypervideo_dl/extractor/soundcloud.py b/hypervideo_dl/extractor/soundcloud.py
index 78fecd1..92535f7 100644
--- a/hypervideo_dl/extractor/soundcloud.py
+++ b/hypervideo_dl/extractor/soundcloud.py
@@ -58,7 +58,143 @@ class SoundcloudEmbedIE(InfoExtractor):
return self.url_result(api_url)
-class SoundcloudIE(InfoExtractor):
+class SoundcloudBaseIE(InfoExtractor):
+ _NETRC_MACHINE = 'soundcloud'
+
+ _API_V2_BASE = 'https://api-v2.soundcloud.com/'
+ _BASE_URL = 'https://soundcloud.com/'
+ _USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'
+ _API_AUTH_QUERY_TEMPLATE = '?client_id=%s'
+ _API_AUTH_URL_PW = 'https://api-auth.soundcloud.com/web-auth/sign-in/password%s'
+ _API_VERIFY_AUTH_TOKEN = 'https://api-auth.soundcloud.com/connect/session%s'
+ _access_token = None
+ _HEADERS = {}
+
+ def _store_client_id(self, client_id):
+ self._downloader.cache.store('soundcloud', 'client_id', client_id)
+
+ def _update_client_id(self):
+ webpage = self._download_webpage('https://soundcloud.com/', None)
+ for src in reversed(re.findall(r'<script[^>]+src="([^"]+)"', webpage)):
+ script = self._download_webpage(src, None, fatal=False)
+ if script:
+ client_id = self._search_regex(
+ r'client_id\s*:\s*"([0-9a-zA-Z]{32})"',
+ script, 'client id', default=None)
+ if client_id:
+ self._CLIENT_ID = client_id
+ self._store_client_id(client_id)
+ return
+ raise ExtractorError('Unable to extract client id')
+
+ def _download_json(self, *args, **kwargs):
+ non_fatal = kwargs.get('fatal') is False
+ if non_fatal:
+ del kwargs['fatal']
+ query = kwargs.get('query', {}).copy()
+ for _ in range(2):
+ query['client_id'] = self._CLIENT_ID
+ kwargs['query'] = query
+ try:
+ return super()._download_json(*args, **compat_kwargs(kwargs))
+ except ExtractorError as e:
+ if isinstance(e.cause, compat_HTTPError) and e.cause.code in (401, 403):
+ self._store_client_id(None)
+ self._update_client_id()
+ continue
+ elif non_fatal:
+ self.report_warning(error_to_compat_str(e))
+ return False
+ raise
+
+ def _initialize_pre_login(self):
+ self._CLIENT_ID = self._downloader.cache.load('soundcloud', 'client_id') or 'a3e059563d7fd3372b49b37f00a00bcf'
+
+ def _perform_login(self, username, password):
+ if username != 'oauth':
+ self.report_warning(
+ 'Login using username and password is not currently supported. '
+ 'Use "--username oauth --password <oauth_token>" to login using an oauth token')
+ self._access_token = password
+ query = self._API_AUTH_QUERY_TEMPLATE % self._CLIENT_ID
+ payload = {'session': {'access_token': self._access_token}}
+ token_verification = sanitized_Request(self._API_VERIFY_AUTH_TOKEN % query, json.dumps(payload).encode('utf-8'))
+ response = self._download_json(token_verification, None, note='Verifying login token...', fatal=False)
+ if response is not False:
+ self._HEADERS = {'Authorization': 'OAuth ' + self._access_token}
+ self.report_login()
+ else:
+ self.report_warning('Provided authorization token seems to be invalid. Continue as guest')
+
+ r'''
+ def genDevId():
+ def genNumBlock():
+ return ''.join([str(random.randrange(10)) for i in range(6)])
+ return '-'.join([genNumBlock() for i in range(4)])
+
+ payload = {
+ 'client_id': self._CLIENT_ID,
+ 'recaptcha_pubkey': 'null',
+ 'recaptcha_response': 'null',
+ 'credentials': {
+ 'identifier': username,
+ 'password': password
+ },
+ 'signature': self.sign(username, password, self._CLIENT_ID),
+ 'device_id': genDevId(),
+ 'user_agent': self._USER_AGENT
+ }
+
+ query = self._API_AUTH_QUERY_TEMPLATE % self._CLIENT_ID
+ login = sanitized_Request(self._API_AUTH_URL_PW % query, json.dumps(payload).encode('utf-8'))
+ response = self._download_json(login, None)
+ self._access_token = response.get('session').get('access_token')
+ if not self._access_token:
+ self.report_warning('Unable to get access token, login may has failed')
+ else:
+ self._HEADERS = {'Authorization': 'OAuth ' + self._access_token}
+ '''
+
+ # signature generation
+ def sign(self, user, pw, clid):
+ a = 33
+ i = 1
+ s = 440123
+ w = 117
+ u = 1800000
+ l = 1042
+ b = 37
+ k = 37
+ c = 5
+ n = '0763ed7314c69015fd4a0dc16bbf4b90' # _KEY
+ y = '8' # _REV
+ r = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36' # _USER_AGENT
+ e = user # _USERNAME
+ t = clid # _CLIENT_ID
+
+ d = '-'.join([str(mInt) for mInt in [a, i, s, w, u, l, b, k]])
+ p = n + y + d + r + e + t + d + n
+ h = p
+
+ m = 8011470
+ f = 0
+
+ for f in range(f, len(h)):
+ m = (m >> 1) + ((1 & m) << 23)
+ m += ord(h[f])
+ m &= 16777215
+
+ # c is not even needed
+ out = str(y) + ':' + str(d) + ':' + format(m, 'x') + ':' + str(c)
+
+ return out
+
+ @classmethod
+ def _resolv_url(cls, url):
+ return cls._API_V2_BASE + 'resolve?url=' + url
+
+
+class SoundcloudIE(SoundcloudBaseIE):
"""Information extractor for soundcloud.com
To access the media, the uid of the song and a stream token
must be extracted from the page source and the script must make
@@ -72,8 +208,9 @@ class SoundcloudIE(InfoExtractor):
(?!stations/track)
(?P<uploader>[\w\d-]+)/
(?!(?:tracks|albums|sets(?:/.+?)?|reposts|likes|spotlight)/?(?:$|[?#]))
- (?P<title>[\w\d-]+)/?
- (?P<token>[^?]+?)?(?:[?].*)?$)
+ (?P<title>[\w\d-]+)
+ (?:/(?P<token>(?!(?:albums|sets|recommended))[^?]+?))?
+ (?:[?].*)?$)
|(?:api(?:-v2)?\.soundcloud\.com/tracks/(?P<track_id>\d+)
(?:/?\?secret_token=(?P<secret_token>[^&]+))?)
)
@@ -250,8 +387,6 @@ class SoundcloudIE(InfoExtractor):
},
]
- _API_V2_BASE = 'https://api-v2.soundcloud.com/'
- _BASE_URL = 'https://soundcloud.com/'
_IMAGE_REPL_RE = r'-([0-9a-z]+)\.jpg'
_ARTWORK_MAP = {
@@ -267,143 +402,6 @@ class SoundcloudIE(InfoExtractor):
'original': 0,
}
- def _store_client_id(self, client_id):
- self._downloader.cache.store('soundcloud', 'client_id', client_id)
-
- def _update_client_id(self):
- webpage = self._download_webpage('https://soundcloud.com/', None)
- for src in reversed(re.findall(r'<script[^>]+src="([^"]+)"', webpage)):
- script = self._download_webpage(src, None, fatal=False)
- if script:
- client_id = self._search_regex(
- r'client_id\s*:\s*"([0-9a-zA-Z]{32})"',
- script, 'client id', default=None)
- if client_id:
- self._CLIENT_ID = client_id
- self._store_client_id(client_id)
- return
- raise ExtractorError('Unable to extract client id')
-
- def _download_json(self, *args, **kwargs):
- non_fatal = kwargs.get('fatal') is False
- if non_fatal:
- del kwargs['fatal']
- query = kwargs.get('query', {}).copy()
- for _ in range(2):
- query['client_id'] = self._CLIENT_ID
- kwargs['query'] = query
- try:
- return super(SoundcloudIE, self)._download_json(*args, **compat_kwargs(kwargs))
- except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError) and e.cause.code in (401, 403):
- self._store_client_id(None)
- self._update_client_id()
- continue
- elif non_fatal:
- self.report_warning(error_to_compat_str(e))
- return False
- raise
-
- def _real_initialize(self):
- self._CLIENT_ID = self._downloader.cache.load('soundcloud', 'client_id') or 'a3e059563d7fd3372b49b37f00a00bcf'
- self._login()
-
- _USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'
- _API_AUTH_QUERY_TEMPLATE = '?client_id=%s'
- _API_AUTH_URL_PW = 'https://api-auth.soundcloud.com/web-auth/sign-in/password%s'
- _API_VERIFY_AUTH_TOKEN = 'https://api-auth.soundcloud.com/connect/session%s'
- _access_token = None
- _HEADERS = {}
- _NETRC_MACHINE = 'soundcloud'
-
- def _login(self):
- username, password = self._get_login_info()
- if username is None:
- return
-
- if username == 'oauth' and password is not None:
- self._access_token = password
- query = self._API_AUTH_QUERY_TEMPLATE % self._CLIENT_ID
- payload = {'session': {'access_token': self._access_token}}
- token_verification = sanitized_Request(self._API_VERIFY_AUTH_TOKEN % query, json.dumps(payload).encode('utf-8'))
- response = self._download_json(token_verification, None, note='Verifying login token...', fatal=False)
- if response is not False:
- self._HEADERS = {'Authorization': 'OAuth ' + self._access_token}
- self.report_login()
- else:
- self.report_warning('Provided authorization token seems to be invalid. Continue as guest')
- elif username is not None:
- self.report_warning(
- 'Login using username and password is not currently supported. '
- 'Use "--user oauth --password <oauth_token>" to login using an oauth token')
-
- r'''
- def genDevId():
- def genNumBlock():
- return ''.join([str(random.randrange(10)) for i in range(6)])
- return '-'.join([genNumBlock() for i in range(4)])
-
- payload = {
- 'client_id': self._CLIENT_ID,
- 'recaptcha_pubkey': 'null',
- 'recaptcha_response': 'null',
- 'credentials': {
- 'identifier': username,
- 'password': password
- },
- 'signature': self.sign(username, password, self._CLIENT_ID),
- 'device_id': genDevId(),
- 'user_agent': self._USER_AGENT
- }
-
- query = self._API_AUTH_QUERY_TEMPLATE % self._CLIENT_ID
- login = sanitized_Request(self._API_AUTH_URL_PW % query, json.dumps(payload).encode('utf-8'))
- response = self._download_json(login, None)
- self._access_token = response.get('session').get('access_token')
- if not self._access_token:
- self.report_warning('Unable to get access token, login may has failed')
- else:
- self._HEADERS = {'Authorization': 'OAuth ' + self._access_token}
- '''
-
- # signature generation
- def sign(self, user, pw, clid):
- a = 33
- i = 1
- s = 440123
- w = 117
- u = 1800000
- l = 1042
- b = 37
- k = 37
- c = 5
- n = '0763ed7314c69015fd4a0dc16bbf4b90' # _KEY
- y = '8' # _REV
- r = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36' # _USER_AGENT
- e = user # _USERNAME
- t = clid # _CLIENT_ID
-
- d = '-'.join([str(mInt) for mInt in [a, i, s, w, u, l, b, k]])
- p = n + y + d + r + e + t + d + n
- h = p
-
- m = 8011470
- f = 0
-
- for f in range(f, len(h)):
- m = (m >> 1) + ((1 & m) << 23)
- m += ord(h[f])
- m &= 16777215
-
- # c is not even needed
- out = str(y) + ':' + str(d) + ':' + format(m, 'x') + ':' + str(c)
-
- return out
-
- @classmethod
- def _resolv_url(cls, url):
- return SoundcloudIE._API_V2_BASE + 'resolve?url=' + url
-
def _extract_info_dict(self, info, full_title=None, secret_token=None):
track_id = compat_str(info['id'])
title = info['title']
@@ -581,7 +579,7 @@ class SoundcloudIE(InfoExtractor):
return self._extract_info_dict(info, full_title, token)
-class SoundcloudPlaylistBaseIE(SoundcloudIE):
+class SoundcloudPlaylistBaseIE(SoundcloudBaseIE):
def _extract_set(self, playlist, token=None):
playlist_id = compat_str(playlist['id'])
tracks = playlist.get('tracks') or []
@@ -654,7 +652,7 @@ class SoundcloudSetIE(SoundcloudPlaylistBaseIE):
return self._extract_set(info, token)
-class SoundcloudPagedPlaylistBaseIE(SoundcloudIE):
+class SoundcloudPagedPlaylistBaseIE(SoundcloudBaseIE):
def _extract_playlist(self, base_url, playlist_id, playlist_title):
return {
'_type': 'playlist',
@@ -824,6 +822,54 @@ class SoundcloudTrackStationIE(SoundcloudPagedPlaylistBaseIE):
track_id, 'Track station: %s' % track['title'])
+class SoundcloudRelatedIE(SoundcloudPagedPlaylistBaseIE):
+ _VALID_URL = r'https?://(?:(?:www|m)\.)?soundcloud\.com/(?P<slug>[\w\d-]+/[\w\d-]+)/(?P<relation>albums|sets|recommended)'
+ IE_NAME = 'soundcloud:related'
+ _TESTS = [{
+ 'url': 'https://soundcloud.com/wajang/sexapil-pingers-5/recommended',
+ 'info_dict': {
+ 'id': '1084577272',
+ 'title': 'Sexapil - Pingers 5 (Recommended)',
+ },
+ 'playlist_mincount': 50,
+ }, {
+ 'url': 'https://soundcloud.com/wajang/sexapil-pingers-5/albums',
+ 'info_dict': {
+ 'id': '1084577272',
+ 'title': 'Sexapil - Pingers 5 (Albums)',
+ },
+ 'playlist_mincount': 1,
+ }, {
+ 'url': 'https://soundcloud.com/wajang/sexapil-pingers-5/sets',
+ 'info_dict': {
+ 'id': '1084577272',
+ 'title': 'Sexapil - Pingers 5 (Sets)',
+ },
+ 'playlist_mincount': 4,
+ }]
+
+ _BASE_URL_MAP = {
+ 'albums': 'tracks/%s/albums',
+ 'sets': 'tracks/%s/playlists_without_albums',
+ 'recommended': 'tracks/%s/related',
+ }
+
+ def _real_extract(self, url):
+ slug, relation = self._match_valid_url(url).group('slug', 'relation')
+
+ track = self._download_json(
+ self._resolv_url(self._BASE_URL + slug),
+ slug, 'Downloading track info', headers=self._HEADERS)
+
+ if track.get('errors'):
+ raise ExtractorError(f'{self.IE_NAME} said: %s' % ','.join(
+ str(err['error_message']) for err in track['errors']), expected=True)
+
+ return self._extract_playlist(
+ self._API_V2_BASE + self._BASE_URL_MAP[relation] % track['id'], str(track['id']),
+ '%s (%s)' % (track.get('title') or slug, relation.capitalize()))
+
+
class SoundcloudPlaylistIE(SoundcloudPlaylistBaseIE):
_VALID_URL = r'https?://api(?:-v2)?\.soundcloud\.com/playlists/(?P<id>[0-9]+)(?:/?\?secret_token=(?P<token>[^&]+?))?$'
IE_NAME = 'soundcloud:playlist'
@@ -853,10 +899,10 @@ class SoundcloudPlaylistIE(SoundcloudPlaylistBaseIE):
return self._extract_set(data, token)
-class SoundcloudSearchIE(SearchInfoExtractor, SoundcloudIE):
+class SoundcloudSearchIE(SoundcloudBaseIE, SearchInfoExtractor):
IE_NAME = 'soundcloud:search'
IE_DESC = 'Soundcloud search'
- _MAX_RESULTS = float('inf')
+ _SEARCH_KEY = 'scsearch'
_TESTS = [{
'url': 'scsearch15:post-avant jazzcore',
'info_dict': {
@@ -865,7 +911,6 @@ class SoundcloudSearchIE(SearchInfoExtractor, SoundcloudIE):
'playlist_count': 15,
}]
- _SEARCH_KEY = 'scsearch'
_MAX_RESULTS_PER_PAGE = 200
_DEFAULT_RESULTS_PER_PAGE = 50
@@ -894,5 +939,6 @@ class SoundcloudSearchIE(SearchInfoExtractor, SoundcloudIE):
break
def _get_n_results(self, query, n):
- tracks = self._get_collection('search/tracks', query, limit=n, q=query)
- return self.playlist_result(tracks, query, query)
+ return self.playlist_result(itertools.islice(
+ self._get_collection('search/tracks', query, limit=n, q=query),
+ 0, None if n == float('inf') else n), query, query)
diff --git a/hypervideo_dl/extractor/southpark.py b/hypervideo_dl/extractor/southpark.py
index d497494..942a52d 100644
--- a/hypervideo_dl/extractor/southpark.py
+++ b/hypervideo_dl/extractor/southpark.py
@@ -6,19 +6,18 @@ from .mtv import MTVServicesInfoExtractor
class SouthParkIE(MTVServicesInfoExtractor):
IE_NAME = 'southpark.cc.com'
- _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark(?:\.cc|studios)\.com/(?:clips|(?:full-)?episodes|collections)/(?P<id>.+?)(\?|#|$))'
+ _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark(?:\.cc|studios)\.com/((?:video-)?clips|(?:full-)?episodes|collections)/(?P<id>.+?)(\?|#|$))'
_FEED_URL = 'http://feeds.mtvnservices.com/od/feed/intl-mrss-player-feed'
_TESTS = [{
- 'url': 'http://southpark.cc.com/clips/104437/bat-daded#tab=featured',
+ 'url': 'https://southpark.cc.com/video-clips/d7wr06/south-park-you-all-agreed-to-counseling',
'info_dict': {
- 'id': 'a7bff6c2-ed00-11e0-aca6-0026b9414f30',
'ext': 'mp4',
- 'title': 'South Park|Bat Daded',
- 'description': 'Randy disqualifies South Park by getting into a fight with Bat Dad.',
- 'timestamp': 1112760000,
- 'upload_date': '20050406',
+ 'title': 'You All Agreed to Counseling',
+ 'description': 'Kenny, Cartman, Stan, and Kyle visit Mr. Mackey and ask for his help getting Mrs. Nelson to come back. Mr. Mackey reveals the only way to get things back to normal is to get the teachers vaccinated.',
+ 'timestamp': 1615352400,
+ 'upload_date': '20210310',
},
}, {
'url': 'http://southpark.cc.com/collections/7758/fan-favorites/1',
@@ -40,11 +39,11 @@ class SouthParkIE(MTVServicesInfoExtractor):
class SouthParkEsIE(SouthParkIE):
IE_NAME = 'southpark.cc.com:español'
- _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.cc\.com/episodios-en-espanol/(?P<id>.+?)(\?|#|$))'
+ _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.cc\.com/es/episodios/(?P<id>.+?)(\?|#|$))'
_LANG = 'es'
_TESTS = [{
- 'url': 'http://southpark.cc.com/episodios-en-espanol/s01e01-cartman-consigue-una-sonda-anal#source=351c1323-0b96-402d-a8b9-40d01b2e9bde&position=1&sort=!airdate',
+ 'url': 'http://southpark.cc.com/es/episodios/s01e01-cartman-consigue-una-sonda-anal#source=351c1323-0b96-402d-a8b9-40d01b2e9bde&position=1&sort=!airdate',
'info_dict': {
'title': 'Cartman Consigue Una Sonda Anal',
'description': 'Cartman Consigue Una Sonda Anal',
diff --git a/hypervideo_dl/extractor/sovietscloset.py b/hypervideo_dl/extractor/sovietscloset.py
index 7df2375..4bc2263 100644
--- a/hypervideo_dl/extractor/sovietscloset.py
+++ b/hypervideo_dl/extractor/sovietscloset.py
@@ -3,7 +3,6 @@ from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
- js_to_json,
try_get,
unified_timestamp
)
@@ -14,17 +13,7 @@ class SovietsClosetBaseIE(InfoExtractor):
def parse_nuxt_jsonp(self, nuxt_jsonp_url, video_id, name):
nuxt_jsonp = self._download_webpage(nuxt_jsonp_url, video_id, note=f'Downloading {name} __NUXT_JSONP__')
- js, arg_keys, arg_vals = self._search_regex(
- r'__NUXT_JSONP__\(.*?\(function\((?P<arg_keys>.*?)\)\{return\s(?P<js>\{.*?\})\}\((?P<arg_vals>.*?)\)',
- nuxt_jsonp, '__NUXT_JSONP__', group=['js', 'arg_keys', 'arg_vals'])
-
- args = dict(zip(arg_keys.split(','), arg_vals.split(',')))
-
- for key, val in args.items():
- if val in ('undefined', 'void 0'):
- args[key] = 'null'
-
- return self._parse_json(js_to_json(js, args), video_id)['data'][0]
+ return self._search_nuxt_data(nuxt_jsonp, video_id, '__NUXT_JSONP__')
def video_meta(self, video_id, game_name, category_name, episode_number, stream_date):
title = game_name
@@ -78,6 +67,7 @@ class SovietsClosetIE(SovietsClosetBaseIE):
'series': 'The Witcher',
'season': 'Misc',
'episode_number': 13,
+ 'episode': 'Episode 13',
},
},
{
@@ -103,6 +93,7 @@ class SovietsClosetIE(SovietsClosetBaseIE):
'series': 'Arma 3',
'season': 'Zeus Games',
'episode_number': 3,
+ 'episode': 'Episode 3',
},
},
]
diff --git a/hypervideo_dl/extractor/spiegel.py b/hypervideo_dl/extractor/spiegel.py
index 2da32b9..58f2ed3 100644
--- a/hypervideo_dl/extractor/spiegel.py
+++ b/hypervideo_dl/extractor/spiegel.py
@@ -7,7 +7,7 @@ from .jwplatform import JWPlatformIE
class SpiegelIE(InfoExtractor):
_UUID_RE = r'[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}'
- _VALID_URL = r'https?://(?:www\.)?(?:spiegel|manager-magazin)\.de(?:/[^/]+)+/[^/]*-(?P<id>[0-9]+|%s)(?:-embed|-iframe)?(?:\.html)?(?:#.*)?$' % _UUID_RE
+ _VALID_URL = r'https?://(?:www\.)?(?:spiegel|manager-magazin)\.de(?:/[^/]+)+/[^/]*-(?P<id>[0-9]+|%s)(?:-embed|-iframe)?(?:\.html)?(?:$|[#?])' % _UUID_RE
_TESTS = [{
'url': 'http://www.spiegel.de/video/vulkan-tungurahua-in-ecuador-ist-wieder-aktiv-video-1259285.html',
'md5': '50c7948883ec85a3e431a0a44b7ad1d6',
diff --git a/hypervideo_dl/extractor/sportdeutschland.py b/hypervideo_dl/extractor/sportdeutschland.py
index 94bcaba..15b488a 100644
--- a/hypervideo_dl/extractor/sportdeutschland.py
+++ b/hypervideo_dl/extractor/sportdeutschland.py
@@ -59,12 +59,8 @@ class SportDeutschlandIE(InfoExtractor):
videos = asset.get('videos') or []
if len(videos) > 1:
playlist_id = parse_qs(url).get('playlistId', [None])[0]
- if playlist_id:
- if self.get_param('noplaylist'):
- videos = [videos[int(playlist_id)]]
- self.to_screen('Downloading just a single video because of --no-playlist')
- else:
- self.to_screen('Downloading playlist %s - add --no-playlist to just download video' % asset_id)
+ if not self._yes_playlist(playlist_id, asset_id):
+ videos = [videos[int(playlist_id)]]
def entries():
for i, video in enumerate(videos, 1):
diff --git a/hypervideo_dl/extractor/srgssr.py b/hypervideo_dl/extractor/srgssr.py
index cbc1c47..f991981 100644
--- a/hypervideo_dl/extractor/srgssr.py
+++ b/hypervideo_dl/extractor/srgssr.py
@@ -7,6 +7,7 @@ from ..utils import (
ExtractorError,
float_or_none,
int_or_none,
+ join_nonempty,
parse_iso8601,
qualities,
try_get,
@@ -94,11 +95,7 @@ class SRGSSRIE(InfoExtractor):
continue
protocol = source.get('protocol')
quality = source.get('quality')
- format_id = []
- for e in (protocol, source.get('encoding'), quality):
- if e:
- format_id.append(e)
- format_id = '-'.join(format_id)
+ format_id = join_nonempty(protocol, source.get('encoding'), quality)
if protocol in ('HDS', 'HLS'):
if source.get('tokenType') == 'AKAMAI':
diff --git a/hypervideo_dl/extractor/steam.py b/hypervideo_dl/extractor/steam.py
index 7f777c4..4ed0fb5 100644
--- a/hypervideo_dl/extractor/steam.py
+++ b/hypervideo_dl/extractor/steam.py
@@ -7,14 +7,13 @@ from ..utils import (
extract_attributes,
ExtractorError,
get_element_by_class,
- js_to_json,
)
class SteamIE(InfoExtractor):
_VALID_URL = r"""(?x)
- https?://store\.steampowered\.com/
- (agecheck/)?
+ https?://(?:store\.steampowered|steamcommunity)\.com/
+ (?:agecheck/)?
(?P<urltype>video|app)/ #If the page is only for videos or for a game
(?P<gameID>\d+)/?
(?P<videoID>\d*)(?P<extra>\??) # For urltype == video we sometimes get the videoID
@@ -27,21 +26,24 @@ class SteamIE(InfoExtractor):
'url': 'http://store.steampowered.com/video/105600/',
'playlist': [
{
- 'md5': '6a294ee0c4b1f47f5bb76a65e31e3592',
+ 'md5': '695242613303ffa2a4c44c9374ddc067',
'info_dict': {
- 'id': '2040428',
+ 'id': '256785003',
'ext': 'mp4',
- 'title': 'Terraria 1.3 Trailer',
- 'playlist_index': 1,
+ 'title': 'Terraria video 256785003',
+ 'thumbnail': r're:^https://cdn\.[^\.]+\.steamstatic\.com',
+ 'n_entries': 2,
}
},
{
- 'md5': '911672b20064ca3263fa89650ba5a7aa',
+ 'md5': '6a294ee0c4b1f47f5bb76a65e31e3592',
'info_dict': {
- 'id': '2029566',
+ 'id': '2040428',
'ext': 'mp4',
- 'title': 'Terraria 1.2 Trailer',
+ 'title': 'Terraria video 2040428',
'playlist_index': 2,
+ 'thumbnail': r're:^https://cdn\.[^\.]+\.steamstatic\.com',
+ 'n_entries': 2,
}
}
],
@@ -53,96 +55,76 @@ class SteamIE(InfoExtractor):
'playlistend': 2,
}
}, {
- 'url': 'http://steamcommunity.com/sharedfiles/filedetails/?id=242472205',
+ 'url': 'https://store.steampowered.com/app/271590/Grand_Theft_Auto_V/',
'info_dict': {
- 'id': 'X8kpJBlzD2E',
+ 'id': '256757115',
+ 'title': 'Grand Theft Auto V video 256757115',
'ext': 'mp4',
- 'upload_date': '20140617',
- 'title': 'FRONTIERS - Trapping',
- 'description': 'md5:bf6f7f773def614054089e5769c12a6e',
- 'uploader': 'AAD Productions',
- 'uploader_id': 'AtomicAgeDogGames',
- }
+ 'thumbnail': r're:^https://cdn\.[^\.]+\.steamstatic\.com',
+ 'n_entries': 20,
+ },
}]
def _real_extract(self, url):
m = self._match_valid_url(url)
fileID = m.group('fileID')
if fileID:
- videourl = url
+ video_url = url
playlist_id = fileID
else:
gameID = m.group('gameID')
playlist_id = gameID
- videourl = self._VIDEO_PAGE_TEMPLATE % playlist_id
+ video_url = self._VIDEO_PAGE_TEMPLATE % playlist_id
- self._set_cookie('steampowered.com', 'mature_content', '1')
+ self._set_cookie('steampowered.com', 'wants_mature_content', '1')
+ self._set_cookie('steampowered.com', 'birthtime', '944006401')
+ self._set_cookie('steampowered.com', 'lastagecheckage', '1-0-2000')
- webpage = self._download_webpage(videourl, playlist_id)
+ webpage = self._download_webpage(video_url, playlist_id)
- if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
- videourl = self._AGECHECK_TEMPLATE % playlist_id
+ if re.search('<div[^>]+>Please enter your birth date to continue:</div>', webpage) is not None:
+ video_url = self._AGECHECK_TEMPLATE % playlist_id
self.report_age_confirmation()
- webpage = self._download_webpage(videourl, playlist_id)
-
- flash_vars = self._parse_json(self._search_regex(
- r'(?s)rgMovieFlashvars\s*=\s*({.+?});', webpage,
- 'flash vars'), playlist_id, js_to_json)
+ webpage = self._download_webpage(video_url, playlist_id)
- playlist_title = None
+ videos = re.findall(r'(<div[^>]+id=[\'"]highlight_movie_(\d+)[\'"][^>]+>)', webpage)
entries = []
- if fileID:
- playlist_title = get_element_by_class('workshopItemTitle', webpage)
- for movie in flash_vars.values():
- if not movie:
- continue
- youtube_id = movie.get('YOUTUBE_VIDEO_ID')
- if not youtube_id:
- continue
+ playlist_title = get_element_by_class('apphub_AppName', webpage)
+ for movie, movie_id in videos:
+ if not movie:
+ continue
+ movie = extract_attributes(movie)
+ if not movie_id:
+ continue
+ entry = {
+ 'id': movie_id,
+ 'title': f'{playlist_title} video {movie_id}',
+ }
+ formats = []
+ if movie:
+ entry['thumbnail'] = movie.get('data-poster')
+ for quality in ('', '-hd'):
+ for ext in ('webm', 'mp4'):
+ video_url = movie.get('data-%s%s-source' % (ext, quality))
+ if video_url:
+ formats.append({
+ 'format_id': ext + quality,
+ 'url': video_url,
+ })
+ self._sort_formats(formats)
+ entry['formats'] = formats
+ entries.append(entry)
+ embedded_videos = re.findall(r'(<iframe[^>]+>)', webpage)
+ for evideos in embedded_videos:
+ evideos = extract_attributes(evideos).get('src')
+ video_id = self._search_regex(r'youtube\.com/embed/([0-9A-Za-z_-]{11})', evideos, 'youtube_video_id', default=None)
+ if video_id:
entries.append({
- '_type': 'url',
- 'url': youtube_id,
+ '_type': 'url_transparent',
+ 'id': video_id,
+ 'url': video_id,
'ie_key': 'Youtube',
})
- else:
- playlist_title = get_element_by_class('apphub_AppName', webpage)
- for movie_id, movie in flash_vars.items():
- if not movie:
- continue
- video_id = self._search_regex(r'movie_(\d+)', movie_id, 'video id', fatal=False)
- title = movie.get('MOVIE_NAME')
- if not title or not video_id:
- continue
- entry = {
- 'id': video_id,
- 'title': title.replace('+', ' '),
- }
- formats = []
- flv_url = movie.get('FILENAME')
- if flv_url:
- formats.append({
- 'format_id': 'flv',
- 'url': flv_url,
- })
- highlight_element = self._search_regex(
- r'(<div[^>]+id="highlight_movie_%s"[^>]+>)' % video_id,
- webpage, 'highlight element', fatal=False)
- if highlight_element:
- highlight_attribs = extract_attributes(highlight_element)
- if highlight_attribs:
- entry['thumbnail'] = highlight_attribs.get('data-poster')
- for quality in ('', '-hd'):
- for ext in ('webm', 'mp4'):
- video_url = highlight_attribs.get('data-%s%s-source' % (ext, quality))
- if video_url:
- formats.append({
- 'format_id': ext + quality,
- 'url': video_url,
- })
- if not formats and not self.get_param('ignore_no_formats'):
- continue
- entry['formats'] = formats
- entries.append(entry)
if not entries:
raise ExtractorError('Could not find any videos')
diff --git a/hypervideo_dl/extractor/storyfire.py b/hypervideo_dl/extractor/storyfire.py
index 9c69862..e18a59a 100644
--- a/hypervideo_dl/extractor/storyfire.py
+++ b/hypervideo_dl/extractor/storyfire.py
@@ -5,7 +5,7 @@ import functools
from .common import InfoExtractor
from ..utils import (
- # HEADRequest,
+ format_field,
int_or_none,
OnDemandPagedList,
smuggle_url,
@@ -26,18 +26,6 @@ class StoryFireBaseIE(InfoExtractor):
r'https?://player\.vimeo\.com/external/(\d+)',
video['vimeoVideoURL'], 'vimeo id')
- # video_url = self._request_webpage(
- # HEADRequest(video['vimeoVideoURL']), video_id).geturl()
- # formats = []
- # for v_url, suffix in [(video_url, '_sep'), (video_url.replace('/sep/video/', '/video/'), '')]:
- # formats.extend(self._extract_m3u8_formats(
- # v_url, video_id, 'mp4', 'm3u8_native',
- # m3u8_id='hls' + suffix, fatal=False))
- # formats.extend(self._extract_mpd_formats(
- # v_url.replace('.m3u8', '.mpd'), video_id,
- # mpd_id='dash' + suffix, fatal=False))
- # self._sort_formats(formats)
-
uploader_id = video.get('hostID')
return {
@@ -51,7 +39,6 @@ class StoryFireBaseIE(InfoExtractor):
'Referer': 'https://storyfire.com/',
}
}),
- # 'formats': formats,
'thumbnail': video.get('storyImage'),
'view_count': int_or_none(video.get('views')),
'like_count': int_or_none(video.get('likesCount')),
@@ -60,7 +47,7 @@ class StoryFireBaseIE(InfoExtractor):
'timestamp': int_or_none(video.get('publishDate')),
'uploader': video.get('username'),
'uploader_id': uploader_id,
- 'uploader_url': 'https://storyfire.com/user/%s/video' % uploader_id if uploader_id else None,
+ 'uploader_url': format_field(uploader_id, template='https://storyfire.com/user/%s/video'),
'episode_number': int_or_none(video.get('episodeNumber') or video.get('episode_number')),
}
diff --git a/hypervideo_dl/extractor/streamcz.py b/hypervideo_dl/extractor/streamcz.py
index 58e0b4c..4cb9923 100644
--- a/hypervideo_dl/extractor/streamcz.py
+++ b/hypervideo_dl/extractor/streamcz.py
@@ -1,105 +1,124 @@
# coding: utf-8
-from __future__ import unicode_literals
-
-import hashlib
-import time
+import json
from .common import InfoExtractor
from ..utils import (
+ float_or_none,
int_or_none,
- sanitized_Request,
+ parse_codecs,
+ traverse_obj,
+ urljoin,
)
-def _get_api_key(api_path):
- if api_path.endswith('?'):
- api_path = api_path[:-1]
-
- api_key = 'fb5f58a820353bd7095de526253c14fd'
- a = '{0:}{1:}{2:}'.format(api_key, api_path, int(round(time.time() / 24 / 3600)))
- return hashlib.md5(a.encode('ascii')).hexdigest()
-
-
class StreamCZIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?stream\.cz/.+/(?P<id>[0-9]+)'
- _API_URL = 'http://www.stream.cz/API'
-
+ _VALID_URL = r'https?://(?:www\.)?(?:stream|televizeseznam)\.cz/[^?#]+/(?P<display_id>[^?#]+)-(?P<id>[0-9]+)'
_TESTS = [{
- 'url': 'http://www.stream.cz/peklonataliri/765767-ecka-pro-deti',
- 'md5': '934bb6a6d220d99c010783c9719960d5',
+ 'url': 'https://www.televizeseznam.cz/video/lajna/buh-57953890',
+ 'md5': '40c41ade1464a390a0b447e333df4239',
'info_dict': {
- 'id': '765767',
+ 'id': '57953890',
'ext': 'mp4',
- 'title': 'Peklo na talíři: Éčka pro děti',
- 'description': 'Taška s grónskou pomazánkou a další pekelnosti ZDE',
- 'thumbnail': 're:^http://im.stream.cz/episode/52961d7e19d423f8f06f0100',
- 'duration': 256,
- },
+ 'title': 'Bůh',
+ 'display_id': 'buh',
+ 'description': 'md5:8f5f09b9b7bc67df910486cdd88f7165',
+ 'duration': 1369.6,
+ 'view_count': int,
+ }
}, {
- 'url': 'http://www.stream.cz/blanik/10002447-tri-roky-pro-mazanka',
- 'md5': '849a88c1e1ca47d41403c2ba5e59e261',
+ 'url': 'https://www.stream.cz/kdo-to-mluvi/kdo-to-mluvi-velke-odhaleni-prinasi-novy-porad-uz-od-25-srpna-64087937',
+ 'md5': '41fd358000086a1ccdb068c77809b158',
'info_dict': {
- 'id': '10002447',
+ 'id': '64087937',
'ext': 'mp4',
- 'title': 'Kancelář Blaník: Tři roky pro Mazánka',
- 'description': 'md5:3862a00ba7bf0b3e44806b544032c859',
- 'thumbnail': 're:^http://im.stream.cz/episode/537f838c50c11f8d21320000',
- 'duration': 368,
- },
+ 'title': 'Kdo to mluví? Velké odhalení přináší nový pořad už od 25. srpna',
+ 'display_id': 'kdo-to-mluvi-velke-odhaleni-prinasi-novy-porad-uz-od-25-srpna',
+ 'description': 'md5:97a811000a6460266029d6c1c2ebcd59',
+ 'duration': 50.2,
+ 'view_count': int,
+ }
+ }, {
+ 'url': 'https://www.stream.cz/tajemno/znicehonic-jim-skrz-strechu-prolitnul-zahadny-predmet-badatele-vse-objasnili-64147267',
+ 'md5': '3ee4d0be040e8f4a543e67e509d55e3f',
+ 'info_dict': {
+ 'id': '64147267',
+ 'ext': 'mp4',
+ 'title': 'Zničehonic jim skrz střechu prolítnul záhadný předmět. Badatelé vše objasnili',
+ 'display_id': 'znicehonic-jim-skrz-strechu-prolitnul-zahadny-predmet-badatele-vse-objasnili',
+ 'description': 'md5:4b8ada6718d34bb011c4e04ca4bc19bf',
+ 'duration': 442.84,
+ 'view_count': int,
+ }
}]
- def _real_extract(self, url):
- video_id = self._match_id(url)
- api_path = '/episode/%s' % video_id
-
- req = sanitized_Request(self._API_URL + api_path)
- req.add_header('Api-Password', _get_api_key(api_path))
- data = self._download_json(req, video_id)
+ def _extract_formats(self, spl_url, video):
+ for ext, pref, streams in (
+ ('ts', -1, traverse_obj(video, ('http_stream', 'qualities'))),
+ ('mp4', 1, video.get('mp4'))):
+ for format_id, stream in streams.items():
+ if not stream.get('url'):
+ continue
+ yield {
+ 'format_id': f'{format_id}-{ext}',
+ 'ext': ext,
+ 'source_preference': pref,
+ 'url': urljoin(spl_url, stream['url']),
+ 'tbr': float_or_none(stream.get('bandwidth'), scale=1000),
+ 'duration': float_or_none(stream.get('duration'), scale=1000),
+ 'width': traverse_obj(stream, ('resolution', 0)),
+ 'height': traverse_obj(stream, ('resolution', 1)) or int_or_none(format_id.replace('p', '')),
+ **parse_codecs(stream.get('codec')),
+ }
- formats = []
- for quality, video in enumerate(data['video_qualities']):
- for f in video['formats']:
- typ = f['type'].partition('/')[2]
- qlabel = video.get('quality_label')
- formats.append({
- 'format_note': '%s-%s' % (qlabel, typ) if qlabel else typ,
- 'format_id': '%s-%s' % (typ, f['quality']),
- 'url': f['source'],
- 'height': int_or_none(f['quality'].rstrip('p')),
- 'quality': quality,
- })
- self._sort_formats(formats)
+ def _real_extract(self, url):
+ display_id, video_id = self._match_valid_url(url).groups()
- image = data.get('image')
- if image:
- thumbnail = self._proto_relative_url(
- image.replace('{width}', '1240').replace('{height}', '697'),
- scheme='http:',
- )
- else:
- thumbnail = None
+ data = self._download_json(
+ 'https://www.televizeseznam.cz/api/graphql', video_id, 'Downloading GraphQL result',
+ data=json.dumps({
+ 'variables': {'urlName': video_id},
+ 'query': '''
+ query LoadEpisode($urlName : String){ episode(urlName: $urlName){ ...VideoDetailFragmentOnEpisode } }
+ fragment VideoDetailFragmentOnEpisode on Episode {
+ id
+ spl
+ urlName
+ name
+ perex
+ duration
+ views
+ }'''
+ }).encode('utf-8'),
+ headers={'Content-Type': 'application/json;charset=UTF-8'}
+ )['data']['episode']
- stream = data.get('_embedded', {}).get('stream:show', {}).get('name')
- if stream:
- title = '%s: %s' % (stream, data['name'])
- else:
- title = data['name']
+ spl_url = data['spl'] + 'spl2,3'
+ metadata = self._download_json(spl_url, video_id, 'Downloading playlist')
+ if 'Location' in metadata and 'data' not in metadata:
+ spl_url = metadata['Location']
+ metadata = self._download_json(spl_url, video_id, 'Downloading redirected playlist')
+ video = metadata['data']
subtitles = {}
- srt_url = data.get('subtitles_srt')
- if srt_url:
- subtitles['cs'] = [{
- 'ext': 'srt',
- 'url': srt_url,
- }]
+ for subs in video.get('subtitles', {}).values():
+ if not subs.get('language'):
+ continue
+ for ext, sub_url in subs.get('urls').items():
+ subtitles.setdefault(subs['language'], []).append({
+ 'ext': ext,
+ 'url': urljoin(spl_url, sub_url)
+ })
+
+ formats = list(self._extract_formats(spl_url, video))
+ self._sort_formats(formats)
return {
'id': video_id,
- 'title': title,
- 'thumbnail': thumbnail,
- 'formats': formats,
- 'description': data.get('web_site_text'),
- 'duration': int_or_none(data.get('duration')),
+ 'display_id': display_id,
+ 'title': data.get('name'),
+ 'description': data.get('perex'),
+ 'duration': float_or_none(data.get('duration')),
'view_count': int_or_none(data.get('views')),
+ 'formats': formats,
'subtitles': subtitles,
}
diff --git a/hypervideo_dl/extractor/streamff.py b/hypervideo_dl/extractor/streamff.py
new file mode 100644
index 0000000..6b190bb
--- /dev/null
+++ b/hypervideo_dl/extractor/streamff.py
@@ -0,0 +1,31 @@
+# coding: utf-8
+from .common import InfoExtractor
+from ..utils import int_or_none, parse_iso8601
+
+
+class StreamFFIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?streamff\.com/v/(?P<id>[a-zA-Z0-9]+)'
+
+ _TESTS = [{
+ 'url': 'https://streamff.com/v/55cc94',
+ 'md5': '8745a67bb5e5c570738efe7983826370',
+ 'info_dict': {
+ 'id': '55cc94',
+ 'ext': 'mp4',
+ 'title': '55cc94',
+ 'timestamp': 1634764643,
+ 'upload_date': '20211020',
+ 'view_count': int,
+ }
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ json_data = self._download_json(f'https://streamff.com/api/videos/{video_id}', video_id)
+ return {
+ 'id': video_id,
+ 'title': json_data.get('name') or video_id,
+ 'url': 'https://streamff.com/%s' % json_data['videoLink'],
+ 'view_count': int_or_none(json_data.get('views')),
+ 'timestamp': parse_iso8601(json_data.get('date')),
+ }
diff --git a/hypervideo_dl/extractor/stripchat.py b/hypervideo_dl/extractor/stripchat.py
new file mode 100644
index 0000000..0d4a0ce
--- /dev/null
+++ b/hypervideo_dl/extractor/stripchat.py
@@ -0,0 +1,66 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import (
+ compat_str,
+)
+from ..utils import (
+ ExtractorError,
+ lowercase_escape,
+ try_get,
+)
+
+
+class StripchatIE(InfoExtractor):
+ _VALID_URL = r'https?://stripchat\.com/(?P<id>[0-9A-Za-z-_]+)'
+ _TESTS = [{
+ 'url': 'https://stripchat.com/feel_me',
+ 'info_dict': {
+ 'id': 'feel_me',
+ 'ext': 'mp4',
+ 'title': 're:^feel_me [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+ 'description': str,
+ 'is_live': True,
+ 'age_limit': 18,
+ },
+ 'skip': 'Room is offline',
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(
+ 'https://stripchat.com/%s/' % video_id, video_id,
+ headers=self.geo_verification_headers())
+
+ data = self._parse_json(
+ self._search_regex(
+ r'<script\b[^>]*>\s*window\.__PRELOADED_STATE__\s*=(?P<value>.*?)<\/script>',
+ webpage, 'data', default='{}', group='value'),
+ video_id, transform_source=lowercase_escape, fatal=False)
+ if not data:
+ raise ExtractorError('Unable to find configuration for stream.')
+
+ if try_get(data, lambda x: x['viewCam']['show'], dict):
+ raise ExtractorError('Model is in private show', expected=True)
+ elif not try_get(data, lambda x: x['viewCam']['model']['isLive'], bool):
+ raise ExtractorError('Model is offline', expected=True)
+
+ server = try_get(data, lambda x: x['viewCam']['viewServers']['flashphoner-hls'], compat_str)
+ host = try_get(data, lambda x: x['config']['data']['hlsStreamHost'], compat_str)
+ model_id = try_get(data, lambda x: x['viewCam']['model']['id'], int)
+
+ formats = self._extract_m3u8_formats(
+ 'https://b-%s.%s/hls/%d/%d.m3u8' % (server, host, model_id, model_id),
+ video_id, ext='mp4', m3u8_id='hls', fatal=False, live=True)
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': video_id,
+ 'description': self._og_search_description(webpage),
+ 'is_live': True,
+ 'formats': formats,
+ # Stripchat declares the RTA meta-tag, but in an non-standard format so _rta_search() can't be used
+ 'age_limit': 18,
+ }
diff --git a/hypervideo_dl/extractor/stv.py b/hypervideo_dl/extractor/stv.py
index d36a4b6..ba5661d 100644
--- a/hypervideo_dl/extractor/stv.py
+++ b/hypervideo_dl/extractor/stv.py
@@ -45,10 +45,7 @@ class STVPlayerIE(InfoExtractor):
ptype, video_id = self._match_valid_url(url).groups()
webpage = self._download_webpage(url, video_id, fatal=False) or ''
- props = (self._parse_json(self._search_regex(
- r'<script[^>]+id="__NEXT_DATA__"[^>]*>({.+?})</script>',
- webpage, 'next data', default='{}'), video_id,
- fatal=False) or {}).get('props') or {}
+ props = self._search_nextjs_data(webpage, video_id, default='{}').get('props') or {}
player_api_cache = try_get(
props, lambda x: x['initialReduxState']['playerApiCache']) or {}
diff --git a/hypervideo_dl/extractor/sunporno.py b/hypervideo_dl/extractor/sunporno.py
index 6805116..59b77bf 100644
--- a/hypervideo_dl/extractor/sunporno.py
+++ b/hypervideo_dl/extractor/sunporno.py
@@ -36,8 +36,7 @@ class SunPornoIE(InfoExtractor):
webpage = self._download_webpage(
'http://www.sunporno.com/videos/%s' % video_id, video_id)
- title = self._html_search_regex(
- r'<title>([^<]+)</title>', webpage, 'title')
+ title = self._html_extract_title(webpage)
description = self._html_search_meta(
'description', webpage, 'description')
thumbnail = self._html_search_regex(
diff --git a/hypervideo_dl/extractor/svt.py b/hypervideo_dl/extractor/svt.py
index 38e0086..8ca62e3 100644
--- a/hypervideo_dl/extractor/svt.py
+++ b/hypervideo_dl/extractor/svt.py
@@ -23,23 +23,27 @@ class SVTBaseIE(InfoExtractor):
is_live = dict_get(video_info, ('live', 'simulcast'), default=False)
m3u8_protocol = 'm3u8' if is_live else 'm3u8_native'
formats = []
+ subtitles = {}
for vr in video_info['videoReferences']:
player_type = vr.get('playerType') or vr.get('format')
vurl = vr['url']
ext = determine_ext(vurl)
if ext == 'm3u8':
- formats.extend(self._extract_m3u8_formats(
+ fmts, subs = self._extract_m3u8_formats_and_subtitles(
vurl, video_id,
ext='mp4', entry_protocol=m3u8_protocol,
- m3u8_id=player_type, fatal=False))
+ m3u8_id=player_type, fatal=False)
+ formats.extend(fmts)
+ self._merge_subtitles(subs, target=subtitles)
elif ext == 'f4m':
formats.extend(self._extract_f4m_formats(
vurl + '?hdcore=3.3.0', video_id,
f4m_id=player_type, fatal=False))
elif ext == 'mpd':
- if player_type == 'dashhbbtv':
- formats.extend(self._extract_mpd_formats(
- vurl, video_id, mpd_id=player_type, fatal=False))
+ fmts, subs = self._extract_mpd_formats_and_subtitles(
+ vurl, video_id, mpd_id=player_type, fatal=False)
+ formats.extend(fmts)
+ self._merge_subtitles(subs, target=subtitles)
else:
formats.append({
'format_id': player_type,
@@ -52,18 +56,19 @@ class SVTBaseIE(InfoExtractor):
countries=self._GEO_COUNTRIES, metadata_available=True)
self._sort_formats(formats)
- subtitles = {}
subtitle_references = dict_get(video_info, ('subtitles', 'subtitleReferences'))
if isinstance(subtitle_references, list):
for sr in subtitle_references:
subtitle_url = sr.get('url')
subtitle_lang = sr.get('language', 'sv')
if subtitle_url:
+ sub = {
+ 'url': subtitle_url,
+ }
if determine_ext(subtitle_url) == 'm3u8':
- # TODO(yan12125): handle WebVTT in m3u8 manifests
- continue
-
- subtitles.setdefault(subtitle_lang, []).append({'url': subtitle_url})
+ # XXX: no way of testing, is it ever hit?
+ sub['ext'] = 'vtt'
+ subtitles.setdefault(subtitle_lang, []).append(sub)
title = video_info.get('title')
@@ -168,7 +173,6 @@ class SVTPlayIE(SVTPlayBaseIE):
},
},
'params': {
- 'format': 'bestvideo',
# skip for now due to download test asserts that segment is > 10000 bytes and svt uses
# init segments that are smaller
# AssertionError: Expected test_SVTPlay_jNwpV9P.mp4 to be at least 9.77KiB, but it's only 864.00B
@@ -204,10 +208,6 @@ class SVTPlayIE(SVTPlayBaseIE):
'only_matching': True,
}]
- def _adjust_title(self, info):
- if info['is_live']:
- info['title'] = self._live_title(info['title'])
-
def _extract_by_video_id(self, video_id, webpage=None):
data = self._download_json(
'https://api.svt.se/videoplayer-api/video/%s' % video_id,
@@ -221,7 +221,6 @@ class SVTPlayIE(SVTPlayBaseIE):
if not title:
title = video_id
info_dict['title'] = title
- self._adjust_title(info_dict)
return info_dict
def _real_extract(self, url):
@@ -252,7 +251,6 @@ class SVTPlayIE(SVTPlayBaseIE):
'title': data['context']['dispatcher']['stores']['MetaStore']['title'],
'thumbnail': thumbnail,
})
- self._adjust_title(info_dict)
return info_dict
svt_id = try_get(
diff --git a/hypervideo_dl/extractor/tagesschau.py b/hypervideo_dl/extractor/tagesschau.py
index 25c2004..6e03d0a 100644
--- a/hypervideo_dl/extractor/tagesschau.py
+++ b/hypervideo_dl/extractor/tagesschau.py
@@ -5,177 +5,63 @@ import re
from .common import InfoExtractor
from ..utils import (
- determine_ext,
js_to_json,
- parse_iso8601,
- parse_filesize,
+ extract_attributes,
+ try_get,
+ int_or_none,
)
-class TagesschauPlayerIE(InfoExtractor):
- IE_NAME = 'tagesschau:player'
- _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/multimedia/(?P<kind>audio|video)/(?P=kind)-(?P<id>\d+)~player(?:_[^/?#&]+)?\.html'
-
- _TESTS = [{
- 'url': 'http://www.tagesschau.de/multimedia/video/video-179517~player.html',
- 'md5': '8d09548d5c15debad38bee3a4d15ca21',
- 'info_dict': {
- 'id': '179517',
- 'ext': 'mp4',
- 'title': 'Marie Kristin Boese, ARD Berlin, über den zukünftigen Kurs der AfD',
- 'thumbnail': r're:^https?:.*\.jpg$',
- 'formats': 'mincount:6',
- },
- }, {
- 'url': 'https://www.tagesschau.de/multimedia/audio/audio-29417~player.html',
- 'md5': '76e6eec6ebd40740671cf0a2c88617e5',
- 'info_dict': {
- 'id': '29417',
- 'ext': 'mp3',
- 'title': 'Trabi - Bye, bye Rennpappe',
- 'thumbnail': r're:^https?:.*\.jpg$',
- 'formats': 'mincount:2',
- },
- }, {
- 'url': 'http://www.tagesschau.de/multimedia/audio/audio-29417~player_autoplay-true.html',
- 'only_matching': True,
- }]
-
- _FORMATS = {
- 'xs': {'quality': 0},
- 's': {'width': 320, 'height': 180, 'quality': 1},
- 'm': {'width': 512, 'height': 288, 'quality': 2},
- 'l': {'width': 960, 'height': 540, 'quality': 3},
- 'xl': {'width': 1280, 'height': 720, 'quality': 4},
- 'xxl': {'quality': 5},
- }
-
- def _extract_via_api(self, kind, video_id):
- info = self._download_json(
- 'https://www.tagesschau.de/api/multimedia/{0}/{0}-{1}.json'.format(kind, video_id),
- video_id)
- title = info['headline']
- formats = []
- for media in info['mediadata']:
- for format_id, format_url in media.items():
- if determine_ext(format_url) == 'm3u8':
- formats.extend(self._extract_m3u8_formats(
- format_url, video_id, 'mp4',
- entry_protocol='m3u8_native', m3u8_id='hls'))
- else:
- formats.append({
- 'url': format_url,
- 'format_id': format_id,
- 'vcodec': 'none' if kind == 'audio' else None,
- })
- self._sort_formats(formats)
- timestamp = parse_iso8601(info.get('date'))
- return {
- 'id': video_id,
- 'title': title,
- 'timestamp': timestamp,
- 'formats': formats,
- }
-
- def _real_extract(self, url):
- mobj = self._match_valid_url(url)
- video_id = mobj.group('id')
-
- # kind = mobj.group('kind').lower()
- # if kind == 'video':
- # return self._extract_via_api(kind, video_id)
-
- # JSON api does not provide some audio formats (e.g. ogg) thus
- # extracting audio via webpage
-
- webpage = self._download_webpage(url, video_id)
-
- title = self._og_search_title(webpage).strip()
- formats = []
-
- for media_json in re.findall(r'({src\s*:\s*["\']http[^}]+type\s*:[^}]+})', webpage):
- media = self._parse_json(js_to_json(media_json), video_id, fatal=False)
- if not media:
- continue
- src = media.get('src')
- if not src:
- return
- quality = media.get('quality')
- kind = media.get('type', '').split('/')[0]
- ext = determine_ext(src)
- f = {
- 'url': src,
- 'format_id': '%s_%s' % (quality, ext) if quality else ext,
- 'ext': ext,
- 'vcodec': 'none' if kind == 'audio' else None,
- }
- f.update(self._FORMATS.get(quality, {}))
- formats.append(f)
-
- self._sort_formats(formats)
-
- thumbnail = self._og_search_thumbnail(webpage)
-
- return {
- 'id': video_id,
- 'title': title,
- 'thumbnail': thumbnail,
- 'formats': formats,
- }
-
-
class TagesschauIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?tagesschau\.de/(?P<path>[^/]+/(?:[^/]+/)*?(?P<id>[^/#?]+?(?:-?[0-9]+)?))(?:~_?[^/#?]+?)?\.html'
_TESTS = [{
'url': 'http://www.tagesschau.de/multimedia/video/video-102143.html',
- 'md5': 'f7c27a0eff3bfe8c7727e65f8fe1b1e6',
+ 'md5': '7a7287612fa881a1ae1d087df45c2fd6',
'info_dict': {
- 'id': 'video-102143',
+ 'id': 'video-102143-1',
'ext': 'mp4',
'title': 'Regierungsumbildung in Athen: Neue Minister in Griechenland vereidigt',
- 'description': '18.07.2015 20:10 Uhr',
- 'thumbnail': r're:^https?:.*\.jpg$',
},
}, {
'url': 'http://www.tagesschau.de/multimedia/sendung/ts-5727.html',
'md5': '3c54c1f6243d279b706bde660ceec633',
'info_dict': {
- 'id': 'ts-5727',
+ 'id': 'ts-5727-1',
'ext': 'mp4',
- 'title': 'Sendung: tagesschau \t04.12.2014 20:00 Uhr',
- 'description': 'md5:695c01bfd98b7e313c501386327aea59',
- 'thumbnail': r're:^https?:.*\.jpg$',
+ 'title': 'Ganze Sendung',
},
}, {
# exclusive audio
'url': 'http://www.tagesschau.de/multimedia/audio/audio-29417.html',
- 'md5': '76e6eec6ebd40740671cf0a2c88617e5',
+ 'md5': '4cf22023c285f35e99c24d290ba58cc9',
'info_dict': {
- 'id': 'audio-29417',
+ 'id': 'audio-29417-1',
'ext': 'mp3',
- 'title': 'Trabi - Bye, bye Rennpappe',
- 'description': 'md5:8687dda862cbbe2cfb2df09b56341317',
- 'thumbnail': r're:^https?:.*\.jpg$',
+ 'title': 'Brasilianischer Präsident Bolsonaro unter Druck: Corona-Bericht wird vorgestellt',
},
}, {
- # audio in article
'url': 'http://www.tagesschau.de/inland/bnd-303.html',
- 'md5': 'e0916c623e85fc1d2b26b78f299d3958',
+ 'md5': '12cfb212d9325b5ba0d52b625f1aa61c',
'info_dict': {
- 'id': 'bnd-303',
- 'ext': 'mp3',
- 'title': 'Viele Baustellen für neuen BND-Chef',
- 'description': 'md5:1e69a54be3e1255b2b07cdbce5bcd8b4',
- 'thumbnail': r're:^https?:.*\.jpg$',
+ 'id': 'bnd-303-1',
+ 'ext': 'mp4',
+ 'title': 'SPD-Gruppenbild mit Bärbel Bas nach der Fraktionssitzung | dpa',
},
}, {
'url': 'http://www.tagesschau.de/inland/afd-parteitag-135.html',
'info_dict': {
'id': 'afd-parteitag-135',
- 'title': 'Möchtegern-Underdog mit Machtanspruch',
+ 'title': 'AfD',
+ },
+ 'playlist_count': 20,
+ }, {
+ 'url': 'https://www.tagesschau.de/multimedia/audio/audio-29417~player.html',
+ 'info_dict': {
+ 'id': 'audio-29417-1',
+ 'ext': 'mp3',
+ 'title': 'Brasilianischer Präsident Bolsonaro unter Druck: Corona-Bericht wird vorgestellt',
},
- 'playlist_count': 2,
}, {
'url': 'http://www.tagesschau.de/multimedia/sendung/tsg-3771.html',
'only_matching': True,
@@ -206,62 +92,6 @@ class TagesschauIE(InfoExtractor):
'only_matching': True,
}]
- @classmethod
- def suitable(cls, url):
- return False if TagesschauPlayerIE.suitable(url) else super(TagesschauIE, cls).suitable(url)
-
- def _extract_formats(self, download_text, media_kind):
- links = re.finditer(
- r'<div class="button" title="(?P<title>[^"]*)"><a href="(?P<url>[^"]+)">(?P<name>.+?)</a></div>',
- download_text)
- formats = []
- for l in links:
- link_url = l.group('url')
- if not link_url:
- continue
- format_id = self._search_regex(
- r'.*/[^/.]+\.([^/]+)\.[^/.]+$', link_url, 'format ID',
- default=determine_ext(link_url))
- format = {
- 'format_id': format_id,
- 'url': l.group('url'),
- 'format_name': l.group('name'),
- }
- title = l.group('title')
- if title:
- if media_kind.lower() == 'video':
- m = re.match(
- r'''(?x)
- Video:\s*(?P<vcodec>[a-zA-Z0-9/._-]+)\s*&\#10;
- (?P<width>[0-9]+)x(?P<height>[0-9]+)px&\#10;
- (?P<vbr>[0-9]+)kbps&\#10;
- Audio:\s*(?P<abr>[0-9]+)kbps,\s*(?P<audio_desc>[A-Za-z\.0-9]+)&\#10;
- Gr&ouml;&szlig;e:\s*(?P<filesize_approx>[0-9.,]+\s+[a-zA-Z]*B)''',
- title)
- if m:
- format.update({
- 'format_note': m.group('audio_desc'),
- 'vcodec': m.group('vcodec'),
- 'width': int(m.group('width')),
- 'height': int(m.group('height')),
- 'abr': int(m.group('abr')),
- 'vbr': int(m.group('vbr')),
- 'filesize_approx': parse_filesize(m.group('filesize_approx')),
- })
- else:
- m = re.match(
- r'(?P<format>.+?)-Format\s*:\s*(?P<abr>\d+)kbps\s*,\s*(?P<note>.+)',
- title)
- if m:
- format.update({
- 'format_note': '%s, %s' % (m.group('format'), m.group('note')),
- 'vcodec': 'none',
- 'abr': int(m.group('abr')),
- })
- formats.append(format)
- self._sort_formats(formats)
- return formats
-
def _real_extract(self, url):
mobj = self._match_valid_url(url)
video_id = mobj.group('id') or mobj.group('path')
@@ -271,34 +101,46 @@ class TagesschauIE(InfoExtractor):
title = self._html_search_regex(
r'<span[^>]*class="headline"[^>]*>(.+?)</span>',
- webpage, 'title', default=None) or self._og_search_title(webpage)
-
- DOWNLOAD_REGEX = r'(?s)<p>Wir bieten dieses (?P<kind>Video|Audio) in folgenden Formaten zum Download an:</p>\s*<div class="controls">(?P<links>.*?)</div>\s*<p>'
-
- webpage_type = self._og_search_property('type', webpage, default=None)
- if webpage_type == 'website': # Article
- entries = []
- for num, (entry_title, media_kind, download_text) in enumerate(re.findall(
- r'(?s)<p[^>]+class="infotext"[^>]*>\s*(?:<a[^>]+>)?\s*<strong>(.+?)</strong>.*?</p>.*?%s' % DOWNLOAD_REGEX,
- webpage), 1):
+ webpage, 'title', default=None) or self._og_search_title(webpage, fatal=False)
+
+ entries = []
+ videos = re.findall(r'<div[^>]+>', webpage)
+ num = 0
+ for video in videos:
+ video = extract_attributes(video).get('data-config')
+ if not video:
+ continue
+ video = self._parse_json(video, video_id, transform_source=js_to_json, fatal=False)
+ video_formats = try_get(video, lambda x: x['mc']['_mediaArray'][0]['_mediaStreamArray'])
+ if not video_formats:
+ continue
+ num += 1
+ for video_format in video_formats:
+ media_url = video_format.get('_stream') or ''
+ formats = []
+ if media_url.endswith('master.m3u8'):
+ formats = self._extract_m3u8_formats(media_url, video_id, 'mp4', m3u8_id='hls')
+ elif media_url.endswith('.hi.mp3') and media_url.startswith('https://download'):
+ formats = [{
+ 'url': media_url,
+ 'vcodec': 'none',
+ }]
+ if not formats:
+ continue
entries.append({
'id': '%s-%d' % (display_id, num),
- 'title': '%s' % entry_title,
- 'formats': self._extract_formats(download_text, media_kind),
+ 'title': try_get(video, lambda x: x['mc']['_title']),
+ 'duration': int_or_none(try_get(video, lambda x: x['mc']['_duration'])),
+ 'formats': formats
})
- if len(entries) > 1:
- return self.playlist_result(entries, display_id, title)
- formats = entries[0]['formats']
- else: # Assume single video
- download_text = self._search_regex(
- DOWNLOAD_REGEX, webpage, 'download links', group='links')
- media_kind = self._search_regex(
- DOWNLOAD_REGEX, webpage, 'media kind', default='Video', group='kind')
- formats = self._extract_formats(download_text, media_kind)
- thumbnail = self._og_search_thumbnail(webpage)
- description = self._html_search_regex(
- r'(?s)<p class="teasertext">(.*?)</p>',
- webpage, 'description', default=None)
+ if len(entries) > 1:
+ return self.playlist_result(entries, display_id, title)
+ formats = entries[0]['formats']
+ video_info = self._search_json_ld(webpage, video_id)
+ description = video_info.get('description')
+ thumbnail = self._og_search_thumbnail(webpage) or video_info.get('thumbnail')
+ timestamp = video_info.get('timestamp')
+ title = title or video_info.get('description')
self._sort_formats(formats)
@@ -307,5 +149,6 @@ class TagesschauIE(InfoExtractor):
'title': title,
'thumbnail': thumbnail,
'formats': formats,
+ 'timestamp': timestamp,
'description': description,
}
diff --git a/hypervideo_dl/extractor/teachable.py b/hypervideo_dl/extractor/teachable.py
index 37eae82..232eaa5 100644
--- a/hypervideo_dl/extractor/teachable.py
+++ b/hypervideo_dl/extractor/teachable.py
@@ -40,8 +40,7 @@ class TeachableBaseIE(InfoExtractor):
if self._logged_in:
return
- username, password = self._get_login_info(
- netrc_machine=self._SITES.get(site, site))
+ username, password = self._get_login_info(netrc_machine=self._SITES.get(site, site))
if username is None:
return
diff --git a/hypervideo_dl/extractor/teamtreehouse.py b/hypervideo_dl/extractor/teamtreehouse.py
index d347e97..64522ec 100644
--- a/hypervideo_dl/extractor/teamtreehouse.py
+++ b/hypervideo_dl/extractor/teamtreehouse.py
@@ -51,17 +51,14 @@ class TeamTreeHouseIE(InfoExtractor):
}]
_NETRC_MACHINE = 'teamtreehouse'
- def _real_initialize(self):
- email, password = self._get_login_info()
- if email is None:
- return
+ def _perform_login(self, username, password):
signin_page = self._download_webpage(
'https://teamtreehouse.com/signin',
None, 'Downloading signin page')
data = self._form_hidden_inputs('new_user_session', signin_page)
data.update({
- 'user_session[email]': email,
+ 'user_session[email]': username,
'user_session[password]': password,
})
error_message = get_element_by_class('error-message', self._download_webpage(
diff --git a/hypervideo_dl/extractor/ted.py b/hypervideo_dl/extractor/ted.py
index f09f1a3..b5c7e35 100644
--- a/hypervideo_dl/extractor/ted.py
+++ b/hypervideo_dl/extractor/ted.py
@@ -1,274 +1,105 @@
-from __future__ import unicode_literals
-
-import json
+import itertools
import re
from .common import InfoExtractor
-from ..compat import (
- compat_str,
- compat_urlparse
-)
from ..utils import (
- extract_attributes,
- float_or_none,
int_or_none,
+ str_to_int,
try_get,
url_or_none,
+ unified_strdate,
+ parse_duration,
)
-class TEDIE(InfoExtractor):
- IE_NAME = 'ted'
- _VALID_URL = r'''(?x)
- (?P<proto>https?://)
- (?P<type>www|embed(?:-ssl)?)(?P<urlmain>\.ted\.com/
- (
- (?P<type_playlist>playlists(?:/(?P<playlist_id>\d+))?) # We have a playlist
- |
- ((?P<type_talk>talks)) # We have a simple talk
- |
- (?P<type_watch>watch)/[^/]+/[^/]+
- )
- (/lang/(.*?))? # The url may contain the language
- /(?P<name>[\w-]+) # Here goes the name and then ".html"
- .*)$
- '''
+class TedBaseIE(InfoExtractor):
+ _VALID_URL_BASE = r'https?://www\.ted\.com/(?:{type})(?:/lang/[^/#?]+)?/(?P<id>[\w-]+)'
+
+ def _parse_playlist(self, playlist):
+ for entry in try_get(playlist, lambda x: x['videos']['nodes'], list):
+ if entry.get('__typename') == 'Video' and entry.get('canonicalUrl'):
+ yield self.url_result(entry['canonicalUrl'], TedTalkIE.ie_key())
+
+
+class TedTalkIE(TedBaseIE):
+ _VALID_URL = TedBaseIE._VALID_URL_BASE.format(type='talks')
_TESTS = [{
- 'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
- 'md5': 'b0ce2b05ca215042124fbc9e3886493a',
- 'info_dict': {
- 'id': '102',
- 'ext': 'mp4',
- 'title': 'The illusion of consciousness',
- 'description': ('Philosopher Dan Dennett makes a compelling '
- 'argument that not only don\'t we understand our own '
- 'consciousness, but that half the time our brains are '
- 'actively fooling us.'),
- 'uploader': 'Dan Dennett',
- 'width': 853,
- 'duration': 1308,
- 'view_count': int,
- 'comment_count': int,
- 'tags': list,
- },
- 'params': {
- 'skip_download': True,
- },
- }, {
- # missing HTTP bitrates
- 'url': 'https://www.ted.com/talks/vishal_sikka_the_beauty_and_power_of_algorithms',
- 'info_dict': {
- 'id': '6069',
- 'ext': 'mp4',
- 'title': 'The beauty and power of algorithms',
- 'thumbnail': r're:^https?://.+\.jpg',
- 'description': 'md5:734e352710fb00d840ab87ae31aaf688',
- 'uploader': 'Vishal Sikka',
- },
- 'params': {
- 'skip_download': True,
- },
- }, {
- 'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best',
- 'md5': 'e6b9617c01a7970ceac8bb2c92c346c0',
- 'info_dict': {
- 'id': '1972',
- 'ext': 'mp4',
- 'title': 'Be passionate. Be courageous. Be your best.',
- 'uploader': 'Gabby Giffords and Mark Kelly',
- 'description': 'md5:5174aed4d0f16021b704120360f72b92',
- 'duration': 1128,
- },
- 'params': {
- 'skip_download': True,
- },
- }, {
- 'url': 'http://www.ted.com/playlists/who_are_the_hackers',
- 'info_dict': {
- 'id': '10',
- 'title': 'Who are the hackers?',
- 'description': 'md5:49a0dbe8fb76d81a0e64b4a80af7f15a'
- },
- 'playlist_mincount': 6,
- }, {
- # contains a youtube video
- 'url': 'https://www.ted.com/talks/douglas_adams_parrots_the_universe_and_everything',
- 'add_ie': ['Youtube'],
- 'info_dict': {
- 'id': '_ZG8HBuDjgc',
- 'ext': 'webm',
- 'title': 'Douglas Adams: Parrots the Universe and Everything',
- 'description': 'md5:01ad1e199c49ac640cb1196c0e9016af',
- 'uploader': 'University of California Television (UCTV)',
- 'uploader_id': 'UCtelevision',
- 'upload_date': '20080522',
- },
- 'params': {
- 'skip_download': True,
- },
- }, {
- # no nativeDownloads
- 'url': 'https://www.ted.com/talks/tom_thum_the_orchestra_in_my_mouth',
+ 'url': 'https://www.ted.com/talks/candace_parker_how_to_break_down_barriers_and_not_accept_limits',
+ 'md5': '47e82c666d9c3261d4fe74748a90aada',
'info_dict': {
- 'id': '1792',
+ 'id': '86532',
'ext': 'mp4',
- 'title': 'The orchestra in my mouth',
- 'description': 'md5:5d1d78650e2f8dfcbb8ebee2951ac29a',
- 'uploader': 'Tom Thum',
+ 'title': 'How to break down barriers and not accept limits',
+ 'description': 'md5:000707cece219d1e165b11550d612331',
'view_count': int,
- 'comment_count': int,
- 'tags': list,
+ 'tags': ['personal growth', 'equality', 'activism', 'motivation', 'social change', 'sports'],
+ 'uploader': 'Candace Parker',
+ 'duration': 676.0,
+ 'upload_date': '20220114',
+ 'release_date': '20211201',
+ 'thumbnail': r're:http.*\.jpg',
},
- 'params': {
- 'skip_download': True,
- },
- }, {
- # with own formats and private Youtube external
- 'url': 'https://www.ted.com/talks/spencer_wells_a_family_tree_for_humanity',
- 'only_matching': True,
}]
- _NATIVE_FORMATS = {
- 'low': {'width': 320, 'height': 180},
- 'medium': {'width': 512, 'height': 288},
- 'high': {'width': 854, 'height': 480},
- }
-
- def _extract_info(self, webpage):
- info_json = self._search_regex(
- r'(?s)q\(\s*"\w+.init"\s*,\s*({.+?})\)\s*</script>',
- webpage, 'info json')
- return json.loads(info_json)
-
def _real_extract(self, url):
- m = re.match(self._VALID_URL, url, re.VERBOSE)
- if m.group('type').startswith('embed'):
- desktop_url = m.group('proto') + 'www' + m.group('urlmain')
- return self.url_result(desktop_url, 'TED')
- name = m.group('name')
- if m.group('type_talk'):
- return self._talk_info(url, name)
- elif m.group('type_watch'):
- return self._watch_info(url, name)
- else:
- return self._playlist_videos_info(url, name)
-
- def _playlist_videos_info(self, url, name):
- '''Returns the videos of the playlist'''
-
- webpage = self._download_webpage(url, name,
- 'Downloading playlist webpage')
-
- playlist_entries = []
- for entry in re.findall(r'(?s)<[^>]+data-ga-context=["\']playlist["\'][^>]*>', webpage):
- attrs = extract_attributes(entry)
- entry_url = compat_urlparse.urljoin(url, attrs['href'])
- playlist_entries.append(self.url_result(entry_url, self.ie_key()))
-
- final_url = self._og_search_url(webpage, fatal=False)
- playlist_id = (
- re.match(self._VALID_URL, final_url).group('playlist_id')
- if final_url else None)
-
- return self.playlist_result(
- playlist_entries, playlist_id=playlist_id,
- playlist_title=self._og_search_title(webpage, fatal=False),
- playlist_description=self._og_search_description(webpage))
-
- def _talk_info(self, url, video_name):
- webpage = self._download_webpage(url, video_name)
-
- info = self._extract_info(webpage)
-
- data = try_get(info, lambda x: x['__INITIAL_DATA__'], dict) or info
- talk_info = data['talks'][0]
-
- title = talk_info['title'].strip()
-
- downloads = talk_info.get('downloads') or {}
- native_downloads = downloads.get('nativeDownloads') or talk_info.get('nativeDownloads') or {}
-
- formats = [{
- 'url': format_url,
- 'format_id': format_id,
- } for (format_id, format_url) in native_downloads.items() if format_url is not None]
-
- subtitled_downloads = downloads.get('subtitledDownloads') or {}
- for lang, subtitled_download in subtitled_downloads.items():
- for q in self._NATIVE_FORMATS:
- q_url = subtitled_download.get(q)
- if not q_url:
- continue
- formats.append({
- 'url': q_url,
- 'format_id': '%s-%s' % (q, lang),
- 'language': lang,
- })
-
- if formats:
- for f in formats:
- finfo = self._NATIVE_FORMATS.get(f['format_id'].split('-')[0])
- if finfo:
- f.update(finfo)
-
- player_talk = talk_info['player_talks'][0]
-
- resources_ = player_talk.get('resources') or talk_info.get('resources')
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
+ talk_info = self._search_nextjs_data(webpage, display_id)['props']['pageProps']['videoData']
+ video_id = talk_info['id']
+ playerData = self._parse_json(talk_info.get('playerData'), video_id)
http_url = None
- for format_id, resources in resources_.items():
+ formats, subtitles = [], {}
+ for format_id, resources in (playerData.get('resources') or {}).items():
if format_id == 'hls':
- if not isinstance(resources, dict):
- continue
- stream_url = url_or_none(resources.get('stream'))
+ stream_url = url_or_none(try_get(resources, lambda x: x['stream']))
if not stream_url:
continue
- formats.extend(self._extract_m3u8_formats(
- stream_url, video_name, 'mp4', m3u8_id=format_id,
- fatal=False))
- else:
- if not isinstance(resources, list):
- continue
- if format_id == 'h264':
- for resource in resources:
- h264_url = resource.get('file')
- if not h264_url:
- continue
- bitrate = int_or_none(resource.get('bitrate'))
- formats.append({
- 'url': h264_url,
- 'format_id': '%s-%sk' % (format_id, bitrate),
- 'tbr': bitrate,
- })
- if re.search(r'\d+k', h264_url):
- http_url = h264_url
- elif format_id == 'rtmp':
- streamer = talk_info.get('streamer')
- if not streamer:
+ m3u8_formats, m3u8_subs = self._extract_m3u8_formats_and_subtitles(
+ stream_url, video_id, 'mp4', m3u8_id=format_id, fatal=False)
+ formats.extend(m3u8_formats)
+ subtitles = self._merge_subtitles(subtitles, m3u8_subs)
+ continue
+
+ if not isinstance(resources, list):
+ continue
+ if format_id == 'h264':
+ for resource in resources:
+ h264_url = resource.get('file')
+ if not h264_url:
continue
- for resource in resources:
- formats.append({
- 'format_id': '%s-%s' % (format_id, resource.get('name')),
- 'url': streamer,
- 'play_path': resource['file'],
- 'ext': 'flv',
- 'width': int_or_none(resource.get('width')),
- 'height': int_or_none(resource.get('height')),
- 'tbr': int_or_none(resource.get('bitrate')),
- })
+ bitrate = int_or_none(resource.get('bitrate'))
+ formats.append({
+ 'url': h264_url,
+ 'format_id': '%s-%sk' % (format_id, bitrate),
+ 'tbr': bitrate,
+ })
+ if re.search(r'\d+k', h264_url):
+ http_url = h264_url
+ elif format_id == 'rtmp':
+ streamer = talk_info.get('streamer')
+ if not streamer:
+ continue
+ formats.extend({
+ 'format_id': '%s-%s' % (format_id, resource.get('name')),
+ 'url': streamer,
+ 'play_path': resource['file'],
+ 'ext': 'flv',
+ 'width': int_or_none(resource.get('width')),
+ 'height': int_or_none(resource.get('height')),
+ 'tbr': int_or_none(resource.get('bitrate')),
+ } for resource in resources if resource.get('file'))
- m3u8_formats = list(filter(
- lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none',
- formats))
if http_url:
+ m3u8_formats = [f for f in formats if f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none']
for m3u8_format in m3u8_formats:
bitrate = self._search_regex(r'(\d+k)', m3u8_format['url'], 'bitrate', default=None)
if not bitrate:
continue
bitrate_url = re.sub(r'\d+k', bitrate, http_url)
if not self._is_valid_url(
- bitrate_url, video_name, '%s bitrate' % bitrate):
+ bitrate_url, video_id, '%s bitrate' % bitrate):
continue
f = m3u8_format.copy()
f.update({
@@ -289,79 +120,123 @@ class TEDIE(InfoExtractor):
})
if not formats:
- external = player_talk.get('external')
- if isinstance(external, dict):
- service = external.get('service')
- if isinstance(service, compat_str):
- ext_url = None
- if service.lower() == 'youtube':
- ext_url = external.get('code')
- return self.url_result(ext_url or external['uri'])
+ external = playerData.get('external') or {}
+ service = external.get('service') or ''
+ ext_url = external.get('code') if service.lower() == 'youtube' else None
+ return self.url_result(ext_url or external['uri'])
self._sort_formats(formats)
- video_id = compat_str(talk_info['id'])
+ thumbnail = playerData.get('thumb') or self._og_search_property('image', webpage)
+ if thumbnail:
+ # trim thumbnail resize parameters
+ thumbnail = thumbnail.split('?')[0]
return {
'id': video_id,
- 'title': title,
- 'uploader': player_talk.get('speaker') or talk_info.get('speaker'),
- 'thumbnail': player_talk.get('thumb') or talk_info.get('thumb'),
- 'description': self._og_search_description(webpage),
- 'subtitles': self._get_subtitles(video_id, talk_info),
+ 'title': talk_info.get('title') or self._og_search_title(webpage),
+ 'uploader': talk_info.get('presenterDisplayName'),
+ 'thumbnail': thumbnail,
+ 'description': talk_info.get('description') or self._og_search_description(webpage),
+ 'subtitles': subtitles,
'formats': formats,
- 'duration': float_or_none(talk_info.get('duration')),
- 'view_count': int_or_none(data.get('viewed_count')),
- 'comment_count': int_or_none(
- try_get(data, lambda x: x['comments']['count'])),
- 'tags': try_get(talk_info, lambda x: x['tags'], list),
+ 'duration': talk_info.get('duration') or parse_duration(self._og_search_property('video:duration', webpage)),
+ 'view_count': str_to_int(talk_info.get('viewedCount')),
+ 'upload_date': unified_strdate(talk_info.get('publishedAt')),
+ 'release_date': unified_strdate(talk_info.get('recordedOn')),
+ 'tags': try_get(playerData, lambda x: x['targeting']['tag'].split(',')),
}
- def _get_subtitles(self, video_id, talk_info):
- sub_lang_list = {}
- for language in try_get(
- talk_info,
- (lambda x: x['downloads']['languages'],
- lambda x: x['languages']), list):
- lang_code = language.get('languageCode') or language.get('ianaCode')
- if not lang_code:
- continue
- sub_lang_list[lang_code] = [
- {
- 'url': 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/%s' % (video_id, lang_code, ext),
- 'ext': ext,
- }
- for ext in ['ted', 'srt']
- ]
- return sub_lang_list
- def _watch_info(self, url, name):
- webpage = self._download_webpage(url, name)
+class TedSeriesIE(TedBaseIE):
+ _VALID_URL = fr'{TedBaseIE._VALID_URL_BASE.format(type=r"series")}(?:#season_(?P<season>\d+))?'
+ _TESTS = [{
+ 'url': 'https://www.ted.com/series/small_thing_big_idea',
+ 'info_dict': {
+ 'id': '3',
+ 'title': 'Small Thing Big Idea',
+ 'series': 'Small Thing Big Idea',
+ 'description': 'md5:6869ca52cec661aef72b3e9f7441c55c'
+ },
+ 'playlist_mincount': 16,
+ }, {
+ 'url': 'https://www.ted.com/series/the_way_we_work#season_2',
+ 'info_dict': {
+ 'id': '8_2',
+ 'title': 'The Way We Work Season 2',
+ 'series': 'The Way We Work',
+ 'description': 'md5:59469256e533e1a48c4aa926a382234c',
+ 'season_number': 2
+ },
+ 'playlist_mincount': 8,
+ }]
- config_json = self._html_search_regex(
- r'"pages\.jwplayer"\s*,\s*({.+?})\s*\)\s*</script>',
- webpage, 'config', default=None)
- if not config_json:
- embed_url = self._search_regex(
- r"<iframe[^>]+class='pages-video-embed__video__object'[^>]+src='([^']+)'", webpage, 'embed url')
- return self.url_result(self._proto_relative_url(embed_url))
- config = json.loads(config_json)['config']
- video_url = config['video']['url']
- thumbnail = config.get('image', {}).get('url')
+ def _real_extract(self, url):
+ display_id, season = self._match_valid_url(url).group('id', 'season')
+ webpage = self._download_webpage(url, display_id, 'Downloading series webpage')
+ info = self._search_nextjs_data(webpage, display_id)['props']['pageProps']
- title = self._html_search_regex(
- r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title')
- description = self._html_search_regex(
- [
- r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.*?</h4>(.*?)</div>',
- r'(?s)<p><strong>About this talk:</strong>\s+(.*?)</p>',
- ],
- webpage, 'description', fatal=False)
+ entries = itertools.chain.from_iterable(
+ self._parse_playlist(s) for s in info['seasons'] if season in [None, s.get('seasonNumber')])
- return {
- 'id': name,
- 'url': video_url,
- 'title': title,
- 'thumbnail': thumbnail,
- 'description': description,
- }
+ series_id = try_get(info, lambda x: x['series']['id'])
+ series_name = try_get(info, lambda x: x['series']['name']) or self._og_search_title(webpage, fatal=False)
+
+ return self.playlist_result(
+ entries,
+ f'{series_id}_{season}' if season and series_id else series_id,
+ f'{series_name} Season {season}' if season else series_name,
+ self._og_search_description(webpage),
+ series=series_name, season_number=int_or_none(season))
+
+
+class TedPlaylistIE(TedBaseIE):
+ _VALID_URL = TedBaseIE._VALID_URL_BASE.format(type=r'playlists(?:/\d+)?')
+ _TESTS = [{
+ 'url': 'https://www.ted.com/playlists/171/the_most_popular_talks_of_all',
+ 'info_dict': {
+ 'id': '171',
+ 'title': 'The most popular talks of all time',
+ 'description': 'md5:d2f22831dc86c7040e733a3cb3993d78'
+ },
+ 'playlist_mincount': 25,
+ }]
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
+ playlist = self._search_nextjs_data(webpage, display_id)['props']['pageProps']['playlist']
+
+ return self.playlist_result(
+ self._parse_playlist(playlist), playlist.get('id'),
+ playlist.get('title') or self._og_search_title(webpage, default='').replace(' | TED Talks', '') or None,
+ self._og_search_description(webpage))
+
+
+class TedEmbedIE(InfoExtractor):
+ _VALID_URL = r'https?://embed(?:-ssl)?\.ted\.com/'
+
+ _TESTS = [{
+ 'url': 'https://embed.ted.com/talks/janet_stovall_how_to_get_serious_about_diversity_and_inclusion_in_the_workplace',
+ 'info_dict': {
+ 'id': '21802',
+ 'ext': 'mp4',
+ 'title': 'How to get serious about diversity and inclusion in the workplace',
+ 'description': 'md5:0978aafe396e05341f8ecc795d22189d',
+ 'view_count': int,
+ 'tags': list,
+ 'uploader': 'Janet Stovall',
+ 'duration': 664.0,
+ 'upload_date': '20180822',
+ 'release_date': '20180719',
+ 'thumbnail': r're:http.*\.jpg',
+ },
+ }]
+
+ @classmethod
+ def _extract_urls(cls, webpage):
+ return [mobj.group('url') for mobj in re.finditer(
+ fr'<iframe[^>]+?src=(["\'])(?P<url>{cls._VALID_URL}.+?)\1', webpage)]
+
+ def _real_extract(self, url):
+ return self.url_result(re.sub(r'://embed(-ssl)?', '://www', url), TedTalkIE.ie_key())
diff --git a/hypervideo_dl/extractor/tele5.py b/hypervideo_dl/extractor/tele5.py
index 0d9cf75..c7beee1 100644
--- a/hypervideo_dl/extractor/tele5.py
+++ b/hypervideo_dl/extractor/tele5.py
@@ -1,19 +1,15 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
-
-from .common import InfoExtractor
-from .jwplatform import JWPlatformIE
-from .nexx import NexxIE
+from .dplay import DPlayIE
+from ..compat import compat_urlparse
from ..utils import (
- NO_DEFAULT,
- parse_qs,
- smuggle_url,
+ ExtractorError,
+ extract_attributes,
)
-class Tele5IE(InfoExtractor):
+class Tele5IE(DPlayIE):
_VALID_URL = r'https?://(?:www\.)?tele5\.de/(?:[^/]+/)*(?P<id>[^/?#&]+)'
_GEO_COUNTRIES = ['DE']
_TESTS = [{
@@ -28,6 +24,7 @@ class Tele5IE(InfoExtractor):
'params': {
'skip_download': True,
},
+ 'skip': 'No longer available: "404 Seite nicht gefunden"',
}, {
# jwplatform, nexx unavailable
'url': 'https://www.tele5.de/filme/ghoul-das-geheimnis-des-friedhofmonsters/',
@@ -42,7 +39,20 @@ class Tele5IE(InfoExtractor):
'params': {
'skip_download': True,
},
- 'add_ie': [JWPlatformIE.ie_key()],
+ 'skip': 'No longer available, redirects to Filme page',
+ }, {
+ 'url': 'https://tele5.de/mediathek/angel-of-mine/',
+ 'info_dict': {
+ 'id': '1252360',
+ 'ext': 'mp4',
+ 'upload_date': '20220109',
+ 'timestamp': 1641762000,
+ 'title': 'Angel of Mine',
+ 'description': 'md5:a72546a175e1286eb3251843a52d1ad7',
+ },
+ 'params': {
+ 'format': 'bestvideo',
+ },
}, {
'url': 'https://www.tele5.de/kalkofes-mattscheibe/video-clips/politik-und-gesellschaft?ve_id=1551191',
'only_matching': True,
@@ -64,45 +74,18 @@ class Tele5IE(InfoExtractor):
}]
def _real_extract(self, url):
- qs = parse_qs(url)
- video_id = (qs.get('vid') or qs.get('ve_id') or [None])[0]
-
- NEXX_ID_RE = r'\d{6,}'
- JWPLATFORM_ID_RE = r'[a-zA-Z0-9]{8}'
-
- def nexx_result(nexx_id):
- return self.url_result(
- 'https://api.nexx.cloud/v3/759/videos/byid/%s' % nexx_id,
- ie=NexxIE.ie_key(), video_id=nexx_id)
-
- nexx_id = jwplatform_id = None
-
- if video_id:
- if re.match(NEXX_ID_RE, video_id):
- return nexx_result(video_id)
- elif re.match(JWPLATFORM_ID_RE, video_id):
- jwplatform_id = video_id
-
- if not nexx_id:
- display_id = self._match_id(url)
- webpage = self._download_webpage(url, display_id)
-
- def extract_id(pattern, name, default=NO_DEFAULT):
- return self._html_search_regex(
- (r'id\s*=\s*["\']video-player["\'][^>]+data-id\s*=\s*["\'](%s)' % pattern,
- r'\s+id\s*=\s*["\']player_(%s)' % pattern,
- r'\bdata-id\s*=\s*["\'](%s)' % pattern), webpage, name,
- default=default)
-
- nexx_id = extract_id(NEXX_ID_RE, 'nexx id', default=None)
- if nexx_id:
- return nexx_result(nexx_id)
-
- if not jwplatform_id:
- jwplatform_id = extract_id(JWPLATFORM_ID_RE, 'jwplatform id')
-
- return self.url_result(
- smuggle_url(
- 'jwplatform:%s' % jwplatform_id,
- {'geo_countries': self._GEO_COUNTRIES}),
- ie=JWPlatformIE.ie_key(), video_id=jwplatform_id)
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+ player_element = self._search_regex(r'(<hyoga-player\b[^>]+?>)', webpage, 'video player')
+ player_info = extract_attributes(player_element)
+ asset_id, country, realm = (player_info[x] for x in ('assetid', 'locale', 'realm', ))
+ endpoint = compat_urlparse.urlparse(player_info['endpoint']).hostname
+ source_type = player_info.get('sourcetype')
+ if source_type:
+ endpoint = '%s-%s' % (source_type, endpoint)
+ try:
+ return self._get_disco_api_info(url, asset_id, endpoint, realm, country)
+ except ExtractorError as e:
+ if getattr(e, 'message', '') == 'Missing deviceId in context':
+ self.report_drm(video_id)
+ raise
diff --git a/hypervideo_dl/extractor/telebruxelles.py b/hypervideo_dl/extractor/telebruxelles.py
index a0353fe..9e8c89b 100644
--- a/hypervideo_dl/extractor/telebruxelles.py
+++ b/hypervideo_dl/extractor/telebruxelles.py
@@ -69,7 +69,7 @@ class TeleBruxellesIE(InfoExtractor):
return {
'id': article_id or display_id,
'display_id': display_id,
- 'title': self._live_title(title) if is_live else title,
+ 'title': title,
'description': description,
'formats': formats,
'is_live': is_live,
diff --git a/hypervideo_dl/extractor/telegram.py b/hypervideo_dl/extractor/telegram.py
new file mode 100644
index 0000000..2dfa261
--- /dev/null
+++ b/hypervideo_dl/extractor/telegram.py
@@ -0,0 +1,37 @@
+from .common import InfoExtractor
+
+
+class TelegramEmbedIE(InfoExtractor):
+ IE_NAME = 'telegram:embed'
+ _VALID_URL = r'https?://t\.me/(?P<channel_name>[^/]+)/(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'https://t.me/europa_press/613',
+ 'info_dict': {
+ 'id': '613',
+ 'ext': 'mp4',
+ 'title': 'Europa Press',
+ 'description': '6ce2d7e8d56eda16d80607b23db7b252',
+ 'thumbnail': r're:^https?:\/\/cdn.*?telesco\.pe\/file\/\w+',
+ },
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+ webpage_embed = self._download_webpage(f'{url}?embed=1', video_id)
+
+ formats = [{
+ 'url': self._proto_relative_url(self._search_regex(
+ '<video[^>]+src="([^"]+)"', webpage_embed, 'source')),
+ 'ext': 'mp4',
+ }]
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': self._html_search_meta(['og:title', 'twitter:title'], webpage, fatal=True),
+ 'description': self._html_search_meta(['og:description', 'twitter:description'], webpage, fatal=True),
+ 'thumbnail': self._search_regex(r'tgme_widget_message_video_thumb"[^>]+background-image:url\(\'([^\']+)\'\)',
+ webpage_embed, 'thumbnail'),
+ 'formats': formats,
+ }
diff --git a/hypervideo_dl/extractor/telemundo.py b/hypervideo_dl/extractor/telemundo.py
index 18552a0..ebcecf5 100644
--- a/hypervideo_dl/extractor/telemundo.py
+++ b/hypervideo_dl/extractor/telemundo.py
@@ -1,4 +1,4 @@
-# coding=utf-8
+# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
@@ -34,8 +34,7 @@ class TelemundoIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- metadata = self._parse_json(
- self._search_regex(r'<[^>]+id="__NEXT_DATA__"[^>]+>([^<]+)', webpage, 'JSON metadata'), video_id)
+ metadata = self._search_nextjs_data(webpage, video_id)
redirect_url = try_get(
metadata,
lambda x: x['props']['initialState']['video']['associatedPlaylists'][0]['videos'][0]['videoAssets'][0]['publicUrl'])
diff --git a/hypervideo_dl/extractor/telequebec.py b/hypervideo_dl/extractor/telequebec.py
index 800d87b..4bef2fe 100644
--- a/hypervideo_dl/extractor/telequebec.py
+++ b/hypervideo_dl/extractor/telequebec.py
@@ -43,9 +43,6 @@ class TeleQuebecIE(TeleQuebecBaseIE):
'uploader_id': '6150020952001',
'upload_date': '20200512',
},
- 'params': {
- 'format': 'bestvideo',
- },
'add_ie': ['BrightcoveNew'],
}, {
'url': 'https://zonevideo.telequebec.tv/media/55267/le-soleil/passe-partout',
@@ -58,9 +55,6 @@ class TeleQuebecIE(TeleQuebecBaseIE):
'upload_date': '20200625',
'timestamp': 1593090307,
},
- 'params': {
- 'format': 'bestvideo',
- },
'add_ie': ['BrightcoveNew'],
}, {
# no description
@@ -157,9 +151,6 @@ class TeleQuebecEmissionIE(InfoExtractor):
'timestamp': 1588713424,
'uploader_id': '6150020952001',
},
- 'params': {
- 'format': 'bestvideo',
- },
}, {
'url': 'http://bancpublic.telequebec.tv/emissions/emission-49/31986/jeunes-meres-sous-pression',
'only_matching': True,
@@ -220,9 +211,6 @@ class TeleQuebecVideoIE(TeleQuebecBaseIE):
'timestamp': 1603115930,
'uploader_id': '6101674910001',
},
- 'params': {
- 'format': 'bestvideo',
- },
}, {
'url': 'https://video.telequebec.tv/player-live/28527',
'only_matching': True,
diff --git a/hypervideo_dl/extractor/tennistv.py b/hypervideo_dl/extractor/tennistv.py
index a39a2fc..58fdece 100644
--- a/hypervideo_dl/extractor/tennistv.py
+++ b/hypervideo_dl/extractor/tennistv.py
@@ -30,11 +30,9 @@ class TennisTVIE(InfoExtractor):
'skip': 'Requires email and password of a subscribed account',
}
_NETRC_MACHINE = 'tennistv'
+ _session_token = None
- def _login(self):
- username, password = self._get_login_info()
- if not username or not password:
- raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
+ def _perform_login(self, username, password):
login_form = {
'Email': username,
@@ -63,7 +61,8 @@ class TennisTVIE(InfoExtractor):
self._session_token = login_result['sessionToken']
def _real_initialize(self):
- self._login()
+ if not self._session_token:
+ raise self.raise_login_required('Login info is needed for this website', method='password')
def _real_extract(self, url):
video_id = self._match_id(url)
diff --git a/hypervideo_dl/extractor/tenplay.py b/hypervideo_dl/extractor/tenplay.py
index c810cfd..5c7b545 100644
--- a/hypervideo_dl/extractor/tenplay.py
+++ b/hypervideo_dl/extractor/tenplay.py
@@ -7,6 +7,7 @@ import base64
from .common import InfoExtractor
from ..utils import (
HEADRequest,
+ int_or_none,
urlencode_postdata,
)
@@ -15,6 +16,28 @@ class TenPlayIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?10play\.com\.au/(?:[^/]+/)+(?P<id>tpv\d{6}[a-z]{5})'
_NETRC_MACHINE = '10play'
_TESTS = [{
+ 'url': 'https://10play.com.au/neighbours/web-extras/season-39/nathan-borg-is-the-first-aussie-actor-with-a-cochlear-implant-to-join-neighbours/tpv210128qupwd',
+ 'info_dict': {
+ 'id': '6226844312001',
+ 'ext': 'mp4',
+ 'title': 'Nathan Borg Is The First Aussie Actor With A Cochlear Implant To Join Neighbours',
+ 'alt_title': 'Nathan Borg Is The First Aussie Actor With A Cochlear Implant To Join Neighbours',
+ 'description': 'md5:a02d0199c901c2dd4c796f1e7dd0de43',
+ 'duration': 186,
+ 'season': 39,
+ 'series': 'Neighbours',
+ 'thumbnail': r're:https://.*\.jpg',
+ 'uploader': 'Channel 10',
+ 'age_limit': 15,
+ 'timestamp': 1611810000,
+ 'upload_date': '20210128',
+ 'uploader_id': '2199827728001',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ 'skip': 'Only available in Australia',
+ }, {
'url': 'https://10play.com.au/todd-sampsons-body-hack/episodes/season-4/episode-7/tpv200921kvngh',
'info_dict': {
'id': '6192880312001',
@@ -58,16 +81,21 @@ class TenPlayIE(InfoExtractor):
'email': username,
'password': password,
}))
- return "Bearer " + data['jwt']['accessToken']
+ return 'Bearer ' + data['jwt']['accessToken']
def _real_extract(self, url):
content_id = self._match_id(url)
- _token = self._get_bearer_token(content_id)
data = self._download_json(
'https://10play.com.au/api/v1/videos/' + content_id, content_id)
+ headers = {}
+
+ if data.get('memberGated') is True:
+ _token = self._get_bearer_token(content_id)
+ headers = {'Authorization': _token}
+
_video_url = self._download_json(
data.get('playbackApiEndpoint'), content_id, 'Downloading video JSON',
- headers={'Authorization': _token}).get('source')
+ headers=headers).get('source')
m3u8_url = self._request_webpage(HEADRequest(
_video_url), content_id).geturl()
if '10play-not-in-oz' in m3u8_url:
@@ -77,12 +105,16 @@ class TenPlayIE(InfoExtractor):
return {
'formats': formats,
+ 'subtitles': {'en': [{'url': data.get('captionUrl')}]} if data.get('captionUrl') else None,
'id': data.get('altId') or content_id,
- 'title': data.get('title'),
+ 'duration': data.get('duration'),
+ 'title': data.get('subtitle'),
+ 'alt_title': data.get('title'),
'description': data.get('description'),
'age_limit': self._AUS_AGES.get(data.get('classification')),
- 'series': data.get('showName'),
- 'season': data.get('showContentSeason'),
+ 'series': data.get('tvShow'),
+ 'season': int_or_none(data.get('season')),
+ 'episode_number': int_or_none(data.get('episode')),
'timestamp': data.get('published'),
'thumbnail': data.get('imageUrl'),
'uploader': 'Channel 10',
diff --git a/hypervideo_dl/extractor/tf1.py b/hypervideo_dl/extractor/tf1.py
index 669eb50..44785bc 100644
--- a/hypervideo_dl/extractor/tf1.py
+++ b/hypervideo_dl/extractor/tf1.py
@@ -29,7 +29,6 @@ class TF1IE(InfoExtractor):
'params': {
# Sometimes wat serves the whole file with the --test option
'skip_download': True,
- 'format': 'bestvideo',
},
}, {
'url': 'http://www.tf1.fr/tf1/koh-lanta/videos/replay-koh-lanta-22-mai-2015.html',
diff --git a/hypervideo_dl/extractor/theta.py b/hypervideo_dl/extractor/theta.py
index 3b65436..8b6d70a 100644
--- a/hypervideo_dl/extractor/theta.py
+++ b/hypervideo_dl/extractor/theta.py
@@ -6,7 +6,7 @@ from ..utils import try_get
class ThetaStreamIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?theta\.tv/(?!video/)(?P<id>[a-z0-9]+)'
+ _VALID_URL = r'https?://(?:www\.)?theta\.tv/(?!video/)(?P<id>[a-z0-9-]+)'
_TESTS = [{
'url': 'https://www.theta.tv/davirus',
'skip': 'The live may have ended',
@@ -25,6 +25,14 @@ class ThetaStreamIE(InfoExtractor):
'title': 'Mystery Science Theatre 3000 24/7 Powered by the THETA Network.',
'thumbnail': r're:https://user-prod-theta-tv\.imgix\.net/.+\.jpg',
}
+ }, {
+ 'url': 'https://www.theta.tv/contv-anime',
+ 'info_dict': {
+ 'id': 'ConTVAnime',
+ 'ext': 'mp4',
+ 'title': 'CONTV ANIME 24/7. Powered by THETA Network.',
+ 'thumbnail': r're:https://user-prod-theta-tv\.imgix\.net/.+\.jpg',
+ }
}]
def _real_extract(self, url):
diff --git a/hypervideo_dl/extractor/thisav.py b/hypervideo_dl/extractor/thisav.py
index 4af286e..6bb00b3 100644
--- a/hypervideo_dl/extractor/thisav.py
+++ b/hypervideo_dl/extractor/thisav.py
@@ -37,9 +37,7 @@ class ThisAVIE(InfoExtractor):
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
- title = remove_end(self._html_search_regex(
- r'<title>([^<]+)</title>', webpage, 'title'),
- ' - 視頻 - ThisAV.com-世界第一中文成人娛樂網站')
+ title = remove_end(self._html_extract_title(webpage), ' - 視頻 - ThisAV.com-世界第一中文成人娛樂網站')
video_url = self._html_search_regex(
r"addVariable\('file','([^']+)'\);", webpage, 'video url', default=None)
if video_url:
diff --git a/hypervideo_dl/extractor/thisoldhouse.py b/hypervideo_dl/extractor/thisoldhouse.py
index a3d9b40..8a1d173 100644
--- a/hypervideo_dl/extractor/thisoldhouse.py
+++ b/hypervideo_dl/extractor/thisoldhouse.py
@@ -2,6 +2,7 @@
from __future__ import unicode_literals
from .common import InfoExtractor
+from ..utils import HEADRequest
class ThisOldHouseIE(InfoExtractor):
@@ -15,6 +16,11 @@ class ThisOldHouseIE(InfoExtractor):
'description': 'In the workshop, Tom Silva and Kevin O\'Connor build a storage bench for an entryway.',
'timestamp': 1442548800,
'upload_date': '20150918',
+ 'duration': 674,
+ 'view_count': int,
+ 'average_rating': 0,
+ 'thumbnail': r're:^https?://.*\.jpg\?\d+$',
+ 'display_id': 'how-to-build-a-storage-bench',
},
'params': {
'skip_download': True,
@@ -41,7 +47,12 @@ class ThisOldHouseIE(InfoExtractor):
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
- video_id = self._search_regex(
- r'<iframe[^>]+src=[\'"](?:https?:)?//(?:www\.)?thisoldhouse\.(?:chorus\.build|com)/videos/zype/([0-9a-f]{24})',
- webpage, 'video id')
+ if 'To Unlock This content' in webpage:
+ self.raise_login_required(method='cookies')
+ video_url = self._search_regex(
+ r'<iframe[^>]+src=[\'"]((?:https?:)?//(?:www\.)?thisoldhouse\.(?:chorus\.build|com)/videos/zype/([0-9a-f]{24})[^\'"]*)[\'"]',
+ webpage, 'video url')
+ if 'subscription_required=true' in video_url or 'c-entry-group-labels__image' in webpage:
+ return self.url_result(self._request_webpage(HEADRequest(video_url), display_id).geturl(), 'Zype', display_id)
+ video_id = self._search_regex(r'(?:https?:)?//(?:www\.)?thisoldhouse\.(?:chorus\.build|com)/videos/zype/([0-9a-f]{24})', video_url, 'video id')
return self.url_result(self._ZYPE_TMPL % video_id, 'Zype', video_id)
diff --git a/hypervideo_dl/extractor/threeqsdn.py b/hypervideo_dl/extractor/threeqsdn.py
index bb76103..00a51dc 100644
--- a/hypervideo_dl/extractor/threeqsdn.py
+++ b/hypervideo_dl/extractor/threeqsdn.py
@@ -9,6 +9,7 @@ from ..utils import (
ExtractorError,
float_or_none,
int_or_none,
+ join_nonempty,
parse_iso8601,
)
@@ -110,8 +111,7 @@ class ThreeQSDNIE(InfoExtractor):
subtitles = self._merge_subtitles(subtitles, subs)
elif source_type == 'hls':
fmts, subs = self._extract_m3u8_formats_and_subtitles(
- source, video_id, 'mp4', 'm3u8' if live else 'm3u8_native',
- m3u8_id='hls', fatal=False)
+ source, video_id, 'mp4', live=live, m3u8_id='hls', fatal=False)
formats.extend(fmts)
subtitles = self._merge_subtitles(subtitles, subs)
elif source_type == 'progressive':
@@ -119,24 +119,16 @@ class ThreeQSDNIE(InfoExtractor):
src = s.get('src')
if not (src and self._is_valid_url(src, video_id)):
continue
- width = None
- format_id = ['http']
ext = determine_ext(src)
- if ext:
- format_id.append(ext)
height = int_or_none(s.get('height'))
- if height:
- format_id.append('%dp' % height)
- if aspect:
- width = int(height * aspect)
formats.append({
'ext': ext,
- 'format_id': '-'.join(format_id),
+ 'format_id': join_nonempty('http', ext, height and '%dp' % height),
'height': height,
'source_preference': 0,
'url': src,
'vcodec': 'none' if height == 0 else None,
- 'width': width,
+ 'width': int(height * aspect) if height and aspect else None,
})
# It seems like this would be correctly handled by default
# However, unless someone can confirm this, the old
@@ -155,7 +147,7 @@ class ThreeQSDNIE(InfoExtractor):
return {
'id': video_id,
- 'title': self._live_title(title) if live else title,
+ 'title': title,
'thumbnail': config.get('poster') or None,
'description': config.get('description') or None,
'timestamp': parse_iso8601(config.get('upload_date')),
diff --git a/hypervideo_dl/extractor/threespeak.py b/hypervideo_dl/extractor/threespeak.py
new file mode 100644
index 0000000..fe6a955
--- /dev/null
+++ b/hypervideo_dl/extractor/threespeak.py
@@ -0,0 +1,97 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ try_get,
+ unified_strdate,
+)
+
+
+class ThreeSpeakIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?3speak\.tv/watch\?v\=[^/]+/(?P<id>[^/$&#?]+)'
+
+ _TESTS = [{
+ 'url': 'https://3speak.tv/watch?v=dannyshine/wjgoxyfy',
+ 'info_dict': {
+ 'id': 'wjgoxyfy',
+ 'ext': 'mp4',
+ 'title': 'Can People who took the Vax think Critically',
+ 'uploader': 'dannyshine',
+ 'description': 'md5:181aa7ccb304afafa089b5af3bca7a10',
+ 'tags': ['sex', 'covid', 'antinatalism', 'comedy', 'vaccines'],
+ 'thumbnail': 'https://img.3speakcontent.co/wjgoxyfy/thumbnails/default.png',
+ 'upload_date': '20211021',
+ 'duration': 2703.867833,
+ 'filesize': 1620054781,
+ },
+ 'params': {'skip_download': True}
+ }]
+
+ def _real_extract(self, url):
+ id = self._match_id(url)
+ webpage = self._download_webpage(url, id)
+ json_str = self._html_search_regex(r'JSON\.parse\(\'([^\']+)\'\)', webpage, 'json')
+ # The json string itself is escaped. Hence the double parsing
+ data_json = self._parse_json(self._parse_json(f'"{json_str}"', id), id)
+ video_json = self._parse_json(data_json['json_metadata'], id)
+ formats, subtitles = [], {}
+ og_m3u8 = self._html_search_regex(r'<meta\s?property=\"ogvideo\"\s?content=\"([^\"]+)\">', webpage, 'og m3u8', fatal=False)
+ if og_m3u8:
+ https_frmts, https_subs = self._extract_m3u8_formats_and_subtitles(og_m3u8, id, fatal=False, m3u8_id='https')
+ formats.extend(https_frmts)
+ subtitles = self._merge_subtitles(subtitles, https_subs)
+ ipfs_m3u8 = try_get(video_json, lambda x: x['video']['info']['ipfs'])
+ if ipfs_m3u8:
+ ipfs_frmts, ipfs_subs = self._extract_m3u8_formats_and_subtitles(f'https://ipfs.3speak.tv/ipfs/{ipfs_m3u8}',
+ id, fatal=False, m3u8_id='ipfs')
+ formats.extend(ipfs_frmts)
+ subtitles = self._merge_subtitles(subtitles, ipfs_subs)
+ mp4_file = try_get(video_json, lambda x: x['video']['info']['file'])
+ if mp4_file:
+ formats.append({
+ 'url': f'https://threespeakvideo.b-cdn.net/{id}/{mp4_file}',
+ 'ext': 'mp4',
+ 'format_id': 'https-mp4',
+ 'duration': try_get(video_json, lambda x: x['video']['info']['duration']),
+ 'filesize': try_get(video_json, lambda x: x['video']['info']['filesize']),
+ 'quality': 11,
+ 'format_note': 'Original file',
+ })
+ self._sort_formats(formats)
+ return {
+ 'id': id,
+ 'title': data_json.get('title') or data_json.get('root_title'),
+ 'uploader': data_json.get('author'),
+ 'description': try_get(video_json, lambda x: x['video']['content']['description']),
+ 'tags': try_get(video_json, lambda x: x['video']['content']['tags']),
+ 'thumbnail': try_get(video_json, lambda x: x['image'][0]),
+ 'upload_date': unified_strdate(data_json.get('created')),
+ 'formats': formats,
+ 'subtitles': subtitles,
+ }
+
+
+class ThreeSpeakUserIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?3speak\.tv/user/(?P<id>[^/$&?#]+)'
+
+ _TESTS = [{
+ 'url': 'https://3speak.tv/user/theycallmedan',
+ 'info_dict': {
+ 'id': 'theycallmedan',
+ },
+ 'playlist_mincount': 115,
+ }]
+
+ def _real_extract(self, url):
+ id = self._match_id(url)
+ webpage = self._download_webpage(url, id)
+ entries = [
+ self.url_result(
+ 'https://3speak.tv/watch?v=%s' % video,
+ ie=ThreeSpeakIE.ie_key())
+ for video in re.findall(r'data-payout\s?\=\s?\"([^\"]+)\"', webpage) if video
+ ]
+ return self.playlist_result(entries, id)
diff --git a/hypervideo_dl/extractor/tiktok.py b/hypervideo_dl/extractor/tiktok.py
index 1db6327..c1d6c54 100644
--- a/hypervideo_dl/extractor/tiktok.py
+++ b/hypervideo_dl/extractor/tiktok.py
@@ -8,10 +8,18 @@ import time
import json
from .common import InfoExtractor
-from ..compat import compat_urllib_parse_unquote
+from ..compat import (
+ compat_urllib_parse_unquote,
+ compat_urllib_parse_urlparse
+)
from ..utils import (
ExtractorError,
+ HEADRequest,
+ get_first,
int_or_none,
+ join_nonempty,
+ LazyList,
+ srt_subtitles_timecode,
str_or_none,
traverse_obj,
try_get,
@@ -21,25 +29,38 @@ from ..utils import (
class TikTokBaseIE(InfoExtractor):
- _APP_VERSION = '20.9.3'
- _MANIFEST_APP_VERSION = '291'
+ _APP_VERSIONS = [('20.9.3', '293'), ('20.4.3', '243'), ('20.2.1', '221'), ('20.1.2', '212'), ('20.0.4', '204')]
+ _WORKING_APP_VERSION = None
_APP_NAME = 'trill'
_AID = 1180
- _API_HOSTNAME = 'api-t2.tiktokv.com'
+ _API_HOSTNAME = 'api-h2.tiktokv.com'
_UPLOADER_URL_FORMAT = 'https://www.tiktok.com/@%s'
- QUALITIES = ('360p', '540p', '720p')
+ _WEBPAGE_HOST = 'https://www.tiktok.com/'
+ QUALITIES = ('360p', '540p', '720p', '1080p')
+
+ def _call_api_impl(self, ep, query, manifest_app_version, video_id, fatal=True,
+ note='Downloading API JSON', errnote='Unable to download API page'):
+ self._set_cookie(self._API_HOSTNAME, 'odin_tt', ''.join(random.choice('0123456789abcdef') for _ in range(160)))
+ webpage_cookies = self._get_cookies(self._WEBPAGE_HOST)
+ if webpage_cookies.get('sid_tt'):
+ self._set_cookie(self._API_HOSTNAME, 'sid_tt', webpage_cookies['sid_tt'].value)
+ return self._download_json(
+ 'https://%s/aweme/v1/%s/' % (self._API_HOSTNAME, ep), video_id=video_id,
+ fatal=fatal, note=note, errnote=errnote, headers={
+ 'User-Agent': f'com.ss.android.ugc.trill/{manifest_app_version} (Linux; U; Android 10; en_US; Pixel 4; Build/QQ3A.200805.001; Cronet/58.0.2991.0)',
+ 'Accept': 'application/json',
+ }, query=query)
- def _call_api(self, ep, query, video_id, fatal=True,
- note='Downloading API JSON', errnote='Unable to download API page'):
- real_query = {
+ def _build_api_query(self, query, app_version, manifest_app_version):
+ return {
**query,
- 'version_name': self._APP_VERSION,
- 'version_code': self._MANIFEST_APP_VERSION,
- 'build_number': self._APP_VERSION,
- 'manifest_version_code': self._MANIFEST_APP_VERSION,
- 'update_version_code': self._MANIFEST_APP_VERSION,
- 'openudid': ''.join(random.choice('0123456789abcdef') for i in range(16)),
- 'uuid': ''.join([random.choice(string.digits) for num in range(16)]),
+ 'version_name': app_version,
+ 'version_code': manifest_app_version,
+ 'build_number': app_version,
+ 'manifest_version_code': manifest_app_version,
+ 'update_version_code': manifest_app_version,
+ 'openudid': ''.join(random.choice('0123456789abcdef') for _ in range(16)),
+ 'uuid': ''.join([random.choice(string.digits) for _ in range(16)]),
'_rticket': int(time.time() * 1000),
'ts': int(time.time()),
'device_brand': 'Google',
@@ -66,13 +87,61 @@ class TikTokBaseIE(InfoExtractor):
'as': 'a1qwert123',
'cp': 'cbfhckdckkde1',
}
- self._set_cookie(self._API_HOSTNAME, 'odin_tt', ''.join(random.choice('0123456789abcdef') for i in range(160)))
- return self._download_json(
- 'https://%s/aweme/v1/%s/' % (self._API_HOSTNAME, ep), video_id=video_id,
- fatal=fatal, note=note, errnote=errnote, headers={
- 'User-Agent': f'com.ss.android.ugc.trill/{self._MANIFEST_APP_VERSION} (Linux; U; Android 10; en_US; Pixel 4; Build/QQ3A.200805.001; Cronet/58.0.2991.0)',
- 'Accept': 'application/json',
- }, query=real_query)
+
+ def _call_api(self, ep, query, video_id, fatal=True,
+ note='Downloading API JSON', errnote='Unable to download API page'):
+ if not self._WORKING_APP_VERSION:
+ app_version = self._configuration_arg('app_version', [''], ie_key=TikTokIE.ie_key())[0]
+ manifest_app_version = self._configuration_arg('manifest_app_version', [''], ie_key=TikTokIE.ie_key())[0]
+ if app_version and manifest_app_version:
+ self._WORKING_APP_VERSION = (app_version, manifest_app_version)
+ self.write_debug('Imported app version combo from extractor arguments')
+ elif app_version or manifest_app_version:
+ self.report_warning('Only one of the two required version params are passed as extractor arguments', only_once=True)
+
+ if self._WORKING_APP_VERSION:
+ app_version, manifest_app_version = self._WORKING_APP_VERSION
+ real_query = self._build_api_query(query, app_version, manifest_app_version)
+ return self._call_api_impl(ep, real_query, manifest_app_version, video_id, fatal, note, errnote)
+
+ for count, (app_version, manifest_app_version) in enumerate(self._APP_VERSIONS, start=1):
+ real_query = self._build_api_query(query, app_version, manifest_app_version)
+ try:
+ res = self._call_api_impl(ep, real_query, manifest_app_version, video_id, fatal, note, errnote)
+ self._WORKING_APP_VERSION = (app_version, manifest_app_version)
+ return res
+ except ExtractorError as e:
+ if isinstance(e.cause, json.JSONDecodeError) and e.cause.pos == 0:
+ if count == len(self._APP_VERSIONS):
+ if fatal:
+ raise e
+ else:
+ self.report_warning(str(e.cause or e.msg))
+ return
+ self.report_warning('%s. Retrying... (attempt %s of %s)' % (str(e.cause or e.msg), count, len(self._APP_VERSIONS)))
+ continue
+ raise e
+
+ def _get_subtitles(self, aweme_detail, aweme_id):
+ # TODO: Extract text positioning info
+ subtitles = {}
+ captions_info = traverse_obj(
+ aweme_detail, ('interaction_stickers', ..., 'auto_video_caption_info', 'auto_captions', ...), expected_type=dict, default=[])
+ for caption in captions_info:
+ caption_url = traverse_obj(caption, ('url', 'url_list', ...), expected_type=url_or_none, get_all=False)
+ if not caption_url:
+ continue
+ caption_json = self._download_json(
+ caption_url, aweme_id, note='Downloading captions', errnote='Unable to download captions', fatal=False)
+ if not caption_json:
+ continue
+ subtitles.setdefault(caption.get('language', 'en'), []).append({
+ 'ext': 'srt',
+ 'data': '\n\n'.join(
+ f'{i + 1}\n{srt_subtitles_timecode(line["start_time"] / 1000)} --> {srt_subtitles_timecode(line["end_time"] / 1000)}\n{line["text"]}'
+ for i, line in enumerate(caption_json['utterances']) if line.get('text'))
+ })
+ return subtitles
def _parse_aweme_video_app(self, aweme_detail):
aweme_id = aweme_detail['aweme_id']
@@ -107,8 +176,8 @@ class TikTokBaseIE(InfoExtractor):
'acodec': 'aac',
'source_preference': -2 if 'aweme/v1' in url else -1, # Downloads from API might get blocked
**add_meta, **parsed_meta,
- 'format_note': ' '.join(filter(None, (
- add_meta.get('format_note'), '(API)' if 'aweme/v1' in url else '')))
+ 'format_note': join_nonempty(
+ add_meta.get('format_note'), '(API)' if 'aweme/v1' in url else None, delim=' ')
} for url in addr.get('url_list') or []]
# Hack: Add direct video links first to prioritize them when removing duplicate formats
@@ -118,7 +187,7 @@ class TikTokBaseIE(InfoExtractor):
'format_id': 'play_addr',
'format_note': 'Direct video',
'vcodec': 'h265' if traverse_obj(
- video_info, 'is_bytevc1', 'is_h265') else 'h264', # Always h264?
+ video_info, 'is_bytevc1', 'is_h265') else 'h264', # TODO: Check for "direct iOS" videos, like https://www.tiktok.com/@cookierun_dev/video/7039716639834656002
'width': video_info.get('width'),
'height': video_info.get('height'),
}))
@@ -156,6 +225,10 @@ class TikTokBaseIE(InfoExtractor):
}))
self._remove_duplicate_formats(formats)
+ auth_cookie = self._get_cookies(self._WEBPAGE_HOST).get('sid_tt')
+ if auth_cookie:
+ for f in formats:
+ self._set_cookie(compat_urllib_parse_urlparse(f['url']).hostname, 'sid_tt', auth_cookie.value)
self._sort_formats(formats, ('quality', 'codec', 'size', 'br'))
thumbnails = []
@@ -175,6 +248,7 @@ class TikTokBaseIE(InfoExtractor):
user_url = self._UPLOADER_URL_FORMAT % (traverse_obj(author_info,
'sec_uid', 'id', 'uid', 'unique_id',
expected_type=str_or_none, get_all=False))
+ labels = traverse_obj(aweme_detail, ('hybrid_label', ..., 'text'), expected_type=str, default=[])
contained_music_track = traverse_obj(
music_info, ('matched_song', 'title'), ('matched_pgc_sound', 'title'), expected_type=str)
@@ -189,8 +263,8 @@ class TikTokBaseIE(InfoExtractor):
return {
'id': aweme_id,
- 'title': aweme_detail['desc'],
- 'description': aweme_detail['desc'],
+ 'title': aweme_detail.get('desc'),
+ 'description': aweme_detail.get('desc'),
'view_count': int_or_none(stats_info.get('play_count')),
'like_count': int_or_none(stats_info.get('digg_count')),
'repost_count': int_or_none(stats_info.get('share_count')),
@@ -204,18 +278,24 @@ class TikTokBaseIE(InfoExtractor):
'artist': music_author,
'timestamp': int_or_none(aweme_detail.get('create_time')),
'formats': formats,
+ 'subtitles': self.extract_subtitles(aweme_detail, aweme_id),
'thumbnails': thumbnails,
- 'duration': int_or_none(traverse_obj(video_info, 'duration', ('download_addr', 'duration')), scale=1000)
+ 'duration': int_or_none(traverse_obj(video_info, 'duration', ('download_addr', 'duration')), scale=1000),
+ 'availability': self._availability(
+ is_private='Private' in labels,
+ needs_subscription='Friends only' in labels,
+ is_unlisted='Followers only' in labels)
}
def _parse_aweme_video_web(self, aweme_detail, webpage_url):
video_info = aweme_detail['video']
- author_info = traverse_obj(aweme_detail, 'author', 'authorInfo', default={})
+ author_info = traverse_obj(aweme_detail, 'authorInfo', 'author', expected_type=dict, default={})
music_info = aweme_detail.get('music') or {}
stats_info = aweme_detail.get('stats') or {}
user_url = self._UPLOADER_URL_FORMAT % (traverse_obj(author_info,
'secUid', 'id', 'uid', 'uniqueId',
- expected_type=str_or_none, get_all=False))
+ expected_type=str_or_none, get_all=False)
+ or aweme_detail.get('authorSecId'))
formats = []
play_url = video_info.get('playAddr')
@@ -267,8 +347,8 @@ class TikTokBaseIE(InfoExtractor):
'comment_count': int_or_none(stats_info.get('commentCount')),
'timestamp': int_or_none(aweme_detail.get('createTime')),
'creator': str_or_none(author_info.get('nickname')),
- 'uploader': str_or_none(author_info.get('uniqueId')),
- 'uploader_id': str_or_none(author_info.get('id')),
+ 'uploader': str_or_none(author_info.get('uniqueId') or aweme_detail.get('author')),
+ 'uploader_id': str_or_none(author_info.get('id') or aweme_detail.get('authorId')),
'uploader_url': user_url,
'track': str_or_none(music_info.get('title')),
'album': str_or_none(music_info.get('album')) or None,
@@ -307,6 +387,9 @@ class TikTokIE(TikTokBaseIE):
'like_count': int,
'repost_count': int,
'comment_count': int,
+ 'artist': 'Ysrbeats',
+ 'album': 'Lehanga',
+ 'track': 'Lehanga',
}
}, {
'url': 'https://www.tiktok.com/@patroxofficial/video/6742501081818877190?langCountry=en',
@@ -330,16 +413,98 @@ class TikTokIE(TikTokBaseIE):
'like_count': int,
'repost_count': int,
'comment_count': int,
+ 'artist': 'Evan Todd, Jessica Keenan Wynn, Alice Lee, Barrett Wilbert Weed & Jon Eidson',
+ 'track': 'Big Fun',
}
}, {
- # Promoted content/ad
- 'url': 'https://www.tiktok.com/@MS4wLjABAAAAAR29F6J2Ktu0Daw03BJyXPNoRQ-W7U5a0Mn3lVCq2rQhjOd_WNLclHUoFgwX8Eno/video/6932675057474981122',
- 'only_matching': True,
+ # Banned audio, only available on the app
+ 'url': 'https://www.tiktok.com/@barudakhb_/video/6984138651336838402',
+ 'info_dict': {
+ 'id': '6984138651336838402',
+ 'ext': 'mp4',
+ 'title': 'Balas @yolaaftwsr hayu yu ? #SquadRandom_ 🔥',
+ 'description': 'Balas @yolaaftwsr hayu yu ? #SquadRandom_ 🔥',
+ 'uploader': 'barudakhb_',
+ 'creator': 'md5:29f238c49bc0c176cb3cef1a9cea9fa6',
+ 'uploader_id': '6974687867511718913',
+ 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAAbhBwQC-R1iKoix6jDFsF-vBdfx2ABoDjaZrM9fX6arU3w71q3cOWgWuTXn1soZ7d',
+ 'track': 'Boka Dance',
+ 'artist': 'md5:29f238c49bc0c176cb3cef1a9cea9fa6',
+ 'timestamp': 1626121503,
+ 'duration': 18,
+ 'thumbnail': r're:^https?://[\w\/\.\-]+(~[\w\-]+\.image)?',
+ 'upload_date': '20210712',
+ 'view_count': int,
+ 'like_count': int,
+ 'repost_count': int,
+ 'comment_count': int,
+ }
+ }, {
+ # Sponsored video, only available with feed workaround
+ 'url': 'https://www.tiktok.com/@MS4wLjABAAAATh8Vewkn0LYM7Fo03iec3qKdeCUOcBIouRk1mkiag6h3o_pQu_dUXvZ2EZlGST7_/video/7042692929109986561',
+ 'info_dict': {
+ 'id': '7042692929109986561',
+ 'ext': 'mp4',
+ 'title': 'Slap and Run!',
+ 'description': 'Slap and Run!',
+ 'uploader': 'user440922249',
+ 'creator': 'Slap And Run',
+ 'uploader_id': '7036055384943690754',
+ 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAATh8Vewkn0LYM7Fo03iec3qKdeCUOcBIouRk1mkiag6h3o_pQu_dUXvZ2EZlGST7_',
+ 'track': 'Promoted Music',
+ 'timestamp': 1639754738,
+ 'duration': 30,
+ 'thumbnail': r're:^https?://[\w\/\.\-]+(~[\w\-]+\.image)?',
+ 'upload_date': '20211217',
+ 'view_count': int,
+ 'like_count': int,
+ 'repost_count': int,
+ 'comment_count': int,
+ },
+ 'expected_warnings': ['Video not available']
+ }, {
+ # Video without title and description
+ 'url': 'https://www.tiktok.com/@pokemonlife22/video/7059698374567611694',
+ 'info_dict': {
+ 'id': '7059698374567611694',
+ 'ext': 'mp4',
+ 'title': 'tiktok video #7059698374567611694',
+ 'description': '',
+ 'uploader': 'pokemonlife22',
+ 'creator': 'Pokemon',
+ 'uploader_id': '6820838815978423302',
+ 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAA0tF1nBwQVVMyrGu3CqttkNgM68Do1OXUFuCY0CRQk8fEtSVDj89HqoqvbSTmUP2W',
+ 'track': 'original sound',
+ 'timestamp': 1643714123,
+ 'duration': 6,
+ 'thumbnail': r're:^https?://[\w\/\.\-]+(~[\w\-]+\.image)?',
+ 'upload_date': '20220201',
+ 'artist': 'Pokemon',
+ 'view_count': int,
+ 'like_count': int,
+ 'repost_count': int,
+ 'comment_count': int,
+ },
+ 'expected_warnings': ['Video not available', 'Creating a generic title']
+ }, {
+ # Auto-captions available
+ 'url': 'https://www.tiktok.com/@hankgreen1/video/7047596209028074758',
+ 'only_matching': True
}]
def _extract_aweme_app(self, aweme_id):
- aweme_detail = self._call_api('aweme/detail', {'aweme_id': aweme_id}, aweme_id,
- note='Downloading video details', errnote='Unable to download video details')['aweme_detail']
+ try:
+ aweme_detail = self._call_api('aweme/detail', {'aweme_id': aweme_id}, aweme_id,
+ note='Downloading video details', errnote='Unable to download video details').get('aweme_detail')
+ if not aweme_detail:
+ raise ExtractorError('Video not available', video_id=aweme_id)
+ except ExtractorError as e:
+ self.report_warning(f'{e}; Retrying with feed workaround')
+ feed_list = self._call_api('feed', {'aweme_id': aweme_id}, aweme_id,
+ note='Downloading video feed', errnote='Unable to download video feed').get('aweme_list') or []
+ aweme_detail = next((aweme for aweme in feed_list if str(aweme.get('aweme_id')) == aweme_id), None)
+ if not aweme_detail:
+ raise ExtractorError('Unable to find video in feed', video_id=aweme_id)
return self._parse_aweme_video_app(aweme_detail)
def _real_extract(self, url):
@@ -353,19 +518,23 @@ class TikTokIE(TikTokBaseIE):
# If we only call once, we get a 403 when downlaoding the video.
self._download_webpage(url, video_id)
webpage = self._download_webpage(url, video_id, note='Downloading video webpage')
- json_string = self._search_regex(
- r'id=\"__NEXT_DATA__\"\s+type=\"application\/json\"\s*[^>]+>\s*(?P<json_string_ld>[^<]+)',
- webpage, 'json_string', group='json_string_ld')
- json_data = self._parse_json(json_string, video_id)
- props_data = try_get(json_data, lambda x: x['props'], expected_type=dict)
-
- # Chech statusCode for success
- status = props_data.get('pageProps').get('statusCode')
+ next_data = self._search_nextjs_data(webpage, video_id, default='{}')
+
+ if next_data:
+ status = traverse_obj(next_data, ('props', 'pageProps', 'statusCode'), expected_type=int) or 0
+ video_data = traverse_obj(next_data, ('props', 'pageProps', 'itemInfo', 'itemStruct'), expected_type=dict)
+ else:
+ sigi_json = self._search_regex(
+ r'>\s*window\[[\'"]SIGI_STATE[\'"]\]\s*=\s*(?P<sigi_state>{.+});',
+ webpage, 'sigi data', group='sigi_state')
+ sigi_data = self._parse_json(sigi_json, video_id)
+ status = traverse_obj(sigi_data, ('VideoPage', 'statusCode'), expected_type=int) or 0
+ video_data = traverse_obj(sigi_data, ('ItemModule', video_id), expected_type=dict)
+
if status == 0:
- return self._parse_aweme_video_web(props_data['pageProps']['itemInfo']['itemStruct'], url)
+ return self._parse_aweme_video_web(video_data, url)
elif status == 10216:
raise ExtractorError('This video is private', expected=True)
-
raise ExtractorError('Video not available', video_id=video_id)
@@ -378,6 +547,16 @@ class TikTokUserIE(TikTokBaseIE):
'info_dict': {
'id': '6935371178089399301',
'title': 'corgibobaa',
+ 'thumbnail': r're:https://.+_1080x1080\.webp'
+ },
+ 'expected_warnings': ['Retrying']
+ }, {
+ 'url': 'https://www.tiktok.com/@6820838815978423302',
+ 'playlist_mincount': 5,
+ 'info_dict': {
+ 'id': '6820838815978423302',
+ 'title': '6820838815978423302',
+ 'thumbnail': r're:https://.+_1080x1080\.webp'
},
'expected_warnings': ['Retrying']
}, {
@@ -386,6 +565,7 @@ class TikTokUserIE(TikTokBaseIE):
'info_dict': {
'id': '79005827461758976',
'title': 'meme',
+ 'thumbnail': r're:https://.+_1080x1080\.webp'
},
'expected_warnings': ['Retrying']
}]
@@ -409,14 +589,14 @@ class TikTokUserIE(TikTokBaseIE):
cursor = data_json['cursor']
'''
- def _entries_api(self, webpage, user_id, username):
+ def _video_entries_api(self, webpage, user_id, username):
query = {
'user_id': user_id,
'count': 21,
'max_cursor': 0,
'min_cursor': 0,
'retry_type': 'no_retry',
- 'device_id': ''.join(random.choice(string.digits) for i in range(19)), # Some endpoints don't like randomized device_id, so it isn't directly set in _call_api.
+ 'device_id': ''.join(random.choice(string.digits) for _ in range(19)), # Some endpoints don't like randomized device_id, so it isn't directly set in _call_api.
}
max_retries = self.get_param('extractor_retries', 3)
@@ -432,23 +612,139 @@ class TikTokUserIE(TikTokBaseIE):
continue
raise
break
+ yield from post_list.get('aweme_list', [])
+ if not post_list.get('has_more'):
+ break
+ query['max_cursor'] = post_list['max_cursor']
+
+ def _entries_api(self, user_id, videos):
+ for video in videos:
+ yield {
+ **self._parse_aweme_video_app(video),
+ 'extractor_key': TikTokIE.ie_key(),
+ 'extractor': 'TikTok',
+ 'webpage_url': f'https://tiktok.com/@{user_id}/video/{video["aweme_id"]}',
+ }
+
+ def _real_extract(self, url):
+ user_name = self._match_id(url)
+ webpage = self._download_webpage(url, user_name, headers={
+ 'User-Agent': 'facebookexternalhit/1.1 (+http://www.facebook.com/externalhit_uatext.php)'
+ })
+ user_id = self._html_search_regex(r'snssdk\d*://user/profile/(\d+)', webpage, 'user ID', default=None) or user_name
+
+ videos = LazyList(self._video_entries_api(webpage, user_id, user_name))
+ thumbnail = traverse_obj(videos, (0, 'author', 'avatar_larger', 'url_list', 0))
+
+ return self.playlist_result(self._entries_api(user_id, videos), user_id, user_name, thumbnail=thumbnail)
+
+
+class TikTokBaseListIE(TikTokBaseIE):
+ def _entries(self, list_id, display_id):
+ query = {
+ self._QUERY_NAME: list_id,
+ 'cursor': 0,
+ 'count': 20,
+ 'type': 5,
+ 'device_id': ''.join(random.choice(string.digits) for i in range(19))
+ }
+
+ max_retries = self.get_param('extractor_retries', 3)
+ for page in itertools.count(1):
+ for retries in itertools.count():
+ try:
+ post_list = self._call_api(self._API_ENDPOINT, query, display_id,
+ note='Downloading video list page %d%s' % (page, f' (attempt {retries})' if retries != 0 else ''),
+ errnote='Unable to download video list')
+ except ExtractorError as e:
+ if isinstance(e.cause, json.JSONDecodeError) and e.cause.pos == 0 and retries != max_retries:
+ self.report_warning('%s. Retrying...' % str(e.cause or e.msg))
+ continue
+ raise
+ break
for video in post_list.get('aweme_list', []):
yield {
**self._parse_aweme_video_app(video),
- 'ie_key': TikTokIE.ie_key(),
+ 'extractor_key': TikTokIE.ie_key(),
'extractor': 'TikTok',
+ 'webpage_url': f'https://tiktok.com/@_/video/{video["aweme_id"]}',
}
if not post_list.get('has_more'):
break
- query['max_cursor'] = post_list['max_cursor']
+ query['cursor'] = post_list['cursor']
def _real_extract(self, url):
- user_name = self._match_id(url)
- webpage = self._download_webpage(url, user_name, headers={
+ list_id = self._match_id(url)
+ return self.playlist_result(self._entries(list_id, list_id), list_id)
+
+
+class TikTokSoundIE(TikTokBaseListIE):
+ IE_NAME = 'tiktok:sound'
+ _VALID_URL = r'https?://(?:www\.)?tiktok\.com/music/[\w\.-]+-(?P<id>[\d]+)[/?#&]?'
+ _QUERY_NAME = 'music_id'
+ _API_ENDPOINT = 'music/aweme'
+ _TESTS = [{
+ 'url': 'https://www.tiktok.com/music/Build-a-Btch-6956990112127585029?lang=en',
+ 'playlist_mincount': 100,
+ 'info_dict': {
+ 'id': '6956990112127585029'
+ },
+ 'expected_warnings': ['Retrying']
+ }, {
+ # Actual entries are less than listed video count
+ 'url': 'https://www.tiktok.com/music/jiefei-soap-remix-7036843036118469381',
+ 'playlist_mincount': 2182,
+ 'info_dict': {
+ 'id': '7036843036118469381'
+ },
+ 'expected_warnings': ['Retrying']
+ }]
+
+
+class TikTokEffectIE(TikTokBaseListIE):
+ IE_NAME = 'tiktok:effect'
+ _VALID_URL = r'https?://(?:www\.)?tiktok\.com/sticker/[\w\.-]+-(?P<id>[\d]+)[/?#&]?'
+ _QUERY_NAME = 'sticker_id'
+ _API_ENDPOINT = 'sticker/aweme'
+ _TESTS = [{
+ 'url': 'https://www.tiktok.com/sticker/MATERIAL-GWOOORL-1258156',
+ 'playlist_mincount': 100,
+ 'info_dict': {
+ 'id': '1258156',
+ },
+ 'expected_warnings': ['Retrying']
+ }, {
+ # Different entries between mobile and web, depending on region
+ 'url': 'https://www.tiktok.com/sticker/Elf-Friend-479565',
+ 'only_matching': True
+ }]
+
+
+class TikTokTagIE(TikTokBaseListIE):
+ IE_NAME = 'tiktok:tag'
+ _VALID_URL = r'https?://(?:www\.)?tiktok\.com/tag/(?P<id>[^/?#&]+)'
+ _QUERY_NAME = 'ch_id'
+ _API_ENDPOINT = 'challenge/aweme'
+ _TESTS = [{
+ 'url': 'https://tiktok.com/tag/hello2018',
+ 'playlist_mincount': 39,
+ 'info_dict': {
+ 'id': '46294678',
+ 'title': 'hello2018',
+ },
+ 'expected_warnings': ['Retrying']
+ }, {
+ 'url': 'https://tiktok.com/tag/fypシ?is_copy_url=0&is_from_webapp=v1',
+ 'only_matching': True
+ }]
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id, headers={
'User-Agent': 'facebookexternalhit/1.1 (+http://www.facebook.com/externalhit_uatext.php)'
})
- user_id = self._html_search_regex(r'snssdk\d*://user/profile/(\d+)', webpage, 'user ID')
- return self.playlist_result(self._entries_api(webpage, user_id, user_name), user_id, user_name)
+ tag_id = self._html_search_regex(r'snssdk\d*://challenge/detail/(\d+)', webpage, 'tag ID')
+ return self.playlist_result(self._entries(tag_id, display_id), tag_id, display_id)
class DouyinIE(TikTokIE):
@@ -534,12 +830,12 @@ class DouyinIE(TikTokIE):
'comment_count': int,
}
}]
- _APP_VERSION = '9.6.0'
- _MANIFEST_APP_VERSION = '960'
+ _APP_VERSIONS = [('9.6.0', '960')]
_APP_NAME = 'aweme'
_AID = 1128
_API_HOSTNAME = 'aweme.snssdk.com'
_UPLOADER_URL_FORMAT = 'https://www.douyin.com/user/%s'
+ _WEBPAGE_HOST = 'https://www.douyin.com/'
def _real_extract(self, url):
video_id = self._match_id(url)
@@ -559,5 +855,40 @@ class DouyinIE(TikTokIE):
render_data = self._parse_json(
render_data_json, video_id, transform_source=compat_urllib_parse_unquote)
- return self._parse_aweme_video_web(
- traverse_obj(render_data, (..., 'aweme', 'detail'), get_all=False), url)
+ return self._parse_aweme_video_web(get_first(render_data, ('aweme', 'detail')), url)
+
+
+class TikTokVMIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:vm|vt)\.tiktok\.com/(?P<id>\w+)'
+ IE_NAME = 'vm.tiktok'
+
+ _TESTS = [{
+ 'url': 'https://vm.tiktok.com/ZSe4FqkKd',
+ 'info_dict': {
+ 'id': '7023491746608712966',
+ 'ext': 'mp4',
+ 'title': 'md5:5607564db90271abbbf8294cca77eddd',
+ 'description': 'md5:5607564db90271abbbf8294cca77eddd',
+ 'duration': 11,
+ 'upload_date': '20211026',
+ 'uploader_id': '7007385080558846981',
+ 'creator': 'Memes',
+ 'artist': 'Memes',
+ 'track': 'original sound',
+ 'uploader': 'susmandem',
+ 'timestamp': 1635284105,
+ 'thumbnail': r're:https://.+\.webp.*',
+ 'like_count': int,
+ 'view_count': int,
+ 'comment_count': int,
+ 'repost_count': int,
+ 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAAXcNoOEOxVyBzuII_E--T0MeCrLP0ay1Sm6x_n3dluiWEoWZD0VlQOytwad4W0i0n',
+ }
+ }, {
+ 'url': 'https://vt.tiktok.com/ZSe4FqkKd',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ return self.url_result(self._request_webpage(
+ HEADRequest(url), self._match_id(url), headers={'User-Agent': 'facebookexternalhit/1.1'}).geturl(), TikTokIE)
diff --git a/hypervideo_dl/extractor/toggo.py b/hypervideo_dl/extractor/toggo.py
new file mode 100644
index 0000000..da5f0c4
--- /dev/null
+++ b/hypervideo_dl/extractor/toggo.py
@@ -0,0 +1,73 @@
+from .common import InfoExtractor
+from ..utils import int_or_none, parse_qs
+
+
+class ToggoIE(InfoExtractor):
+ IE_NAME = 'toggo'
+ _VALID_URL = r'https?://(?:www\.)?toggo\.de/[\w-]+/folge/(?P<id>[\w-]+)'
+ _TESTS = [{
+ 'url': 'https://www.toggo.de/weihnachtsmann--co-kg/folge/ein-geschenk-fuer-zwei',
+ 'info_dict': {
+ 'id': 'VEP2977',
+ 'ext': 'mp4',
+ 'title': 'Ein Geschenk für zwei',
+ 'display_id': 'ein-geschenk-fuer-zwei',
+ 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
+ 'description': 'md5:b7715915bfa47824b4e4ad33fb5962f8',
+ 'release_timestamp': 1637259179,
+ 'series': 'Weihnachtsmann & Co. KG',
+ 'season': 'Weihnachtsmann & Co. KG',
+ 'season_number': 1,
+ 'season_id': 'VST118',
+ 'episode': 'Ein Geschenk für zwei',
+ 'episode_number': 7,
+ 'episode_id': 'VEP2977',
+ 'timestamp': 1581935960,
+ 'uploader_id': '6057955896001',
+ 'upload_date': '20200217',
+ },
+ 'params': {'skip_download': True},
+ }]
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ data = self._download_json(
+ f'https://production-n.toggo.de/api/assetstore/vod/asset/{display_id}', display_id)['data']
+
+ brightcove_id = next(
+ x['value'] for x in data['custom_fields'] if x.get('key') == 'video-cloud-id')
+ info = self._downloader.get_info_extractor('BrightcoveNew').extract(
+ f'http://players.brightcove.net/6057955896001/default_default/index.html?videoId={brightcove_id}')
+
+ for f in info['formats']:
+ if '/dash/live/cenc/' in f.get('fragment_base_url', ''):
+ # Get hidden non-DRM format
+ f['fragment_base_url'] = f['fragment_base_url'].replace('/cenc/', '/clear/')
+ f['has_drm'] = False
+
+ if '/fairplay/' in f.get('manifest_url', ''):
+ f['has_drm'] = True
+
+ thumbnails = [{
+ 'id': name,
+ 'url': url,
+ 'width': int_or_none(next(iter(parse_qs(url).get('width', [])), None)),
+ } for name, url in (data.get('images') or {}).items()]
+
+ return {
+ **info,
+ 'id': data.get('id'),
+ 'display_id': display_id,
+ 'title': data.get('title'),
+ 'language': data.get('language'),
+ 'thumbnails': thumbnails,
+ 'description': data.get('description'),
+ 'release_timestamp': data.get('earliest_start_date'),
+ 'series': data.get('series_title'),
+ 'season': data.get('season_title'),
+ 'season_number': data.get('season_no'),
+ 'season_id': data.get('season_id'),
+ 'episode': data.get('title'),
+ 'episode_number': data.get('episode_no'),
+ 'episode_id': data.get('id'),
+ }
diff --git a/hypervideo_dl/extractor/tokentube.py b/hypervideo_dl/extractor/tokentube.py
index d636211..579623f 100644
--- a/hypervideo_dl/extractor/tokentube.py
+++ b/hypervideo_dl/extractor/tokentube.py
@@ -6,7 +6,10 @@ import re
from .common import InfoExtractor
from ..utils import (
+ clean_html,
+ get_element_by_class,
parse_count,
+ remove_end,
unified_strdate,
js_to_json,
OnDemandPagedList,
@@ -35,7 +38,7 @@ class TokentubeIE(InfoExtractor):
'id': '3950239124',
'ext': 'mp4',
'title': 'Linux Ubuntu Studio perus käyttö',
- 'description': 'md5:854ff1dc732ff708976de2880ea32050',
+ 'description': 'md5:46077d0daaba1974f2dc381257f9d64c',
'uploader': 'jyrilehtonen',
'upload_date': '20210825',
},
@@ -45,7 +48,7 @@ class TokentubeIE(InfoExtractor):
'id': '3582463289',
'ext': 'mp4',
'title': 'Police for Freedom - toiminta aloitetaan Suomessa ❤️??',
- 'description': 'md5:cd92e620d7f5fa162e8410d0fc9a08be',
+ 'description': 'md5:37ebf1cb44264e0bf23ed98b337ee63e',
'uploader': 'Voitontie',
'upload_date': '20210428',
}
@@ -90,7 +93,10 @@ class TokentubeIE(InfoExtractor):
r'<a\s*class="place-left"[^>]+>(.+?)</a>',
webpage, 'uploader', fatal=False)
- description = self._html_search_meta('description', webpage)
+ description = (clean_html(get_element_by_class('p-d-txt', webpage))
+ or self._html_search_meta(('og:description', 'description', 'twitter:description'), webpage))
+
+ description = remove_end(description, 'Category')
self._sort_formats(formats)
diff --git a/hypervideo_dl/extractor/tonline.py b/hypervideo_dl/extractor/tonline.py
index cc11eae..9b6a40d 100644
--- a/hypervideo_dl/extractor/tonline.py
+++ b/hypervideo_dl/extractor/tonline.py
@@ -2,7 +2,7 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..utils import int_or_none
+from ..utils import int_or_none, join_nonempty
class TOnlineIE(InfoExtractor):
@@ -30,13 +30,8 @@ class TOnlineIE(InfoExtractor):
asset_source = asset.get('source') or asset.get('source2')
if not asset_source:
continue
- formats_id = []
- for field_key in ('type', 'profile'):
- field_value = asset.get(field_key)
- if field_value:
- formats_id.append(field_value)
formats.append({
- 'format_id': '-'.join(formats_id),
+ 'format_id': join_nonempty('type', 'profile', from_dict=asset),
'url': asset_source,
})
diff --git a/hypervideo_dl/extractor/toutv.py b/hypervideo_dl/extractor/toutv.py
index 6c84c21..1d5da10 100644
--- a/hypervideo_dl/extractor/toutv.py
+++ b/hypervideo_dl/extractor/toutv.py
@@ -40,17 +40,14 @@ class TouTvIE(RadioCanadaIE):
}]
_CLIENT_KEY = '90505c8d-9c34-4f34-8da1-3a85bdc6d4f4'
- def _real_initialize(self):
- email, password = self._get_login_info()
- if email is None:
- return
+ def _perform_login(self, username, password):
try:
self._access_token = self._download_json(
'https://services.radio-canada.ca/toutv/profiling/accounts/login',
None, 'Logging in', data=json.dumps({
'ClientId': self._CLIENT_KEY,
'ClientSecret': '34026772-244b-49b6-8b06-317b30ac9a20',
- 'Email': email,
+ 'Email': username,
'Password': password,
'Scope': 'id.write media-validation.read',
}).encode(), headers={
diff --git a/hypervideo_dl/extractor/traileraddict.py b/hypervideo_dl/extractor/traileraddict.py
index 10100fb..514f479 100644
--- a/hypervideo_dl/extractor/traileraddict.py
+++ b/hypervideo_dl/extractor/traileraddict.py
@@ -24,8 +24,7 @@ class TrailerAddictIE(InfoExtractor):
name = mobj.group('movie') + '/' + mobj.group('trailer_name')
webpage = self._download_webpage(url, name)
- title = self._search_regex(r'<title>(.+?)</title>',
- webpage, 'video title').replace(' - Trailer Addict', '')
+ title = self._html_extract_title(webpage, 'video title').replace(' - Trailer Addict', '')
view_count_str = self._search_regex(
r'<span class="views_n">([0-9,.]+)</span>',
webpage, 'view count', fatal=False)
diff --git a/hypervideo_dl/extractor/trovo.py b/hypervideo_dl/extractor/trovo.py
index ec55f41..65ea13d 100644
--- a/hypervideo_dl/extractor/trovo.py
+++ b/hypervideo_dl/extractor/trovo.py
@@ -7,6 +7,7 @@ import json
from .common import InfoExtractor
from ..utils import (
ExtractorError,
+ format_field,
int_or_none,
str_or_none,
try_get,
@@ -17,13 +18,18 @@ class TrovoBaseIE(InfoExtractor):
_VALID_URL_BASE = r'https?://(?:www\.)?trovo\.live/'
_HEADERS = {'Origin': 'https://trovo.live'}
+ def _call_api(self, video_id, query=None, data=None):
+ return self._download_json(
+ 'https://gql.trovo.live/', video_id, query=query, data=data,
+ headers={'Accept': 'application/json'})
+
def _extract_streamer_info(self, data):
streamer_info = data.get('streamerInfo') or {}
username = streamer_info.get('userName')
return {
'uploader': streamer_info.get('nickName'),
'uploader_id': str_or_none(streamer_info.get('uid')),
- 'uploader_url': 'https://trovo.live/' + username if username else None,
+ 'uploader_url': format_field(username, template='https://trovo.live/%s'),
}
@@ -32,9 +38,8 @@ class TrovoIE(TrovoBaseIE):
def _real_extract(self, url):
username = self._match_id(url)
- live_info = self._download_json(
- 'https://gql.trovo.live/', username, query={
- 'query': '''{
+ live_info = self._call_api(username, query={
+ 'query': '''{
getLiveInfo(params: {userName: "%s"}) {
isLive
programInfo {
@@ -53,12 +58,12 @@ class TrovoIE(TrovoBaseIE):
}
}
}''' % username,
- })['data']['getLiveInfo']
+ })['data']['getLiveInfo']
if live_info.get('isLive') == 0:
raise ExtractorError('%s is offline' % username, expected=True)
program_info = live_info['programInfo']
program_id = program_info['id']
- title = self._live_title(program_info['title'])
+ title = program_info['title']
formats = []
for stream_info in (program_info.get('streamInfo') or []):
@@ -104,6 +109,7 @@ class TrovoVodIE(TrovoBaseIE):
'comments': 'mincount:8',
'categories': ['Grand Theft Auto V'],
},
+ 'skip': '404'
}, {
'url': 'https://trovo.live/clip/lc-5285890810184026005',
'only_matching': True,
@@ -111,15 +117,14 @@ class TrovoVodIE(TrovoBaseIE):
def _real_extract(self, url):
vid = self._match_id(url)
- resp = self._download_json(
- 'https://gql.trovo.live/', vid, data=json.dumps([{
- 'query': '''{
+ resp = self._call_api(vid, data=json.dumps([{
+ 'query': '''{
batchGetVodDetailInfo(params: {vids: ["%s"]}) {
VodDetailInfos
}
}''' % vid,
- }, {
- 'query': '''{
+ }, {
+ 'query': '''{
getCommentList(params: {appInfo: {postID: "%s"}, pageSize: 1000000000, preview: {}}) {
commentList {
author {
@@ -133,9 +138,7 @@ class TrovoVodIE(TrovoBaseIE):
}
}
}''' % vid,
- }]).encode(), headers={
- 'Content-Type': 'application/json',
- })
+ }]).encode())
vod_detail_info = resp[0]['data']['batchGetVodDetailInfo']['VodDetailInfos'][vid]
vod_info = vod_detail_info['vodInfo']
title = vod_info['title']
@@ -197,7 +200,7 @@ class TrovoVodIE(TrovoBaseIE):
return info
-class TrovoChannelBaseIE(InfoExtractor):
+class TrovoChannelBaseIE(TrovoBaseIE):
def _get_vod_json(self, page, uid):
raise NotImplementedError('This method must be implemented by subclasses')
@@ -215,7 +218,7 @@ class TrovoChannelBaseIE(InfoExtractor):
def _real_extract(self, url):
id = self._match_id(url)
- uid = str(self._download_json('https://gql.trovo.live/', id, query={
+ uid = str(self._call_api(id, query={
'query': '{getLiveInfo(params:{userName:"%s"}){streamerInfo{uid}}}' % id
})['data']['getLiveInfo']['streamerInfo']['uid'])
return self.playlist_result(self._entries(uid), playlist_id=uid)
@@ -223,7 +226,7 @@ class TrovoChannelBaseIE(InfoExtractor):
class TrovoChannelVodIE(TrovoChannelBaseIE):
_VALID_URL = r'trovovod:(?P<id>[^\s]+)'
- IE_DESC = 'All VODs of a trovo.live channel, "trovovod" keyword'
+ IE_DESC = 'All VODs of a trovo.live channel; "trovovod:" prefix'
_TESTS = [{
'url': 'trovovod:OneTappedYou',
@@ -237,14 +240,14 @@ class TrovoChannelVodIE(TrovoChannelBaseIE):
_TYPE = 'video'
def _get_vod_json(self, page, uid):
- return self._download_json('https://gql.trovo.live/', uid, query={
+ return self._call_api(uid, query={
'query': self._QUERY % (page, uid)
})['data']['getChannelLtvVideoInfos']
class TrovoChannelClipIE(TrovoChannelBaseIE):
_VALID_URL = r'trovoclip:(?P<id>[^\s]+)'
- IE_DESC = 'All Clips of a trovo.live channel, "trovoclip" keyword'
+ IE_DESC = 'All Clips of a trovo.live channel; "trovoclip:" prefix'
_TESTS = [{
'url': 'trovoclip:OneTappedYou',
@@ -258,6 +261,6 @@ class TrovoChannelClipIE(TrovoChannelBaseIE):
_TYPE = 'clip'
def _get_vod_json(self, page, uid):
- return self._download_json('https://gql.trovo.live/', uid, query={
+ return self._call_api(uid, query={
'query': self._QUERY % (page, uid)
})['data']['getChannelClipVideoInfos']
diff --git a/hypervideo_dl/extractor/trueid.py b/hypervideo_dl/extractor/trueid.py
new file mode 100644
index 0000000..fc98303
--- /dev/null
+++ b/hypervideo_dl/extractor/trueid.py
@@ -0,0 +1,139 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_HTTPError
+from ..utils import (
+ determine_ext,
+ ExtractorError,
+ int_or_none,
+ parse_age_limit,
+ traverse_obj,
+ unified_timestamp,
+ url_or_none
+)
+
+
+class TrueIDIE(InfoExtractor):
+ _VALID_URL = r'https?://(?P<domain>vn\.trueid\.net|trueid\.(?:id|ph))/(?:movie|series/[^/]+)/(?P<id>[^/?#&]+)'
+ _TESTS = [{
+ 'url': 'https://trueid.id/movie/XYNlDOZZJzL6/pengabdi-setan/',
+ 'md5': '2552c7535125885901f1a2a4bcf32ca3',
+ 'info_dict': {
+ 'id': 'XYNlDOZZJzL6',
+ 'ext': 'mp4',
+ 'title': 'Pengabdi Setan',
+ 'display_id': 'pengabdi-setan',
+ 'description': 'md5:b0b41df08601e85e5291496c9bbe52cd',
+ 'timestamp': 1600243511,
+ 'categories': ['Film Indonesia', 'Horror', 'Mystery'],
+ 'release_timestamp': 1593536400,
+ 'release_year': 1982,
+ 'cast': list,
+ 'thumbnail': 'https://cms.dmpcdn.com/movie/2020/09/18/8b6e35c0-f97f-11ea-81fe-c52fc9dd314f_original.png',
+ 'upload_date': '20200916',
+ 'release_date': '20200630',
+ },
+ 'expected_warnings': ['Video is geo restricted.']
+ }, {
+ 'url': 'https://trueid.id/series/zZOBVPb62EwR/qXY73rwyl7oj/one-piece-ep-1/',
+ 'md5': '1c6d976049bc3c89a8a25aed2c3fb081',
+ 'info_dict': {
+ 'id': 'qXY73rwyl7oj',
+ 'ext': 'mp4',
+ 'title': 'One Piece Ep. 1',
+ 'display_id': 'one-piece-ep-1',
+ 'description': 'md5:13226d603bd03c4150a1cf5758e842ea',
+ 'timestamp': 1610421085,
+ 'categories': ['Animation & Cartoon', 'Kids & Family', 'Adventure'],
+ 'release_timestamp': 1612112400,
+ 'release_year': 1999,
+ 'age_limit': 7,
+ 'cast': ['Kounosuke Uda', 'Junji Shimizu'],
+ 'thumbnail': 'https://cms.dmpcdn.com/movie/2021/01/13/f84e9e70-5562-11eb-9fe2-dd6c2099a468_original.png',
+ 'upload_date': '20210112',
+ 'release_date': '20210131',
+ },
+ 'expected_warnings': ['Video is geo restricted.']
+ }, {
+ 'url': 'https://vn.trueid.net/series/7DNPM7Bpa9wv/pwLgEQ4Xbda2/haikyu-vua-bong-chuyen-phan-1/',
+ 'info_dict': {
+ 'id': 'pwLgEQ4Xbda2',
+ 'ext': 'mp4',
+ 'title': 'Haikyu!!: Vua Bóng Chuyền Phần 1 - Tập 1',
+ 'display_id': 'haikyu-vua-bong-chuyen-phan-1-tap-1',
+ 'description': 'md5:0374dd44d247799169449ee30cca963a',
+ 'timestamp': 1629270901,
+ 'categories': ['Anime', 'Phim Hài', 'Phim Học Đường', 'Phim Thể Thao', 'Shounen'],
+ 'release_timestamp': 1629270720,
+ 'release_year': 2014,
+ 'age_limit': 13,
+ 'thumbnail': 'https://cms.dmpcdn.com/movie/2021/09/28/b6e7ec00-2039-11ec-8436-974544e5841f_webp_original.jpg',
+ 'upload_date': '20210818',
+ 'release_date': '20210818',
+ },
+ 'expected_warnings': ['Video is geo restricted.']
+ }, {
+ 'url': 'https://trueid.ph/series/l8rvvAw7Jwv8/l8rvvAw7Jwv8/naruto-trailer/',
+ 'only_matching': True,
+ }]
+ _CUSTOM_RATINGS = {
+ 'PG': 7,
+ }
+
+ def _real_extract(self, url):
+ domain, video_id = self._match_valid_url(url).group('domain', 'id')
+ webpage = self._download_webpage(url, video_id)
+ initial_data = traverse_obj(
+ self._search_nextjs_data(webpage, video_id, fatal=False), ('props', 'pageProps', 'initialContentData'), default={})
+
+ try:
+ stream_data = self._download_json(
+ f'https://{domain}/cmsPostProxy/contents/video/{video_id}/streamer?os=android', video_id, data=b'')['data']
+ except ExtractorError as e:
+ if not isinstance(e.cause, compat_HTTPError):
+ raise e
+ errmsg = self._parse_json(e.cause.read().decode(), video_id)['meta']['message']
+ if 'country' in errmsg:
+ self.raise_geo_restricted(
+ errmsg, [initial_data['display_country']] if initial_data.get('display_country') else None, True)
+ else:
+ self.raise_no_formats(errmsg, video_id=video_id)
+
+ if stream_data:
+ stream_url = stream_data['stream']['stream_url']
+ stream_ext = determine_ext(stream_url)
+ if stream_ext == 'm3u8':
+ formats, subs = self._extract_m3u8_formats_and_subtitles(stream_url, video_id, 'mp4')
+ elif stream_ext == 'mpd':
+ formats, subs = self._extract_mpd_formats_and_subtitles(stream_url, video_id)
+ else:
+ formats = [{'url': stream_url}]
+
+ thumbnails = [
+ {'id': thumb_key, 'url': thumb_url}
+ for thumb_key, thumb_url in (initial_data.get('thumb_list') or {}).items()
+ if url_or_none(thumb_url)]
+
+ return {
+ 'id': video_id,
+ 'title': initial_data.get('title') or self._html_search_regex(
+ [r'Nonton (?P<name>.+) Gratis',
+ r'Xem (?P<name>.+) Miễn phí',
+ r'Watch (?P<name>.+) Free'], webpage, 'title', group='name'),
+ 'display_id': initial_data.get('slug_title'),
+ 'description': initial_data.get('synopsis'),
+ 'timestamp': unified_timestamp(initial_data.get('create_date')),
+ # 'duration': int_or_none(initial_data.get('duration'), invscale=60), # duration field must atleast be accurate to the second
+ 'categories': traverse_obj(initial_data, ('article_category_details', ..., 'name')),
+ 'release_timestamp': unified_timestamp(initial_data.get('publish_date')),
+ 'release_year': int_or_none(initial_data.get('release_year')),
+ 'formats': formats,
+ 'subtitles': subs,
+ 'thumbnails': thumbnails,
+ 'age_limit': self._CUSTOM_RATINGS.get(initial_data.get('rate')) or parse_age_limit(initial_data.get('rate')),
+ 'cast': traverse_obj(initial_data, (('actor', 'director'), ...)),
+ 'view_count': int_or_none(initial_data.get('count_views')),
+ 'like_count': int_or_none(initial_data.get('count_likes')),
+ 'average_rating': int_or_none(initial_data.get('count_ratings')),
+ }
diff --git a/hypervideo_dl/extractor/tubitv.py b/hypervideo_dl/extractor/tubitv.py
index 2e9b325..31feb9a 100644
--- a/hypervideo_dl/extractor/tubitv.py
+++ b/hypervideo_dl/extractor/tubitv.py
@@ -54,10 +54,7 @@ class TubiTvIE(InfoExtractor):
},
}]
- def _login(self):
- username, password = self._get_login_info()
- if username is None:
- return
+ def _perform_login(self, username, password):
self.report_login()
form_data = {
'username': username,
@@ -72,9 +69,6 @@ class TubiTvIE(InfoExtractor):
raise ExtractorError(
'Login failed (invalid username/password)', expected=True)
- def _real_initialize(self):
- self._login()
-
def _real_extract(self, url):
video_id = self._match_id(url)
video_data = self._download_json(
@@ -107,6 +101,9 @@ class TubiTvIE(InfoExtractor):
'url': self._proto_relative_url(sub_url),
})
+ season_number, episode_number, episode_title = self._search_regex(
+ r'^S(\d+):E(\d+) - (.+)', title, 'episode info', fatal=False, group=(1, 2, 3), default=(None, None, None))
+
return {
'id': video_id,
'title': title,
@@ -117,6 +114,9 @@ class TubiTvIE(InfoExtractor):
'duration': int_or_none(video_data.get('duration')),
'uploader_id': video_data.get('publisher_id'),
'release_year': int_or_none(video_data.get('year')),
+ 'season_number': int_or_none(season_number),
+ 'episode_number': int_or_none(episode_number),
+ 'episode_title': episode_title
}
@@ -132,9 +132,11 @@ class TubiTvShowIE(InfoExtractor):
def _entries(self, show_url, show_name):
show_webpage = self._download_webpage(show_url, show_name)
+
show_json = self._parse_json(self._search_regex(
- r"window\.__data\s*=\s*({.+?});\s*</script>",
- show_webpage, 'data',), show_name, transform_source=js_to_json)['video']
+ r'window\.__data\s*=\s*({[^<]+});\s*</script>',
+ show_webpage, 'data'), show_name, transform_source=js_to_json)['video']
+
for episode_id in show_json['fullContentById'].keys():
yield self.url_result(
'tubitv:%s' % episode_id,
diff --git a/hypervideo_dl/extractor/tumblr.py b/hypervideo_dl/extractor/tumblr.py
index adc3701..8086f61 100644
--- a/hypervideo_dl/extractor/tumblr.py
+++ b/hypervideo_dl/extractor/tumblr.py
@@ -6,6 +6,7 @@ from .common import InfoExtractor
from ..utils import (
ExtractorError,
int_or_none,
+ traverse_obj,
urlencode_postdata
)
@@ -14,39 +15,130 @@ class TumblrIE(InfoExtractor):
_VALID_URL = r'https?://(?P<blog_name>[^/?#&]+)\.tumblr\.com/(?:post|video)/(?P<id>[0-9]+)(?:$|[/?#])'
_NETRC_MACHINE = 'tumblr'
_LOGIN_URL = 'https://www.tumblr.com/login'
+ _OAUTH_URL = 'https://www.tumblr.com/api/v2/oauth2/token'
_TESTS = [{
'url': 'http://tatianamaslanydaily.tumblr.com/post/54196191430/orphan-black-dvd-extra-behind-the-scenes',
'md5': '479bb068e5b16462f5176a6828829767',
'info_dict': {
'id': '54196191430',
'ext': 'mp4',
- 'title': 'tatiana maslany news, Orphan Black || DVD extra - behind the scenes ↳...',
- 'description': 'md5:37db8211e40b50c7c44e95da14f630b7',
- 'thumbnail': r're:http://.*\.jpg',
+ 'title': 'md5:dfac39636969fe6bf1caa2d50405f069',
+ 'description': 'md5:390ab77358960235b6937ab3b8528956',
+ 'uploader_id': 'tatianamaslanydaily',
+ 'uploader_url': 'https://tatianamaslanydaily.tumblr.com/',
+ 'thumbnail': r're:^https?://.*\.jpg',
+ 'duration': 127,
+ 'like_count': int,
+ 'repost_count': int,
+ 'age_limit': 0,
+ 'tags': ['Orphan Black', 'Tatiana Maslany', 'Interview', 'Video', 'OB S1 DVD Extras'],
+ }
+ }, {
+ 'note': 'multiple formats',
+ 'url': 'https://maskofthedragon.tumblr.com/post/626907179849564160/mona-talking-in-english',
+ 'md5': 'f43ff8a8861712b6cf0e0c2bd84cfc68',
+ 'info_dict': {
+ 'id': '626907179849564160',
+ 'ext': 'mp4',
+ 'title': 'Mona\xa0“talking” in\xa0“english”',
+ 'description': 'md5:082a3a621530cb786ad2b7592a6d9e2c',
+ 'uploader_id': 'maskofthedragon',
+ 'uploader_url': 'https://maskofthedragon.tumblr.com/',
+ 'thumbnail': r're:^https?://.*\.jpg',
+ 'duration': 7,
+ 'like_count': int,
+ 'repost_count': int,
+ 'age_limit': 0,
+ 'tags': 'count:19',
+ },
+ 'params': {
+ 'format': 'hd',
+ },
+ }, {
+ 'note': 'non-iframe video (with related posts)',
+ 'url': 'https://shieldfoss.tumblr.com/post/675519763813908480',
+ 'md5': '12bdb75661ef443bffe5a4dac1dbf118',
+ 'info_dict': {
+ 'id': '675519763813908480',
+ 'ext': 'mp4',
+ 'title': 'Shieldfoss',
+ 'uploader_id': 'nerviovago',
+ 'uploader_url': 'https://nerviovago.tumblr.com/',
+ 'thumbnail': r're:^https?://.*\.jpg',
+ 'like_count': int,
+ 'repost_count': int,
+ 'age_limit': 0,
+ 'tags': [],
}
}, {
- 'url': 'http://5sostrum.tumblr.com/post/90208453769/yall-forgetting-the-greatest-keek-of-them-all',
- 'md5': 'bf348ef8c0ef84fbf1cbd6fa6e000359',
+ 'note': 'dashboard only (original post)',
+ 'url': 'https://jujanon.tumblr.com/post/159704441298/my-baby-eating',
+ 'md5': '029f7c91ab386701b211e3d494d2d95e',
'info_dict': {
- 'id': '90208453769',
+ 'id': '159704441298',
'ext': 'mp4',
- 'title': '5SOS STRUM ;]',
- 'description': 'md5:dba62ac8639482759c8eb10ce474586a',
- 'thumbnail': r're:http://.*\.jpg',
+ 'title': 'md5:ba79365861101f4911452728d2950561',
+ 'description': 'md5:773738196cea76b6996ec71e285bdabc',
+ 'uploader_id': 'jujanon',
+ 'uploader_url': 'https://jujanon.tumblr.com/',
+ 'thumbnail': r're:^https?://.*\.jpg',
+ 'like_count': int,
+ 'repost_count': int,
+ 'age_limit': 0,
+ 'tags': ['crabs', 'my video', 'my pets'],
}
}, {
- 'url': 'http://hdvideotest.tumblr.com/post/130323439814/test-description-for-my-hd-video',
- 'md5': '7ae503065ad150122dc3089f8cf1546c',
+ 'note': 'dashboard only (reblog)',
+ 'url': 'https://bartlebyshop.tumblr.com/post/180294460076/duality-of-bird',
+ 'md5': '04334e7cadb1af680d162912559f51a5',
'info_dict': {
- 'id': '130323439814',
+ 'id': '180294460076',
'ext': 'mp4',
- 'title': 'HD Video Testing \u2014 Test description for my HD video',
- 'description': 'md5:97cc3ab5fcd27ee4af6356701541319c',
- 'thumbnail': r're:http://.*\.jpg',
- },
- 'params': {
- 'format': 'hd',
+ 'title': 'duality of bird',
+ 'description': 'duality of bird',
+ 'uploader_id': 'todaysbird',
+ 'uploader_url': 'https://todaysbird.tumblr.com/',
+ 'thumbnail': r're:^https?://.*\.jpg',
+ 'like_count': int,
+ 'repost_count': int,
+ 'age_limit': 0,
+ 'tags': [],
+ }
+ }, {
+ 'note': 'dashboard only (external)',
+ 'url': 'https://afloweroutofstone.tumblr.com/post/675661759168823296/the-blues-remembers-everything-the-country-forgot',
+ 'info_dict': {
+ 'id': 'q67_fd7b8SU',
+ 'ext': 'mp4',
+ 'title': 'The Blues Remembers Everything the Country Forgot',
+ 'alt_title': 'The Blues Remembers Everything the Country Forgot',
+ 'description': 'md5:1a6b4097e451216835a24c1023707c79',
+ 'release_date': '20201224',
+ 'creator': 'md5:c2239ba15430e87c3b971ba450773272',
+ 'uploader': 'Moor Mother - Topic',
+ 'upload_date': '20201223',
+ 'uploader_id': 'UCxrMtFBRkFvQJ_vVM4il08w',
+ 'uploader_url': 'http://www.youtube.com/channel/UCxrMtFBRkFvQJ_vVM4il08w',
+ 'thumbnail': r're:^https?://i.ytimg.com/.*',
+ 'channel': 'Moor Mother - Topic',
+ 'channel_id': 'UCxrMtFBRkFvQJ_vVM4il08w',
+ 'channel_url': 'https://www.youtube.com/channel/UCxrMtFBRkFvQJ_vVM4il08w',
+ 'channel_follower_count': int,
+ 'duration': 181,
+ 'view_count': int,
+ 'like_count': int,
+ 'age_limit': 0,
+ 'categories': ['Music'],
+ 'tags': 'count:7',
+ 'live_status': 'not_live',
+ 'playable_in_embed': True,
+ 'availability': 'public',
+ 'track': 'The Blues Remembers Everything the Country Forgot',
+ 'artist': 'md5:c2239ba15430e87c3b971ba450773272',
+ 'album': 'Brass',
+ 'release_year': 2020,
},
+ 'add_ie': ['Youtube'],
}, {
'url': 'http://naked-yogi.tumblr.com/post/118312946248/naked-smoking-stretching',
'md5': 'de07e5211d60d4f3a2c3df757ea9f6ab',
@@ -60,16 +152,51 @@ class TumblrIE(InfoExtractor):
'uploader_id': '1638622',
'uploader': 'naked-yogi',
},
- 'add_ie': ['Vidme'],
+ # 'add_ie': ['Vidme'],
+ 'skip': 'dead embedded video host'
}, {
- 'url': 'http://camdamage.tumblr.com/post/98846056295/',
- 'md5': 'a9e0c8371ea1ca306d6554e3fecf50b6',
+ 'url': 'https://prozdvoices.tumblr.com/post/673201091169681408/what-recording-voice-acting-sounds-like',
+ 'md5': 'a0063fc8110e6c9afe44065b4ea68177',
'info_dict': {
- 'id': '105463834',
+ 'id': 'eomhW5MLGWA',
'ext': 'mp4',
- 'title': 'Cam Damage-HD 720p',
- 'uploader': 'John Moyer',
- 'uploader_id': 'user32021558',
+ 'title': 'what recording voice acting sounds like',
+ 'description': 'md5:1da3faa22d0e0b1d8b50216c284ee798',
+ 'uploader': 'ProZD',
+ 'upload_date': '20220112',
+ 'uploader_id': 'ProZD',
+ 'uploader_url': 'http://www.youtube.com/user/ProZD',
+ 'thumbnail': r're:^https?://i.ytimg.com/.*',
+ 'channel': 'ProZD',
+ 'channel_id': 'UC6MFZAOHXlKK1FI7V0XQVeA',
+ 'channel_url': 'https://www.youtube.com/channel/UC6MFZAOHXlKK1FI7V0XQVeA',
+ 'channel_follower_count': int,
+ 'duration': 20,
+ 'view_count': int,
+ 'like_count': int,
+ 'age_limit': 0,
+ 'categories': ['Film & Animation'],
+ 'tags': [],
+ 'live_status': 'not_live',
+ 'playable_in_embed': True,
+ 'availability': 'public',
+ },
+ 'add_ie': ['Youtube'],
+ }, {
+ 'url': 'https://dominustempori.tumblr.com/post/673572712813297664/youtubes-all-right-for-some-pretty-cool',
+ 'md5': '203e9eb8077e3f45bfaeb4c86c1467b8',
+ 'info_dict': {
+ 'id': '87816359',
+ 'ext': 'mov',
+ 'title': 'Harold Ramis',
+ 'description': 'md5:be8e68cbf56ce0785c77f0c6c6dfaf2c',
+ 'uploader': 'Resolution Productions Group',
+ 'uploader_id': 'resolutionproductions',
+ 'uploader_url': 'https://vimeo.com/resolutionproductions',
+ 'upload_date': '20140227',
+ 'thumbnail': r're:^https?://i.vimeocdn.com/video/.*',
+ 'timestamp': 1393523719,
+ 'duration': 291,
},
'add_ie': ['Vimeo'],
}, {
@@ -86,127 +213,180 @@ class TumblrIE(InfoExtractor):
'like_count': int,
'comment_count': int,
'repost_count': int,
+ 'thumbnail': r're:^https?://.*\.jpg',
+ 'timestamp': 1455940159,
+ 'view_count': int,
},
'add_ie': ['Vine'],
}, {
- 'url': 'http://vitasidorkina.tumblr.com/post/134652425014/joskriver-victoriassecret-invisibility-or',
- 'md5': '01c12ceb82cbf6b2fe0703aa56b3ad72',
+ 'url': 'https://silami.tumblr.com/post/84250043974/my-bad-river-flows-in-you-impression-on-maschine',
+ 'md5': '3c92d7c3d867f14ccbeefa2119022277',
'info_dict': {
- 'id': '-7LnUPGlSo',
+ 'id': 'nYtvtTPuTl',
'ext': 'mp4',
- 'title': 'Video by victoriassecret',
- 'description': 'Invisibility or flight…which superpower would YOU choose? #VSFashionShow #ThisOrThat',
- 'uploader_id': 'victoriassecret',
- 'thumbnail': r're:^https?://.*\.jpg'
+ 'title': 'Video by silbulterman',
+ 'description': '#maschine',
+ 'uploader_id': '242859024',
+ 'thumbnail': r're:^https?://.*\.jpg',
+ 'timestamp': 1398801174,
+ 'like_count': int,
+ 'uploader': 'Sil',
+ 'channel': 'silbulterman',
+ 'comment_count': int,
+ 'upload_date': '20140429',
},
'add_ie': ['Instagram'],
}]
- def _real_initialize(self):
- self._login()
+ _providers = {
+ 'instagram': 'Instagram',
+ 'vimeo': 'Vimeo',
+ 'vine': 'Vine',
+ 'youtube': 'Youtube',
+ }
- def _login(self):
- username, password = self._get_login_info()
- if username is None:
- return
+ _ACCESS_TOKEN = None
+ def _initialize_pre_login(self):
login_page = self._download_webpage(
- self._LOGIN_URL, None, 'Downloading login page')
-
- login_form = self._hidden_inputs(login_page)
- login_form.update({
- 'user[email]': username,
- 'user[password]': password
- })
-
- response, urlh = self._download_webpage_handle(
- self._LOGIN_URL, None, 'Logging in',
- data=urlencode_postdata(login_form), headers={
- 'Content-Type': 'application/x-www-form-urlencoded',
- 'Referer': self._LOGIN_URL,
- })
+ self._LOGIN_URL, None, 'Downloading login page', fatal=False)
+ if login_page:
+ self._ACCESS_TOKEN = self._search_regex(
+ r'"API_TOKEN":\s*"(\w+)"', login_page, 'API access token', fatal=False)
+ if not self._ACCESS_TOKEN:
+ self.report_warning('Failed to get access token; metadata will be missing and some videos may not work')
- # Successful login
- if '/dashboard' in urlh.geturl():
+ def _perform_login(self, username, password):
+ if not self._ACCESS_TOKEN:
return
- login_errors = self._parse_json(
- self._search_regex(
- r'RegistrationForm\.errors\s*=\s*(\[.+?\])\s*;', response,
- 'login errors', default='[]'),
- None, fatal=False)
- if login_errors:
- raise ExtractorError(
- 'Unable to login: %s' % login_errors[0], expected=True)
-
- self.report_warning('Login has probably failed')
+ self._download_json(
+ self._OAUTH_URL, None, 'Logging in',
+ data=urlencode_postdata({
+ 'password': password,
+ 'grant_type': 'password',
+ 'username': username,
+ }), headers={
+ 'Content-Type': 'application/x-www-form-urlencoded',
+ 'Authorization': f'Bearer {self._ACCESS_TOKEN}',
+ },
+ errnote='Login failed', fatal=False)
def _real_extract(self, url):
- m_url = self._match_valid_url(url)
- video_id = m_url.group('id')
- blog = m_url.group('blog_name')
+ blog, video_id = self._match_valid_url(url).groups()
- url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
+ url = f'http://{blog}.tumblr.com/post/{video_id}/'
webpage, urlh = self._download_webpage_handle(url, video_id)
redirect_url = urlh.geturl()
- if 'tumblr.com/safe-mode' in redirect_url or redirect_url.startswith('/safe-mode'):
- raise ExtractorError(
- 'This Tumblr may contain sensitive media. '
- 'Disable safe mode in your account settings '
- 'at https://www.tumblr.com/settings/account#safe_mode',
- expected=True)
+ api_only = bool(self._search_regex(
+ r'(tumblr.com|^)/(safe-mode|login_required|blog/view)',
+ redirect_url, 'redirect', default=None))
+
+ if api_only and not self._ACCESS_TOKEN:
+ raise ExtractorError('Cannot get data for dashboard-only post without access token')
+
+ post_json = {}
+ if self._ACCESS_TOKEN:
+ post_json = traverse_obj(
+ self._download_json(
+ f'https://www.tumblr.com/api/v2/blog/{blog}/posts/{video_id}/permalink',
+ video_id, headers={'Authorization': f'Bearer {self._ACCESS_TOKEN}'}, fatal=False),
+ ('response', 'timeline', 'elements', 0)) or {}
+ content_json = traverse_obj(post_json, ('trail', 0, 'content'), ('content')) or []
+ video_json = next(
+ (item for item in content_json if item.get('type') == 'video'), {})
+ media_json = video_json.get('media') or {}
+ if api_only and not media_json.get('url') and not video_json.get('url'):
+ raise ExtractorError('Failed to find video data for dashboard-only post')
+
+ if not media_json.get('url') and video_json.get('url'):
+ # external video host
+ return self.url_result(
+ video_json['url'],
+ self._providers.get(video_json.get('provider'), 'Generic'))
+
+ video_url = self._og_search_video_url(webpage, default=None)
+ duration = None
+ formats = []
+
+ # iframes can supply duration and sometimes additional formats, so check for one
iframe_url = self._search_regex(
- r'src=\'(https?://www\.tumblr\.com/video/[^\']+)\'',
+ fr'src=\'(https?://www\.tumblr\.com/video/{blog}/{video_id}/[^\']+)\'',
webpage, 'iframe url', default=None)
- if iframe_url is None:
- return self.url_result(redirect_url, 'Generic')
+ if iframe_url:
+ iframe = self._download_webpage(
+ iframe_url, video_id, 'Downloading iframe page',
+ headers={'Referer': redirect_url})
- iframe = self._download_webpage(iframe_url, video_id, 'Downloading iframe page')
+ options = self._parse_json(
+ self._search_regex(
+ r'data-crt-options=(["\'])(?P<options>.+?)\1', iframe,
+ 'hd video url', default='', group='options'),
+ video_id, fatal=False)
+ if options:
+ duration = int_or_none(options.get('duration'))
- duration = None
- sources = []
-
- sd_url = self._search_regex(
- r'<source[^>]+src=(["\'])(?P<url>.+?)\1', iframe,
- 'sd video url', default=None, group='url')
- if sd_url:
- sources.append((sd_url, 'sd'))
-
- options = self._parse_json(
- self._search_regex(
- r'data-crt-options=(["\'])(?P<options>.+?)\1', iframe,
- 'hd video url', default='', group='options'),
- video_id, fatal=False)
- if options:
- duration = int_or_none(options.get('duration'))
- hd_url = options.get('hdUrl')
- if hd_url:
- sources.append((hd_url, 'hd'))
-
- formats = [{
- 'url': video_url,
- 'ext': 'mp4',
- 'format_id': format_id,
- 'height': int_or_none(self._search_regex(
- r'/(\d{3,4})$', video_url, 'height', default=None)),
- 'quality': quality,
- } for quality, (video_url, format_id) in enumerate(sources)]
+ hd_url = options.get('hdUrl')
+ if hd_url:
+ # there are multiple formats; extract them
+ # ignore other sources of width/height data as they may be wrong
+ sources = []
+ sd_url = self._search_regex(
+ r'<source[^>]+src=(["\'])(?P<url>.+?)\1', iframe,
+ 'sd video url', default=None, group='url')
+ if sd_url:
+ sources.append((sd_url, 'sd'))
+ sources.append((hd_url, 'hd'))
+
+ formats = [{
+ 'url': video_url,
+ 'format_id': format_id,
+ 'height': int_or_none(self._search_regex(
+ r'_(\d+)\.\w+$', video_url, 'height', default=None)),
+ 'quality': quality,
+ } for quality, (video_url, format_id) in enumerate(sources)]
+
+ if not media_json.get('url') and not video_url and not iframe_url:
+ # external video host (but we weren't able to figure it out from the api)
+ iframe_url = self._search_regex(
+ r'src=["\'](https?://safe\.txmblr\.com/svc/embed/inline/[^"\']+)["\']',
+ webpage, 'embed iframe url', default=None)
+ return self.url_result(iframe_url or redirect_url, 'Generic')
+ formats = formats or [{
+ 'url': media_json.get('url') or video_url,
+ 'width': int_or_none(
+ media_json.get('width') or self._og_search_property('video:width', webpage, default=None)),
+ 'height': int_or_none(
+ media_json.get('height') or self._og_search_property('video:height', webpage, default=None)),
+ }]
self._sort_formats(formats)
- # The only place where you can get a title, it's not complete,
- # but searching in other places doesn't work for all videos
- video_title = self._html_search_regex(
- r'(?s)<title>(?P<title>.*?)(?: \| Tumblr)?</title>',
- webpage, 'title')
+ # the url we're extracting from might be an original post or it might be a reblog.
+ # if it's a reblog, og:description will be the reblogger's comment, not the uploader's.
+ # content_json is always the op, so if it exists but has no text, there's no description
+ if content_json:
+ description = '\n\n'.join((
+ item.get('text') for item in content_json if item.get('type') == 'text')) or None
+ else:
+ description = self._og_search_description(webpage, default=None)
+ uploader_id = traverse_obj(post_json, 'reblogged_root_name', 'blog_name')
return {
'id': video_id,
- 'title': video_title,
- 'description': self._og_search_description(webpage, default=None),
- 'thumbnail': self._og_search_thumbnail(webpage, default=None),
+ 'title': post_json.get('summary') or (blog if api_only else self._html_search_regex(
+ r'(?s)<title>(?P<title>.*?)(?: \| Tumblr)?</title>', webpage, 'title')),
+ 'description': description,
+ 'thumbnail': (traverse_obj(video_json, ('poster', 0, 'url'))
+ or self._og_search_thumbnail(webpage, default=None)),
+ 'uploader_id': uploader_id,
+ 'uploader_url': f'https://{uploader_id}.tumblr.com/' if uploader_id else None,
'duration': duration,
+ 'like_count': post_json.get('like_count'),
+ 'repost_count': post_json.get('reblog_count'),
+ 'age_limit': {True: 18, False: 0}.get(post_json.get('is_nsfw')),
+ 'tags': post_json.get('tags'),
'formats': formats,
}
diff --git a/hypervideo_dl/extractor/tunein.py b/hypervideo_dl/extractor/tunein.py
index c7a5f5a..7e51de8 100644
--- a/hypervideo_dl/extractor/tunein.py
+++ b/hypervideo_dl/extractor/tunein.py
@@ -62,7 +62,7 @@ class TuneInBaseIE(InfoExtractor):
return {
'id': content_id,
- 'title': self._live_title(title) if is_live else title,
+ 'title': title,
'formats': formats,
'thumbnail': thumbnail,
'location': location,
diff --git a/hypervideo_dl/extractor/turner.py b/hypervideo_dl/extractor/turner.py
index 32125bc..519dc32 100644
--- a/hypervideo_dl/extractor/turner.py
+++ b/hypervideo_dl/extractor/turner.py
@@ -205,7 +205,7 @@ class TurnerBaseIE(AdobePassIE):
return {
'id': video_id,
- 'title': self._live_title(title) if is_live else title,
+ 'title': title,
'formats': formats,
'subtitles': subtitles,
'thumbnails': thumbnails,
diff --git a/hypervideo_dl/extractor/tv2.py b/hypervideo_dl/extractor/tv2.py
index e085153..977da30 100644
--- a/hypervideo_dl/extractor/tv2.py
+++ b/hypervideo_dl/extractor/tv2.py
@@ -19,7 +19,7 @@ from ..utils import (
class TV2IE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?tv2\.no/v/(?P<id>\d+)'
+ _VALID_URL = r'https?://(?:www\.)?tv2\.no/v\d*/(?P<id>\d+)'
_TESTS = [{
'url': 'http://www.tv2.no/v/916509/',
'info_dict': {
@@ -33,6 +33,9 @@ class TV2IE(InfoExtractor):
'view_count': int,
'categories': list,
},
+ }, {
+ 'url': 'http://www.tv2.no/v2/916509',
+ 'only_matching': True,
}]
_PROTOCOLS = ('HLS', 'DASH')
_GEO_COUNTRIES = ['NO']
@@ -78,9 +81,7 @@ class TV2IE(InfoExtractor):
elif ext == 'm3u8':
if not data.get('drmProtected'):
formats.extend(self._extract_m3u8_formats(
- video_url, video_id, 'mp4',
- 'm3u8' if is_live else 'm3u8_native',
- m3u8_id=format_id, fatal=False))
+ video_url, video_id, 'mp4', live=is_live, m3u8_id=format_id, fatal=False))
elif ext == 'mpd':
formats.extend(self._extract_mpd_formats(
video_url, video_id, format_id, fatal=False))
@@ -103,7 +104,7 @@ class TV2IE(InfoExtractor):
return {
'id': video_id,
'url': video_url,
- 'title': self._live_title(title) if is_live else title,
+ 'title': title,
'description': strip_or_none(asset.get('description')),
'thumbnails': thumbnails,
'timestamp': parse_iso8601(asset.get('live_broadcast_time') or asset.get('update_time')),
@@ -241,9 +242,7 @@ class KatsomoIE(InfoExtractor):
elif ext == 'm3u8':
if not data.get('drmProtected'):
formats.extend(self._extract_m3u8_formats(
- video_url, video_id, 'mp4',
- 'm3u8' if is_live else 'm3u8_native',
- m3u8_id=format_id, fatal=False))
+ video_url, video_id, 'mp4', live=is_live, m3u8_id=format_id, fatal=False))
elif ext == 'mpd':
formats.extend(self._extract_mpd_formats(
video_url, video_id, format_id, fatal=False))
@@ -268,7 +267,7 @@ class KatsomoIE(InfoExtractor):
return {
'id': video_id,
'url': video_url,
- 'title': self._live_title(title) if is_live else title,
+ 'title': title,
'description': strip_or_none(asset.get('description')),
'thumbnails': thumbnails,
'timestamp': parse_iso8601(asset.get('createTime')),
diff --git a/hypervideo_dl/extractor/tv2dk.py b/hypervideo_dl/extractor/tv2dk.py
index 8bd5fd6..ec5cbdf 100644
--- a/hypervideo_dl/extractor/tv2dk.py
+++ b/hypervideo_dl/extractor/tv2dk.py
@@ -41,8 +41,16 @@ class TV2DKIE(InfoExtractor):
'duration': 1347,
'view_count': int,
},
- 'params': {
- 'skip_download': True,
+ 'add_ie': ['Kaltura'],
+ }, {
+ 'url': 'https://www.tv2lorry.dk/gadekamp/gadekamp-6-hoejhuse-i-koebenhavn',
+ 'info_dict': {
+ 'id': '1_7iwll9n0',
+ 'ext': 'mp4',
+ 'upload_date': '20211027',
+ 'title': 'Gadekamp #6 - Højhuse i København',
+ 'uploader_id': 'tv2lorry',
+ 'timestamp': 1635345229,
},
'add_ie': ['Kaltura'],
}, {
@@ -91,11 +99,14 @@ class TV2DKIE(InfoExtractor):
add_entry(partner_id, kaltura_id)
if not entries:
kaltura_id = self._search_regex(
- r'entry_id\s*:\s*["\']([0-9a-z_]+)', webpage, 'kaltura id')
+ (r'entry_id\s*:\s*["\']([0-9a-z_]+)',
+ r'\\u002FentryId\\u002F(\w+)\\u002F'), webpage, 'kaltura id')
partner_id = self._search_regex(
(r'\\u002Fp\\u002F(\d+)\\u002F', r'/p/(\d+)/'), webpage,
'partner id')
add_entry(partner_id, kaltura_id)
+ if len(entries) == 1:
+ return entries[0]
return self.playlist_result(entries)
diff --git a/hypervideo_dl/extractor/tver.py b/hypervideo_dl/extractor/tver.py
index 943b3eb..9ff3136 100644
--- a/hypervideo_dl/extractor/tver.py
+++ b/hypervideo_dl/extractor/tver.py
@@ -5,15 +5,16 @@ from __future__ import unicode_literals
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
+ ExtractorError,
int_or_none,
remove_start,
smuggle_url,
- try_get,
+ traverse_obj,
)
class TVerIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?tver\.jp/(?P<path>(?:corner|episode|feature)/(?P<id>f?\d+))'
+ _VALID_URL = r'https?://(?:www\.)?tver\.jp/(?P<path>corner|episode|feature|lp|tokyo2020/video)/(?P<id>[fc]?\d+)'
# videos are only available for 7 days
_TESTS = [{
'url': 'https://tver.jp/corner/f0062178',
@@ -28,6 +29,15 @@ class TVerIE(InfoExtractor):
# subtitle = ' '
'url': 'https://tver.jp/corner/f0068870',
'only_matching': True,
+ }, {
+ 'url': 'https://tver.jp/lp/f0009694',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://tver.jp/lp/c0000239',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://tver.jp/tokyo2020/video/6264525510001',
+ 'only_matching': True,
}]
_TOKEN = None
BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s'
@@ -38,13 +48,20 @@ class TVerIE(InfoExtractor):
def _real_extract(self, url):
path, video_id = self._match_valid_url(url).groups()
- main = self._download_json(
- 'https://api.tver.jp/v4/' + path, video_id,
- query={'token': self._TOKEN})['main']
- p_id = main['publisher_id']
- service = remove_start(main['service'], 'ts_')
+ if path == 'lp':
+ webpage = self._download_webpage(url, video_id)
+ redirect_path = self._search_regex(r'to_href="([^"]+)', webpage, 'redirect path')
+ path, video_id = self._match_valid_url(f'https://tver.jp{redirect_path}').groups()
+ api_response = self._download_json(f'https://api.tver.jp/v4/{path}/{video_id}', video_id, query={'token': self._TOKEN})
+ p_id = traverse_obj(api_response, ('main', 'publisher_id'))
+ if not p_id:
+ error_msg, expected = traverse_obj(api_response, ('episode', 0, 'textbar', 0, ('text', 'longer')), get_all=False), True
+ if not error_msg:
+ error_msg, expected = 'Failed to extract publisher ID', False
+ raise ExtractorError(error_msg, expected=expected)
+ service = remove_start(traverse_obj(api_response, ('main', 'service')), 'ts_')
- r_id = main['reference_id']
+ r_id = traverse_obj(api_response, ('main', 'reference_id'))
if service not in ('tx', 'russia2018', 'sebare2018live', 'gorin'):
r_id = 'ref:' + r_id
bc_url = smuggle_url(
@@ -53,8 +70,8 @@ class TVerIE(InfoExtractor):
return {
'_type': 'url_transparent',
- 'description': try_get(main, lambda x: x['note'][0]['text'], compat_str),
- 'episode_number': int_or_none(try_get(main, lambda x: x['ext']['episode_number'])),
+ 'description': traverse_obj(api_response, ('main', 'note', 0, 'text'), expected_type=compat_str),
+ 'episode_number': int_or_none(traverse_obj(api_response, ('main', 'ext', 'episode_number'), expected_type=compat_str)),
'url': bc_url,
'ie_key': 'BrightcoveNew',
}
diff --git a/hypervideo_dl/extractor/tvnet.py b/hypervideo_dl/extractor/tvnet.py
index 4222ff9..aa1e9d9 100644
--- a/hypervideo_dl/extractor/tvnet.py
+++ b/hypervideo_dl/extractor/tvnet.py
@@ -111,9 +111,7 @@ class TVNetIE(InfoExtractor):
continue
stream_urls.add(stream_url)
formats.extend(self._extract_m3u8_formats(
- stream_url, video_id, 'mp4',
- entry_protocol='m3u8' if is_live else 'm3u8_native',
- m3u8_id='hls', fatal=False))
+ stream_url, video_id, 'mp4', live=is_live, m3u8_id='hls', fatal=False))
self._sort_formats(formats)
# better support for radio streams
@@ -130,9 +128,6 @@ class TVNetIE(InfoExtractor):
r'data-image=(["\'])(?P<url>(?:https?:)?//.+?)\1', webpage,
'thumbnail', default=None, group='url'))
- if is_live:
- title = self._live_title(title)
-
view_count = int_or_none(self._search_regex(
r'(?s)<div[^>]+\bclass=["\'].*?view-count[^>]+>.*?(\d+).*?</div>',
webpage, 'view count', default=None))
diff --git a/hypervideo_dl/extractor/tvopengr.py b/hypervideo_dl/extractor/tvopengr.py
new file mode 100644
index 0000000..a11cdc6
--- /dev/null
+++ b/hypervideo_dl/extractor/tvopengr.py
@@ -0,0 +1,128 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ determine_ext,
+ get_elements_text_and_html_by_attribute,
+ scale_thumbnails_to_max_format_width,
+ unescapeHTML,
+)
+
+
+class TVOpenGrBaseIE(InfoExtractor):
+ def _return_canonical_url(self, url, video_id):
+ webpage = self._download_webpage(url, video_id)
+ canonical_url = self._og_search_url(webpage)
+ title = self._og_search_title(webpage)
+ return self.url_result(canonical_url, ie=TVOpenGrWatchIE.ie_key(), video_id=video_id, video_title=title)
+
+
+class TVOpenGrWatchIE(TVOpenGrBaseIE):
+ IE_NAME = 'tvopengr:watch'
+ IE_DESC = 'tvopen.gr (and ethnos.gr) videos'
+ _VALID_URL = r'https?://(?P<netloc>(?:www\.)?(?:tvopen|ethnos)\.gr)/watch/(?P<id>\d+)/(?P<slug>[^/]+)'
+ _API_ENDPOINT = 'https://www.tvopen.gr/templates/data/player'
+
+ _TESTS = [{
+ 'url': 'https://www.ethnos.gr/watch/101009/nikoskaprabelosdenexoymekanenanasthenhsemethmethmetallaxhomikron',
+ 'md5': '8728570e3a72e0f8d9475ba94859fdc1',
+ 'info_dict': {
+ 'id': '101009',
+ 'title': 'md5:51f68773dcb6c70498cd326f45fefdf0',
+ 'display_id': 'nikoskaprabelosdenexoymekanenanasthenhsemethmethmetallaxhomikron',
+ 'description': 'md5:78fff49f18fb3effe41b070e5c7685d6',
+ 'thumbnail': 'https://opentv-static.siliconweb.com/imgHandler/1920/d573ba71-ec5f-43c6-b4cb-d181f327d3a8.jpg',
+ 'ext': 'mp4',
+ 'upload_date': '20220109',
+ 'timestamp': 1641686400,
+ },
+ }, {
+ 'url': 'https://www.tvopen.gr/watch/100979/se28099agapaomenalla7cepeisodio267cmhthrargiapashskakias',
+ 'md5': '38f98a1be0c577db4ea2d1b1c0770c48',
+ 'info_dict': {
+ 'id': '100979',
+ 'title': 'md5:e021f3001e16088ee40fa79b20df305b',
+ 'display_id': 'se28099agapaomenalla7cepeisodio267cmhthrargiapashskakias',
+ 'description': 'md5:ba17db53954134eb8d625d199e2919fb',
+ 'thumbnail': 'https://opentv-static.siliconweb.com/imgHandler/1920/9bb71cf1-21da-43a9-9d65-367950fde4e3.jpg',
+ 'ext': 'mp4',
+ 'upload_date': '20220108',
+ 'timestamp': 1641600000,
+ },
+ }]
+
+ def _extract_formats_and_subs(self, response, video_id):
+ formats, subs = [], {}
+ for format_id, format_url in response.items():
+ if format_id not in ('stream', 'httpstream', 'mpegdash'):
+ continue
+ ext = determine_ext(format_url)
+ if ext == 'm3u8':
+ formats_, subs_ = self._extract_m3u8_formats_and_subtitles(
+ format_url, video_id, 'mp4', m3u8_id=format_id,
+ fatal=False)
+ elif ext == 'mpd':
+ formats_, subs_ = self._extract_mpd_formats_and_subtitles(
+ format_url, video_id, 'mp4', fatal=False)
+ else:
+ formats.append({
+ 'url': format_url,
+ 'format_id': format_id,
+ })
+ continue
+ formats.extend(formats_)
+ self._merge_subtitles(subs_, target=subs)
+ self._sort_formats(formats)
+ return formats, subs
+
+ def _real_extract(self, url):
+ netloc, video_id, display_id = self._match_valid_url(url).group('netloc', 'id', 'slug')
+ if netloc.find('tvopen.gr') == -1:
+ return self._return_canonical_url(url, video_id)
+ webpage = self._download_webpage(url, video_id)
+ info = self._search_json_ld(webpage, video_id, expected_type='VideoObject')
+ info['formats'], info['subtitles'] = self._extract_formats_and_subs(
+ self._download_json(self._API_ENDPOINT, video_id, query={'cid': video_id}),
+ video_id)
+ info['thumbnails'] = scale_thumbnails_to_max_format_width(
+ info['formats'], info['thumbnails'], r'(?<=/imgHandler/)\d+')
+ description, _html = next(get_elements_text_and_html_by_attribute('class', 'description', webpage))
+ if description and _html.startswith('<span '):
+ info['description'] = description
+ info['id'] = video_id
+ info['display_id'] = display_id
+ return info
+
+
+class TVOpenGrEmbedIE(TVOpenGrBaseIE):
+ IE_NAME = 'tvopengr:embed'
+ IE_DESC = 'tvopen.gr embedded videos'
+ _VALID_URL = r'(?:https?:)?//(?:www\.|cdn\.|)(?:tvopen|ethnos).gr/embed/(?P<id>\d+)'
+ _EMBED_RE = re.compile(rf'''<iframe[^>]+?src=(?P<_q1>["'])(?P<url>{_VALID_URL})(?P=_q1)''')
+
+ _TESTS = [{
+ 'url': 'https://cdn.ethnos.gr/embed/100963',
+ 'md5': '2da147881f45571d81662d94d086628b',
+ 'info_dict': {
+ 'id': '100963',
+ 'display_id': 'koronoiosapotoysdieythyntestonsxoleionselftestgiaosoysdenbrhkan',
+ 'title': 'md5:2c71876fadf0cda6043da0da5fca2936',
+ 'description': 'md5:17482b4432e5ed30eccd93b05d6ea509',
+ 'thumbnail': 'https://opentv-static.siliconweb.com/imgHandler/1920/5804e07f-799a-4247-a696-33842c94ca37.jpg',
+ 'ext': 'mp4',
+ 'upload_date': '20220108',
+ 'timestamp': 1641600000,
+ },
+ }]
+
+ @classmethod
+ def _extract_urls(cls, webpage):
+ for mobj in cls._EMBED_RE.finditer(webpage):
+ yield unescapeHTML(mobj.group('url'))
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ return self._return_canonical_url(url, video_id)
diff --git a/hypervideo_dl/extractor/tvp.py b/hypervideo_dl/extractor/tvp.py
index 1e42b33..48e2c6e 100644
--- a/hypervideo_dl/extractor/tvp.py
+++ b/hypervideo_dl/extractor/tvp.py
@@ -2,35 +2,40 @@
from __future__ import unicode_literals
import itertools
+import random
import re
from .common import InfoExtractor
from ..utils import (
- clean_html,
determine_ext,
+ dict_get,
ExtractorError,
- get_element_by_attribute,
+ int_or_none,
+ js_to_json,
orderedSet,
+ str_or_none,
+ try_get,
)
class TVPIE(InfoExtractor):
IE_NAME = 'tvp'
IE_DESC = 'Telewizja Polska'
- _VALID_URL = r'https?://[^/]+\.tvp\.(?:pl|info)/(?:video/(?:[^,\s]*,)*|(?:(?!\d+/)[^/]+/)*)(?P<id>\d+)'
+ _VALID_URL = r'https?://(?:[^/]+\.)?(?:tvp(?:parlament)?\.(?:pl|info)|polandin\.com)/(?:video/(?:[^,\s]*,)*|(?:(?!\d+/)[^/]+/)*)(?P<id>\d+)'
_TESTS = [{
+ # TVPlayer 2 in js wrapper
'url': 'https://vod.tvp.pl/video/czas-honoru,i-seria-odc-13,194536',
- 'md5': 'a21eb0aa862f25414430f15fdfb9e76c',
'info_dict': {
'id': '194536',
'ext': 'mp4',
'title': 'Czas honoru, odc. 13 – Władek',
'description': 'md5:437f48b93558370b031740546b696e24',
+ 'age_limit': 12,
},
}, {
+ # TVPlayer legacy
'url': 'http://www.tvp.pl/there-can-be-anything-so-i-shortened-it/17916176',
- 'md5': 'b0005b542e5b4de643a9690326ab1257',
'info_dict': {
'id': '17916176',
'ext': 'mp4',
@@ -38,16 +43,63 @@ class TVPIE(InfoExtractor):
'description': 'TVP Gorzów pokaże filmy studentów z podroży dookoła świata',
},
}, {
- # page id is not the same as video id(#7799)
- 'url': 'https://wiadomosci.tvp.pl/33908820/28092017-1930',
- 'md5': '84cd3c8aec4840046e5ab712416b73d0',
+ # TVPlayer 2 in iframe
+ 'url': 'https://wiadomosci.tvp.pl/50725617/dzieci-na-sprzedaz-dla-homoseksualistow',
'info_dict': {
- 'id': '33908820',
+ 'id': '50725617',
'ext': 'mp4',
- 'title': 'Wiadomości, 28.09.2017, 19:30',
- 'description': 'Wydanie główne codziennego serwisu informacyjnego.'
+ 'title': 'Dzieci na sprzedaż dla homoseksualistów',
+ 'description': 'md5:7d318eef04e55ddd9f87a8488ac7d590',
+ 'age_limit': 12,
},
- 'skip': 'HTTP Error 404: Not Found',
+ }, {
+ # TVPlayer 2 in client-side rendered website (regional; window.__newsData)
+ 'url': 'https://warszawa.tvp.pl/25804446/studio-yayo',
+ 'info_dict': {
+ 'id': '25804446',
+ 'ext': 'mp4',
+ 'title': 'Studio Yayo',
+ 'upload_date': '20160616',
+ 'timestamp': 1466075700,
+ }
+ }, {
+ # TVPlayer 2 in client-side rendered website (tvp.info; window.__videoData)
+ 'url': 'https://www.tvp.info/52880236/09042021-0800',
+ 'info_dict': {
+ 'id': '52880236',
+ 'ext': 'mp4',
+ 'title': '09.04.2021, 08:00',
+ },
+ }, {
+ # client-side rendered (regional) program (playlist) page
+ 'url': 'https://opole.tvp.pl/9660819/rozmowa-dnia',
+ 'info_dict': {
+ 'id': '9660819',
+ 'description': 'Od poniedziałku do piątku o 18:55',
+ 'title': 'Rozmowa dnia',
+ },
+ 'playlist_mincount': 1800,
+ 'params': {
+ 'skip_download': True,
+ }
+ }, {
+ # ABC-specific video embeding
+ # moved to https://bajkowakraina.tvp.pl/wideo/50981130,teleranek,51027049,zubr,51116450
+ 'url': 'https://abc.tvp.pl/48636269/zubry-odc-124',
+ 'info_dict': {
+ 'id': '48320456',
+ 'ext': 'mp4',
+ 'title': 'Teleranek, Żubr',
+ },
+ 'skip': 'unavailable',
+ }, {
+ # yet another vue page
+ 'url': 'https://jp2.tvp.pl/46925618/filmy',
+ 'info_dict': {
+ 'id': '46925618',
+ 'title': 'Filmy',
+ },
+ 'playlist_mincount': 19,
}, {
'url': 'http://vod.tvp.pl/seriale/obyczajowe/na-sygnale/sezon-2-27-/odc-39/17834272',
'only_matching': True,
@@ -66,137 +118,344 @@ class TVPIE(InfoExtractor):
}, {
'url': 'http://www.tvp.info/25511919/trwa-rewolucja-wladza-zdecydowala-sie-na-pogwalcenie-konstytucji',
'only_matching': True,
+ }, {
+ 'url': 'https://tvp.info/49193823/teczowe-flagi-na-pomnikach-prokuratura-wszczela-postepowanie-wieszwiecej',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.tvpparlament.pl/retransmisje-vod/inne/wizyta-premiera-mateusza-morawieckiego-w-firmie-berotu-sp-z-oo/48857277',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://polandin.com/47942651/pln-10-billion-in-subsidies-transferred-to-companies-pm',
+ 'only_matching': True,
}]
+ def _parse_vue_website_data(self, webpage, page_id):
+ website_data = self._search_regex([
+ # website - regiony, tvp.info
+ # directory - jp2.tvp.pl
+ r'window\.__(?:website|directory)Data\s*=\s*({(?:.|\s)+?});',
+ ], webpage, 'website data')
+ if not website_data:
+ return None
+ return self._parse_json(website_data, page_id, transform_source=js_to_json)
+
+ def _extract_vue_video(self, video_data, page_id=None):
+ if isinstance(video_data, str):
+ video_data = self._parse_json(video_data, page_id, transform_source=js_to_json)
+ thumbnails = []
+ image = video_data.get('image')
+ if image:
+ for thumb in (image if isinstance(image, list) else [image]):
+ thmb_url = str_or_none(thumb.get('url'))
+ if thmb_url:
+ thumbnails.append({
+ 'url': thmb_url,
+ })
+ is_website = video_data.get('type') == 'website'
+ if is_website:
+ url = video_data['url']
+ fucked_up_url_parts = re.match(r'https?://vod\.tvp\.pl/(\d+)/([^/?#]+)', url)
+ if fucked_up_url_parts:
+ url = f'https://vod.tvp.pl/website/{fucked_up_url_parts.group(2)},{fucked_up_url_parts.group(1)}'
+ else:
+ url = 'tvp:' + str_or_none(video_data.get('_id') or page_id)
+ return {
+ '_type': 'url_transparent',
+ 'id': str_or_none(video_data.get('_id') or page_id),
+ 'url': url,
+ 'ie_key': 'TVPEmbed' if not is_website else 'TVPWebsite',
+ 'title': str_or_none(video_data.get('title')),
+ 'description': str_or_none(video_data.get('lead')),
+ 'timestamp': int_or_none(video_data.get('release_date_long')),
+ 'duration': int_or_none(video_data.get('duration')),
+ 'thumbnails': thumbnails,
+ }
+
+ def _handle_vuejs_page(self, url, webpage, page_id):
+ # vue client-side rendered sites (all regional pages + tvp.info)
+ video_data = self._search_regex([
+ r'window\.__(?:news|video)Data\s*=\s*({(?:.|\s)+?})\s*;',
+ ], webpage, 'video data', default=None)
+ if video_data:
+ return self._extract_vue_video(video_data, page_id=page_id)
+ # paged playlists
+ website_data = self._parse_vue_website_data(webpage, page_id)
+ if website_data:
+ entries = self._vuejs_entries(url, website_data, page_id)
+
+ return {
+ '_type': 'playlist',
+ 'id': page_id,
+ 'title': str_or_none(website_data.get('title')),
+ 'description': str_or_none(website_data.get('lead')),
+ 'entries': entries,
+ }
+ raise ExtractorError('Could not extract video/website data')
+
+ def _vuejs_entries(self, url, website_data, page_id):
+
+ def extract_videos(wd):
+ if wd.get('latestVideo'):
+ yield self._extract_vue_video(wd['latestVideo'])
+ for video in wd.get('videos') or []:
+ yield self._extract_vue_video(video)
+ for video in wd.get('items') or []:
+ yield self._extract_vue_video(video)
+
+ yield from extract_videos(website_data)
+
+ if website_data.get('items_total_count') > website_data.get('items_per_page'):
+ for page in itertools.count(2):
+ page_website_data = self._parse_vue_website_data(
+ self._download_webpage(url, page_id, note='Downloading page #%d' % page,
+ query={'page': page}),
+ page_id)
+ if not page_website_data.get('videos') and not page_website_data.get('items'):
+ break
+ yield from extract_videos(page_website_data)
+
def _real_extract(self, url):
page_id = self._match_id(url)
- webpage = self._download_webpage(url, page_id)
+ webpage, urlh = self._download_webpage_handle(url, page_id)
+
+ # The URL may redirect to a VOD
+ # example: https://vod.tvp.pl/48463890/wadowickie-spotkania-z-janem-pawlem-ii
+ if TVPWebsiteIE.suitable(urlh.url):
+ return self.url_result(urlh.url, ie=TVPWebsiteIE.ie_key(), video_id=page_id)
+
+ if re.search(
+ r'window\.__(?:video|news|website|directory)Data\s*=',
+ webpage):
+ return self._handle_vuejs_page(url, webpage, page_id)
+
+ # classic server-side rendered sites
video_id = self._search_regex([
+ r'<iframe[^>]+src="[^"]*?embed\.php\?(?:[^&]+&)*ID=(\d+)',
r'<iframe[^>]+src="[^"]*?object_id=(\d+)',
r"object_id\s*:\s*'(\d+)'",
- r'data-video-id="(\d+)"'], webpage, 'video id', default=page_id)
+ r'data-video-id="(\d+)"',
+
+ # abc.tvp.pl - somehow there are more than one video IDs that seem to be the same video?
+ # the first one is referenced to as "copyid", and seems to be unused by the website
+ r'<script>\s*tvpabc\.video\.init\(\s*\d+,\s*(\d+)\s*\)\s*</script>',
+ ], webpage, 'video id', default=page_id)
return {
'_type': 'url_transparent',
'url': 'tvp:' + video_id,
'description': self._og_search_description(
- webpage, default=None) or self._html_search_meta(
- 'description', webpage, default=None),
+ webpage, default=None) or (self._html_search_meta(
+ 'description', webpage, default=None)
+ if '//s.tvp.pl/files/portal/v' in webpage else None),
'thumbnail': self._og_search_thumbnail(webpage, default=None),
'ie_key': 'TVPEmbed',
}
+class TVPStreamIE(InfoExtractor):
+ IE_NAME = 'tvp:stream'
+ _VALID_URL = r'(?:tvpstream:|https?://tvpstream\.vod\.tvp\.pl/(?:\?(?:[^&]+[&;])*channel_id=)?)(?P<id>\d*)'
+ _TESTS = [{
+ # untestable as "video" id changes many times across a day
+ 'url': 'https://tvpstream.vod.tvp.pl/?channel_id=1455',
+ 'only_matching': True,
+ }, {
+ 'url': 'tvpstream:39821455',
+ 'only_matching': True,
+ }, {
+ # the default stream when you provide no channel_id, most probably TVP Info
+ 'url': 'tvpstream:',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://tvpstream.vod.tvp.pl/',
+ 'only_matching': True,
+ }]
+
+ _PLAYER_BOX_RE = r'<div\s[^>]*id\s*=\s*["\']?tvp_player_box["\']?[^>]+data-%s-id\s*=\s*["\']?(\d+)'
+ _BUTTON_RE = r'<div\s[^>]*data-channel-id=["\']?%s["\']?[^>]*\sdata-title=(?:"([^"]*)"|\'([^\']*)\')[^>]*\sdata-stationname=(?:"([^"]*)"|\'([^\']*)\')'
+
+ def _real_extract(self, url):
+ channel_id = self._match_id(url)
+ channel_url = self._proto_relative_url('//tvpstream.vod.tvp.pl/?channel_id=%s' % channel_id or 'default')
+ webpage = self._download_webpage(channel_url, channel_id, 'Downloading channel webpage')
+ if not channel_id:
+ channel_id = self._search_regex(self._PLAYER_BOX_RE % 'channel',
+ webpage, 'default channel id')
+ video_id = self._search_regex(self._PLAYER_BOX_RE % 'video',
+ webpage, 'video id')
+ audition_title, station_name = self._search_regex(
+ self._BUTTON_RE % (re.escape(channel_id)), webpage,
+ 'audition title and station name',
+ group=(1, 2))
+ return {
+ '_type': 'url_transparent',
+ 'id': channel_id,
+ 'url': 'tvp:%s' % video_id,
+ 'title': audition_title,
+ 'alt_title': station_name,
+ 'is_live': True,
+ 'ie_key': 'TVPEmbed',
+ }
+
+
class TVPEmbedIE(InfoExtractor):
IE_NAME = 'tvp:embed'
IE_DESC = 'Telewizja Polska'
- _VALID_URL = r'(?:tvp:|https?://[^/]+\.tvp\.(?:pl|info)/sess/tvplayer\.php\?.*?object_id=)(?P<id>\d+)'
+ _VALID_URL = r'''(?x)
+ (?:
+ tvp:
+ |https?://
+ (?:[^/]+\.)?
+ (?:tvp(?:parlament)?\.pl|tvp\.info|polandin\.com)/
+ (?:sess/
+ (?:tvplayer\.php\?.*?object_id
+ |TVPlayer2/(?:embed|api)\.php\?.*[Ii][Dd])
+ |shared/details\.php\?.*?object_id)
+ =)
+ (?P<id>\d+)
+ '''
_TESTS = [{
'url': 'tvp:194536',
- 'md5': 'a21eb0aa862f25414430f15fdfb9e76c',
'info_dict': {
'id': '194536',
'ext': 'mp4',
'title': 'Czas honoru, odc. 13 – Władek',
+ 'description': 'md5:76649d2014f65c99477be17f23a4dead',
+ 'age_limit': 12,
},
}, {
- # not available
- 'url': 'http://www.tvp.pl/sess/tvplayer.php?object_id=22670268',
- 'md5': '8c9cd59d16edabf39331f93bf8a766c7',
+ 'url': 'https://www.tvp.pl/sess/tvplayer.php?object_id=51247504&amp;autoplay=false',
'info_dict': {
- 'id': '22670268',
+ 'id': '51247504',
'ext': 'mp4',
- 'title': 'Panorama, 07.12.2015, 15:40',
+ 'title': 'Razmova 091220',
},
- 'skip': 'Transmisja została zakończona lub materiał niedostępny',
}, {
- 'url': 'tvp:22670268',
+ # TVPlayer2 embed URL
+ 'url': 'https://tvp.info/sess/TVPlayer2/embed.php?ID=50595757',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://wiadomosci.tvp.pl/sess/TVPlayer2/api.php?id=51233452',
+ 'only_matching': True,
+ }, {
+ # pulsembed on dziennik.pl
+ 'url': 'https://www.tvp.pl/shared/details.php?copy_id=52205981&object_id=52204505&autoplay=false&is_muted=false&allowfullscreen=true&template=external-embed/video/iframe-video.html',
'only_matching': True,
}]
+ @staticmethod
+ def _extract_urls(webpage, **kw):
+ return [m.group('embed') for m in re.finditer(
+ r'(?x)<iframe[^>]+?src=(["\'])(?P<embed>%s)' % TVPEmbedIE._VALID_URL[4:],
+ webpage)]
+
def _real_extract(self, url):
video_id = self._match_id(url)
+ # it could be anything that is a valid JS function name
+ callback = random.choice((
+ 'jebac_pis',
+ 'jebacpis',
+ 'ziobro',
+ 'sasin70',
+ 'sasin_przejebal_70_milionow_PLN',
+ 'tvp_is_a_state_propaganda_service',
+ ))
+
webpage = self._download_webpage(
- 'http://www.tvp.pl/sess/tvplayer.php?object_id=%s' % video_id, video_id)
-
- error = self._html_search_regex(
- r'(?s)<p[^>]+\bclass=["\']notAvailable__text["\'][^>]*>(.+?)</p>',
- webpage, 'error', default=None) or clean_html(
- get_element_by_attribute('class', 'msg error', webpage))
- if error:
- raise ExtractorError('%s said: %s' % (
- self.IE_NAME, clean_html(error)), expected=True)
-
- title = self._search_regex(
- r'name\s*:\s*([\'"])Title\1\s*,\s*value\s*:\s*\1(?P<title>.+?)\1',
- webpage, 'title', group='title')
- series_title = self._search_regex(
- r'name\s*:\s*([\'"])SeriesTitle\1\s*,\s*value\s*:\s*\1(?P<series>.+?)\1',
- webpage, 'series', group='series', default=None)
- if series_title:
- title = '%s, %s' % (series_title, title)
-
- thumbnail = self._search_regex(
- r"poster\s*:\s*'([^']+)'", webpage, 'thumbnail', default=None)
-
- video_url = self._search_regex(
- r'0:{src:([\'"])(?P<url>.*?)\1', webpage,
- 'formats', group='url', default=None)
- if not video_url or 'material_niedostepny.mp4' in video_url:
- video_url = self._download_json(
- 'http://www.tvp.pl/pub/stat/videofileinfo?video_id=%s' % video_id,
- video_id)['video_url']
+ ('https://www.tvp.pl/sess/TVPlayer2/api.php?id=%s'
+ + '&@method=getTvpConfig&@callback=%s') % (video_id, callback), video_id)
+
+ # stripping JSONP padding
+ datastr = webpage[15 + len(callback):-3]
+ if datastr.startswith('null,'):
+ error = self._parse_json(datastr[5:], video_id)
+ raise ExtractorError(error[0]['desc'])
+
+ content = self._parse_json(datastr, video_id)['content']
+ info = content['info']
+ is_live = try_get(info, lambda x: x['isLive'], bool)
formats = []
- video_url_base = self._search_regex(
- r'(https?://.+?/video)(?:\.(?:ism|f4m|m3u8)|-\d+\.mp4)',
- video_url, 'video base url', default=None)
- if video_url_base:
- # TODO: <Group> found instead of <AdaptationSet> in MPD manifest.
- # It's not mentioned in MPEG-DASH standard. Figure that out.
- # formats.extend(self._extract_mpd_formats(
- # video_url_base + '.ism/video.mpd',
- # video_id, mpd_id='dash', fatal=False))
- formats.extend(self._extract_ism_formats(
- video_url_base + '.ism/Manifest',
- video_id, 'mss', fatal=False))
- formats.extend(self._extract_f4m_formats(
- video_url_base + '.ism/video.f4m',
- video_id, f4m_id='hds', fatal=False))
- m3u8_formats = self._extract_m3u8_formats(
- video_url_base + '.ism/video.m3u8', video_id,
- 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)
- self._sort_formats(m3u8_formats)
- m3u8_formats = list(filter(
- lambda f: f.get('vcodec') != 'none', m3u8_formats))
- formats.extend(m3u8_formats)
- for i, m3u8_format in enumerate(m3u8_formats, 2):
- http_url = '%s-%d.mp4' % (video_url_base, i)
- if self._is_valid_url(http_url, video_id):
- f = m3u8_format.copy()
- f.update({
- 'url': http_url,
- 'format_id': f['format_id'].replace('hls', 'http'),
- 'protocol': 'http',
- })
- formats.append(f)
- else:
- formats = [{
- 'format_id': 'direct',
- 'url': video_url,
- 'ext': determine_ext(video_url, 'mp4'),
- }]
+ for file in content['files']:
+ video_url = file.get('url')
+ if not video_url:
+ continue
+ if video_url.endswith('.m3u8'):
+ formats.extend(self._extract_m3u8_formats(video_url, video_id, m3u8_id='hls', fatal=False, live=is_live))
+ elif video_url.endswith('.mpd'):
+ if is_live:
+ # doesn't work with either ffmpeg or native downloader
+ continue
+ formats.extend(self._extract_mpd_formats(video_url, video_id, mpd_id='dash', fatal=False))
+ elif video_url.endswith('.f4m'):
+ formats.extend(self._extract_f4m_formats(video_url, video_id, f4m_id='hds', fatal=False))
+ elif video_url.endswith('.ism/manifest'):
+ formats.extend(self._extract_ism_formats(video_url, video_id, ism_id='mss', fatal=False))
+ else:
+ # mp4, wmv or something
+ quality = file.get('quality', {})
+ formats.append({
+ 'format_id': 'direct',
+ 'url': video_url,
+ 'ext': determine_ext(video_url, file['type']),
+ 'fps': int_or_none(quality.get('fps')),
+ 'tbr': int_or_none(quality.get('bitrate')),
+ 'width': int_or_none(quality.get('width')),
+ 'height': int_or_none(quality.get('height')),
+ })
self._sort_formats(formats)
- return {
+ title = dict_get(info, ('subtitle', 'title', 'seoTitle'))
+ description = dict_get(info, ('description', 'seoDescription'))
+ thumbnails = []
+ for thumb in content.get('posters') or ():
+ thumb_url = thumb.get('src')
+ if not thumb_url or '{width}' in thumb_url or '{height}' in thumb_url:
+ continue
+ thumbnails.append({
+ 'url': thumb.get('src'),
+ 'width': thumb.get('width'),
+ 'height': thumb.get('height'),
+ })
+ age_limit = try_get(info, lambda x: x['ageGroup']['minAge'], int)
+ if age_limit == 1:
+ age_limit = 0
+ duration = try_get(info, lambda x: x['duration'], int) if not is_live else None
+
+ subtitles = {}
+ for sub in content.get('subtitles') or []:
+ if not sub.get('url'):
+ continue
+ subtitles.setdefault(sub['lang'], []).append({
+ 'url': sub['url'],
+ 'ext': sub.get('type'),
+ })
+
+ info_dict = {
'id': video_id,
'title': title,
- 'thumbnail': thumbnail,
+ 'description': description,
+ 'thumbnails': thumbnails,
+ 'age_limit': age_limit,
+ 'is_live': is_live,
+ 'duration': duration,
'formats': formats,
+ 'subtitles': subtitles,
}
+ # vod.tvp.pl
+ if info.get('vortalName') == 'vod':
+ info_dict.update({
+ 'title': '%s, %s' % (info.get('title'), info.get('subtitle')),
+ 'series': info.get('title'),
+ 'season': info.get('season'),
+ 'episode_number': info.get('episode'),
+ })
+
+ return info_dict
+
class TVPWebsiteIE(InfoExtractor):
IE_NAME = 'tvp:series'
@@ -204,18 +463,20 @@ class TVPWebsiteIE(InfoExtractor):
_TESTS = [{
# series
- 'url': 'https://vod.tvp.pl/website/lzy-cennet,38678312/video',
+ 'url': 'https://vod.tvp.pl/website/wspaniale-stulecie,17069012/video',
'info_dict': {
- 'id': '38678312',
+ 'id': '17069012',
},
- 'playlist_count': 115,
+ 'playlist_count': 312,
}, {
# film
- 'url': 'https://vod.tvp.pl/website/gloria,35139666',
+ 'url': 'https://vod.tvp.pl/website/krzysztof-krawczyk-cale-moje-zycie,51374466',
'info_dict': {
- 'id': '36637049',
+ 'id': '51374509',
'ext': 'mp4',
- 'title': 'Gloria, Gloria',
+ 'title': 'Krzysztof Krawczyk – całe moje życie, Krzysztof Krawczyk – całe moje życie',
+ 'description': 'md5:2e80823f00f5fc263555482f76f8fa42',
+ 'age_limit': 12,
},
'params': {
'skip_download': True,
diff --git a/hypervideo_dl/extractor/tvplay.py b/hypervideo_dl/extractor/tvplay.py
index fbafb41..b5dbc55 100644
--- a/hypervideo_dl/extractor/tvplay.py
+++ b/hypervideo_dl/extractor/tvplay.py
@@ -12,9 +12,9 @@ from ..utils import (
determine_ext,
ExtractorError,
int_or_none,
- parse_duration,
parse_iso8601,
qualities,
+ traverse_obj,
try_get,
update_url_query,
url_or_none,
@@ -369,7 +369,6 @@ class ViafreeIE(InfoExtractor):
'upload_date': '20201217'
},
'params': {
- 'format': 'bestvideo',
'skip_download': True
}
}, {
@@ -432,77 +431,96 @@ class ViafreeIE(InfoExtractor):
class TVPlayHomeIE(InfoExtractor):
- _VALID_URL = r'https?://(?:tv3?)?play\.(?:tv3\.lt|skaties\.lv|tv3\.ee)/(?:[^/]+/)*[^/?#&]+-(?P<id>\d+)'
+ _VALID_URL = r'''(?x)
+ https?://
+ (?:tv3?)?
+ play\.(?:tv3|skaties)\.(?P<country>lv|lt|ee)/
+ (?P<live>lives/)?
+ [^?#&]+(?:episode|programme|clip)-(?P<id>\d+)
+ '''
_TESTS = [{
- 'url': 'https://tvplay.tv3.lt/aferistai-n-7/aferistai-10047125/',
+ 'url': 'https://play.tv3.lt/series/gauju-karai-karveliai,serial-2343791/serija-8,episode-2343828',
'info_dict': {
- 'id': '366367',
+ 'id': '2343828',
'ext': 'mp4',
- 'title': 'Aferistai',
- 'description': 'Aferistai. Kalėdinė pasaka.',
- 'series': 'Aferistai [N-7]',
- 'season': '1 sezonas',
+ 'title': 'Gaujų karai. Karveliai (2021) | S01E08: Serija 8',
+ 'description': 'md5:f6fcfbb236429f05531131640dfa7c81',
+ 'duration': 2710,
+ 'season': 'Gaujų karai. Karveliai',
'season_number': 1,
- 'duration': 464,
- 'timestamp': 1394209658,
- 'upload_date': '20140307',
- 'age_limit': 18,
+ 'release_year': 2021,
+ 'episode': 'Serija 8',
+ 'episode_number': 8,
},
'params': {
- 'skip_download': True,
+ 'skip_download': 'm3u8',
},
}, {
- 'url': 'https://tvplay.skaties.lv/vinas-melo-labak/vinas-melo-labak-10280317/',
- 'only_matching': True,
+ 'url': 'https://play.tv3.lt/series/moterys-meluoja-geriau-n-7,serial-2574652/serija-25,episode-3284937',
+ 'info_dict': {
+ 'id': '3284937',
+ 'ext': 'mp4',
+ 'season': 'Moterys meluoja geriau [N-7]',
+ 'season_number': 14,
+ 'release_year': 2021,
+ 'episode': 'Serija 25',
+ 'episode_number': 25,
+ 'title': 'Moterys meluoja geriau [N-7] (2021) | S14|E25: Serija 25',
+ 'description': 'md5:c6926e9710f1a126f028fbe121eddb79',
+ 'duration': 2440,
+ },
+ 'skip': '404'
}, {
- 'url': 'https://tvplay.tv3.ee/cool-d-ga-mehhikosse/cool-d-ga-mehhikosse-10044354/',
+ 'url': 'https://play.tv3.lt/lives/tv6-lt,live-2838694/optibet-a-lygos-rungtynes-marijampoles-suduva--vilniaus-riteriai,programme-3422014',
'only_matching': True,
}, {
- 'url': 'https://play.tv3.lt/aferistai-10047125',
+ 'url': 'https://tv3play.skaties.lv/series/women-lie-better-lv,serial-1024464/women-lie-better-lv,episode-1038762',
'only_matching': True,
}, {
- 'url': 'https://tv3play.skaties.lv/vinas-melo-labak-10280317',
+ 'url': 'https://play.tv3.ee/series/_,serial-2654462/_,episode-2654474',
'only_matching': True,
}, {
- 'url': 'https://play.tv3.ee/cool-d-ga-mehhikosse-10044354',
+ 'url': 'https://tv3play.skaties.lv/clips/tv3-zinas-valsti-lidz-15novembrim-bus-majsede,clip-3464509',
'only_matching': True,
}]
def _real_extract(self, url):
- video_id = self._match_id(url)
+ country, is_live, video_id = self._match_valid_url(url).groups()
- asset = self._download_json(
- urljoin(url, '/sb/public/asset/' + video_id), video_id)
+ api_path = 'lives/programmes' if is_live else 'vods'
+ data = self._download_json(
+ urljoin(url, f'/api/products/{api_path}/{video_id}?platform=BROWSER&lang={country.upper()}'),
+ video_id)
- m3u8_url = asset['movie']['contentUrl']
- video_id = asset['assetId']
- asset_title = asset['title']
- title = asset_title['title']
-
- formats = self._extract_m3u8_formats(
- m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls')
+ video_type = 'CATCHUP' if is_live else 'MOVIE'
+ stream_id = data['programRecordingId'] if is_live else video_id
+ stream = self._download_json(
+ urljoin(url, f'/api/products/{stream_id}/videos/playlist?videoType={video_type}&platform=BROWSER'), video_id)
+ formats, subtitles = self._extract_m3u8_formats_and_subtitles(
+ stream['sources']['HLS'][0]['src'], video_id, 'mp4', 'm3u8_native', m3u8_id='hls')
self._sort_formats(formats)
- thumbnails = None
- image_url = asset.get('imageUrl')
- if image_url:
- thumbnails = [{
- 'url': urljoin(url, image_url),
- 'ext': 'jpg',
- }]
-
- metadata = asset.get('metadata') or {}
+ thumbnails = set(traverse_obj(
+ data, (('galary', 'images', 'artworks'), ..., ..., ('miniUrl', 'mainUrl')), expected_type=url_or_none))
return {
'id': video_id,
- 'title': title,
- 'description': asset_title.get('summaryLong') or asset_title.get('summaryShort'),
- 'thumbnails': thumbnails,
- 'duration': parse_duration(asset_title.get('runTime')),
- 'series': asset.get('tvSeriesTitle'),
- 'season': asset.get('tvSeasonTitle'),
- 'season_number': int_or_none(metadata.get('seasonNumber')),
- 'episode': asset_title.get('titleBrief'),
- 'episode_number': int_or_none(metadata.get('episodeNumber')),
+ 'title': self._resolve_title(data),
+ 'description': traverse_obj(data, 'description', 'lead'),
+ 'duration': int_or_none(data.get('duration')),
+ 'season': traverse_obj(data, ('season', 'serial', 'title')),
+ 'season_number': int_or_none(traverse_obj(data, ('season', 'number'))),
+ 'episode': data.get('title'),
+ 'episode_number': int_or_none(data.get('episode')),
+ 'release_year': int_or_none(traverse_obj(data, ('season', 'serial', 'year'))),
+ 'thumbnails': [{'url': url, 'ext': 'jpg'} for url in thumbnails],
'formats': formats,
+ 'subtitles': subtitles,
}
+
+ @staticmethod
+ def _resolve_title(data):
+ return try_get(data, lambda x: (
+ f'{data["season"]["serial"]["title"]} ({data["season"]["serial"]["year"]}) | '
+ f'S{data["season"]["number"]:02d}E{data["episode"]:02d}: {data["title"]}'
+ )) or data.get('title')
diff --git a/hypervideo_dl/extractor/tvplayer.py b/hypervideo_dl/extractor/tvplayer.py
index 8f8686a..5970596 100644
--- a/hypervideo_dl/extractor/tvplayer.py
+++ b/hypervideo_dl/extractor/tvplayer.py
@@ -80,7 +80,7 @@ class TVPlayerIE(InfoExtractor):
return {
'id': resource_id,
'display_id': display_id,
- 'title': self._live_title(title),
+ 'title': title,
'formats': formats,
'is_live': True,
}
diff --git a/hypervideo_dl/extractor/twitcasting.py b/hypervideo_dl/extractor/twitcasting.py
index 3acf1b1..5c4d26c 100644
--- a/hypervideo_dl/extractor/twitcasting.py
+++ b/hypervideo_dl/extractor/twitcasting.py
@@ -8,22 +8,27 @@ from .common import InfoExtractor
from ..downloader.websocket import has_websockets
from ..utils import (
clean_html,
+ ExtractorError,
float_or_none,
get_element_by_class,
get_element_by_id,
parse_duration,
qualities,
str_to_int,
+ traverse_obj,
try_get,
unified_timestamp,
urlencode_postdata,
urljoin,
- ExtractorError,
)
class TwitCastingIE(InfoExtractor):
_VALID_URL = r'https?://(?:[^/]+\.)?twitcasting\.tv/(?P<uploader_id>[^/]+)/(?:movie|twplayer)/(?P<id>\d+)'
+ _M3U8_HEADERS = {
+ 'Origin': 'https://twitcasting.tv',
+ 'Referer': 'https://twitcasting.tv/',
+ }
_TESTS = [{
'url': 'https://twitcasting.tv/ivetesangalo/movie/2357609',
'md5': '745243cad58c4681dc752490f7540d7f',
@@ -60,6 +65,16 @@ class TwitCastingIE(InfoExtractor):
'skip_download': True,
'videopassword': 'abc',
},
+ }, {
+ 'note': 'archive is split in 2 parts',
+ 'url': 'https://twitcasting.tv/loft_heaven/movie/685979292',
+ 'info_dict': {
+ 'id': '685979292',
+ 'ext': 'mp4',
+ 'title': '南波一海のhear_here “ナタリー望月哲さんに聞く編集と「渋谷系狂騒曲」”',
+ 'duration': 6964.599334,
+ },
+ 'playlist_mincount': 2,
}]
def _real_extract(self, url):
@@ -70,66 +85,49 @@ class TwitCastingIE(InfoExtractor):
if video_password:
request_data = urlencode_postdata({
'password': video_password,
- })
- webpage = self._download_webpage(
+ }, encoding='utf-8')
+ webpage, urlh = self._download_webpage_handle(
url, video_id, data=request_data,
headers={'Origin': 'https://twitcasting.tv'})
+ if urlh.geturl() != url and request_data:
+ webpage = self._download_webpage(
+ urlh.geturl(), video_id, data=request_data,
+ headers={'Origin': 'https://twitcasting.tv'},
+ note='Retrying authentication')
+ # has to check here as the first request can contain password input form even if the password is correct
+ if re.search(r'<form\s+method="POST">\s*<input\s+[^>]+?name="password"', webpage):
+ raise ExtractorError('This video is protected by a password, use the --video-password option', expected=True)
title = (clean_html(get_element_by_id('movietitle', webpage))
or self._html_search_meta(['og:title', 'twitter:title'], webpage, fatal=True))
- video_js_data = {}
- m3u8_url = self._search_regex(
- r'data-movie-url=(["\'])(?P<url>(?:(?!\1).)+)\1',
- webpage, 'm3u8 url', group='url', default=None)
- if not m3u8_url:
- video_js_data = self._parse_json(self._search_regex(
- r'data-movie-playlist=(["\'])(?P<url>(?:(?!\1).)+)',
- webpage, 'movie playlist', group='url', default='[{}]'), video_id)
- if isinstance(video_js_data, dict):
- video_js_data = list(video_js_data.values())[0]
- video_js_data = video_js_data[0]
- m3u8_url = try_get(video_js_data, lambda x: x['source']['url'])
-
- stream_server_data = self._download_json(
- 'https://twitcasting.tv/streamserver.php?target=%s&mode=client' % uploader_id, video_id,
- 'Downloading live info', fatal=False)
-
- is_live = 'data-status="online"' in webpage
- formats = []
- if is_live and not m3u8_url:
- m3u8_url = 'https://twitcasting.tv/%s/metastream.m3u8' % uploader_id
- if is_live and has_websockets and stream_server_data:
- qq = qualities(['base', 'mobilesource', 'main'])
- for mode, ws_url in stream_server_data['llfmp4']['streams'].items():
- formats.append({
- 'url': ws_url,
- 'format_id': 'ws-%s' % mode,
- 'ext': 'mp4',
- 'quality': qq(mode),
- 'protocol': 'websocket_frag', # TwitCasting simply sends moof atom directly over WS
- })
+ video_js_data = try_get(
+ webpage,
+ lambda x: self._parse_json(self._search_regex(
+ r'data-movie-playlist=\'([^\']+?)\'',
+ x, 'movie playlist', default=None), video_id)['2'], list)
- thumbnail = video_js_data.get('thumbnailUrl') or self._og_search_thumbnail(webpage)
+ thumbnail = traverse_obj(video_js_data, (0, 'thumbnailUrl')) or self._og_search_thumbnail(webpage)
description = clean_html(get_element_by_id(
'authorcomment', webpage)) or self._html_search_meta(
['description', 'og:description', 'twitter:description'], webpage)
- duration = float_or_none(video_js_data.get(
- 'duration'), 1000) or parse_duration(clean_html(
- get_element_by_class('tw-player-duration-time', webpage)))
+ duration = (try_get(video_js_data, lambda x: sum(float_or_none(y.get('duration')) for y in x) / 1000)
+ or parse_duration(clean_html(get_element_by_class('tw-player-duration-time', webpage))))
view_count = str_to_int(self._search_regex(
- r'Total\s*:\s*([\d,]+)\s*Views', webpage, 'views', None))
+ (r'Total\s*:\s*([\d,]+)\s*Views', r'総視聴者\s*:\s*([\d,]+)\s*</'), webpage, 'views', None))
timestamp = unified_timestamp(self._search_regex(
r'data-toggle="true"[^>]+datetime="([^"]+)"',
webpage, 'datetime', None))
- if m3u8_url:
- formats.extend(self._extract_m3u8_formats(
- m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', live=is_live))
- self._sort_formats(formats)
+ stream_server_data = self._download_json(
+ 'https://twitcasting.tv/streamserver.php?target=%s&mode=client' % uploader_id, video_id,
+ 'Downloading live info', fatal=False)
- return {
- 'id': video_id,
+ is_live = 'data-status="online"' in webpage
+ if not traverse_obj(stream_server_data, 'llfmp4') and is_live:
+ self.raise_login_required(method='cookies')
+
+ base_dict = {
'title': title,
'description': description,
'thumbnail': thumbnail,
@@ -137,10 +135,75 @@ class TwitCastingIE(InfoExtractor):
'uploader_id': uploader_id,
'duration': duration,
'view_count': view_count,
- 'formats': formats,
'is_live': is_live,
}
+ def find_dmu(x):
+ data_movie_url = self._search_regex(
+ r'data-movie-url=(["\'])(?P<url>(?:(?!\1).)+)\1',
+ x, 'm3u8 url', group='url', default=None)
+ if data_movie_url:
+ return [data_movie_url]
+
+ m3u8_urls = (try_get(webpage, find_dmu, list)
+ or traverse_obj(video_js_data, (..., 'source', 'url'))
+ or ([f'https://twitcasting.tv/{uploader_id}/metastream.m3u8'] if is_live else None))
+ if not m3u8_urls:
+ raise ExtractorError('Failed to get m3u8 playlist')
+
+ if is_live:
+ m3u8_url = m3u8_urls[0]
+ formats = self._extract_m3u8_formats(
+ m3u8_url, video_id, ext='mp4', m3u8_id='hls',
+ live=True, headers=self._M3U8_HEADERS)
+
+ if traverse_obj(stream_server_data, ('hls', 'source')):
+ formats.extend(self._extract_m3u8_formats(
+ m3u8_url, video_id, ext='mp4', m3u8_id='source',
+ live=True, query={'mode': 'source'},
+ note='Downloading source quality m3u8',
+ headers=self._M3U8_HEADERS, fatal=False))
+
+ if has_websockets:
+ qq = qualities(['base', 'mobilesource', 'main'])
+ streams = traverse_obj(stream_server_data, ('llfmp4', 'streams')) or {}
+ for mode, ws_url in streams.items():
+ formats.append({
+ 'url': ws_url,
+ 'format_id': 'ws-%s' % mode,
+ 'ext': 'mp4',
+ 'quality': qq(mode),
+ 'source_preference': -10,
+ # TwitCasting simply sends moof atom directly over WS
+ 'protocol': 'websocket_frag',
+ })
+
+ self._sort_formats(formats, ('source',))
+
+ infodict = {
+ 'formats': formats
+ }
+ else:
+ infodict = {
+ '_type': 'multi_video',
+ 'entries': [{
+ 'id': f'{video_id}-{num}',
+ 'url': m3u8_url,
+ 'ext': 'mp4',
+ # Requesting the manifests here will cause download to fail.
+ # So use ffmpeg instead. See: https://github.com/hypervideo/hypervideo/issues/382
+ 'protocol': 'm3u8',
+ 'http_headers': self._M3U8_HEADERS,
+ **base_dict,
+ } for (num, m3u8_url) in enumerate(m3u8_urls)],
+ }
+
+ return {
+ 'id': video_id,
+ **base_dict,
+ **infodict,
+ }
+
class TwitCastingLiveIE(InfoExtractor):
_VALID_URL = r'https?://(?:[^/]+\.)?twitcasting\.tv/(?P<id>[^/]+)/?(?:[#?]|$)'
@@ -161,6 +224,17 @@ class TwitCastingLiveIE(InfoExtractor):
r'tw-sound-flag-open-link" data-id="(\d+)" style=',),
webpage, 'current live ID', default=None)
if not current_live:
+ # fetch unfiltered /show to find running livestreams; we can't get ID of the password-protected livestream above
+ webpage = self._download_webpage(
+ f'https://twitcasting.tv/{uploader_id}/show/', uploader_id,
+ note='Downloading live history')
+ is_live = self._search_regex(r'(?s)(<span\s*class="tw-movie-thumbnail-badge"\s*data-status="live">\s*LIVE)', webpage, 'is live?', default=None)
+ if is_live:
+ # get the first live; running live is always at the first
+ current_live = self._search_regex(
+ r'(?s)<a\s+class="tw-movie-thumbnail"\s*href="/[^/]+/movie/(?P<video_id>\d+)"\s*>.+?</a>',
+ webpage, 'current live ID 2', default=None, group='video_id')
+ if not current_live:
raise ExtractorError('The user is not currently live')
return self.url_result('https://twitcasting.tv/%s/movie/%s' % (uploader_id, current_live))
diff --git a/hypervideo_dl/extractor/twitch.py b/hypervideo_dl/extractor/twitch.py
index be70bee..10de74c 100644
--- a/hypervideo_dl/extractor/twitch.py
+++ b/hypervideo_dl/extractor/twitch.py
@@ -24,6 +24,8 @@ from ..utils import (
parse_iso8601,
parse_qs,
qualities,
+ str_or_none,
+ traverse_obj,
try_get,
unified_timestamp,
update_url_query,
@@ -52,16 +54,10 @@ class TwitchBaseIE(InfoExtractor):
'VideoAccessToken_Clip': '36b89d2507fce29e5ca551df756d27c1cfe079e2609642b4390aa4c35796eb11',
'VideoPreviewOverlay': '3006e77e51b128d838fa4e835723ca4dc9a05c5efd4466c1085215c6e437e65c',
'VideoMetadata': '226edb3e692509f727fd56821f5653c05740242c82b0388883e0c0e75dcbf687',
+ 'VideoPlayer_ChapterSelectButtonVideo': '8d2793384aac3773beab5e59bd5d6f585aedb923d292800119e03d40cd0f9b41',
}
- def _real_initialize(self):
- self._login()
-
- def _login(self):
- username, password = self._get_login_info()
- if username is None:
- return
-
+ def _perform_login(self, username, password):
def fail(message):
raise ExtractorError(
'Unable to login. Twitch said: %s' % message, expected=True)
@@ -249,6 +245,38 @@ class TwitchVodIE(TwitchBaseIE):
}, {
'url': 'https://player.twitch.tv/?video=480452374',
'only_matching': True,
+ }, {
+ 'url': 'https://www.twitch.tv/videos/635475444',
+ 'info_dict': {
+ 'id': 'v635475444',
+ 'ext': 'mp4',
+ 'title': 'Riot Games',
+ 'duration': 11643,
+ 'uploader': 'Riot Games',
+ 'uploader_id': 'riotgames',
+ 'timestamp': 1590770569,
+ 'upload_date': '20200529',
+ 'chapters': [
+ {
+ 'start_time': 0,
+ 'end_time': 573,
+ 'title': 'League of Legends'
+ },
+ {
+ 'start_time': 573,
+ 'end_time': 3922,
+ 'title': 'Legends of Runeterra'
+ },
+ {
+ 'start_time': 3922,
+ 'end_time': 11643,
+ 'title': 'Art'
+ }
+ ],
+ },
+ 'params': {
+ 'skip_download': True
+ }
}]
def _download_info(self, item_id):
@@ -259,16 +287,24 @@ class TwitchVodIE(TwitchBaseIE):
'channelLogin': '',
'videoID': item_id,
},
+ }, {
+ 'operationName': 'VideoPlayer_ChapterSelectButtonVideo',
+ 'variables': {
+ 'includePrivate': False,
+ 'videoID': item_id,
+ },
}],
- 'Downloading stream metadata GraphQL')[0]['data']
- video = data.get('video')
+ 'Downloading stream metadata GraphQL')
+
+ video = traverse_obj(data, (0, 'data', 'video'))
+ video['moments'] = traverse_obj(data, (1, 'data', 'video', 'moments', 'edges', ..., 'node'))
+
if video is None:
raise ExtractorError(
'Video %s does not exist' % item_id, expected=True)
return self._extract_info_gql(video, item_id)
- @staticmethod
- def _extract_info(info):
+ def _extract_info(self, info):
status = info.get('status')
if status == 'recording':
is_live = True
@@ -302,18 +338,39 @@ class TwitchVodIE(TwitchBaseIE):
'timestamp': parse_iso8601(info.get('recorded_at')),
'view_count': int_or_none(info.get('views')),
'is_live': is_live,
+ 'was_live': True,
}
- @staticmethod
- def _extract_info_gql(info, item_id):
+ def _extract_moments(self, info, item_id):
+ for moment in info.get('moments') or []:
+ start_time = int_or_none(moment.get('positionMilliseconds'), 1000)
+ duration = int_or_none(moment.get('durationMilliseconds'), 1000)
+ name = str_or_none(moment.get('description'))
+
+ if start_time is None or duration is None:
+ self.report_warning(f'Important chapter information missing for chapter {name}', item_id)
+ continue
+ yield {
+ 'start_time': start_time,
+ 'end_time': start_time + duration,
+ 'title': name,
+ }
+
+ def _extract_info_gql(self, info, item_id):
vod_id = info.get('id') or item_id
# id backward compatibility for download archives
if vod_id[0] != 'v':
vod_id = 'v%s' % vod_id
thumbnail = url_or_none(info.get('previewThumbnailURL'))
+ is_live = None
if thumbnail:
- for p in ('width', 'height'):
- thumbnail = thumbnail.replace('{%s}' % p, '0')
+ if thumbnail.endswith('/404_processing_{width}x{height}.png'):
+ is_live, thumbnail = True, None
+ else:
+ is_live = False
+ for p in ('width', 'height'):
+ thumbnail = thumbnail.replace('{%s}' % p, '0')
+
return {
'id': vod_id,
'title': info.get('title') or 'Untitled Broadcast',
@@ -324,6 +381,9 @@ class TwitchVodIE(TwitchBaseIE):
'uploader_id': try_get(info, lambda x: x['owner']['login'], compat_str),
'timestamp': unified_timestamp(info.get('publishedAt')),
'view_count': int_or_none(info.get('viewCount')),
+ 'chapters': list(self._extract_moments(info, item_id)),
+ 'is_live': is_live,
+ 'was_live': True,
}
def _real_extract(self, url):
@@ -836,7 +896,7 @@ class TwitchStreamIE(TwitchBaseIE):
return {
'id': stream_id,
'display_id': channel_name,
- 'title': self._live_title(title),
+ 'title': title,
'description': description,
'thumbnail': thumbnail,
'uploader': uploader,
@@ -981,7 +1041,7 @@ class TwitchClipsIE(TwitchBaseIE):
'title': clip.get('title') or video_id,
'formats': formats,
'duration': int_or_none(clip.get('durationSeconds')),
- 'views': int_or_none(clip.get('viewCount')),
+ 'view_count': int_or_none(clip.get('viewCount')),
'timestamp': unified_timestamp(clip.get('createdAt')),
'thumbnails': thumbnails,
'creator': try_get(clip, lambda x: x['broadcaster']['displayName'], compat_str),
diff --git a/hypervideo_dl/extractor/twitter.py b/hypervideo_dl/extractor/twitter.py
index 485b781..8ccc38e 100644
--- a/hypervideo_dl/extractor/twitter.py
+++ b/hypervideo_dl/extractor/twitter.py
@@ -13,8 +13,10 @@ from ..compat import (
from ..utils import (
dict_get,
ExtractorError,
+ format_field,
float_or_none,
int_or_none,
+ traverse_obj,
try_get,
strip_or_none,
unified_timestamp,
@@ -55,7 +57,7 @@ class TwitterBaseIE(InfoExtractor):
def _extract_formats_from_vmap_url(self, vmap_url, video_id):
vmap_url = url_or_none(vmap_url)
if not vmap_url:
- return []
+ return [], {}
vmap_data = self._download_xml(vmap_url, video_id)
formats = []
subtitles = {}
@@ -88,6 +90,9 @@ class TwitterBaseIE(InfoExtractor):
headers = {
'Authorization': 'Bearer AAAAAAAAAAAAAAAAAAAAAPYXBAAAAAAACLXUNDekMxqa8h%2F40K4moUkGsoc%3DTYfbDKbT3jJPCEVnMYqilB28NHfOPqkca3qaAxGfsyKCs0wRbw',
}
+ token = self._get_cookies(self._API_BASE).get('ct0')
+ if token:
+ headers['x-csrf-token'] = token.value
if not self._GUEST_TOKEN:
self._GUEST_TOKEN = self._download_json(
self._API_BASE + 'guest/activate.json', video_id,
@@ -468,7 +473,7 @@ class TwitterIE(TwitterBaseIE):
'uploader': uploader,
'timestamp': unified_timestamp(status.get('created_at')),
'uploader_id': uploader_id,
- 'uploader_url': 'https://twitter.com/' + uploader_id if uploader_id else None,
+ 'uploader_url': format_field(uploader_id, template='https://twitter.com/%s'),
'like_count': int_or_none(status.get('favorite_count')),
'repost_count': int_or_none(status.get('retweet_count')),
'comment_count': int_or_none(status.get('reply_count')),
@@ -485,7 +490,7 @@ class TwitterIE(TwitterBaseIE):
fmts, subs = self._extract_variant_formats(variant, twid)
subtitles = self._merge_subtitles(subtitles, subs)
formats.extend(fmts)
- self._sort_formats(formats)
+ self._sort_formats(formats, ('res', 'br', 'size', 'proto')) # The codec of http formats are unknown
thumbnails = []
media_url = media.get('media_url_https') or media.get('media_url')
@@ -508,7 +513,7 @@ class TwitterIE(TwitterBaseIE):
'duration': float_or_none(video_info.get('duration_millis'), 1000),
})
- media = try_get(status, lambda x: x['extended_entities']['media'][0])
+ media = traverse_obj(status, ((None, 'quoted_status'), 'extended_entities', 'media', 0), get_all=False)
if media and media.get('type') != 'photo':
extract_from_video_info(media)
else:
diff --git a/hypervideo_dl/extractor/udemy.py b/hypervideo_dl/extractor/udemy.py
index 74f638e..88b2310 100644
--- a/hypervideo_dl/extractor/udemy.py
+++ b/hypervideo_dl/extractor/udemy.py
@@ -168,14 +168,7 @@ class UdemyIE(InfoExtractor):
self._handle_error(response)
return response
- def _real_initialize(self):
- self._login()
-
- def _login(self):
- username, password = self._get_login_info()
- if username is None:
- return
-
+ def _perform_login(self, username, password):
login_popup = self._download_webpage(
self._LOGIN_URL, None, 'Downloading login popup')
diff --git a/hypervideo_dl/extractor/uol.py b/hypervideo_dl/extractor/uol.py
index 4a2a97f..1baee0b 100644
--- a/hypervideo_dl/extractor/uol.py
+++ b/hypervideo_dl/extractor/uol.py
@@ -95,7 +95,6 @@ class UOLIE(InfoExtractor):
if v:
query[k] = v
f_url = update_url_query(f_url, query)
- format_id = format_id
if format_id == 'HLS':
m3u8_formats = self._extract_m3u8_formats(
f_url, media_id, 'mp4', 'm3u8_native',
diff --git a/hypervideo_dl/extractor/urplay.py b/hypervideo_dl/extractor/urplay.py
index 753ffa4..eb2ab26 100644
--- a/hypervideo_dl/extractor/urplay.py
+++ b/hypervideo_dl/extractor/urplay.py
@@ -4,7 +4,11 @@ from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
dict_get,
+ ExtractorError,
int_or_none,
+ ISO639Utils,
+ parse_age_limit,
+ try_get,
unified_timestamp,
)
@@ -23,9 +27,10 @@ class URPlayIE(InfoExtractor):
'upload_date': '20171214',
'series': 'UR Samtiden - Livet, universum och rymdens märkliga musik',
'duration': 2269,
- 'categories': ['Kultur & historia'],
+ 'categories': ['Vetenskap & teknik'],
'tags': ['Kritiskt tänkande', 'Vetenskap', 'Vetenskaplig verksamhet'],
'episode': 'Om vetenskap, kritiskt tänkande och motstånd',
+ 'age_limit': 15,
},
}, {
'url': 'https://urskola.se/Produkter/190031-Tripp-Trapp-Trad-Sovkudde',
@@ -50,11 +55,16 @@ class URPlayIE(InfoExtractor):
video_id = self._match_id(url)
url = url.replace('skola.se/Produkter', 'play.se/program')
webpage = self._download_webpage(url, video_id)
- vid = int(video_id)
- accessible_episodes = self._parse_json(self._html_search_regex(
- r'data-react-class="routes/Product/components/ProgramContainer/ProgramContainer"[^>]+data-react-props="({.+?})"',
- webpage, 'urplayer data'), video_id)['accessibleEpisodes']
- urplayer_data = next(e for e in accessible_episodes if e.get('id') == vid)
+ urplayer_data = self._search_nextjs_data(webpage, video_id, fatal=False) or {}
+ if urplayer_data:
+ urplayer_data = try_get(urplayer_data, lambda x: x['props']['pageProps']['program'], dict)
+ if not urplayer_data:
+ raise ExtractorError('Unable to parse __NEXT_DATA__')
+ else:
+ accessible_episodes = self._parse_json(self._html_search_regex(
+ r'data-react-class="routes/Product/components/ProgramContainer/ProgramContainer"[^>]+data-react-props="({.+?})"',
+ webpage, 'urplayer data'), video_id)['accessibleEpisodes']
+ urplayer_data = next(e for e in accessible_episodes if e.get('id') == int_or_none(video_id))
episode = urplayer_data['title']
host = self._download_json('http://streaming-loadbalancer.ur.se/loadbalancer.json', video_id)['redirect']
@@ -72,11 +82,28 @@ class URPlayIE(InfoExtractor):
self._sort_formats(formats)
subtitles = {}
- subs = urplayer_streams.get("sweComplete", {}).get("tt", {}).get("location")
- if subs:
- subtitles.setdefault('Svenska', []).append({
- 'url': subs,
- })
+
+ def parse_lang_code(code):
+ "3-character language code or None (utils candidate)"
+ if code is None:
+ return
+ lang = code.lower()
+ if not ISO639Utils.long2short(lang):
+ lang = ISO639Utils.short2long(lang)
+ return lang or None
+
+ for k, v in (urplayer_data['streamingInfo'].get('sweComplete') or {}).items():
+ if (k in ('sd', 'hd') or not isinstance(v, dict)):
+ continue
+ lang, sttl_url = (v.get(kk) for kk in ('language', 'location', ))
+ if not sttl_url:
+ continue
+ lang = parse_lang_code(lang)
+ if not lang:
+ continue
+ sttl = subtitles.get(lang) or []
+ sttl.append({'ext': k, 'url': sttl_url, })
+ subtitles[lang] = sttl
image = urplayer_data.get('image') or {}
thumbnails = []
@@ -98,7 +125,6 @@ class URPlayIE(InfoExtractor):
return {
'id': video_id,
- 'subtitles': subtitles,
'title': '%s : %s' % (series_title, episode) if series_title else episode,
'description': urplayer_data.get('description'),
'thumbnails': thumbnails,
@@ -111,4 +137,7 @@ class URPlayIE(InfoExtractor):
'season': series.get('label'),
'episode': episode,
'episode_number': int_or_none(urplayer_data.get('episodeNumber')),
+ 'age_limit': parse_age_limit(min(try_get(a, lambda x: x['from'], int) or 0
+ for a in urplayer_data.get('ageRanges', []))),
+ 'subtitles': subtitles,
}
diff --git a/hypervideo_dl/extractor/ustream.py b/hypervideo_dl/extractor/ustream.py
index 8b75879..4a7a8f8 100644
--- a/hypervideo_dl/extractor/ustream.py
+++ b/hypervideo_dl/extractor/ustream.py
@@ -13,6 +13,7 @@ from ..utils import (
ExtractorError,
int_or_none,
float_or_none,
+ join_nonempty,
mimetype2ext,
str_or_none,
)
@@ -139,8 +140,8 @@ class UstreamIE(InfoExtractor):
content_type = stream['contentType']
kind = content_type.split('/')[0]
f = {
- 'format_id': '-'.join(filter(None, [
- 'dash', kind, str_or_none(stream.get('bitrate'))])),
+ 'format_id': join_nonempty(
+ 'dash', kind, str_or_none(stream.get('bitrate'))),
'protocol': 'http_dash_segments',
# TODO: generate a MPD doc for external players?
'url': encode_data_uri(b'<MPD/>', 'text/xml'),
diff --git a/hypervideo_dl/extractor/utreon.py b/hypervideo_dl/extractor/utreon.py
index 4a25f0c..4986635 100644
--- a/hypervideo_dl/extractor/utreon.py
+++ b/hypervideo_dl/extractor/utreon.py
@@ -13,7 +13,7 @@ from ..utils import (
class UtreonIE(InfoExtractor):
- _VALID_URL = r'(?:https?://)(?:www\.)?utreon.com/v/(?P<id>[a-zA-Z0-9_-]+)'
+ _VALID_URL = r'https?://(?:www\.)?utreon.com/v/(?P<id>[a-zA-Z0-9_-]+)'
_TESTS = [{
'url': 'https://utreon.com/v/z_I7ikQbuDw',
'info_dict': {
diff --git a/hypervideo_dl/extractor/varzesh3.py b/hypervideo_dl/extractor/varzesh3.py
index 81313dc..32655b9 100644
--- a/hypervideo_dl/extractor/varzesh3.py
+++ b/hypervideo_dl/extractor/varzesh3.py
@@ -42,8 +42,7 @@ class Varzesh3IE(InfoExtractor):
video_url = self._search_regex(
r'<source[^>]+src="([^"]+)"', webpage, 'video url')
- title = remove_start(self._html_search_regex(
- r'<title>([^<]+)</title>', webpage, 'title'), 'ویدیو ورزش 3 | ')
+ title = remove_start(self._html_extract_title(webpage), 'ویدیو ورزش 3 | ')
description = self._html_search_regex(
r'(?s)<div class="matn">(.+?)</div>',
diff --git a/hypervideo_dl/extractor/veo.py b/hypervideo_dl/extractor/veo.py
index 4e57a52..d87bb5b 100644
--- a/hypervideo_dl/extractor/veo.py
+++ b/hypervideo_dl/extractor/veo.py
@@ -6,13 +6,14 @@ from .common import InfoExtractor
from ..utils import (
int_or_none,
mimetype2ext,
+ str_or_none,
unified_timestamp,
url_or_none,
)
class VeoIE(InfoExtractor):
- _VALID_URL = r'https?://app\.veo\.co/matches/(?P<id>[0-9A-Za-z-]+)'
+ _VALID_URL = r'https?://app\.veo\.co/matches/(?P<id>[0-9A-Za-z-_]+)'
_TESTS = [{
'url': 'https://app.veo.co/matches/20201027-last-period/',
@@ -24,7 +25,11 @@ class VeoIE(InfoExtractor):
'upload_date': '20201028',
'timestamp': 1603847208,
'duration': 1916,
+ 'view_count': int,
}
+ }, {
+ 'url': 'https://app.veo.co/matches/20220313-2022-03-13_u15m-plsjq-vs-csl/',
+ 'only_matching': True,
}]
def _real_extract(self, url):
@@ -36,39 +41,41 @@ class VeoIE(InfoExtractor):
video_data = self._download_json(
'https://app.veo.co/api/app/matches/%s/videos' % video_id, video_id, 'Downloading video data')
- title = metadata.get('title')
- thumbnail = url_or_none(metadata.get('thumbnail'))
-
- timestamp = unified_timestamp(metadata.get('created'))
- duration = int_or_none(metadata.get('duration'))
- view_count = int_or_none(metadata.get('view_count'))
-
formats = []
for fmt in video_data:
- mimetype = fmt.get('mime_type')
+ mimetype = str_or_none(fmt.get('mime_type'))
+ format_url = url_or_none(fmt.get('url'))
# skip configuration file for panoramic video
- if mimetype == 'video/mp2t':
+ if not format_url or mimetype == 'video/mp2t':
continue
+
height = int_or_none(fmt.get('height'))
- bitrate = int_or_none(fmt.get('bit_rate'), scale=1000)
- render_type = fmt.get('render_type')
+ render_type = str_or_none(fmt.get('render_type'))
+ format_id = f'{render_type}-{height}p' if render_type and height else None
+
+ # Veo returns panoramic video information even if panoramic video is not available.
+ # e.g. https://app.veo.co/matches/20201027-last-period/
+ if render_type == 'panorama':
+ if not self._is_valid_url(format_url, video_id, format_id):
+ continue
+
formats.append({
- 'url': url_or_none(fmt.get('url')),
- 'format_id': '%s-%sp' % (render_type, height),
+ 'url': format_url,
+ 'format_id': format_id,
'ext': mimetype2ext(mimetype),
'width': int_or_none(fmt.get('width')),
'height': height,
- 'vbr': bitrate
+ 'vbr': int_or_none(fmt.get('bit_rate'), scale=1000),
})
self._sort_formats(formats)
return {
'id': video_id,
- 'title': title,
+ 'title': str_or_none(metadata.get('title')),
'formats': formats,
- 'thumbnail': thumbnail,
- 'timestamp': timestamp,
- 'view_count': view_count,
- 'duration': duration
+ 'thumbnail': url_or_none(metadata.get('thumbnail')),
+ 'timestamp': unified_timestamp(metadata.get('created')),
+ 'view_count': int_or_none(metadata.get('view_count')),
+ 'duration': int_or_none(metadata.get('duration')),
}
diff --git a/hypervideo_dl/extractor/veoh.py b/hypervideo_dl/extractor/veoh.py
index 1c44c14..d9afb56 100644
--- a/hypervideo_dl/extractor/veoh.py
+++ b/hypervideo_dl/extractor/veoh.py
@@ -5,21 +5,30 @@ from ..utils import (
int_or_none,
parse_duration,
qualities,
+ try_get
)
class VeohIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?veoh\.com/(?:watch|embed|iphone/#_Watch)/(?P<id>(?:v|e|yapi-)[\da-zA-Z]+)'
+ _VALID_URL = r'https?://(?:www\.)?veoh\.com/(?:watch|videos|embed|iphone/#_Watch)/(?P<id>(?:v|e|yapi-)[\da-zA-Z]+)'
_TESTS = [{
'url': 'http://www.veoh.com/watch/v56314296nk7Zdmz3',
- 'md5': '9e7ecc0fd8bbee7a69fe38953aeebd30',
+ 'md5': '620e68e6a3cff80086df3348426c9ca3',
'info_dict': {
'id': 'v56314296nk7Zdmz3',
'ext': 'mp4',
'title': 'Straight Backs Are Stronger',
+ 'description': 'md5:203f976279939a6dc664d4001e13f5f4',
+ 'thumbnail': 're:https://fcache\\.veoh\\.com/file/f/th56314296\\.jpg(\\?.*)?',
'uploader': 'LUMOback',
- 'description': 'At LUMOback, we believe straight backs are stronger. The LUMOback Posture & Movement Sensor: It gently vibrates when you slouch, inspiring improved posture and mobility. Use the app to track your data and improve your posture over time. ',
+ 'duration': 46,
+ 'view_count': int,
+ 'average_rating': int,
+ 'comment_count': int,
+ 'age_limit': 0,
+ 'categories': ['technology_and_gaming'],
+ 'tags': ['posture', 'posture', 'sensor', 'back', 'pain', 'wearable', 'tech', 'lumo'],
},
}, {
'url': 'http://www.veoh.com/embed/v56314296nk7Zdmz3',
@@ -51,30 +60,36 @@ class VeohIE(InfoExtractor):
}, {
'url': 'http://www.veoh.com/watch/e152215AJxZktGS',
'only_matching': True,
- }]
-
- def _extract_video(self, source):
- return {
- 'id': source.get('videoId'),
- 'title': source.get('title'),
- 'description': source.get('description'),
- 'thumbnail': source.get('highResImage') or source.get('medResImage'),
- 'uploader': source.get('username'),
- 'duration': int_or_none(source.get('length')),
- 'view_count': int_or_none(source.get('views')),
- 'age_limit': 18 if source.get('isMature') == 'true' or source.get('isSexy') == 'true' else 0,
- 'formats': self._extract_formats(source),
+ }, {
+ 'url': 'https://www.veoh.com/videos/v16374379WA437rMH',
+ 'md5': 'cceb73f3909063d64f4b93d4defca1b3',
+ 'info_dict': {
+ 'id': 'v16374379WA437rMH',
+ 'ext': 'mp4',
+ 'title': 'Phantasmagoria 2, pt. 1-3',
+ 'description': 'Phantasmagoria: a Puzzle of Flesh',
+ 'thumbnail': 're:https://fcache\\.veoh\\.com/file/f/th16374379\\.jpg(\\?.*)?',
+ 'uploader': 'davidspackage',
+ 'duration': 968,
+ 'view_count': int,
+ 'average_rating': int,
+ 'comment_count': int,
+ 'age_limit': 18,
+ 'categories': ['technology_and_gaming', 'gaming'],
+ 'tags': ['puzzle', 'of', 'flesh'],
}
+ }]
def _real_extract(self, url):
video_id = self._match_id(url)
- video = self._download_json(
+ metadata = self._download_json(
'https://www.veoh.com/watch/getVideo/' + video_id,
- video_id)['video']
+ video_id)
+ video = metadata['video']
title = video['title']
thumbnail_url = None
- q = qualities(['HQ', 'Regular'])
+ q = qualities(['Regular', 'HQ'])
formats = []
for f_id, f_url in video.get('src', {}).items():
if not f_url:
@@ -89,6 +104,12 @@ class VeohIE(InfoExtractor):
})
self._sort_formats(formats)
+ categories = metadata.get('categoryPath')
+ if not categories:
+ category = try_get(video, lambda x: x['category'].strip().removeprefix('category_'))
+ categories = [category] if category else None
+ tags = video.get('tags')
+
return {
'id': video_id,
'title': title,
@@ -100,4 +121,7 @@ class VeohIE(InfoExtractor):
'formats': formats,
'average_rating': int_or_none(video.get('rating')),
'comment_count': int_or_none(video.get('numOfComments')),
+ 'age_limit': 18 if video.get('contentRatingId') == 2 else 0,
+ 'categories': categories,
+ 'tags': tags.split(', ') if tags else None,
}
diff --git a/hypervideo_dl/extractor/vgtv.py b/hypervideo_dl/extractor/vgtv.py
index b6131ff..9d6090b 100644
--- a/hypervideo_dl/extractor/vgtv.py
+++ b/hypervideo_dl/extractor/vgtv.py
@@ -195,9 +195,7 @@ class VGTVIE(XstreamIE):
hls_url = streams.get('hls')
if hls_url:
formats.extend(self._extract_m3u8_formats(
- hls_url, video_id, 'mp4',
- entry_protocol='m3u8' if is_live else 'm3u8_native',
- m3u8_id='hls', fatal=False))
+ hls_url, video_id, 'mp4', live=is_live, m3u8_id='hls', fatal=False))
hds_url = streams.get('hds')
if hds_url:
@@ -242,7 +240,7 @@ class VGTVIE(XstreamIE):
info.update({
'id': video_id,
- 'title': self._live_title(data['title']) if is_live else data['title'],
+ 'title': data['title'],
'description': data['description'],
'thumbnail': data['images']['main'] + '?t[]=900x506q80',
'timestamp': data['published'],
diff --git a/hypervideo_dl/extractor/vice.py b/hypervideo_dl/extractor/vice.py
index ca4d3ed..c8c3055 100644
--- a/hypervideo_dl/extractor/vice.py
+++ b/hypervideo_dl/extractor/vice.py
@@ -290,7 +290,6 @@ class ViceArticleIE(ViceBaseIE):
},
'params': {
'skip_download': True,
- 'format': 'bestvideo',
},
'add_ie': [ViceIE.ie_key()],
}, {
diff --git a/hypervideo_dl/extractor/videa.py b/hypervideo_dl/extractor/videa.py
index 512ade7..90d7050 100644
--- a/hypervideo_dl/extractor/videa.py
+++ b/hypervideo_dl/extractor/videa.py
@@ -111,7 +111,6 @@ class VideaIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
-
video_page = self._download_webpage(url, video_id)
if 'videa.hu/player' in url:
@@ -146,7 +145,7 @@ class VideaIE(InfoExtractor):
compat_b64decode(b64_info), key), video_id)
video = xpath_element(info, './video', 'video')
- if not video:
+ if video is None:
raise ExtractorError(xpath_element(
info, './error', fatal=True), expected=True)
sources = xpath_element(
@@ -163,9 +162,9 @@ class VideaIE(InfoExtractor):
source_exp = source.get('exp')
if not (source_url and source_name):
continue
- hash_value = None
- if hash_values:
- hash_value = xpath_text(hash_values, 'hash_value_' + source_name)
+ hash_value = (
+ xpath_text(hash_values, 'hash_value_' + source_name)
+ if hash_values is not None else None)
if hash_value and source_exp:
source_url = update_url_query(source_url, {
'md5': hash_value,
diff --git a/hypervideo_dl/extractor/videocampus_sachsen.py b/hypervideo_dl/extractor/videocampus_sachsen.py
new file mode 100644
index 0000000..96e9857
--- /dev/null
+++ b/hypervideo_dl/extractor/videocampus_sachsen.py
@@ -0,0 +1,96 @@
+# coding: utf-8
+from .common import InfoExtractor
+
+
+class VideocampusSachsenIE(InfoExtractor):
+ _VALID_URL = r'''(?x)https?://videocampus\.sachsen\.de/(?:
+ m/(?P<tmp_id>[0-9a-f]+)|
+ (?:category/)?video/(?P<display_id>[\w-]+)/(?P<id>[0-9a-f]{32})
+ )'''
+
+ _TESTS = [
+ {
+ 'url': 'https://videocampus.sachsen.de/m/e0d6c8ce6e394c188f1342f1ab7c50ed6fc4490b808699801def5cb2e46d76ca7367f622a9f516c542ffb805b24d6b643bd7c81f385acaac4c59081b87a2767b',
+ 'info_dict': {
+ 'id': 'e6b9349905c1628631f175712250f2a1',
+ 'title': 'Konstruktiver Entwicklungsprozess Vorlesung 7',
+ 'ext': 'mp4',
+ },
+ },
+ {
+ 'url': 'https://videocampus.sachsen.de/video/Was-ist-selbstgesteuertes-Lernen/fc99c527e4205b121cb7c74433469262',
+ 'info_dict': {
+ 'id': 'fc99c527e4205b121cb7c74433469262',
+ 'title': 'Was ist selbstgesteuertes Lernen?',
+ 'display_id': 'Was-ist-selbstgesteuertes-Lernen',
+ 'ext': 'mp4',
+ },
+ },
+ {
+ 'url': 'https://videocampus.sachsen.de/category/video/Tutorial-zur-Nutzung-von-Adobe-Connect-aus-Veranstalter-Sicht/09d4ed029002eb1bdda610f1103dd54c/100',
+ 'info_dict': {
+ 'id': '09d4ed029002eb1bdda610f1103dd54c',
+ 'title': 'Tutorial zur Nutzung von Adobe Connect aus Veranstalter-Sicht',
+ 'display_id': 'Tutorial-zur-Nutzung-von-Adobe-Connect-aus-Veranstalter-Sicht',
+ 'ext': 'mp4',
+ },
+ },
+ ]
+
+ def _real_extract(self, url):
+ video_id, tmp_id, display_id = self._match_valid_url(url).group('id', 'tmp_id', 'display_id')
+ webpage = self._download_webpage(url, video_id or tmp_id, fatal=False) or ''
+
+ if not tmp_id:
+ video_id = self._html_search_regex(
+ r'src="https?://videocampus\.sachsen\.de/media/embed\?key=([0-9a-f]+)&',
+ webpage, 'video_id')
+
+ title = self._html_search_regex(
+ (r'<h1>(?P<content>[^<]+)</h1>', *self._meta_regex('title')),
+ webpage, 'title', group='content', fatal=False)
+
+ formats, subtitles = self._extract_m3u8_formats_and_subtitles(
+ f'https://videocampus.sachsen.de/media/hlsMedium/key/{video_id}/format/auto/ext/mp4/learning/0/path/m3u8',
+ video_id, 'mp4', 'm3u8_native', m3u8_id='hls')
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'display_id': display_id,
+ 'formats': formats,
+ 'subtitles': subtitles
+ }
+
+
+class VideocampusSachsenEmbedIE(InfoExtractor):
+ _VALID_URL = r'https?://videocampus.sachsen.de/media/embed\?key=(?P<id>[0-9a-f]+)'
+
+ _TESTS = [
+ {
+ 'url': 'https://videocampus.sachsen.de/media/embed?key=fc99c527e4205b121cb7c74433469262',
+ 'info_dict': {
+ 'id': 'fc99c527e4205b121cb7c74433469262',
+ 'title': 'Was ist selbstgesteuertes Lernen?',
+ 'ext': 'mp4',
+ },
+ }
+ ]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+ title = self._html_search_regex(r'<img[^>]*title="([^"<]+)"', webpage, 'title', fatal=False)
+ formats, subtitles = self._extract_m3u8_formats_and_subtitles(
+ f'https://videocampus.sachsen.de/media/hlsMedium/key/{video_id}/format/auto/ext/mp4/learning/0/path/m3u8',
+ video_id, 'mp4', 'm3u8_native', m3u8_id='hls')
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'formats': formats,
+ 'subtitles': subtitles,
+ }
diff --git a/hypervideo_dl/extractor/vidio.py b/hypervideo_dl/extractor/vidio.py
index 571448b..6bfb8d4 100644
--- a/hypervideo_dl/extractor/vidio.py
+++ b/hypervideo_dl/extractor/vidio.py
@@ -6,6 +6,7 @@ from .common import InfoExtractor
from ..utils import (
clean_html,
ExtractorError,
+ format_field,
get_element_by_class,
int_or_none,
parse_iso8601,
@@ -22,11 +23,7 @@ class VidioBaseIE(InfoExtractor):
_LOGIN_URL = 'https://www.vidio.com/users/login'
_NETRC_MACHINE = 'vidio'
- def _login(self):
- username, password = self._get_login_info()
- if username is None:
- return
-
+ def _perform_login(self, username, password):
def is_logged_in():
res = self._download_json(
'https://www.vidio.com/interactions.json', None, 'Checking if logged in', fatal=False) or {}
@@ -62,10 +59,9 @@ class VidioBaseIE(InfoExtractor):
'Unable to log in: %s. %s' % (reason, clean_html(subreason)), expected=True)
raise ExtractorError('Unable to log in')
- def _real_initialize(self):
+ def _initialize_pre_login(self):
self._api_key = self._download_json(
'https://www.vidio.com/auth', None, data=b'')['api_key']
- self._login()
def _call_api(self, url, video_id, note=None):
return self._download_json(url, video_id, note=note, headers={
@@ -160,7 +156,7 @@ class VidioIE(VidioBaseIE):
'uploader': user.get('name'),
'timestamp': parse_iso8601(video.get('created_at')),
'uploader_id': username,
- 'uploader_url': 'https://www.vidio.com/@' + username if username else None,
+ 'uploader_url': format_field(username, template='https://www.vidio.com/@%s'),
'channel': channel.get('name'),
'channel_id': str_or_none(channel.get('id')),
'view_count': get_count('view_count'),
@@ -291,5 +287,5 @@ class VidioLiveIE(VidioBaseIE):
'uploader': user.get('name'),
'timestamp': parse_iso8601(stream_meta.get('start_time')),
'uploader_id': username,
- 'uploader_url': 'https://www.vidio.com/@' + username if username else None,
+ 'uploader_url': format_field(username, template='https://www.vidio.com/@%s'),
}
diff --git a/hypervideo_dl/extractor/vidlii.py b/hypervideo_dl/extractor/vidlii.py
index f477425..a63919f 100644
--- a/hypervideo_dl/extractor/vidlii.py
+++ b/hypervideo_dl/extractor/vidlii.py
@@ -5,9 +5,12 @@ import re
from .common import InfoExtractor
from ..utils import (
+ HEADRequest,
+ format_field,
float_or_none,
get_element_by_id,
int_or_none,
+ str_to_int,
strip_or_none,
unified_strdate,
urljoin,
@@ -36,6 +39,25 @@ class VidLiiIE(InfoExtractor):
'tags': ['Vidlii', 'Jan', 'Videogames'],
}
}, {
+ 'url': 'https://www.vidlii.com/watch?v=zTAtaAgOLKt',
+ 'md5': '5778f7366aa4c569b77002f8bf6b614f',
+ 'info_dict': {
+ 'id': 'zTAtaAgOLKt',
+ 'ext': 'mp4',
+ 'title': 'FULPTUBE SUCKS.',
+ 'description': 'md5:087b2ca355d4c8f8f77e97c43e72d711',
+ 'thumbnail': 'https://www.vidlii.com/usfi/thmp/zTAtaAgOLKt.jpg',
+ 'uploader': 'Homicide',
+ 'uploader_url': 'https://www.vidlii.com/user/Homicide',
+ 'upload_date': '20210612',
+ 'duration': 89,
+ 'view_count': int,
+ 'comment_count': int,
+ 'average_rating': float,
+ 'categories': ['News & Politics'],
+ 'tags': ['fulp', 'tube', 'sucks', 'bad', 'fulptube'],
+ },
+ }, {
'url': 'https://www.vidlii.com/embed?v=tJluaH4BJ3v&a=0',
'only_matching': True,
}]
@@ -45,10 +67,20 @@ class VidLiiIE(InfoExtractor):
webpage = self._download_webpage(
'https://www.vidlii.com/watch?v=%s' % video_id, video_id)
-
- video_url = self._search_regex(
- r'src\s*:\s*(["\'])(?P<url>(?:https?://)?(?:(?!\1).)+)\1', webpage,
- 'video url', group='url')
+ formats = []
+
+ sources = [source[1] for source in re.findall(
+ r'src\s*:\s*(["\'])(?P<url>(?:https?://)?(?:(?!\1).)+)\1',
+ webpage) or []]
+ for source in sources:
+ height = int(self._search_regex(r'(\d+).mp4', source, 'height', default=360))
+ if self._request_webpage(HEADRequest(source), video_id, f'Checking {height}p url', errnote=False):
+ formats.append({
+ 'url': source,
+ 'format_id': f'{height}p',
+ 'height': height,
+ })
+ self._sort_formats(formats)
title = self._search_regex(
(r'<h1>([^<]+)</h1>', r'<title>([^<]+) - VidLii<'), webpage,
@@ -71,7 +103,7 @@ class VidLiiIE(InfoExtractor):
uploader = self._search_regex(
r'<div[^>]+class=["\']wt_person[^>]+>\s*<a[^>]+\bhref=["\']/user/[^>]+>([^<]+)',
webpage, 'uploader', fatal=False)
- uploader_url = 'https://www.vidlii.com/user/%s' % uploader if uploader else None
+ uploader_url = format_field(uploader, template='https://www.vidlii.com/user/%s')
upload_date = unified_strdate(self._html_search_meta(
'datePublished', webpage, default=None) or self._search_regex(
@@ -82,9 +114,9 @@ class VidLiiIE(InfoExtractor):
default=None) or self._search_regex(
r'duration\s*:\s*(\d+)', webpage, 'duration', fatal=False))
- view_count = int_or_none(self._search_regex(
- (r'<strong>(\d+)</strong> views',
- r'Views\s*:\s*<strong>(\d+)</strong>'),
+ view_count = str_to_int(self._search_regex(
+ (r'<strong>([,0-9]+)</strong> views',
+ r'Views\s*:\s*<strong>([,0-9]+)</strong>'),
webpage, 'view count', fatal=False))
comment_count = int_or_none(self._search_regex(
@@ -109,11 +141,11 @@ class VidLiiIE(InfoExtractor):
return {
'id': video_id,
- 'url': video_url,
'title': title,
'description': description,
'thumbnail': thumbnail,
'uploader': uploader,
+ 'formats': formats,
'uploader_url': uploader_url,
'upload_date': upload_date,
'duration': duration,
diff --git a/hypervideo_dl/extractor/viewlift.py b/hypervideo_dl/extractor/viewlift.py
index c3b2e86..4627f66 100644
--- a/hypervideo_dl/extractor/viewlift.py
+++ b/hypervideo_dl/extractor/viewlift.py
@@ -9,6 +9,7 @@ from ..utils import (
ExtractorError,
int_or_none,
parse_age_limit,
+ traverse_obj,
)
@@ -32,26 +33,33 @@ class ViewLiftBaseIE(InfoExtractor):
}
_TOKENS = {}
- def _call_api(self, site, path, video_id, query):
- token = self._TOKENS.get(site)
- if not token:
- token_query = {'site': site}
- email, password = self._get_login_info(netrc_machine=site)
- if email:
- resp = self._download_json(
- self._API_BASE + 'identity/signin', video_id,
- 'Logging in', query=token_query, data=json.dumps({
- 'email': email,
- 'password': password,
- }).encode())
- else:
- resp = self._download_json(
- self._API_BASE + 'identity/anonymous-token', video_id,
- 'Downloading authorization token', query=token_query)
- self._TOKENS[site] = token = resp['authorizationToken']
- return self._download_json(
- self._API_BASE + path, video_id,
- headers={'Authorization': token}, query=query)
+ def _fetch_token(self, site, url):
+ if self._TOKENS.get(site):
+ return
+
+ cookies = self._get_cookies(url)
+ if cookies and cookies.get('token'):
+ self._TOKENS[site] = self._search_regex(r'22authorizationToken\%22:\%22([^\%]+)\%22', cookies['token'].value, 'token')
+ if not self._TOKENS.get(site):
+ self.raise_login_required('Cookies (not necessarily logged in) are needed to download from this website', method='cookies')
+
+ def _call_api(self, site, path, video_id, url, query):
+ self._fetch_token(site, url)
+ try:
+ return self._download_json(
+ self._API_BASE + path, video_id, headers={'Authorization': self._TOKENS.get(site)}, query=query)
+ except ExtractorError as e:
+ if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
+ webpage = e.cause.read().decode()
+ try:
+ error_message = traverse_obj(json.loads(webpage), 'errorMessage', 'message')
+ except json.JSONDecodeError:
+ raise ExtractorError(f'{site} said: {webpage}', cause=e.cause)
+ if error_message:
+ if 'has not purchased' in error_message:
+ self.raise_login_required(method='cookies')
+ raise ExtractorError(error_message, expected=True)
+ raise
class ViewLiftEmbedIE(ViewLiftBaseIE):
@@ -96,27 +104,24 @@ class ViewLiftEmbedIE(ViewLiftBaseIE):
site = domain.split('.')[-2]
if site in self._SITE_MAP:
site = self._SITE_MAP[site]
- try:
- content_data = self._call_api(
- site, 'entitlement/video/status', film_id, {
- 'id': film_id
- })['video']
- except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
- error_message = self._parse_json(e.cause.read().decode(), film_id).get('errorMessage')
- if error_message == 'User does not have a valid subscription or has not purchased this content.':
- self.raise_login_required()
- raise ExtractorError(error_message, expected=True)
- raise
+
+ content_data = self._call_api(
+ site, 'entitlement/video/status', film_id, url, {
+ 'id': film_id
+ })['video']
gist = content_data['gist']
title = gist['title']
video_assets = content_data['streamingInfo']['videoAssets']
- formats = []
- mpeg_video_assets = video_assets.get('mpeg') or []
- for video_asset in mpeg_video_assets:
+ hls_url = video_assets.get('hls')
+ formats, subtitles = [], {}
+ if hls_url:
+ formats, subtitles = self._extract_m3u8_formats_and_subtitles(
+ hls_url, film_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)
+
+ for video_asset in video_assets.get('mpeg') or []:
video_asset_url = video_asset.get('url')
- if not video_asset:
+ if not video_asset_url:
continue
bitrate = int_or_none(video_asset.get('bitrate'))
height = int_or_none(self._search_regex(
@@ -130,13 +135,17 @@ class ViewLiftEmbedIE(ViewLiftBaseIE):
'vcodec': video_asset.get('codec'),
})
- hls_url = video_assets.get('hls')
- if hls_url:
- formats.extend(self._extract_m3u8_formats(
- hls_url, film_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))
- self._sort_formats(formats)
+ subs = {}
+ for sub in traverse_obj(content_data, ('contentDetails', 'closedCaptions')) or []:
+ sub_url = sub.get('url')
+ if not sub_url:
+ continue
+ subs.setdefault(sub.get('language', 'English'), []).append({
+ 'url': sub_url,
+ })
- info = {
+ self._sort_formats(formats)
+ return {
'id': film_id,
'title': title,
'description': gist.get('description'),
@@ -145,14 +154,15 @@ class ViewLiftEmbedIE(ViewLiftBaseIE):
'age_limit': parse_age_limit(content_data.get('parentalRating')),
'timestamp': int_or_none(gist.get('publishDate'), 1000),
'formats': formats,
+ 'subtitles': self._merge_subtitles(subs, subtitles),
+ 'categories': traverse_obj(content_data, ('categories', ..., 'title')),
+ 'tags': traverse_obj(content_data, ('tags', ..., 'title')),
}
- for k in ('categories', 'tags'):
- info[k] = [v['title'] for v in content_data.get(k, []) if v.get('title')]
- return info
class ViewLiftIE(ViewLiftBaseIE):
IE_NAME = 'viewlift'
+ _API_BASE = 'https://prod-api-cached-2.viewlift.com/'
_VALID_URL = r'https?://(?:www\.)?(?P<domain>%s)(?P<path>(?:/(?:films/title|show|(?:news/)?videos?|watch))?/(?P<id>[^?#]+))' % ViewLiftBaseIE._DOMAINS_REGEX
_TESTS = [{
'url': 'http://www.snagfilms.com/films/title/lost_for_life',
@@ -222,24 +232,111 @@ class ViewLiftIE(ViewLiftBaseIE):
}, {
'url': 'https://www.marquee.tv/watch/sadlerswells-sacredmonsters',
'only_matching': True,
+ }, { # Free film with langauge code
+ 'url': 'https://www.hoichoi.tv/bn/films/title/shuyopoka',
+ 'info_dict': {
+ 'id': '7a7a9d33-1f4c-4771-9173-ee4fb6dbf196',
+ 'ext': 'mp4',
+ 'title': 'Shuyopoka',
+ 'description': 'md5:e28f2fb8680096a69c944d37c1fa5ffc',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'upload_date': '20211006',
+ 'series': None
+ },
+ 'params': {'skip_download': True},
+ }, { # Free film
+ 'url': 'https://www.hoichoi.tv/films/title/dadu-no1',
+ 'info_dict': {
+ 'id': '0000015b-b009-d126-a1db-b81ff3780000',
+ 'ext': 'mp4',
+ 'title': 'Dadu No.1',
+ 'description': 'md5:605cba408e51a79dafcb824bdeded51e',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'upload_date': '20210827',
+ 'series': None
+ },
+ 'params': {'skip_download': True},
+ }, { # Free episode
+ 'url': 'https://www.hoichoi.tv/webseries/case-jaundice-s01-e01',
+ 'info_dict': {
+ 'id': 'f779e07c-30c8-459c-8612-5a834ab5e5ba',
+ 'ext': 'mp4',
+ 'title': 'Humans Vs. Corona',
+ 'description': 'md5:ca30a682b4528d02a3eb6d0427dd0f87',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'upload_date': '20210830',
+ 'series': 'Case Jaundice'
+ },
+ 'params': {'skip_download': True},
+ }, { # Free video
+ 'url': 'https://www.hoichoi.tv/videos/1549072415320-six-episode-02-hindi',
+ 'info_dict': {
+ 'id': 'b41fa1ce-aca6-47b6-b208-283ff0a2de30',
+ 'ext': 'mp4',
+ 'title': 'Woman in red - Hindi',
+ 'description': 'md5:9d21edc1827d32f8633eb67c2054fc31',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'upload_date': '20211006',
+ 'series': 'Six (Hindi)'
+ },
+ 'params': {'skip_download': True},
+ }, { # Free episode
+ 'url': 'https://www.hoichoi.tv/shows/watch-asian-paints-moner-thikana-online-season-1-episode-1',
+ 'info_dict': {
+ 'id': '1f45d185-8500-455c-b88d-13252307c3eb',
+ 'ext': 'mp4',
+ 'title': 'Jisshu Sengupta',
+ 'description': 'md5:ef6ffae01a3d83438597367400f824ed',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'upload_date': '20211004',
+ 'series': 'Asian Paints Moner Thikana'
+ },
+ 'params': {'skip_download': True},
+ }, { # Free series
+ 'url': 'https://www.hoichoi.tv/shows/watch-moner-thikana-bengali-web-series-online',
+ 'playlist_mincount': 5,
+ 'info_dict': {
+ 'id': 'watch-moner-thikana-bengali-web-series-online',
+ },
+ }, { # Premium series
+ 'url': 'https://www.hoichoi.tv/shows/watch-byomkesh-bengali-web-series-online',
+ 'playlist_mincount': 14,
+ 'info_dict': {
+ 'id': 'watch-byomkesh-bengali-web-series-online',
+ },
+ }, { # Premium movie
+ 'url': 'https://www.hoichoi.tv/movies/detective-2020',
+ 'only_matching': True
}]
@classmethod
def suitable(cls, url):
return False if ViewLiftEmbedIE.suitable(url) else super(ViewLiftIE, cls).suitable(url)
+ def _show_entries(self, domain, seasons):
+ for season in seasons:
+ for episode in season.get('episodes') or []:
+ path = traverse_obj(episode, ('gist', 'permalink'))
+ if path:
+ yield self.url_result(f'https://www.{domain}{path}', ie=self.ie_key())
+
def _real_extract(self, url):
domain, path, display_id = self._match_valid_url(url).groups()
site = domain.split('.')[-2]
if site in self._SITE_MAP:
site = self._SITE_MAP[site]
modules = self._call_api(
- site, 'content/pages', display_id, {
+ site, 'content/pages', display_id, url, {
'includeContent': 'true',
'moduleOffset': 1,
'path': path,
'site': site,
})['modules']
+
+ seasons = next((m['contentData'][0]['seasons'] for m in modules if m.get('moduleType') == 'ShowDetailModule'), None)
+ if seasons:
+ return self.playlist_result(self._show_entries(domain, seasons), display_id)
+
film_id = next(m['contentData'][0]['gist']['id'] for m in modules if m.get('moduleType') == 'VideoDetailModule')
return {
'_type': 'url_transparent',
diff --git a/hypervideo_dl/extractor/viki.py b/hypervideo_dl/extractor/viki.py
index acb5ae5..8a93079 100644
--- a/hypervideo_dl/extractor/viki.py
+++ b/hypervideo_dl/extractor/viki.py
@@ -19,7 +19,7 @@ class VikiBaseIE(InfoExtractor):
_VALID_URL_BASE = r'https?://(?:www\.)?viki\.(?:com|net|mx|jp|fr)/'
_API_URL_TEMPLATE = 'https://api.viki.io%s'
- _DEVICE_ID = '86085977d' # used for android api
+ _DEVICE_ID = '112395910d'
_APP = '100005a'
_APP_VERSION = '6.11.3'
_APP_SECRET = 'd96704b180208dbb2efa30fe44c48bd8690441af9f567ba8fd710a72badc85198f7472'
@@ -99,14 +99,7 @@ class VikiBaseIE(InfoExtractor):
self.raise_login_required(message)
self._raise_error(message)
- def _real_initialize(self):
- self._login()
-
- def _login(self):
- username, password = self._get_login_info()
- if username is None:
- return
-
+ def _perform_login(self, username, password):
self._token = self._call_api(
'sessions.json', None, 'Logging in', fatal=False,
data={'username': username, 'password': password}).get('token')
@@ -135,9 +128,6 @@ class VikiIE(VikiBaseIE):
'uploader': 'FCC',
'upload_date': '20201127',
},
- 'params': {
- 'format': 'bestvideo',
- },
}, {
'url': 'http://www.viki.com/videos/1023585v-heirs-episode-14',
'info_dict': {
@@ -151,9 +141,6 @@ class VikiIE(VikiBaseIE):
'duration': 3570,
'episode_number': 14,
},
- 'params': {
- 'format': 'bestvideo',
- },
'skip': 'Blocked in the US',
}, {
# clip
@@ -203,9 +190,6 @@ class VikiIE(VikiBaseIE):
'age_limit': 13,
'episode_number': 1,
},
- 'params': {
- 'format': 'bestvideo',
- },
}, {
# youtube external
'url': 'http://www.viki.com/videos/50562v-poor-nastya-complete-episode-1',
@@ -241,9 +225,6 @@ class VikiIE(VikiBaseIE):
'title': 'Love In Magic',
'age_limit': 13,
},
- 'params': {
- 'format': 'bestvideo',
- },
}]
def _real_extract(self, url):
@@ -265,7 +246,7 @@ class VikiIE(VikiBaseIE):
} for thumbnail_id, thumbnail in (video.get('images') or {}).items() if thumbnail.get('url')]
resp = self._call_api(
- 'playback_streams/%s.json?drms=dt1,dt2&device_id=%s' % (video_id, self._DEVICE_ID),
+ 'playback_streams/%s.json?drms=dt3&device_id=%s' % (video_id, self._DEVICE_ID),
video_id, 'Downloading video streams JSON')['main'][0]
stream_id = try_get(resp, lambda x: x['properties']['track']['stream_id'])
@@ -276,10 +257,13 @@ class VikiIE(VikiBaseIE):
} for ext in ('srt', 'vtt')]) for lang in (video.get('subtitle_completions') or {}).keys())
mpd_url = resp['url']
- # 1080p is hidden in another mpd which can be found in the current manifest content
+ # 720p is hidden in another MPD which can be found in the current manifest content
mpd_content = self._download_webpage(mpd_url, video_id, note='Downloading initial MPD manifest')
mpd_url = self._search_regex(
r'(?mi)<BaseURL>(http.+.mpd)', mpd_content, 'new manifest', default=mpd_url)
+ if 'mpdhd_high' not in mpd_url and 'sig=' not in mpd_url:
+ # Modify the URL to get 1080p
+ mpd_url = mpd_url.replace('mpdhd', 'mpdhd_high')
formats = self._extract_mpd_formats(mpd_url, video_id)
self._sort_formats(formats)
diff --git a/hypervideo_dl/extractor/vimeo.py b/hypervideo_dl/extractor/vimeo.py
index 9fb5475..4f025a5 100644
--- a/hypervideo_dl/extractor/vimeo.py
+++ b/hypervideo_dl/extractor/vimeo.py
@@ -3,7 +3,6 @@ from __future__ import unicode_literals
import base64
import functools
-import json
import re
import itertools
@@ -17,8 +16,9 @@ from ..compat import (
from ..utils import (
clean_html,
determine_ext,
- dict_get,
ExtractorError,
+ get_element_by_class,
+ HEADRequest,
js_to_json,
int_or_none,
merge_dicts,
@@ -26,10 +26,8 @@ from ..utils import (
parse_filesize,
parse_iso8601,
parse_qs,
- RegexNotFoundError,
sanitized_Request,
smuggle_url,
- std_headers,
str_or_none,
try_get,
unified_timestamp,
@@ -37,6 +35,7 @@ from ..utils import (
urlencode_postdata,
urljoin,
unescapeHTML,
+ urlhandle_detect_ext,
)
@@ -45,12 +44,7 @@ class VimeoBaseInfoExtractor(InfoExtractor):
_LOGIN_REQUIRED = False
_LOGIN_URL = 'https://vimeo.com/log_in'
- def _login(self):
- username, password = self._get_login_info()
- if username is None:
- if self._LOGIN_REQUIRED:
- raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
- return
+ def _perform_login(self, username, password):
webpage = self._download_webpage(
self._LOGIN_URL, None, 'Downloading login page')
token, vuid = self._extract_xsrft_and_vuid(webpage)
@@ -76,6 +70,10 @@ class VimeoBaseInfoExtractor(InfoExtractor):
expected=True)
raise ExtractorError('Unable to log in')
+ def _real_initialize(self):
+ if self._LOGIN_REQUIRED and not self._get_cookies('https://vimeo.com').get('vuid'):
+ self._raise_login_required()
+
def _get_video_password(self):
password = self.get_param('videopassword')
if password is None:
@@ -119,26 +117,29 @@ class VimeoBaseInfoExtractor(InfoExtractor):
self._set_cookie('vimeo.com', name, value)
def _vimeo_sort_formats(self, formats):
- # Bitrates are completely broken. Single m3u8 may contain entries in kbps and bps
- # at the same time without actual units specified. This lead to wrong sorting.
- # But since hypervideo prefers 'res,fps' anyway, 'field_preference' is not needed
- self._sort_formats(formats)
+ # Note: Bitrates are completely broken. Single m3u8 may contain entries in kbps and bps
+ # at the same time without actual units specified.
+ self._sort_formats(formats, ('quality', 'res', 'fps', 'hdr:12', 'source'))
def _parse_config(self, config, video_id):
video_data = config['video']
video_title = video_data['title']
live_event = video_data.get('live_event') or {}
is_live = live_event.get('status') == 'started'
+ request = config.get('request') or {}
formats = []
- config_files = video_data.get('files') or config['request'].get('files', {})
- for f in config_files.get('progressive', []):
+ subtitles = {}
+
+ config_files = video_data.get('files') or request.get('files') or {}
+ for f in (config_files.get('progressive') or []):
video_url = f.get('url')
if not video_url:
continue
formats.append({
'url': video_url,
'format_id': 'http-%s' % f.get('quality'),
+ 'source_preference': 10,
'width': int_or_none(f.get('width')),
'height': int_or_none(f.get('height')),
'fps': int_or_none(f.get('fps')),
@@ -148,7 +149,7 @@ class VimeoBaseInfoExtractor(InfoExtractor):
# TODO: fix handling of 308 status code returned for live archive manifest requests
sep_pattern = r'/sep/video/'
for files_type in ('hls', 'dash'):
- for cdn_name, cdn_data in config_files.get(files_type, {}).get('cdns', {}).items():
+ for cdn_name, cdn_data in (try_get(config_files, lambda x: x[files_type]['cdns']) or {}).items():
manifest_url = cdn_data.get('url')
if not manifest_url:
continue
@@ -162,21 +163,23 @@ class VimeoBaseInfoExtractor(InfoExtractor):
sep_manifest_urls = [(format_id, manifest_url)]
for f_id, m_url in sep_manifest_urls:
if files_type == 'hls':
- formats.extend(self._extract_m3u8_formats(
- m_url, video_id, 'mp4',
- 'm3u8' if is_live else 'm3u8_native', m3u8_id=f_id,
+ fmts, subs = self._extract_m3u8_formats_and_subtitles(
+ m_url, video_id, 'mp4', live=is_live, m3u8_id=f_id,
note='Downloading %s m3u8 information' % cdn_name,
- fatal=False))
+ fatal=False)
+ formats.extend(fmts)
+ self._merge_subtitles(subs, target=subtitles)
elif files_type == 'dash':
if 'json=1' in m_url:
real_m_url = (self._download_json(m_url, video_id, fatal=False) or {}).get('url')
if real_m_url:
m_url = real_m_url
- mpd_formats = self._extract_mpd_formats(
+ fmts, subs = self._extract_mpd_formats_and_subtitles(
m_url.replace('/master.json', '/master.mpd'), video_id, f_id,
'Downloading %s MPD information' % cdn_name,
fatal=False)
- formats.extend(mpd_formats)
+ formats.extend(fmts)
+ self._merge_subtitles(subs, target=subtitles)
live_archive = live_event.get('archive') or {}
live_archive_source_url = live_archive.get('source_url')
@@ -187,18 +190,15 @@ class VimeoBaseInfoExtractor(InfoExtractor):
'quality': 10,
})
- subtitles = {}
- text_tracks = config['request'].get('text_tracks')
- if text_tracks:
- for tt in text_tracks:
- subtitles[tt['lang']] = [{
- 'ext': 'vtt',
- 'url': urljoin('https://vimeo.com', tt['url']),
- }]
+ for tt in (request.get('text_tracks') or []):
+ subtitles.setdefault(tt['lang'], []).append({
+ 'ext': 'vtt',
+ 'url': urljoin('https://vimeo.com', tt['url']),
+ })
thumbnails = []
if not is_live:
- for key, thumb in video_data.get('thumbs', {}).items():
+ for key, thumb in (video_data.get('thumbs') or {}).items():
thumbnails.append({
'id': key,
'width': int_or_none(key),
@@ -213,14 +213,25 @@ class VimeoBaseInfoExtractor(InfoExtractor):
owner = video_data.get('owner') or {}
video_uploader_url = owner.get('url')
+ duration = int_or_none(video_data.get('duration'))
+ chapter_data = try_get(config, lambda x: x['embed']['chapters']) or []
+ chapters = [{
+ 'title': current_chapter.get('title'),
+ 'start_time': current_chapter.get('timecode'),
+ 'end_time': next_chapter.get('timecode'),
+ } for current_chapter, next_chapter in zip(chapter_data, chapter_data[1:] + [{'timecode': duration}])]
+ if chapters and chapters[0]['start_time']: # Chapters may not start from 0
+ chapters[:0] = [{'title': '<Untitled>', 'start_time': 0, 'end_time': chapters[0]['start_time']}]
+
return {
'id': str_or_none(video_data.get('id')) or video_id,
- 'title': self._live_title(video_title) if is_live else video_title,
+ 'title': video_title,
'uploader': owner.get('name'),
'uploader_id': video_uploader_url.split('/')[-1] if video_uploader_url else None,
'uploader_url': video_uploader_url,
'thumbnails': thumbnails,
- 'duration': int_or_none(video_data.get('duration')),
+ 'duration': duration,
+ 'chapters': chapters or None,
'formats': formats,
'subtitles': subtitles,
'is_live': is_live,
@@ -232,27 +243,26 @@ class VimeoBaseInfoExtractor(InfoExtractor):
query['unlisted_hash'] = unlisted_hash
download_data = self._download_json(
url, video_id, fatal=False, query=query,
- headers={'X-Requested-With': 'XMLHttpRequest'})
- if download_data:
- source_file = download_data.get('source_file')
- if isinstance(source_file, dict):
- download_url = source_file.get('download_url')
- if download_url and not source_file.get('is_cold') and not source_file.get('is_defrosting'):
- source_name = source_file.get('public_name', 'Original')
- if self._is_valid_url(download_url, video_id, '%s video' % source_name):
- ext = (try_get(
- source_file, lambda x: x['extension'],
- compat_str) or determine_ext(
- download_url, None) or 'mp4').lower()
- return {
- 'url': download_url,
- 'ext': ext,
- 'width': int_or_none(source_file.get('width')),
- 'height': int_or_none(source_file.get('height')),
- 'filesize': parse_filesize(source_file.get('size')),
- 'format_id': source_name,
- 'quality': 1,
- }
+ headers={'X-Requested-With': 'XMLHttpRequest'},
+ expected_status=(403, 404)) or {}
+ source_file = download_data.get('source_file')
+ download_url = try_get(source_file, lambda x: x['download_url'])
+ if download_url and not source_file.get('is_cold') and not source_file.get('is_defrosting'):
+ source_name = source_file.get('public_name', 'Original')
+ if self._is_valid_url(download_url, video_id, '%s video' % source_name):
+ ext = (try_get(
+ source_file, lambda x: x['extension'],
+ compat_str) or determine_ext(
+ download_url, None) or 'mp4').lower()
+ return {
+ 'url': download_url,
+ 'ext': ext,
+ 'width': int_or_none(source_file.get('width')),
+ 'height': int_or_none(source_file.get('height')),
+ 'filesize': parse_filesize(source_file.get('size')),
+ 'format_id': source_name,
+ 'quality': 1,
+ }
jwt_response = self._download_json(
'https://vimeo.com/_rv/viewer', video_id, note='Downloading jwt token', fatal=False) or {}
@@ -261,15 +271,19 @@ class VimeoBaseInfoExtractor(InfoExtractor):
headers = {'Authorization': 'jwt %s' % jwt_response['jwt']}
original_response = self._download_json(
f'https://api.vimeo.com/videos/{video_id}', video_id,
- headers=headers, fatal=False) or {}
- for download_data in original_response.get('download') or {}:
+ headers=headers, fatal=False, expected_status=(403, 404)) or {}
+ for download_data in original_response.get('download') or []:
download_url = download_data.get('link')
if not download_url or download_data.get('quality') != 'source':
continue
- query = parse_qs(download_url)
+ ext = determine_ext(parse_qs(download_url).get('filename', [''])[0].lower(), default_ext=None)
+ if not ext:
+ urlh = self._request_webpage(
+ HEADRequest(download_url), video_id, fatal=False, note='Determining source extension')
+ ext = urlh and urlhandle_detect_ext(urlh)
return {
'url': download_url,
- 'ext': determine_ext(query.get('filename', [''])[0].lower()),
+ 'ext': ext or 'unknown_video',
'format_id': download_data.get('public_name', 'Original'),
'width': int_or_none(download_data.get('width')),
'height': int_or_none(download_data.get('height')),
@@ -294,7 +308,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
)?
vimeo(?:pro)?\.com/
(?!(?:channels|album|showcase)/[^/?#]+/?(?:$|[?#])|[^/]+/review/|ondemand/)
- (?:.*?/)?
+ (?:[^/]+/)*?
(?:
(?:
play_redirect_hls|
@@ -313,7 +327,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
'info_dict': {
'id': '56015672',
'ext': 'mp4',
- 'title': "youtube-dl test video - \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550",
+ 'title': "youtube-dl test video '' ä↭𝕐-BaW jenozKc",
'description': 'md5:2d3305bad981a06ff79f027f19865021',
'timestamp': 1355990239,
'upload_date': '20121220',
@@ -326,6 +340,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
'params': {
'format': 'best[protocol=https]',
},
+ 'skip': 'No longer available'
},
{
'url': 'http://vimeopro.com/openstreetmapus/state-of-the-map-us-2013/video/68093876',
@@ -342,6 +357,11 @@ class VimeoIE(VimeoBaseInfoExtractor):
'duration': 1595,
'upload_date': '20130610',
'timestamp': 1370893156,
+ 'license': 'by',
+ 'thumbnail': 'https://i.vimeocdn.com/video/440260469-19b0d92fca3bd84066623b53f1eb8aaa3980c6c809e2d67b6b39ab7b4a77a344-d_960',
+ 'view_count': int,
+ 'comment_count': int,
+ 'like_count': int,
},
'params': {
'format': 'best[protocol=https]',
@@ -349,7 +369,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
},
{
'url': 'http://player.vimeo.com/video/54469442',
- 'md5': '619b811a4417aa4abe78dc653becf511',
+ 'md5': 'b3e7f4d2cbb53bd7dc3bb6ff4ed5cfbd',
'note': 'Videos that embed the url in the player page',
'info_dict': {
'id': '54469442',
@@ -360,11 +380,11 @@ class VimeoIE(VimeoBaseInfoExtractor):
'uploader_id': 'businessofsoftware',
'duration': 3610,
'description': None,
+ 'thumbnail': 'https://i.vimeocdn.com/video/376682406-f34043e7b766af6bef2af81366eacd6724f3fc3173179a11a97a1e26587c9529-d_1280',
},
'params': {
'format': 'best[protocol=https]',
},
- 'expected_warnings': ['Unable to download JSON metadata'],
},
{
'url': 'http://vimeo.com/68375962',
@@ -381,6 +401,10 @@ class VimeoIE(VimeoBaseInfoExtractor):
'uploader': 'Jaime Marquínez Ferrándiz',
'duration': 10,
'description': 'md5:dca3ea23adb29ee387127bc4ddfce63f',
+ 'thumbnail': 'https://i.vimeocdn.com/video/440665496-b2c5aee2b61089442c794f64113a8e8f7d5763c3e6b3ebfaf696ae6413f8b1f4-d_960',
+ 'view_count': int,
+ 'comment_count': int,
+ 'like_count': int,
},
'params': {
'format': 'best[protocol=https]',
@@ -403,15 +427,19 @@ class VimeoIE(VimeoBaseInfoExtractor):
'timestamp': 1380339469,
'upload_date': '20130928',
'duration': 187,
+ 'thumbnail': 'https://i.vimeocdn.com/video/450239872-a05512d9b1e55d707a7c04365c10980f327b06d966351bc403a5d5d65c95e572-d_1280',
+ 'view_count': int,
+ 'comment_count': int,
+ 'like_count': int,
},
- 'expected_warnings': ['Unable to download JSON metadata'],
+ 'params': {'format': 'http-1080p'},
},
{
'url': 'http://vimeo.com/76979871',
'note': 'Video with subtitles',
'info_dict': {
'id': '76979871',
- 'ext': 'mp4',
+ 'ext': 'mov',
'title': 'The New Vimeo Player (You Know, For Videos)',
'description': 'md5:2ec900bf97c3f389378a96aee11260ea',
'timestamp': 1381846109,
@@ -420,7 +448,14 @@ class VimeoIE(VimeoBaseInfoExtractor):
'uploader_id': 'staff',
'uploader': 'Vimeo Staff',
'duration': 62,
- }
+ 'subtitles': {
+ 'de': [{'ext': 'vtt'}],
+ 'en': [{'ext': 'vtt'}],
+ 'es': [{'ext': 'vtt'}],
+ 'fr': [{'ext': 'vtt'}],
+ },
+ },
+ 'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest'],
},
{
# from https://www.ouya.tv/game/Pier-Solar-and-the-Great-Architects/
@@ -433,6 +468,8 @@ class VimeoIE(VimeoBaseInfoExtractor):
'uploader': 'Tulio Gonçalves',
'uploader_url': r're:https?://(?:www\.)?vimeo\.com/user28849593',
'uploader_id': 'user28849593',
+ 'duration': 118,
+ 'thumbnail': 'https://i.vimeocdn.com/video/478636036-c18440305ef3df9decfb6bf207a61fe39d2d17fa462a96f6f2d93d30492b037d-d_1280',
},
},
{
@@ -449,6 +486,11 @@ class VimeoIE(VimeoBaseInfoExtractor):
'timestamp': 1324343742,
'upload_date': '20111220',
'description': 'md5:ae23671e82d05415868f7ad1aec21147',
+ 'duration': 60,
+ 'comment_count': int,
+ 'view_count': int,
+ 'thumbnail': 'https://i.vimeocdn.com/video/231174622-dd07f015e9221ff529d451e1cc31c982b5d87bfafa48c4189b1da72824ee289a-d_1280',
+ 'like_count': int,
},
},
{
@@ -464,8 +506,10 @@ class VimeoIE(VimeoBaseInfoExtractor):
'uploader': 'Framework Studio',
'description': 'md5:f2edc61af3ea7a5592681ddbb683db73',
'upload_date': '20200225',
+ 'duration': 176,
+ 'thumbnail': 'https://i.vimeocdn.com/video/859377297-836494a4ef775e9d4edbace83937d9ad34dc846c688c0c419c0e87f7ab06c4b3-d_1280',
+ 'uploader_url': 'https://vimeo.com/frameworkla',
},
- 'expected_warnings': ['Unable to download JSON metadata'],
},
{
# only available via https://vimeo.com/channels/tributes/6213729 and
@@ -483,11 +527,15 @@ class VimeoIE(VimeoBaseInfoExtractor):
'timestamp': 1250886430,
'upload_date': '20090821',
'description': 'md5:bdbf314014e58713e6e5b66eb252f4a6',
+ 'duration': 321,
+ 'comment_count': int,
+ 'view_count': int,
+ 'thumbnail': 'https://i.vimeocdn.com/video/22728298-bfc22146f930de7cf497821c7b0b9f168099201ecca39b00b6bd31fcedfca7a6-d_1280',
+ 'like_count': int,
},
'params': {
'skip_download': True,
},
- 'expected_warnings': ['Unable to download JSON metadata'],
},
{
# redirects to ondemand extractor and should be passed through it
@@ -507,7 +555,6 @@ class VimeoIE(VimeoBaseInfoExtractor):
'params': {
'skip_download': True,
},
- 'expected_warnings': ['Unable to download JSON metadata'],
'skip': 'this page is no longer available.',
},
{
@@ -517,10 +564,17 @@ class VimeoIE(VimeoBaseInfoExtractor):
'id': '68375962',
'ext': 'mp4',
'title': 'youtube-dl password protected test video',
+ 'timestamp': 1371200155,
+ 'upload_date': '20130614',
'uploader_url': r're:https?://(?:www\.)?vimeo\.com/user18948128',
'uploader_id': 'user18948128',
'uploader': 'Jaime Marquínez Ferrándiz',
'duration': 10,
+ 'description': 'md5:dca3ea23adb29ee387127bc4ddfce63f',
+ 'thumbnail': 'https://i.vimeocdn.com/video/440665496-b2c5aee2b61089442c794f64113a8e8f7d5763c3e6b3ebfaf696ae6413f8b1f4-d_960',
+ 'view_count': int,
+ 'comment_count': int,
+ 'like_count': int,
},
'params': {
'format': 'best[protocol=https]',
@@ -550,12 +604,18 @@ class VimeoIE(VimeoBaseInfoExtractor):
'info_dict': {
'id': '119195465',
'ext': 'mp4',
- 'title': 'youtube-dl test video \'ä"BaW_jenozKc',
+ 'title': "youtube-dl test video '' ä↭𝕐-BaW jenozKc",
'uploader': 'Philipp Hagemeister',
'uploader_id': 'user20132939',
'description': 'md5:fa7b6c6d8db0bdc353893df2f111855b',
'upload_date': '20150209',
'timestamp': 1423518307,
+ 'thumbnail': 'https://i.vimeocdn.com/video/default_1280',
+ 'duration': 10,
+ 'like_count': int,
+ 'uploader_url': 'https://vimeo.com/user20132939',
+ 'view_count': int,
+ 'comment_count': int,
},
'params': {
'format': 'best[protocol=https]',
@@ -568,13 +628,94 @@ class VimeoIE(VimeoBaseInfoExtractor):
'only_matching': True,
},
{
+ 'note': 'Direct URL with hash',
'url': 'https://vimeo.com/160743502/abd0e13fb4',
- 'only_matching': True,
+ 'info_dict': {
+ 'id': '160743502',
+ 'ext': 'mp4',
+ 'uploader': 'Julian Tryba',
+ 'uploader_id': 'aliniamedia',
+ 'title': 'Harrisville New Hampshire',
+ 'timestamp': 1459259666,
+ 'upload_date': '20160329',
+ 'release_timestamp': 1459259666,
+ 'license': 'by-nc',
+ 'duration': 159,
+ 'comment_count': int,
+ 'thumbnail': 'https://i.vimeocdn.com/video/562802436-585eeb13b5020c6ac0f171a2234067938098f84737787df05ff0d767f6d54ee9-d_1280',
+ 'like_count': int,
+ 'uploader_url': 'https://vimeo.com/aliniamedia',
+ 'release_date': '20160329',
+ },
+ 'params': {'skip_download': True},
+ },
+ {
+ 'url': 'https://vimeo.com/138909882',
+ 'info_dict': {
+ 'id': '138909882',
+ 'ext': 'mp4',
+ 'title': 'Eastnor Castle 2015 Firework Champions - The Promo!',
+ 'description': 'md5:5967e090768a831488f6e74b7821b3c1',
+ 'uploader_id': 'fireworkchampions',
+ 'uploader': 'Firework Champions',
+ 'upload_date': '20150910',
+ 'timestamp': 1441901895,
+ },
+ 'params': {
+ 'skip_download': True,
+ 'format': 'Original',
+ },
+ },
+ {
+ 'url': 'https://vimeo.com/channels/staffpicks/143603739',
+ 'info_dict': {
+ 'id': '143603739',
+ 'ext': 'mp4',
+ 'uploader': 'Karim Huu Do',
+ 'timestamp': 1445846953,
+ 'upload_date': '20151026',
+ 'title': 'The Shoes - Submarine Feat. Blaine Harrison',
+ 'uploader_id': 'karimhd',
+ 'description': 'md5:8e2eea76de4504c2e8020a9bcfa1e843',
+ 'channel_id': 'staffpicks',
+ 'duration': 336,
+ 'comment_count': int,
+ 'view_count': int,
+ 'thumbnail': 'https://i.vimeocdn.com/video/541243181-b593db36a16db2f0096f655da3f5a4dc46b8766d77b0f440df937ecb0c418347-d_1280',
+ 'like_count': int,
+ 'uploader_url': 'https://vimeo.com/karimhd',
+ 'channel_url': 'https://vimeo.com/channels/staffpicks',
+ },
+ 'params': {'skip_download': 'm3u8'},
},
{
# requires passing unlisted_hash(a52724358e) to load_download_config request
'url': 'https://vimeo.com/392479337/a52724358e',
'only_matching': True,
+ },
+ {
+ # similar, but all numeric: ID must be 581039021, not 9603038895
+ # issue #29690
+ 'url': 'https://vimeo.com/581039021/9603038895',
+ 'info_dict': {
+ 'id': '581039021',
+ 'ext': 'mp4',
+ 'timestamp': 1627621014,
+ 'release_timestamp': 1627621014,
+ 'duration': 976,
+ 'comment_count': int,
+ 'thumbnail': 'https://i.vimeocdn.com/video/1202249320-4ddb2c30398c0dc0ee059172d1bd5ea481ad12f0e0e3ad01d2266f56c744b015-d_1280',
+ 'like_count': int,
+ 'uploader_url': 'https://vimeo.com/txwestcapital',
+ 'release_date': '20210730',
+ 'uploader': 'Christopher Inks',
+ 'title': 'Thursday, July 29, 2021 BMA Evening Video Update',
+ 'uploader_id': 'txwestcapital',
+ 'upload_date': '20210730',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
}
# https://gettingthingsdone.com/workflowmap/
# vimeo embed with check-password page protected by Referer header
@@ -623,8 +764,36 @@ class VimeoIE(VimeoBaseInfoExtractor):
raise ExtractorError('Wrong video password', expected=True)
return checked
- def _real_initialize(self):
- self._login()
+ def _extract_from_api(self, video_id, unlisted_hash=None):
+ token = self._download_json(
+ 'https://vimeo.com/_rv/jwt', video_id, headers={
+ 'X-Requested-With': 'XMLHttpRequest'
+ })['token']
+ api_url = 'https://api.vimeo.com/videos/' + video_id
+ if unlisted_hash:
+ api_url += ':' + unlisted_hash
+ video = self._download_json(
+ api_url, video_id, headers={
+ 'Authorization': 'jwt ' + token,
+ }, query={
+ 'fields': 'config_url,created_time,description,license,metadata.connections.comments.total,metadata.connections.likes.total,release_time,stats.plays',
+ })
+ info = self._parse_config(self._download_json(
+ video['config_url'], video_id), video_id)
+ self._vimeo_sort_formats(info['formats'])
+ get_timestamp = lambda x: parse_iso8601(video.get(x + '_time'))
+ info.update({
+ 'description': video.get('description'),
+ 'license': video.get('license'),
+ 'release_timestamp': get_timestamp('release'),
+ 'timestamp': get_timestamp('created'),
+ 'view_count': int_or_none(try_get(video, lambda x: x['stats']['plays'])),
+ })
+ connections = try_get(
+ video, lambda x: x['metadata']['connections'], dict) or {}
+ for k in ('comment', 'like'):
+ info[k + '_count'] = int_or_none(try_get(connections, lambda x: x[k + 's']['total']))
+ return info
def _try_album_password(self, url):
album_id = self._search_regex(
@@ -666,54 +835,26 @@ class VimeoIE(VimeoBaseInfoExtractor):
def _real_extract(self, url):
url, data = unsmuggle_url(url, {})
- headers = std_headers.copy()
+ headers = self.get_param('http_headers').copy()
if 'http_headers' in data:
headers.update(data['http_headers'])
if 'Referer' not in headers:
headers['Referer'] = url
# Extract ID from URL
- video_id, unlisted_hash = self._match_valid_url(url).groups()
+ mobj = self._match_valid_url(url).groupdict()
+ video_id, unlisted_hash = mobj['id'], mobj.get('unlisted_hash')
if unlisted_hash:
- token = self._download_json(
- 'https://vimeo.com/_rv/jwt', video_id, headers={
- 'X-Requested-With': 'XMLHttpRequest'
- })['token']
- video = self._download_json(
- 'https://api.vimeo.com/videos/%s:%s' % (video_id, unlisted_hash),
- video_id, headers={
- 'Authorization': 'jwt ' + token,
- }, query={
- 'fields': 'config_url,created_time,description,license,metadata.connections.comments.total,metadata.connections.likes.total,release_time,stats.plays',
- })
- info = self._parse_config(self._download_json(
- video['config_url'], video_id), video_id)
- self._vimeo_sort_formats(info['formats'])
- get_timestamp = lambda x: parse_iso8601(video.get(x + '_time'))
- info.update({
- 'description': video.get('description'),
- 'license': video.get('license'),
- 'release_timestamp': get_timestamp('release'),
- 'timestamp': get_timestamp('created'),
- 'view_count': int_or_none(try_get(video, lambda x: x['stats']['plays'])),
- })
- connections = try_get(
- video, lambda x: x['metadata']['connections'], dict) or {}
- for k in ('comment', 'like'):
- info[k + '_count'] = int_or_none(try_get(connections, lambda x: x[k + 's']['total']))
- return info
+ return self._extract_from_api(video_id, unlisted_hash)
orig_url = url
is_pro = 'vimeopro.com/' in url
- is_player = '://player.vimeo.com/video/' in url
if is_pro:
# some videos require portfolio_id to be present in player url
# https://github.com/ytdl-org/youtube-dl/issues/20070
url = self._extract_url(url, self._download_webpage(url, video_id))
if not url:
url = 'https://vimeo.com/' + video_id
- elif is_player:
- url = 'https://player.vimeo.com/video/' + video_id
elif any(p in url for p in ('play_redirect_hls', 'moogaloop.swf')):
url = 'https://vimeo.com/' + video_id
@@ -734,14 +875,25 @@ class VimeoIE(VimeoBaseInfoExtractor):
expected=True)
raise
- # Now we begin extracting as much information as we can from what we
- # retrieved. First we extract the information common to all extractors,
- # and latter we extract those that are Vimeo specific.
- self.report_extraction(video_id)
+ if '://player.vimeo.com/video/' in url:
+ config = self._parse_json(self._search_regex(
+ r'\bconfig\s*=\s*({.+?})\s*;', webpage, 'info section'), video_id)
+ if config.get('view') == 4:
+ config = self._verify_player_video_password(
+ redirect_url, video_id, headers)
+ info = self._parse_config(config, video_id)
+ self._vimeo_sort_formats(info['formats'])
+ return info
+
+ if re.search(r'<form[^>]+?id="pw_form"', webpage):
+ video_password = self._get_video_password()
+ token, vuid = self._extract_xsrft_and_vuid(webpage)
+ webpage = self._verify_video_password(
+ redirect_url, video_id, video_password, token, vuid)
vimeo_config = self._extract_vimeo_config(webpage, video_id, default=None)
if vimeo_config:
- seed_status = vimeo_config.get('seed_status', {})
+ seed_status = vimeo_config.get('seed_status') or {}
if seed_status.get('state') == 'failed':
raise ExtractorError(
'%s said: %s' % (self.IE_NAME, seed_status['title']),
@@ -750,70 +902,41 @@ class VimeoIE(VimeoBaseInfoExtractor):
cc_license = None
timestamp = None
video_description = None
+ info_dict = {}
+ config_url = None
- # Extract the config JSON
- try:
- try:
- config_url = self._html_search_regex(
- r' data-config-url="(.+?)"', webpage,
- 'config URL', default=None)
- if not config_url:
- # Sometimes new react-based page is served instead of old one that require
- # different config URL extraction approach (see
- # https://github.com/ytdl-org/youtube-dl/pull/7209)
- page_config = self._parse_json(self._search_regex(
- r'vimeo\.(?:clip|vod_title)_page_config\s*=\s*({.+?});',
- webpage, 'page config'), video_id)
- config_url = page_config['player']['config_url']
- cc_license = page_config.get('cc_license')
- timestamp = try_get(
- page_config, lambda x: x['clip']['uploaded_on'],
- compat_str)
- video_description = clean_html(dict_get(
- page_config, ('description', 'description_html_escaped')))
- config = self._download_json(config_url, video_id)
- except RegexNotFoundError:
- # For pro videos or player.vimeo.com urls
- # We try to find out to which variable is assigned the config dic
- m_variable_name = re.search(r'(\w)\.video\.id', webpage)
- if m_variable_name is not None:
- config_re = [r'%s=({[^}].+?});' % re.escape(m_variable_name.group(1))]
- else:
- config_re = [r' = {config:({.+?}),assets:', r'(?:[abc])=({.+?});']
- config_re.append(r'\bvar\s+r\s*=\s*({.+?})\s*;')
- config_re.append(r'\bconfig\s*=\s*({.+?})\s*;')
- config = self._search_regex(config_re, webpage, 'info section',
- flags=re.DOTALL)
- config = json.loads(config)
- except Exception as e:
- if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
- raise ExtractorError('The author has restricted the access to this video, try with the "--referer" option')
-
- if re.search(r'<form[^>]+?id="pw_form"', webpage) is not None:
- if '_video_password_verified' in data:
- raise ExtractorError('video password verification failed!')
- video_password = self._get_video_password()
- token, vuid = self._extract_xsrft_and_vuid(webpage)
- self._verify_video_password(
- redirect_url, video_id, video_password, token, vuid)
- return self._real_extract(
- smuggle_url(redirect_url, {'_video_password_verified': 'verified'}))
- else:
- raise ExtractorError('Unable to extract info section',
- cause=e)
- else:
- if config.get('view') == 4:
- config = self._verify_player_video_password(redirect_url, video_id, headers)
-
+ channel_id = self._search_regex(
+ r'vimeo\.com/channels/([^/]+)', url, 'channel id', default=None)
+ if channel_id:
+ config_url = self._html_search_regex(
+ r'\bdata-config-url="([^"]+)"', webpage, 'config URL', default=None)
+ video_description = clean_html(get_element_by_class('description', webpage))
+ info_dict.update({
+ 'channel_id': channel_id,
+ 'channel_url': 'https://vimeo.com/channels/' + channel_id,
+ })
+ if not config_url:
+ page_config = self._parse_json(self._search_regex(
+ r'vimeo\.(?:clip|vod_title)_page_config\s*=\s*({.+?});',
+ webpage, 'page config', default='{}'), video_id, fatal=False)
+ if not page_config:
+ return self._extract_from_api(video_id)
+ config_url = page_config['player']['config_url']
+ cc_license = page_config.get('cc_license')
+ clip = page_config.get('clip') or {}
+ timestamp = clip.get('uploaded_on')
+ video_description = clean_html(
+ clip.get('description') or page_config.get('description_html_escaped'))
+ config = self._download_json(config_url, video_id)
video = config.get('video') or {}
vod = video.get('vod') or {}
def is_rented():
if '>You rented this title.<' in webpage:
return True
- if config.get('user', {}).get('purchased'):
+ if try_get(config, lambda x: x['user']['purchased']):
return True
- for purchase_option in vod.get('purchase_options', []):
+ for purchase_option in (vod.get('purchase_options') or []):
if purchase_option.get('purchased'):
return True
label = purchase_option.get('label_string')
@@ -828,14 +951,14 @@ class VimeoIE(VimeoBaseInfoExtractor):
'https://player.vimeo.com/player/%s' % feature_id,
{'force_feature_id': True}), 'Vimeo')
- # Extract video description
if not video_description:
video_description = self._html_search_regex(
r'(?s)<div\s+class="[^"]*description[^"]*"[^>]*>(.*?)</div>',
webpage, 'description', default=None)
if not video_description:
video_description = self._html_search_meta(
- 'description', webpage, default=None)
+ ['description', 'og:description', 'twitter:description'],
+ webpage, default=None)
if not video_description and is_pro:
orig_webpage = self._download_webpage(
orig_url, video_id,
@@ -844,24 +967,17 @@ class VimeoIE(VimeoBaseInfoExtractor):
if orig_webpage:
video_description = self._html_search_meta(
'description', orig_webpage, default=None)
- if not video_description and not is_player:
+ if not video_description:
self.report_warning('Cannot find video description')
- # Extract upload date
if not timestamp:
timestamp = self._search_regex(
r'<time[^>]+datetime="([^"]+)"', webpage,
'timestamp', default=None)
- try:
- view_count = int(self._search_regex(r'UserPlays:(\d+)', webpage, 'view count'))
- like_count = int(self._search_regex(r'UserLikes:(\d+)', webpage, 'like count'))
- comment_count = int(self._search_regex(r'UserComments:(\d+)', webpage, 'comment count'))
- except RegexNotFoundError:
- # This info is only available in vimeo.com/{id} urls
- view_count = None
- like_count = None
- comment_count = None
+ view_count = int_or_none(self._search_regex(r'UserPlays:(\d+)', webpage, 'view count', default=None))
+ like_count = int_or_none(self._search_regex(r'UserLikes:(\d+)', webpage, 'like count', default=None))
+ comment_count = int_or_none(self._search_regex(r'UserComments:(\d+)', webpage, 'comment count', default=None))
formats = []
@@ -881,11 +997,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
r'<link[^>]+rel=["\']license["\'][^>]+href=(["\'])(?P<license>(?:(?!\1).)+)\1',
webpage, 'license', default=None, group='license')
- channel_id = self._search_regex(
- r'vimeo\.com/channels/([^/]+)', url, 'channel id', default=None)
- channel_url = 'https://vimeo.com/channels/%s' % channel_id if channel_id else None
-
- info_dict = {
+ info_dict.update({
'formats': formats,
'timestamp': unified_timestamp(timestamp),
'description': video_description,
@@ -894,18 +1006,14 @@ class VimeoIE(VimeoBaseInfoExtractor):
'like_count': like_count,
'comment_count': comment_count,
'license': cc_license,
- 'channel_id': channel_id,
- 'channel_url': channel_url,
- }
-
- info_dict = merge_dicts(info_dict, info_dict_config, json_ld)
+ })
- return info_dict
+ return merge_dicts(info_dict, info_dict_config, json_ld)
class VimeoOndemandIE(VimeoIE):
IE_NAME = 'vimeo:ondemand'
- _VALID_URL = r'https?://(?:www\.)?vimeo\.com/ondemand/([^/]+/)?(?P<id>[^/?#&]+)'
+ _VALID_URL = r'https?://(?:www\.)?vimeo\.com/ondemand/(?:[^/]+/)?(?P<id>[^/?#&]+)'
_TESTS = [{
# ondemand video not available via https://vimeo.com/id
'url': 'https://vimeo.com/ondemand/20704',
@@ -917,9 +1025,15 @@ class VimeoOndemandIE(VimeoIE):
'uploader': 'גם סרטים',
'uploader_url': r're:https?://(?:www\.)?vimeo\.com/gumfilms',
'uploader_id': 'gumfilms',
- 'description': 'md5:4c027c965e439de4baab621e48b60791',
+ 'description': 'md5:aeeba3dbd4d04b0fa98a4fdc9c639998',
'upload_date': '20140906',
'timestamp': 1410032453,
+ 'thumbnail': 'https://i.vimeocdn.com/video/488238335-d7bf151c364cff8d467f1b73784668fe60aae28a54573a35d53a1210ae283bd8-d_1280',
+ 'comment_count': int,
+ 'license': 'https://creativecommons.org/licenses/by-nc-nd/3.0/',
+ 'duration': 53,
+ 'view_count': int,
+ 'like_count': int,
},
'params': {
'format': 'best[protocol=https]',
@@ -938,6 +1052,11 @@ class VimeoOndemandIE(VimeoIE):
'description': 'md5:c3c46a90529612c8279fb6af803fc0df',
'upload_date': '20150502',
'timestamp': 1430586422,
+ 'duration': 121,
+ 'comment_count': int,
+ 'view_count': int,
+ 'thumbnail': 'https://i.vimeocdn.com/video/517077723-7066ae1d9a79d3eb361334fb5d58ec13c8f04b52f8dd5eadfbd6fb0bcf11f613-d_1280',
+ 'like_count': int,
},
'params': {
'skip_download': True,
@@ -967,7 +1086,7 @@ class VimeoChannelIE(VimeoBaseInfoExtractor):
'id': 'tributes',
'title': 'Vimeo Tributes',
},
- 'playlist_mincount': 25,
+ 'playlist_mincount': 22,
}]
_BASE_URL_TEMPL = 'https://vimeo.com/channels/%s'
@@ -1128,10 +1247,10 @@ class VimeoGroupsIE(VimeoChannelIE):
IE_NAME = 'vimeo:group'
_VALID_URL = r'https://vimeo\.com/groups/(?P<id>[^/]+)(?:/(?!videos?/\d+)|$)'
_TESTS = [{
- 'url': 'https://vimeo.com/groups/kattykay',
+ 'url': 'https://vimeo.com/groups/meetup',
'info_dict': {
- 'id': 'kattykay',
- 'title': 'Katty Kay',
+ 'id': 'meetup',
+ 'title': 'Vimeo Meetup!',
},
'playlist_mincount': 27,
}]
@@ -1152,8 +1271,10 @@ class VimeoReviewIE(VimeoBaseInfoExtractor):
'uploader': 'Richard Hardwick',
'uploader_id': 'user21297594',
'description': "Comedian Dick Hardwick's five minute demo filmed in front of a live theater audience.\nEdit by Doug Mattocks",
+ 'duration': 304,
+ 'thumbnail': 'https://i.vimeocdn.com/video/450115033-43303819d9ebe24c2630352e18b7056d25197d09b3ae901abdac4c4f1d68de71-d_1280',
+ 'uploader_url': 'https://vimeo.com/user21297594',
},
- 'expected_warnings': ['Unable to download JSON metadata'],
}, {
'note': 'video player needs Referer',
'url': 'https://vimeo.com/user22258446/review/91613211/13f927e053',
@@ -1184,9 +1305,6 @@ class VimeoReviewIE(VimeoBaseInfoExtractor):
'skip': 'video gone',
}]
- def _real_initialize(self):
- self._login()
-
def _real_extract(self, url):
page_url, video_id = self._match_valid_url(url).groups()
data = self._download_json(
@@ -1228,9 +1346,6 @@ class VimeoWatchLaterIE(VimeoChannelIE):
'only_matching': True,
}]
- def _real_initialize(self):
- self._login()
-
def _page_url(self, base_url, pagenum):
url = '%s/page:%d/' % (base_url, pagenum)
request = sanitized_Request(url)
diff --git a/hypervideo_dl/extractor/vimm.py b/hypervideo_dl/extractor/vimm.py
new file mode 100644
index 0000000..060b92b
--- /dev/null
+++ b/hypervideo_dl/extractor/vimm.py
@@ -0,0 +1,69 @@
+# coding: utf-8
+from .common import InfoExtractor
+
+
+class VimmIE(InfoExtractor):
+ IE_NAME = 'Vimm:stream'
+ _VALID_URL = r'https?://(?:www\.)?vimm\.tv/(?:c/)?(?P<id>[0-9a-z-]+)$'
+ _TESTS = [{
+ 'url': 'https://www.vimm.tv/c/calimeatwagon',
+ 'info_dict': {
+ 'id': 'calimeatwagon',
+ 'ext': 'mp4',
+ 'title': 're:^calimeatwagon [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+ 'live_status': 'is_live',
+ },
+ 'skip': 'Live',
+ }, {
+ 'url': 'https://www.vimm.tv/octaafradio',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ channel_id = self._match_id(url)
+
+ formats, subs = self._extract_m3u8_formats_and_subtitles(
+ f'https://www.vimm.tv/hls/{channel_id}.m3u8', channel_id, 'mp4', m3u8_id='hls', live=True)
+ self._sort_formats(formats)
+
+ return {
+ 'id': channel_id,
+ 'title': channel_id,
+ 'is_live': True,
+ 'formats': formats,
+ 'subtitles': subs,
+ }
+
+
+class VimmRecordingIE(InfoExtractor):
+ IE_NAME = 'Vimm:recording'
+ _VALID_URL = r'https?://(?:www\.)?vimm\.tv/c/(?P<channel_id>[0-9a-z-]+)\?v=(?P<video_id>[0-9A-Za-z]+)'
+ _TESTS = [{
+ 'url': 'https://www.vimm.tv/c/kaldewei?v=2JZsrPTFxsSz',
+ 'md5': '15122ee95baa32a548e4a3e120b598f1',
+ 'info_dict': {
+ 'id': '2JZsrPTFxsSz',
+ 'ext': 'mp4',
+ 'title': 'VIMM - [DE/GER] Kaldewei Live - In Farbe und Bunt',
+ 'uploader_id': 'kaldewei',
+ },
+ }]
+
+ def _real_extract(self, url):
+ channel_id, video_id = self._match_valid_url(url).groups()
+
+ webpage = self._download_webpage(url, video_id)
+ title = self._og_search_title(webpage)
+
+ formats, subs = self._extract_m3u8_formats_and_subtitles(
+ f'https://d211qfrkztakg3.cloudfront.net/{channel_id}/{video_id}/index.m3u8', video_id, 'mp4', m3u8_id='hls', live=False)
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'is_live': False,
+ 'uploader_id': channel_id,
+ 'formats': formats,
+ 'subtitles': subs,
+ }
diff --git a/hypervideo_dl/extractor/vine.py b/hypervideo_dl/extractor/vine.py
index 07fce0d..e59b103 100644
--- a/hypervideo_dl/extractor/vine.py
+++ b/hypervideo_dl/extractor/vine.py
@@ -6,6 +6,7 @@ from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
determine_ext,
+ format_field,
int_or_none,
unified_timestamp,
)
@@ -92,7 +93,7 @@ class VineIE(InfoExtractor):
username = data.get('username')
- alt_title = 'Vine by %s' % username if username else None
+ alt_title = format_field(username, template='Vine by %s')
return {
'id': video_id,
diff --git a/hypervideo_dl/extractor/viu.py b/hypervideo_dl/extractor/viu.py
index 1b34c52..3cfca89 100644
--- a/hypervideo_dl/extractor/viu.py
+++ b/hypervideo_dl/extractor/viu.py
@@ -1,55 +1,32 @@
# coding: utf-8
from __future__ import unicode_literals
-import json
import re
+import json
+import uuid
+import random
+import urllib.parse
from .common import InfoExtractor
-from ..compat import (
- compat_kwargs,
- compat_str,
- compat_urlparse,
- compat_urllib_request,
-)
+from ..compat import compat_str
from ..utils import (
ExtractorError,
int_or_none,
+ strip_or_none,
try_get,
smuggle_url,
unsmuggle_url,
+ url_or_none,
)
class ViuBaseIE(InfoExtractor):
- def _real_initialize(self):
- viu_auth_res = self._request_webpage(
- 'https://www.viu.com/api/apps/v2/authenticate', None,
- 'Requesting Viu auth', query={
- 'acct': 'test',
- 'appid': 'viu_desktop',
- 'fmt': 'json',
- 'iid': 'guest',
- 'languageid': 'default',
- 'platform': 'desktop',
- 'userid': 'guest',
- 'useridtype': 'guest',
- 'ver': '1.0'
- }, headers=self.geo_verification_headers())
- self._auth_token = viu_auth_res.info()['X-VIU-AUTH']
-
- def _call_api(self, path, *args, **kwargs):
- headers = self.geo_verification_headers()
- headers.update({
- 'X-VIU-AUTH': self._auth_token
- })
- headers.update(kwargs.get('headers', {}))
- kwargs['headers'] = headers
+ def _call_api(self, path, *args, headers={}, **kwargs):
response = self._download_json(
- 'https://www.viu.com/api/' + path, *args,
- **compat_kwargs(kwargs))['response']
+ f'https://www.viu.com/api/{path}', *args, **kwargs,
+ headers={**self.geo_verification_headers(), **headers})['response']
if response.get('status') != 'success':
- raise ExtractorError('%s said: %s' % (
- self.IE_NAME, response['message']), expected=True)
+ raise ExtractorError(f'{self.IE_NAME} said: {response["message"]}', expected=True)
return response
@@ -101,6 +78,7 @@ class ViuIE(ViuBaseIE):
tdirforwhole = video_data.get('tdirforwhole')
# #EXT-X-BYTERANGE is not supported by native hls downloader
# and ffmpeg (#10955)
+ # FIXME: It is supported in hypervideo
# hls_file = video_data.get('hlsfile')
hls_file = video_data.get('jwhlsfile')
if url_path and tdirforwhole and hls_file:
@@ -110,10 +88,9 @@ class ViuIE(ViuBaseIE):
# r'(/hlsc_)[a-z]+(\d+\.m3u8)',
# r'\1whe\2', video_data['href'])
m3u8_url = video_data['href']
- formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4')
+ formats, subtitles = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, 'mp4')
self._sort_formats(formats)
- subtitles = {}
for key, value in video_data.items():
mobj = re.match(r'^subtitle_(?P<lang>[^_]+)_(?P<ext>(vtt|srt))', key)
if not mobj:
@@ -227,42 +204,63 @@ class ViuOTTIE(InfoExtractor):
'zh-cn': 2,
'en-us': 3,
}
- _user_info = None
+
+ _user_token = None
+ _auth_codes = {}
def _detect_error(self, response):
- code = response.get('status', {}).get('code')
- if code > 0:
+ code = try_get(response, lambda x: x['status']['code'])
+ if code and code > 0:
message = try_get(response, lambda x: x['status']['message'])
- raise ExtractorError('%s said: %s (%s)' % (
- self.IE_NAME, message, code), expected=True)
- return response['data']
-
- def _raise_login_required(self):
- raise ExtractorError(
- 'This video requires login. '
- 'Specify --username and --password or --netrc (machine: %s) '
- 'to provide account credentials.' % self._NETRC_MACHINE,
- expected=True)
+ raise ExtractorError(f'{self.IE_NAME} said: {message} ({code})', expected=True)
+ return response.get('data') or {}
def _login(self, country_code, video_id):
- if not self._user_info:
+ if self._user_token is None:
username, password = self._get_login_info()
- if username is None or password is None:
+ if username is None:
return
+ headers = {
+ 'Authorization': f'Bearer {self._auth_codes[country_code]}',
+ 'Content-Type': 'application/json'
+ }
+ data = self._download_json(
+ 'https://api-gateway-global.viu.com/api/account/validate',
+ video_id, 'Validating email address', headers=headers,
+ data=json.dumps({
+ 'principal': username,
+ 'provider': 'email'
+ }).encode())
+ if not data.get('exists'):
+ raise ExtractorError('Invalid email address')
data = self._download_json(
- compat_urllib_request.Request(
- 'https://www.viu.com/ott/%s/index.php' % country_code, method='POST'),
- video_id, 'Logging in', errnote=False, fatal=False,
- query={'r': 'user/login'},
+ 'https://api-gateway-global.viu.com/api/auth/login',
+ video_id, 'Logging in', headers=headers,
data=json.dumps({
- 'username': username,
+ 'email': username,
'password': password,
- 'platform_flag_label': 'web',
+ 'provider': 'email',
}).encode())
- self._user_info = self._detect_error(data)['user']
-
- return self._user_info
+ self._detect_error(data)
+ self._user_token = data.get('identity')
+ # need to update with valid user's token else will throw an error again
+ self._auth_codes[country_code] = data.get('token')
+ return self._user_token
+
+ def _get_token(self, country_code, video_id):
+ rand = ''.join(random.choice('0123456789') for _ in range(10))
+ return self._download_json(
+ f'https://api-gateway-global.viu.com/api/auth/token?v={rand}000', video_id,
+ headers={'Content-Type': 'application/json'}, note='Getting bearer token',
+ data=json.dumps({
+ 'countryCode': country_code.upper(),
+ 'platform': 'browser',
+ 'platformFlagLabel': 'web',
+ 'language': 'en',
+ 'uuid': str(uuid.uuid4()),
+ 'carrierId': '0'
+ }).encode('utf-8'))['token']
def _real_extract(self, url):
url, idata = unsmuggle_url(url, {})
@@ -279,17 +277,16 @@ class ViuOTTIE(InfoExtractor):
query['area_id'] = area_id
product_data = self._download_json(
- 'http://www.viu.com/ott/%s/index.php' % country_code, video_id,
+ f'http://www.viu.com/ott/{country_code}/index.php', video_id,
'Downloading video info', query=query)['data']
video_data = product_data.get('current_product')
if not video_data:
- raise ExtractorError('This video is not available in your region.', expected=True)
+ self.raise_geo_restricted()
series_id = video_data.get('series_id')
- if not self.get_param('noplaylist') and not idata.get('force_noplaylist'):
- self.to_screen('Downloading playlist %s - add --no-playlist to just download video' % series_id)
- series = product_data.get('series', {})
+ if self._yes_playlist(series_id, video_id, idata):
+ series = product_data.get('series') or {}
product = series.get('product')
if product:
entries = []
@@ -297,88 +294,78 @@ class ViuOTTIE(InfoExtractor):
item_id = entry.get('product_id')
if not item_id:
continue
- item_id = compat_str(item_id)
entries.append(self.url_result(
- smuggle_url(
- 'http://www.viu.com/ott/%s/%s/vod/%s/' % (country_code, lang_code, item_id),
- {'force_noplaylist': True}), # prevent infinite recursion
- 'ViuOTT',
- item_id,
- entry.get('synopsis', '').strip()))
+ smuggle_url(f'http://www.viu.com/ott/{country_code}/{lang_code}/vod/{item_id}/',
+ {'force_noplaylist': True}),
+ ViuOTTIE, str(item_id), entry.get('synopsis', '').strip()))
return self.playlist_result(entries, series_id, series.get('name'), series.get('description'))
- if self.get_param('noplaylist'):
- self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
-
duration_limit = False
query = {
'ccs_product_id': video_data['ccs_product_id'],
'language_flag_id': self._LANGUAGE_FLAG.get(lang_code.lower()) or '3',
}
- headers = {
- 'Referer': url,
- 'Origin': url,
- }
- try:
+
+ def download_playback():
stream_data = self._download_json(
- 'https://d1k2us671qcoau.cloudfront.net/distribute_web_%s.php' % country_code,
- video_id, 'Downloading stream info', query=query, headers=headers)
- stream_data = self._detect_error(stream_data)['stream']
- except (ExtractorError, KeyError):
- stream_data = None
- if video_data.get('user_level', 0) > 0:
- user = self._login(country_code, video_id)
- if user:
- query['identity'] = user['identity']
- stream_data = self._download_json(
- 'https://d1k2us671qcoau.cloudfront.net/distribute_web_%s.php' % country_code,
- video_id, 'Downloading stream info', query=query, headers=headers)
- stream_data = self._detect_error(stream_data).get('stream')
- else:
- # preview is limited to 3min for non-members
- # try to bypass the duration limit
- duration_limit = True
- query['duration'] = '180'
- stream_data = self._download_json(
- 'https://d1k2us671qcoau.cloudfront.net/distribute_web_%s.php' % country_code,
- video_id, 'Downloading stream info', query=query, headers=headers)
- try:
- stream_data = self._detect_error(stream_data)['stream']
- except (ExtractorError, KeyError): # if still not working, give up
- self._raise_login_required()
+ 'https://api-gateway-global.viu.com/api/playback/distribute',
+ video_id=video_id, query=query, fatal=False, note='Downloading stream info',
+ headers={
+ 'Authorization': f'Bearer {self._auth_codes[country_code]}',
+ 'Referer': url,
+ 'Origin': url
+ })
+ return self._detect_error(stream_data).get('stream')
+
+ if not self._auth_codes.get(country_code):
+ self._auth_codes[country_code] = self._get_token(country_code, video_id)
+ stream_data = None
+ try:
+ stream_data = download_playback()
+ except (ExtractorError, KeyError):
+ token = self._login(country_code, video_id)
+ if token is not None:
+ query['identity'] = token
+ else:
+ # The content is Preview or for VIP only.
+ # We can try to bypass the duration which is limited to 3mins only
+ duration_limit, query['duration'] = True, '180'
+ try:
+ stream_data = download_playback()
+ except (ExtractorError, KeyError):
+ if token is not None:
+ raise
+ self.raise_login_required(method='password')
if not stream_data:
raise ExtractorError('Cannot get stream info', expected=True)
- stream_sizes = stream_data.get('size', {})
formats = []
- for vid_format, stream_url in stream_data.get('url', {}).items():
- height = int_or_none(self._search_regex(
- r's(\d+)p', vid_format, 'height', default=None))
+ for vid_format, stream_url in (stream_data.get('url') or {}).items():
+ height = int(self._search_regex(r's(\d+)p', vid_format, 'height', default=None))
# bypass preview duration limit
if duration_limit:
- stream_url = compat_urlparse.urlparse(stream_url)
- query = dict(compat_urlparse.parse_qsl(stream_url.query, keep_blank_values=True))
- time_duration = int_or_none(video_data.get('time_duration'))
+ old_stream_url = urllib.parse.urlparse(stream_url)
+ query = dict(urllib.parse.parse_qsl(old_stream_url.query, keep_blank_values=True))
query.update({
- 'duration': time_duration if time_duration > 0 else '9999999',
+ 'duration': video_data.get('time_duration') or '9999999',
'duration_start': '0',
})
- stream_url = stream_url._replace(query=compat_urlparse.urlencode(query)).geturl()
+ stream_url = old_stream_url._replace(query=urllib.parse.urlencode(query)).geturl()
formats.append({
'format_id': vid_format,
'url': stream_url,
'height': height,
'ext': 'mp4',
- 'filesize': int_or_none(stream_sizes.get(vid_format))
+ 'filesize': try_get(stream_data, lambda x: x['size'][vid_format], int)
})
self._sort_formats(formats)
subtitles = {}
- for sub in video_data.get('subtitle', []):
+ for sub in video_data.get('subtitle') or []:
sub_url = sub.get('url')
if not sub_url:
continue
@@ -387,17 +374,16 @@ class ViuOTTIE(InfoExtractor):
'ext': 'srt',
})
- title = video_data['synopsis'].strip()
-
+ title = strip_or_none(video_data.get('synopsis'))
return {
'id': video_id,
'title': title,
'description': video_data.get('description'),
- 'series': product_data.get('series', {}).get('name'),
+ 'series': try_get(product_data, lambda x: x['series']['name']),
'episode': title,
'episode_number': int_or_none(video_data.get('number')),
'duration': int_or_none(stream_data.get('duration')),
- 'thumbnail': video_data.get('cover_image_url'),
+ 'thumbnail': url_or_none(video_data.get('cover_image_url')),
'formats': formats,
'subtitles': subtitles,
}
diff --git a/hypervideo_dl/extractor/vk.py b/hypervideo_dl/extractor/vk.py
index d8a9b9a..cbc3159 100644
--- a/hypervideo_dl/extractor/vk.py
+++ b/hypervideo_dl/extractor/vk.py
@@ -2,7 +2,6 @@
from __future__ import unicode_literals
import collections
-import functools
import re
from .common import InfoExtractor
@@ -12,7 +11,6 @@ from ..utils import (
ExtractorError,
get_element_by_class,
int_or_none,
- OnDemandPagedList,
orderedSet,
str_or_none,
str_to_int,
@@ -31,11 +29,7 @@ from .youtube import YoutubeIE
class VKBaseIE(InfoExtractor):
_NETRC_MACHINE = 'vk'
- def _login(self):
- username, password = self._get_login_info()
- if username is None:
- return
-
+ def _perform_login(self, username, password):
login_page, url_handle = self._download_webpage_handle(
'https://vk.com', None, 'Downloading login page')
@@ -51,7 +45,7 @@ class VKBaseIE(InfoExtractor):
self._apply_first_set_cookie_header(url_handle, 'remixlhk')
login_page = self._download_webpage(
- 'https://login.vk.com/?act=login', None,
+ 'https://vk.com/login', None,
note='Logging in',
data=urlencode_postdata(login_form))
@@ -59,9 +53,6 @@ class VKBaseIE(InfoExtractor):
raise ExtractorError(
'Unable to login, incorrect username and/or password', expected=True)
- def _real_initialize(self):
- self._login()
-
def _download_payload(self, path, video_id, data, fatal=True):
data['al'] = 1
code, payload = self._download_json(
@@ -87,10 +78,10 @@ class VKIE(VKBaseIE):
)
ext\.php\?(?P<embed_query>.*?\boid=(?P<oid>-?\d+).*?\bid=(?P<id>\d+).*)|
(?:
- (?:(?:m|new)\.)?vk\.com/(?:.+?\?.*?z=)?video|
+ (?:(?:m|new)\.)?vk\.com/(?:.+?\?.*?z=)?(?:video|clip)|
(?:www\.)?daxab.com/embed/
)
- (?P<videoid>-?\d+_\d+)(?:.*\blist=(?P<list_id>[\da-f]+))?
+ (?P<videoid>-?\d+_\d+)(?:.*\blist=(?P<list_id>([\da-f]+)|(ln-[\da-zA-Z]+)))?
)
'''
_TESTS = [
@@ -182,6 +173,17 @@ class VKIE(VKBaseIE):
'skip': 'Removed',
},
{
+ 'url': 'https://vk.com/video-93049196_456239755?list=ln-cBjJ7S4jYYx3ADnmDT',
+ 'info_dict': {
+ 'id': '-93049196_456239755',
+ 'ext': 'mp4',
+ 'title': '8 серия (озвучка)',
+ 'duration': 8383,
+ 'upload_date': '20211222',
+ 'view_count': int,
+ },
+ },
+ {
# video (removed?) only available with list id
'url': 'https://vk.com/video30481095_171201961?list=8764ae2d21f14088d4',
'md5': '091287af5402239a1051c37ec7b92913',
@@ -298,6 +300,10 @@ class VKIE(VKBaseIE):
# The video is not available in your region.
'url': 'https://vk.com/video-51812607_171445436',
'only_matching': True,
+ },
+ {
+ 'url': 'https://vk.com/clip30014565_456240946',
+ 'only_matching': True,
}]
@staticmethod
@@ -434,8 +440,6 @@ class VKIE(VKBaseIE):
# 2 = live
# 3 = post live (finished live)
is_live = data.get('live') == 2
- if is_live:
- title = self._live_title(title)
timestamp = unified_timestamp(self._html_search_regex(
r'class=["\']mv_info_date[^>]+>([^<]+)(?:<|from)', info_page,
@@ -471,6 +475,13 @@ class VKIE(VKBaseIE):
})
self._sort_formats(formats)
+ subtitles = {}
+ for sub in data.get('subs') or {}:
+ subtitles.setdefault(sub.get('lang', 'en'), []).append({
+ 'ext': sub.get('title', '.srt').split('.')[-1],
+ 'url': url_or_none(sub.get('url')),
+ })
+
return {
'id': video_id,
'formats': formats,
@@ -484,69 +495,66 @@ class VKIE(VKBaseIE):
'like_count': int_or_none(mv_data.get('likes')),
'comment_count': int_or_none(mv_data.get('commcount')),
'is_live': is_live,
+ 'subtitles': subtitles,
}
class VKUserVideosIE(VKBaseIE):
IE_NAME = 'vk:uservideos'
IE_DESC = "VK - User's Videos"
- _VALID_URL = r'https?://(?:(?:m|new)\.)?vk\.com/videos(?P<id>-?[0-9]+)(?!\?.*\bz=video)(?:[/?#&](?:.*?\bsection=(?P<section>\w+))?|$)'
+ _VALID_URL = r'https?://(?:(?:m|new)\.)?vk\.com/video/@(?P<id>[^?$#/&]+)(?!\?.*\bz=video)(?:[/?#&](?:.*?\bsection=(?P<section>\w+))?|$)'
_TEMPLATE_URL = 'https://vk.com/videos'
_TESTS = [{
- 'url': 'https://vk.com/videos-767561',
+ 'url': 'https://vk.com/video/@mobidevices',
'info_dict': {
- 'id': '-767561_all',
+ 'id': '-17892518_all',
},
- 'playlist_mincount': 1150,
+ 'playlist_mincount': 1355,
}, {
- 'url': 'https://vk.com/videos-767561?section=uploaded',
+ 'url': 'https://vk.com/video/@mobidevices?section=uploaded',
'info_dict': {
- 'id': '-767561_uploaded',
+ 'id': '-17892518_uploaded',
},
- 'playlist_mincount': 425,
- }, {
- 'url': 'http://vk.com/videos205387401',
- 'only_matching': True,
- }, {
- 'url': 'http://vk.com/videos-77521',
- 'only_matching': True,
- }, {
- 'url': 'http://vk.com/videos-97664626?section=all',
- 'only_matching': True,
- }, {
- 'url': 'http://m.vk.com/videos205387401',
- 'only_matching': True,
- }, {
- 'url': 'http://new.vk.com/videos205387401',
- 'only_matching': True,
+ 'playlist_mincount': 182,
}]
- _PAGE_SIZE = 1000
_VIDEO = collections.namedtuple('Video', ['owner_id', 'id'])
- def _fetch_page(self, page_id, section, page):
- l = self._download_payload('al_video', page_id, {
+ def _entries(self, page_id, section):
+ video_list_json = self._download_payload('al_video', page_id, {
'act': 'load_videos_silent',
- 'offset': page * self._PAGE_SIZE,
+ 'offset': 0,
'oid': page_id,
'section': section,
- })[0][section]['list']
-
- for video in l:
- v = self._VIDEO._make(video[:2])
- video_id = '%d_%d' % (v.owner_id, v.id)
- yield self.url_result(
- 'http://vk.com/video' + video_id, VKIE.ie_key(), video_id)
+ })[0][section]
+ count = video_list_json['count']
+ total = video_list_json['total']
+ video_list = video_list_json['list']
+
+ while True:
+ for video in video_list:
+ v = self._VIDEO._make(video[:2])
+ video_id = '%d_%d' % (v.owner_id, v.id)
+ yield self.url_result(
+ 'http://vk.com/video' + video_id, VKIE.ie_key(), video_id)
+ if count >= total:
+ break
+ video_list_json = self._download_payload('al_video', page_id, {
+ 'act': 'load_videos_silent',
+ 'offset': count,
+ 'oid': page_id,
+ 'section': section,
+ })[0][section]
+ count += video_list_json['count']
+ video_list = video_list_json['list']
def _real_extract(self, url):
- page_id, section = self._match_valid_url(url).groups()
+ u_id, section = self._match_valid_url(url).groups()
+ webpage = self._download_webpage(url, u_id)
+ page_id = self._search_regex(r'data-owner-id\s?=\s?"([^"]+)"', webpage, 'page_id')
if not section:
section = 'all'
- entries = OnDemandPagedList(
- functools.partial(self._fetch_page, page_id, section),
- self._PAGE_SIZE)
-
- return self.playlist_result(entries, '%s_%s' % (page_id, section))
+ return self.playlist_result(self._entries(page_id, section), '%s_%s' % (page_id, section))
class VKWallPostIE(VKBaseIE):
@@ -673,7 +681,7 @@ class VKWallPostIE(VKBaseIE):
'artist': performer,
'track': title,
'ext': 'mp4',
- 'protocol': 'm3u8',
+ 'protocol': 'm3u8_native',
})
for video in re.finditer(
diff --git a/hypervideo_dl/extractor/vlive.py b/hypervideo_dl/extractor/vlive.py
index 84f51a5..ae35c97 100644
--- a/hypervideo_dl/extractor/vlive.py
+++ b/hypervideo_dl/extractor/vlive.py
@@ -12,22 +12,65 @@ from ..compat import (
from ..utils import (
ExtractorError,
int_or_none,
+ LazyList,
merge_dicts,
str_or_none,
strip_or_none,
try_get,
urlencode_postdata,
+ url_or_none,
)
class VLiveBaseIE(NaverBaseIE):
- _APP_ID = '8c6cc7b45d2568fb668be6e05b6e5a3b'
+ _NETRC_MACHINE = 'vlive'
+ _logged_in = False
+
+ def _perform_login(self, username, password):
+ if self._logged_in:
+ return
+ LOGIN_URL = 'https://www.vlive.tv/auth/email/login'
+ self._request_webpage(
+ LOGIN_URL, None, note='Downloading login cookies')
+
+ self._download_webpage(
+ LOGIN_URL, None, note='Logging in',
+ data=urlencode_postdata({'email': username, 'pwd': password}),
+ headers={
+ 'Referer': LOGIN_URL,
+ 'Content-Type': 'application/x-www-form-urlencoded'
+ })
+
+ login_info = self._download_json(
+ 'https://www.vlive.tv/auth/loginInfo', None,
+ note='Checking login status',
+ headers={'Referer': 'https://www.vlive.tv/home'})
+
+ if not try_get(login_info, lambda x: x['message']['login'], bool):
+ raise ExtractorError('Unable to log in', expected=True)
+ VLiveBaseIE._logged_in = True
+
+ def _call_api(self, path_template, video_id, fields=None, query_add={}, note=None):
+ if note is None:
+ note = 'Downloading %s JSON metadata' % path_template.split('/')[-1].split('-')[0]
+ query = {'appId': '8c6cc7b45d2568fb668be6e05b6e5a3b', 'gcc': 'KR', 'platformType': 'PC'}
+ if fields:
+ query['fields'] = fields
+ if query_add:
+ query.update(query_add)
+ try:
+ return self._download_json(
+ 'https://www.vlive.tv/globalv-web/vam-web/' + path_template % video_id, video_id,
+ note, headers={'Referer': 'https://www.vlive.tv/'}, query=query)
+ except ExtractorError as e:
+ if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
+ self.raise_login_required(json.loads(e.cause.read().decode('utf-8'))['message'])
+ raise
class VLiveIE(VLiveBaseIE):
IE_NAME = 'vlive'
_VALID_URL = r'https?://(?:(?:www|m)\.)?vlive\.tv/(?:video|embed)/(?P<id>[0-9]+)'
- _NETRC_MACHINE = 'vlive'
_TESTS = [{
'url': 'http://www.vlive.tv/video/1326',
'md5': 'cc7314812855ce56de70a06a27314983',
@@ -38,6 +81,12 @@ class VLiveIE(VLiveBaseIE):
'creator': "Girl's Day",
'view_count': int,
'uploader_id': 'muploader_a',
+ 'upload_date': '20150817',
+ 'thumbnail': r're:^https?://.*\.(?:jpg|png)$',
+ 'timestamp': 1439816449,
+ },
+ 'params': {
+ 'skip_download': True,
},
}, {
'url': 'http://www.vlive.tv/video/16937',
@@ -49,6 +98,9 @@ class VLiveIE(VLiveBaseIE):
'view_count': int,
'subtitles': 'mincount:12',
'uploader_id': 'muploader_j',
+ 'upload_date': '20161112',
+ 'thumbnail': r're:^https?://.*\.(?:jpg|png)$',
+ 'timestamp': 1478923074,
},
'params': {
'skip_download': True,
@@ -81,53 +133,6 @@ class VLiveIE(VLiveBaseIE):
'playlist_mincount': 120
}]
- def _real_initialize(self):
- self._login()
-
- def _login(self):
- email, password = self._get_login_info()
- if None in (email, password):
- return
-
- def is_logged_in():
- login_info = self._download_json(
- 'https://www.vlive.tv/auth/loginInfo', None,
- note='Downloading login info',
- headers={'Referer': 'https://www.vlive.tv/home'})
- return try_get(
- login_info, lambda x: x['message']['login'], bool) or False
-
- LOGIN_URL = 'https://www.vlive.tv/auth/email/login'
- self._request_webpage(
- LOGIN_URL, None, note='Downloading login cookies')
-
- self._download_webpage(
- LOGIN_URL, None, note='Logging in',
- data=urlencode_postdata({'email': email, 'pwd': password}),
- headers={
- 'Referer': LOGIN_URL,
- 'Content-Type': 'application/x-www-form-urlencoded'
- })
-
- if not is_logged_in():
- raise ExtractorError('Unable to log in', expected=True)
-
- def _call_api(self, path_template, video_id, fields=None, limit=None):
- query = {'appId': self._APP_ID, 'gcc': 'KR', 'platformType': 'PC'}
- if fields:
- query['fields'] = fields
- if limit:
- query['limit'] = limit
- try:
- return self._download_json(
- 'https://www.vlive.tv/globalv-web/vam-web/' + path_template % video_id, video_id,
- 'Downloading %s JSON metadata' % path_template.split('/')[-1].split('-')[0],
- headers={'Referer': 'https://www.vlive.tv/'}, query=query)
- except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
- self.raise_login_required(json.loads(e.cause.read().decode('utf-8'))['message'])
- raise
-
def _real_extract(self, url):
video_id = self._match_id(url)
@@ -135,30 +140,24 @@ class VLiveIE(VLiveBaseIE):
'post/v1.0/officialVideoPost-%s', video_id,
'author{nickname},channel{channelCode,channelName},officialVideo{commentCount,exposeStatus,likeCount,playCount,playTime,status,title,type,vodId},playlist{playlistSeq,totalCount,name}')
- playlist = post.get('playlist')
- if not playlist or self.get_param('noplaylist'):
- if playlist:
- self.to_screen(
- 'Downloading just video %s because of --no-playlist'
- % video_id)
-
+ playlist_id = str_or_none(try_get(post, lambda x: x['playlist']['playlistSeq']))
+ if not self._yes_playlist(playlist_id, video_id):
video = post['officialVideo']
return self._get_vlive_info(post, video, video_id)
- else:
- playlist_name = playlist.get('name')
- playlist_id = str_or_none(playlist.get('playlistSeq'))
- playlist_count = str_or_none(playlist.get('totalCount'))
- playlist = self._call_api(
- 'playlist/v1.0/playlist-%s/posts', playlist_id, 'data', limit=playlist_count)
+ playlist_name = str_or_none(try_get(post, lambda x: x['playlist']['name']))
+ playlist_count = str_or_none(try_get(post, lambda x: x['playlist']['totalCount']))
- entries = []
- for video_data in playlist['data']:
- video = video_data.get('officialVideo')
- video_id = str_or_none(video.get('videoSeq'))
- entries.append(self._get_vlive_info(video_data, video, video_id))
+ playlist = self._call_api(
+ 'playlist/v1.0/playlist-%s/posts', playlist_id, 'data', {'limit': playlist_count})
- return self.playlist_result(entries, playlist_id, playlist_name)
+ entries = []
+ for video_data in playlist['data']:
+ video = video_data.get('officialVideo')
+ video_id = str_or_none(video.get('videoSeq'))
+ entries.append(self._get_vlive_info(video_data, video, video_id))
+
+ return self.playlist_result(entries, playlist_id, playlist_name)
def _get_vlive_info(self, post, video, video_id):
def get_common_fields():
@@ -172,6 +171,8 @@ class VLiveIE(VLiveBaseIE):
'view_count': int_or_none(video.get('playCount')),
'like_count': int_or_none(video.get('likeCount')),
'comment_count': int_or_none(video.get('commentCount')),
+ 'timestamp': int_or_none(video.get('createdAt'), scale=1000),
+ 'thumbnail': video.get('thumb'),
}
video_type = video.get('type')
@@ -197,7 +198,7 @@ class VLiveIE(VLiveBaseIE):
self._sort_formats(formats)
info = get_common_fields()
info.update({
- 'title': self._live_title(video['title']),
+ 'title': video['title'],
'id': video_id,
'formats': formats,
'is_live': True,
@@ -216,7 +217,7 @@ class VLiveIE(VLiveBaseIE):
raise ExtractorError('Unknown status ' + status)
-class VLivePostIE(VLiveIE):
+class VLivePostIE(VLiveBaseIE):
IE_NAME = 'vlive:post'
_VALID_URL = r'https?://(?:(?:www|m)\.)?vlive\.tv/post/(?P<id>\d-\d+)'
_TESTS = [{
@@ -238,8 +239,6 @@ class VLivePostIE(VLiveIE):
'playlist_count': 1,
}]
_FVIDEO_TMPL = 'fvideo/v1.0/fvideo-%%s/%s'
- _SOS_TMPL = _FVIDEO_TMPL % 'sosPlayInfo'
- _INKEY_TMPL = _FVIDEO_TMPL % 'inKey'
def _real_extract(self, url):
post_id = self._match_id(url)
@@ -266,7 +265,7 @@ class VLivePostIE(VLiveIE):
entry = None
if upload_type == 'SOS':
download = self._call_api(
- self._SOS_TMPL, video_id)['videoUrl']['download']
+ self._FVIDEO_TMPL % 'sosPlayInfo', video_id)['videoUrl']['download']
formats = []
for f_id, f_url in download.items():
formats.append({
@@ -284,7 +283,7 @@ class VLivePostIE(VLiveIE):
vod_id = upload_info.get('videoId')
if not vod_id:
continue
- inkey = self._call_api(self._INKEY_TMPL, video_id)['inKey']
+ inkey = self._call_api(self._FVIDEO_TMPL % 'inKey', video_id)['inKey']
entry = self._extract_video_info(video_id, vod_id, inkey)
if entry:
entry['title'] = '%s_part%s' % (title, idx)
@@ -295,7 +294,7 @@ class VLivePostIE(VLiveIE):
class VLiveChannelIE(VLiveBaseIE):
IE_NAME = 'vlive:channel'
- _VALID_URL = r'https?://(?:channels\.vlive\.tv|(?:(?:www|m)\.)?vlive\.tv/channel)/(?P<id>[0-9A-Z]+)'
+ _VALID_URL = r'https?://(?:channels\.vlive\.tv|(?:(?:www|m)\.)?vlive\.tv/channel)/(?P<channel_id>[0-9A-Z]+)(?:/board/(?P<posts_id>\d+))?'
_TESTS = [{
'url': 'http://channels.vlive.tv/FCD4B',
'info_dict': {
@@ -306,78 +305,57 @@ class VLiveChannelIE(VLiveBaseIE):
}, {
'url': 'https://www.vlive.tv/channel/FCD4B',
'only_matching': True,
+ }, {
+ 'url': 'https://www.vlive.tv/channel/FCD4B/board/3546',
+ 'info_dict': {
+ 'id': 'FCD4B-3546',
+ 'title': 'MAMAMOO - Star Board',
+ },
+ 'playlist_mincount': 880
}]
- def _call_api(self, path, channel_key_suffix, channel_value, note, query):
- q = {
- 'app_id': self._APP_ID,
- 'channel' + channel_key_suffix: channel_value,
- }
- q.update(query)
- return self._download_json(
- 'http://api.vfan.vlive.tv/vproxy/channelplus/' + path,
- channel_value, note='Downloading ' + note, query=q)['result']
-
- def _real_extract(self, url):
- channel_code = self._match_id(url)
-
- channel_seq = self._call_api(
- 'decodeChannelCode', 'Code', channel_code,
- 'decode channel code', {})['channelSeq']
-
- channel_name = None
- entries = []
+ def _entries(self, posts_id, board_name):
+ if board_name:
+ posts_path = 'post/v1.0/board-%s/posts'
+ query_add = {'limit': 100, 'sortType': 'LATEST'}
+ else:
+ posts_path = 'post/v1.0/channel-%s/starPosts'
+ query_add = {'limit': 100}
for page_num in itertools.count(1):
video_list = self._call_api(
- 'getChannelVideoList', 'Seq', channel_seq,
- 'channel list page #%d' % page_num, {
- # Large values of maxNumOfRows (~300 or above) may cause
- # empty responses (see [1]), e.g. this happens for [2] that
- # has more than 300 videos.
- # 1. https://github.com/ytdl-org/youtube-dl/issues/13830
- # 2. http://channels.vlive.tv/EDBF.
- 'maxNumOfRows': 100,
- 'pageNo': page_num
- }
- )
-
- if not channel_name:
- channel_name = try_get(
- video_list,
- lambda x: x['channelInfo']['channelName'],
- compat_str)
+ posts_path, posts_id, 'channel{channelName},contentType,postId,title,url', query_add,
+ note=f'Downloading playlist page {page_num}')
+
+ for video in try_get(video_list, lambda x: x['data'], list) or []:
+ video_id = str(video.get('postId'))
+ video_title = str_or_none(video.get('title'))
+ video_url = url_or_none(video.get('url'))
+ if not all((video_id, video_title, video_url)) or video.get('contentType') != 'VIDEO':
+ continue
+ channel_name = try_get(video, lambda x: x['channel']['channelName'], compat_str)
+ yield self.url_result(video_url, VLivePostIE.ie_key(), video_id, video_title, channel=channel_name)
- videos = try_get(
- video_list, lambda x: x['videoList'], list)
- if not videos:
+ after = try_get(video_list, lambda x: x['paging']['nextParams']['after'], compat_str)
+ if not after:
break
+ query_add['after'] = after
+
+ def _real_extract(self, url):
+ channel_id, posts_id = self._match_valid_url(url).groups()
- for video in videos:
- video_id = video.get('videoSeq')
- video_type = video.get('videoType')
+ board_name = None
+ if posts_id:
+ board = self._call_api(
+ 'board/v1.0/board-%s', posts_id, 'title,boardType')
+ board_name = board.get('title') or 'Unknown'
+ if board.get('boardType') not in ('STAR', 'VLIVE_PLUS'):
+ raise ExtractorError(f'Board {board_name!r} is not supported', expected=True)
- if not video_id or not video_type:
- continue
- video_id = compat_str(video_id)
-
- if video_type in ('PLAYLIST'):
- first_video_id = try_get(
- video,
- lambda x: x['videoPlaylist']['videoList'][0]['videoSeq'], int)
-
- if not first_video_id:
- continue
-
- entries.append(
- self.url_result(
- 'http://www.vlive.tv/video/%s' % first_video_id,
- ie=VLiveIE.ie_key(), video_id=first_video_id))
- else:
- entries.append(
- self.url_result(
- 'http://www.vlive.tv/video/%s' % video_id,
- ie=VLiveIE.ie_key(), video_id=video_id))
+ entries = LazyList(self._entries(posts_id or channel_id, board_name))
+ channel_name = entries[0]['channel']
return self.playlist_result(
- entries, channel_code, channel_name)
+ entries,
+ f'{channel_id}-{posts_id}' if posts_id else channel_id,
+ f'{channel_name} - {board_name}' if channel_name and board_name else channel_name)
diff --git a/hypervideo_dl/extractor/voicy.py b/hypervideo_dl/extractor/voicy.py
index 11ebe76..37c7d56 100644
--- a/hypervideo_dl/extractor/voicy.py
+++ b/hypervideo_dl/extractor/voicy.py
@@ -6,9 +6,10 @@ from ..compat import compat_str
from ..utils import (
ExtractorError,
smuggle_url,
+ str_or_none,
traverse_obj,
- unsmuggle_url,
unified_strdate,
+ unsmuggle_url,
)
import itertools
@@ -25,9 +26,9 @@ class VoicyBaseIE(InfoExtractor):
'id': voice_id,
'title': compat_str(value.get('PlaylistName')),
'uploader': value.get('SpeakerName'),
- 'uploader_id': compat_str(value.get('SpeakerId')),
+ 'uploader_id': str_or_none(value.get('SpeakerId')),
'channel': value.get('ChannelName'),
- 'channel_id': compat_str(value.get('ChannelId')),
+ 'channel_id': str_or_none(value.get('ChannelId')),
'upload_date': upload_date,
}
diff --git a/hypervideo_dl/extractor/voot.py b/hypervideo_dl/extractor/voot.py
index e2944ec..a9b66b9 100644
--- a/hypervideo_dl/extractor/voot.py
+++ b/hypervideo_dl/extractor/voot.py
@@ -15,7 +15,7 @@ class VootIE(InfoExtractor):
_VALID_URL = r'''(?x)
(?:
voot:|
- (?:https?://)(?:www\.)?voot\.com/?
+ https?://(?:www\.)?voot\.com/?
(?:
movies/[^/]+/|
(?:shows|kids)/(?:[^/]+/){4}
diff --git a/hypervideo_dl/extractor/vrv.py b/hypervideo_dl/extractor/vrv.py
index 4196021..00e1006 100644
--- a/hypervideo_dl/extractor/vrv.py
+++ b/hypervideo_dl/extractor/vrv.py
@@ -19,6 +19,7 @@ from ..utils import (
ExtractorError,
float_or_none,
int_or_none,
+ join_nonempty,
traverse_obj,
)
@@ -84,7 +85,30 @@ class VRVBaseIE(InfoExtractor):
'resource_key': resource_key,
})['__links__']['cms_resource']['href']
- def _real_initialize(self):
+ def _extract_vrv_formats(self, url, video_id, stream_format, audio_lang, hardsub_lang):
+ if not url or stream_format not in ('hls', 'dash', 'adaptive_hls'):
+ return []
+ format_id = join_nonempty(
+ stream_format,
+ audio_lang and 'audio-%s' % audio_lang,
+ hardsub_lang and 'hardsub-%s' % hardsub_lang)
+ if 'hls' in stream_format:
+ adaptive_formats = self._extract_m3u8_formats(
+ url, video_id, 'mp4', m3u8_id=format_id,
+ note='Downloading %s information' % format_id,
+ fatal=False)
+ elif stream_format == 'dash':
+ adaptive_formats = self._extract_mpd_formats(
+ url, video_id, mpd_id=format_id,
+ note='Downloading %s information' % format_id,
+ fatal=False)
+ if audio_lang:
+ for f in adaptive_formats:
+ if f.get('acodec') != 'none':
+ f['language'] = audio_lang
+ return adaptive_formats
+
+ def _set_api_params(self):
webpage = self._download_webpage(
'https://vrv.co/', None, headers=self.geo_verification_headers())
self._API_PARAMS = self._parse_json(self._search_regex(
@@ -123,47 +147,17 @@ class VRVIE(VRVBaseIE):
}]
_NETRC_MACHINE = 'vrv'
- def _real_initialize(self):
- super(VRVIE, self)._real_initialize()
-
- email, password = self._get_login_info()
- if email is None:
- return
-
+ def _perform_login(self, username, password):
token_credentials = self._call_api(
'authenticate/by:credentials', None, 'Token Credentials', data={
- 'email': email,
+ 'email': username,
'password': password,
})
self._TOKEN = token_credentials['oauth_token']
self._TOKEN_SECRET = token_credentials['oauth_token_secret']
- def _extract_vrv_formats(self, url, video_id, stream_format, audio_lang, hardsub_lang):
- if not url or stream_format not in ('hls', 'dash', 'adaptive_hls'):
- return []
- stream_id_list = []
- if audio_lang:
- stream_id_list.append('audio-%s' % audio_lang)
- if hardsub_lang:
- stream_id_list.append('hardsub-%s' % hardsub_lang)
- format_id = stream_format
- if stream_id_list:
- format_id += '-' + '-'.join(stream_id_list)
- if 'hls' in stream_format:
- adaptive_formats = self._extract_m3u8_formats(
- url, video_id, 'mp4', m3u8_id=format_id,
- note='Downloading %s information' % format_id,
- fatal=False)
- elif stream_format == 'dash':
- adaptive_formats = self._extract_mpd_formats(
- url, video_id, mpd_id=format_id,
- note='Downloading %s information' % format_id,
- fatal=False)
- if audio_lang:
- for f in adaptive_formats:
- if f.get('acodec') != 'none':
- f['language'] = audio_lang
- return adaptive_formats
+ def _initialize_pre_login(self):
+ return self._set_api_params()
def _real_extract(self, url):
video_id = self._match_id(url)
@@ -258,6 +252,9 @@ class VRVSeriesIE(VRVBaseIE):
'playlist_mincount': 11,
}
+ def _initialize_pre_login(self):
+ return self._set_api_params()
+
def _real_extract(self, url):
series_id = self._match_id(url)
diff --git a/hypervideo_dl/extractor/vshare.py b/hypervideo_dl/extractor/vshare.py
index c631ac1..b4874ac 100644
--- a/hypervideo_dl/extractor/vshare.py
+++ b/hypervideo_dl/extractor/vshare.py
@@ -50,8 +50,7 @@ class VShareIE(InfoExtractor):
'https://vshare.io/v/%s/width-650/height-430/1' % video_id,
video_id, headers={'Referer': url})
- title = self._html_search_regex(
- r'<title>([^<]+)</title>', webpage, 'title')
+ title = self._html_extract_title(webpage)
title = title.split(' - ')[0]
error = self._html_search_regex(
diff --git a/hypervideo_dl/extractor/vupload.py b/hypervideo_dl/extractor/vupload.py
index 9846aba..b561f63 100644
--- a/hypervideo_dl/extractor/vupload.py
+++ b/hypervideo_dl/extractor/vupload.py
@@ -7,6 +7,7 @@ from ..utils import (
parse_filesize,
extract_attributes,
int_or_none,
+ js_to_json
)
@@ -27,9 +28,12 @@ class VuploadIE(InfoExtractor):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- title = self._html_search_regex(r'<title>(.+?)</title>', webpage, 'title')
- video_e = self._html_search_regex(r'\|([a-z0-9]{60})\|', webpage, 'video')
- video_url = f'https://wurize.megaupload.to/{video_e}/v.mp4'
+ title = self._html_extract_title(webpage)
+ video_json = self._parse_json(self._html_search_regex(r'sources:\s*(.+?]),', webpage, 'video'), video_id, transform_source=js_to_json)
+ formats = []
+ for source in video_json:
+ if source['src'].endswith('.m3u8'):
+ formats.extend(self._extract_m3u8_formats(source['src'], video_id, m3u8_id='hls'))
duration = parse_duration(self._html_search_regex(
r'<i\s*class=["\']fad\s*fa-clock["\']></i>\s*([\d:]+)\s*</div>', webpage, 'duration', fatal=False))
filesize_approx = parse_filesize(self._html_search_regex(
@@ -40,7 +44,7 @@ class VuploadIE(InfoExtractor):
return {
'id': video_id,
- 'url': video_url,
+ 'formats': formats,
'duration': duration,
'filesize_approx': filesize_approx,
'width': int_or_none(extra_video_info.get('width')),
diff --git a/hypervideo_dl/extractor/vyborymos.py b/hypervideo_dl/extractor/vyborymos.py
index 9e703c4..4d93666 100644
--- a/hypervideo_dl/extractor/vyborymos.py
+++ b/hypervideo_dl/extractor/vyborymos.py
@@ -44,11 +44,11 @@ class VyboryMosIE(InfoExtractor):
info = self._download_json(
'http://vybory.mos.ru/json/voting_stations/%s/%s.json'
% (compat_str(station_id)[:3], station_id),
- station_id, 'Downloading station JSON', fatal=False)
+ station_id, 'Downloading station JSON', fatal=False) or {}
return {
'id': station_id,
- 'title': self._live_title(info['name'] if info else station_id),
+ 'title': info.get('name') or station_id,
'description': info.get('address'),
'is_live': True,
'formats': formats,
diff --git a/hypervideo_dl/extractor/wakanim.py b/hypervideo_dl/extractor/wakanim.py
index c956d61..a70a719 100644
--- a/hypervideo_dl/extractor/wakanim.py
+++ b/hypervideo_dl/extractor/wakanim.py
@@ -1,6 +1,8 @@
# coding: utf-8
from __future__ import unicode_literals
+from urllib.parse import unquote
+
from .common import InfoExtractor
from ..utils import (
merge_dicts,
@@ -23,7 +25,6 @@ class WakanimIE(InfoExtractor):
'episode_number': 2,
},
'params': {
- 'format': 'bestvideo',
'skip_download': True,
},
}, {
@@ -31,26 +32,37 @@ class WakanimIE(InfoExtractor):
'url': 'https://www.wakanim.tv/de/v2/catalogue/episode/7843/sword-art-online-alicization-omu-arc-2-folge-15-omu',
'only_matching': True,
}]
+ _GEO_BYPASS = False
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- m3u8_url = urljoin(url, self._search_regex(
- r'file\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, 'm3u8 url',
+ if 'Geoblocking' in webpage:
+ if '/de/' in url:
+ self.raise_geo_restricted(countries=['DE', 'AT', 'CH'])
+ else:
+ self.raise_geo_restricted(countries=['RU'])
+
+ manifest_url = urljoin(url, self._search_regex(
+ r'file\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, 'manifest url',
group='url'))
if not self.get_param('allow_unplayable_formats'):
# https://docs.microsoft.com/en-us/azure/media-services/previous/media-services-content-protection-overview#streaming-urls
encryption = self._search_regex(
r'encryption%3D(c(?:enc|bc(?:s-aapl)?))',
- m3u8_url, 'encryption', default=None)
+ manifest_url, 'encryption', default=None)
if encryption in ('cenc', 'cbcs-aapl'):
self.report_drm(video_id)
- formats = self._extract_m3u8_formats(
- m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native',
- m3u8_id='hls')
+ if 'format=mpd-time-cmaf' in unquote(manifest_url):
+ formats = self._extract_mpd_formats(
+ manifest_url, video_id, mpd_id='dash')
+ else:
+ formats = self._extract_m3u8_formats(
+ manifest_url, video_id, 'mp4', entry_protocol='m3u8_native',
+ m3u8_id='hls')
info = self._search_json_ld(webpage, video_id, default={})
diff --git a/hypervideo_dl/extractor/wasdtv.py b/hypervideo_dl/extractor/wasdtv.py
new file mode 100644
index 0000000..38c10dc
--- /dev/null
+++ b/hypervideo_dl/extractor/wasdtv.py
@@ -0,0 +1,161 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ int_or_none,
+ parse_iso8601,
+ traverse_obj,
+ try_get,
+)
+
+
+class WASDTVBaseIE(InfoExtractor):
+
+ def _fetch(self, path, video_id, description, query={}):
+ response = self._download_json(
+ f'https://wasd.tv/api/{path}', video_id, query=query,
+ note=f'Downloading {description} metadata',
+ errnote=f'Unable to download {description} metadata')
+ error = response.get('error')
+ if error:
+ raise ExtractorError(f'{self.IE_NAME} returned error: {error}', expected=True)
+ return response.get('result')
+
+ def _extract_thumbnails(self, thumbnails_dict):
+ return [{
+ 'url': url,
+ 'preference': index,
+ } for index, url in enumerate(
+ traverse_obj(thumbnails_dict, (('small', 'medium', 'large'),))) if url]
+
+ def _real_extract(self, url):
+ container = self._get_container(url)
+ stream = traverse_obj(container, ('media_container_streams', 0))
+ media = try_get(stream, lambda x: x['stream_media'][0])
+ if not media:
+ raise ExtractorError('Can not extract media data.', expected=True)
+ media_meta = media.get('media_meta')
+ media_url, is_live = self._get_media_url(media_meta)
+ video_id = media.get('media_id') or container.get('media_container_id')
+ formats, subtitles = self._extract_m3u8_formats_and_subtitles(media_url, video_id, 'mp4')
+ self._sort_formats(formats)
+ return {
+ 'id': str(video_id),
+ 'title': container.get('media_container_name') or self._og_search_title(self._download_webpage(url, video_id)),
+ 'description': container.get('media_container_description'),
+ 'thumbnails': self._extract_thumbnails(media_meta.get('media_preview_images')),
+ 'timestamp': parse_iso8601(container.get('created_at')),
+ 'view_count': int_or_none(stream.get('stream_current_viewers' if is_live else 'stream_total_viewers')),
+ 'is_live': is_live,
+ 'formats': formats,
+ 'subtitles': subtitles,
+ }
+
+ def _get_container(self, url):
+ raise NotImplementedError('Subclass for get media container')
+
+ def _get_media_url(self, media_meta):
+ raise NotImplementedError('Subclass for get media url')
+
+
+class WASDTVStreamIE(WASDTVBaseIE):
+ IE_NAME = 'wasdtv:stream'
+ _VALID_URL = r'https?://wasd\.tv/(?P<id>[^/#?]+)$'
+ _TESTS = [{
+ 'url': 'https://wasd.tv/24_7',
+ 'info_dict': {
+ 'id': '559738',
+ 'ext': 'mp4',
+ 'title': 'Live 24/7 Music',
+ 'description': '24&#x2F;7 Music',
+ 'timestamp': int,
+ 'upload_date': r're:^\d{8}$',
+ 'is_live': True,
+ 'view_count': int,
+ },
+ }]
+
+ def _get_container(self, url):
+ nickname = self._match_id(url)
+ channel = self._fetch(f'channels/nicknames/{nickname}', video_id=nickname, description='channel')
+ channel_id = channel.get('channel_id')
+ containers = self._fetch(
+ 'v2/media-containers', channel_id, 'running media containers',
+ query={
+ 'channel_id': channel_id,
+ 'media_container_type': 'SINGLE',
+ 'media_container_status': 'RUNNING',
+ })
+ if not containers:
+ raise ExtractorError(f'{nickname} is offline', expected=True)
+ return containers[0]
+
+ def _get_media_url(self, media_meta):
+ return media_meta['media_url'], True
+
+
+class WASDTVRecordIE(WASDTVBaseIE):
+ IE_NAME = 'wasdtv:record'
+ _VALID_URL = r'https?://wasd\.tv/[^/#?]+/videos\?record=(?P<id>\d+)$'
+ _TESTS = [{
+ 'url': 'https://wasd.tv/spacemita/videos?record=907755',
+ 'md5': 'c9899dd85be4cc997816ff9f9ca516ce',
+ 'info_dict': {
+ 'id': '906825',
+ 'ext': 'mp4',
+ 'title': 'Музыкальный',
+ 'description': 'md5:f510388d929ff60ae61d4c3cab3137cc',
+ 'timestamp': 1645812079,
+ 'upload_date': '20220225',
+ 'thumbnail': r're:^https?://.+\.jpg',
+ 'is_live': False,
+ 'view_count': int,
+ },
+ }]
+
+ def _get_container(self, url):
+ container_id = self._match_id(url)
+ return self._fetch(
+ f'v2/media-containers/{container_id}', container_id, 'media container')
+
+ def _get_media_url(self, media_meta):
+ media_archive_url = media_meta.get('media_archive_url')
+ if media_archive_url:
+ return media_archive_url, False
+ return media_meta['media_url'], True
+
+
+class WASDTVClipIE(WASDTVBaseIE):
+ IE_NAME = 'wasdtv:clip'
+ _VALID_URL = r'https?://wasd\.tv/[^/#?]+/clips\?clip=(?P<id>\d+)$'
+ _TESTS = [{
+ 'url': 'https://wasd.tv/spacemita/clips?clip=26804',
+ 'md5': '818885e720143d7a4e776ff66fcff148',
+ 'info_dict': {
+ 'id': '26804',
+ 'ext': 'mp4',
+ 'title': 'Пуш флексит на голове стримера',
+ 'timestamp': 1646682908,
+ 'upload_date': '20220307',
+ 'thumbnail': r're:^https?://.+\.jpg',
+ 'view_count': int,
+ },
+ }]
+
+ def _real_extract(self, url):
+ clip_id = self._match_id(url)
+ clip = self._fetch(f'v2/clips/{clip_id}', video_id=clip_id, description='clip')
+ clip_data = clip.get('clip_data')
+ formats, subtitles = self._extract_m3u8_formats_and_subtitles(clip_data.get('url'), video_id=clip_id, ext='mp4')
+ self._sort_formats(formats)
+ return {
+ 'id': clip_id,
+ 'title': clip.get('clip_title') or self._og_search_title(self._download_webpage(url, clip_id, fatal=False)),
+ 'thumbnails': self._extract_thumbnails(clip_data.get('preview')),
+ 'timestamp': parse_iso8601(clip.get('created_at')),
+ 'view_count': int_or_none(clip.get('clip_views_count')),
+ 'formats': formats,
+ 'subtitles': subtitles,
+ }
diff --git a/hypervideo_dl/extractor/washingtonpost.py b/hypervideo_dl/extractor/washingtonpost.py
index 8afb1af..9d6ae28 100644
--- a/hypervideo_dl/extractor/washingtonpost.py
+++ b/hypervideo_dl/extractor/washingtonpost.py
@@ -5,6 +5,8 @@ import re
from .common import InfoExtractor
+from ..utils import traverse_obj
+
class WashingtonPostIE(InfoExtractor):
IE_NAME = 'washingtonpost'
@@ -50,7 +52,7 @@ class WashingtonPostArticleIE(InfoExtractor):
'title': 'Sinkhole of bureaucracy',
},
'playlist': [{
- 'md5': 'b9be794ceb56c7267d410a13f99d801a',
+ 'md5': '7ccf53ea8cbb77de5f570242b3b21a59',
'info_dict': {
'id': 'fc433c38-b146-11e3-b8b3-44b1d1cd4c1f',
'ext': 'mp4',
@@ -59,9 +61,10 @@ class WashingtonPostArticleIE(InfoExtractor):
'description': 'Overly complicated paper pushing is nothing new to government bureaucracy. But the way federal retirement applications are filed may be the most outdated. David Fahrenthold explains.',
'timestamp': 1395440416,
'upload_date': '20140321',
+ 'thumbnail': r're:https://[^\.]+.cloudfront\.net/PAPERMINESplash\.jpg',
},
}, {
- 'md5': '1fff6a689d8770966df78c8cb6c8c17c',
+ 'md5': '7ccf53ea8cbb77de5f570242b3b21a59',
'info_dict': {
'id': '41255e28-b14a-11e3-b8b3-44b1d1cd4c1f',
'ext': 'mp4',
@@ -70,6 +73,7 @@ class WashingtonPostArticleIE(InfoExtractor):
'duration': 2220,
'timestamp': 1395441819,
'upload_date': '20140321',
+ 'thumbnail': r're:https://[^\.]+.cloudfront\.net/BoyersSplash\.jpeg',
},
}],
}, {
@@ -88,7 +92,11 @@ class WashingtonPostArticleIE(InfoExtractor):
'timestamp': 1419972442,
'title': 'Why black boxes don’t transmit data in real time',
}
- }]
+ }],
+ 'skip': 'Doesnt have a video anymore',
+ }, {
+ 'url': 'https://www.washingtonpost.com/nation/2021/08/05/dixie-river-fire-california-climate/',
+ 'only_matching': True,
}]
@classmethod
@@ -106,6 +114,13 @@ class WashingtonPostArticleIE(InfoExtractor):
<div\s+class="posttv-video-embed[^>]*?data-uuid=|
data-video-uuid=
)"([^"]+)"''', webpage)
+
+ if not uuids:
+ json_data = self._search_nextjs_data(webpage, page_id)
+ for content_element in traverse_obj(json_data, ('props', 'pageProps', 'globalContent', 'content_elements')):
+ if content_element.get('type') == 'video':
+ uuids.append(content_element.get('_id'))
+
entries = [self.url_result('washingtonpost:%s' % uuid, 'WashingtonPost', uuid) for uuid in uuids]
return {
diff --git a/hypervideo_dl/extractor/watchbox.py b/hypervideo_dl/extractor/watchbox.py
index 7469fe9..d19d801 100644
--- a/hypervideo_dl/extractor/watchbox.py
+++ b/hypervideo_dl/extractor/watchbox.py
@@ -30,7 +30,6 @@ class WatchBoxIE(InfoExtractor):
'release_year': 2009,
},
'params': {
- 'format': 'bestvideo',
'skip_download': True,
},
'expected_warnings': ['Failed to download m3u8 information'],
@@ -52,7 +51,6 @@ class WatchBoxIE(InfoExtractor):
'episode_number': 1,
},
'params': {
- 'format': 'bestvideo',
'skip_download': True,
},
'expected_warnings': ['Failed to download m3u8 information'],
diff --git a/hypervideo_dl/extractor/wdr.py b/hypervideo_dl/extractor/wdr.py
index f54aa6f..ef58a66 100644
--- a/hypervideo_dl/extractor/wdr.py
+++ b/hypervideo_dl/extractor/wdr.py
@@ -10,6 +10,7 @@ from ..compat import (
)
from ..utils import (
determine_ext,
+ dict_get,
ExtractorError,
js_to_json,
strip_jsonp,
@@ -22,9 +23,14 @@ from ..utils import (
class WDRIE(InfoExtractor):
- _VALID_URL = r'https?://deviceids-medp\.wdr\.de/ondemand/\d+/(?P<id>\d+)\.js'
+ __API_URL_TPL = '//deviceids-medp.wdr.de/ondemand/%s/%s'
+ _VALID_URL = r'''(?x)https?://
+ (?:deviceids-medp\.wdr\.de/ondemand/\d+/|
+ kinder\.wdr\.de/(?!mediathek/)[^#?]+-)
+ (?P<id>\d+)\.(?:js|assetjsonp)
+ '''
_GEO_COUNTRIES = ['DE']
- _TEST = {
+ _TESTS = [{
'url': 'http://deviceids-medp.wdr.de/ondemand/155/1557833.js',
'info_dict': {
'id': 'mdb-1557833',
@@ -32,11 +38,19 @@ class WDRIE(InfoExtractor):
'title': 'Biathlon-Staffel verpasst Podest bei Olympia-Generalprobe',
'upload_date': '20180112',
},
- }
+ }]
+
+ def _asset_url(self, wdr_id):
+ id_len = max(len(wdr_id), 5)
+ return ''.join(('https:', self.__API_URL_TPL % (wdr_id[:id_len - 4], wdr_id, ), '.js'))
def _real_extract(self, url):
video_id = self._match_id(url)
+ if url.startswith('wdr:'):
+ video_id = url[4:]
+ url = self._asset_url(video_id)
+
metadata = self._download_json(
url, video_id, transform_source=strip_jsonp)
@@ -113,7 +127,7 @@ class WDRIE(InfoExtractor):
return {
'id': tracker_data.get('trackerClipId', video_id),
- 'title': self._live_title(title) if is_live else title,
+ 'title': title,
'alt_title': tracker_data.get('trackerClipSubcategory'),
'formats': formats,
'subtitles': subtitles,
@@ -122,10 +136,10 @@ class WDRIE(InfoExtractor):
}
-class WDRPageIE(InfoExtractor):
- _CURRENT_MAUS_URL = r'https?://(?:www\.)wdrmaus.de/(?:[^/]+/){1,2}[^/?#]+\.php5'
+class WDRPageIE(WDRIE):
+ _MAUS_REGEX = r'https?://(?:www\.)wdrmaus.de/(?:[^/]+/)*?(?P<maus_id>[^/?#.]+)(?:/?|/index\.php5|\.php5)$'
_PAGE_REGEX = r'/(?:mediathek/)?(?:[^/]+/)*(?P<display_id>[^/]+)\.html'
- _VALID_URL = r'https?://(?:www\d?\.)?(?:(?:kinder\.)?wdr\d?|sportschau)\.de' + _PAGE_REGEX + '|' + _CURRENT_MAUS_URL
+ _VALID_URL = r'https?://(?:www\d?\.)?(?:(?:kinder\.)?wdr\d?|sportschau)\.de' + _PAGE_REGEX + '|' + _MAUS_REGEX
_TESTS = [
{
@@ -166,11 +180,11 @@ class WDRPageIE(InfoExtractor):
{
'url': 'http://www1.wdr.de/mediathek/video/live/index.html',
'info_dict': {
- 'id': 'mdb-1406149',
+ 'id': 'mdb-2296252',
'ext': 'mp4',
- 'title': r're:^WDR Fernsehen im Livestream \(nur in Deutschland erreichbar\) [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+ 'title': r're:^WDR Fernsehen im Livestream (?:\(nur in Deutschland erreichbar\) )?[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
'alt_title': 'WDR Fernsehen Live',
- 'upload_date': '20150101',
+ 'upload_date': '20201112',
'is_live': True,
},
'params': {
@@ -179,7 +193,7 @@ class WDRPageIE(InfoExtractor):
},
{
'url': 'http://www1.wdr.de/mediathek/video/sendungen/aktuelle-stunde/aktuelle-stunde-120.html',
- 'playlist_mincount': 7,
+ 'playlist_mincount': 6,
'info_dict': {
'id': 'aktuelle-stunde-120',
},
@@ -187,10 +201,10 @@ class WDRPageIE(InfoExtractor):
{
'url': 'http://www.wdrmaus.de/aktuelle-sendung/index.php5',
'info_dict': {
- 'id': 'mdb-1552552',
+ 'id': 'mdb-2627637',
'ext': 'mp4',
'upload_date': 're:^[0-9]{8}$',
- 'title': 're:^Die Sendung mit der Maus vom [0-9.]{10}$',
+ 'title': 're:^Die Sendung (?:mit der Maus )?vom [0-9.]{10}$',
},
'skip': 'The id changes from week to week because of the new episode'
},
@@ -203,6 +217,7 @@ class WDRPageIE(InfoExtractor):
'upload_date': '20130919',
'title': 'Sachgeschichte - Achterbahn ',
},
+ 'skip': 'HTTP Error 404: Not Found',
},
{
'url': 'http://www1.wdr.de/radio/player/radioplayer116~_layout-popupVersion.html',
@@ -228,6 +243,7 @@ class WDRPageIE(InfoExtractor):
'params': {
'skip_download': True,
},
+ 'skip': 'HTTP Error 404: Not Found',
},
{
'url': 'http://www.sportschau.de/handballem2018/audio-vorschau---die-handball-em-startet-mit-grossem-favoritenfeld-100.html',
@@ -241,7 +257,7 @@ class WDRPageIE(InfoExtractor):
def _real_extract(self, url):
mobj = self._match_valid_url(url)
- display_id = mobj.group('display_id')
+ display_id = dict_get(mobj.groupdict(), ('display_id', 'maus_id'), 'wdrmaus')
webpage = self._download_webpage(url, display_id)
entries = []
@@ -267,6 +283,14 @@ class WDRPageIE(InfoExtractor):
jsonp_url = try_get(
media_link_obj, lambda x: x['mediaObj']['url'], compat_str)
if jsonp_url:
+ # metadata, or player JS with ['ref'] giving WDR id, or just media, perhaps
+ clip_id = media_link_obj['mediaObj'].get('ref')
+ if jsonp_url.endswith('.assetjsonp'):
+ asset = self._download_json(
+ jsonp_url, display_id, fatal=False, transform_source=strip_jsonp)
+ clip_id = try_get(asset, lambda x: x['trackerData']['trackerClipId'], compat_str)
+ if clip_id:
+ jsonp_url = self._asset_url(clip_id[4:])
entries.append(self.url_result(jsonp_url, ie=WDRIE.ie_key()))
# Playlist (e.g. https://www1.wdr.de/mediathek/video/sendungen/aktuelle-stunde/aktuelle-stunde-120.html)
@@ -286,16 +310,14 @@ class WDRPageIE(InfoExtractor):
class WDRElefantIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)wdrmaus\.de/elefantenseite/#(?P<id>.+)'
_TEST = {
- 'url': 'http://www.wdrmaus.de/elefantenseite/#folge_ostern_2015',
+ 'url': 'http://www.wdrmaus.de/elefantenseite/#elefantenkino_wippe',
+ # adaptive stream: unstable file MD5
'info_dict': {
- 'title': 'Folge Oster-Spezial 2015',
- 'id': 'mdb-1088195',
+ 'title': 'Wippe',
+ 'id': 'mdb-1198320',
'ext': 'mp4',
'age_limit': None,
- 'upload_date': '20150406'
- },
- 'params': {
- 'skip_download': True,
+ 'upload_date': '20071003'
},
}
@@ -330,6 +352,7 @@ class WDRMobileIE(InfoExtractor):
/[0-9]+/[0-9]+/
(?P<id>[0-9]+)_(?P<title>[0-9]+)'''
IE_NAME = 'wdr:mobile'
+ _WORKING = False # no such domain
_TEST = {
'url': 'http://mobile-ondemand.wdr.de/CMS2010/mdb/ondemand/weltweit/fsk0/42/421735/421735_4283021.mp4',
'info_dict': {
diff --git a/hypervideo_dl/extractor/webcaster.py b/hypervideo_dl/extractor/webcaster.py
index e4b65f5..a858e99 100644
--- a/hypervideo_dl/extractor/webcaster.py
+++ b/hypervideo_dl/extractor/webcaster.py
@@ -6,6 +6,7 @@ import re
from .common import InfoExtractor
from ..utils import (
determine_ext,
+ join_nonempty,
xpath_text,
)
@@ -34,12 +35,9 @@ class WebcasterIE(InfoExtractor):
title = xpath_text(video, './/event_name', 'event name', fatal=True)
- def make_id(parts, separator):
- return separator.join(filter(None, parts))
-
formats = []
for format_id in (None, 'noise'):
- track_tag = make_id(('track', format_id), '_')
+ track_tag = join_nonempty('track', format_id, delim='_')
for track in video.findall('.//iphone/%s' % track_tag):
track_url = track.text
if not track_url:
@@ -48,7 +46,7 @@ class WebcasterIE(InfoExtractor):
m3u8_formats = self._extract_m3u8_formats(
track_url, video_id, 'mp4',
entry_protocol='m3u8_native',
- m3u8_id=make_id(('hls', format_id), '-'), fatal=False)
+ m3u8_id=join_nonempty('hls', format_id, delim='-'), fatal=False)
for f in m3u8_formats:
f.update({
'source_preference': 0 if format_id == 'noise' else 1,
diff --git a/hypervideo_dl/extractor/weibo.py b/hypervideo_dl/extractor/weibo.py
index 621df5b..dafa2af 100644
--- a/hypervideo_dl/extractor/weibo.py
+++ b/hypervideo_dl/extractor/weibo.py
@@ -73,8 +73,7 @@ class WeiboIE(InfoExtractor):
webpage = self._download_webpage(
url, video_id, note='Revisiting webpage')
- title = self._html_search_regex(
- r'<title>(.+?)</title>', webpage, 'title')
+ title = self._html_extract_title(webpage)
video_formats = compat_parse_qs(self._search_regex(
r'video-sources=\\\"(.+?)\"', webpage, 'video_sources'))
diff --git a/hypervideo_dl/extractor/whowatch.py b/hypervideo_dl/extractor/whowatch.py
index f8bc2e7..e4b610d 100644
--- a/hypervideo_dl/extractor/whowatch.py
+++ b/hypervideo_dl/extractor/whowatch.py
@@ -5,6 +5,7 @@ from .common import InfoExtractor
from ..utils import (
int_or_none,
qualities,
+ try_call,
try_get,
ExtractorError,
)
@@ -26,10 +27,10 @@ class WhoWatchIE(InfoExtractor):
metadata = self._download_json('https://api.whowatch.tv/lives/%s' % video_id, video_id)
live_data = self._download_json('https://api.whowatch.tv/lives/%s/play' % video_id, video_id)
- title = try_get(None, (
- lambda x: live_data['share_info']['live_title'][1:-1],
- lambda x: metadata['live']['title'],
- ), compat_str)
+ title = try_call(
+ lambda: live_data['share_info']['live_title'][1:-1],
+ lambda: metadata['live']['title'],
+ expected_type=str)
hls_url = live_data.get('hls_url')
if not hls_url:
diff --git a/hypervideo_dl/extractor/willow.py b/hypervideo_dl/extractor/willow.py
new file mode 100644
index 0000000..4d3d62f
--- /dev/null
+++ b/hypervideo_dl/extractor/willow.py
@@ -0,0 +1,58 @@
+# coding: utf-8
+from ..utils import ExtractorError
+from .common import InfoExtractor
+
+
+class WillowIE(InfoExtractor):
+ _VALID_URL = r'https?://(www\.)?willow\.tv/videos/(?P<id>[0-9a-z-_]+)'
+ _GEO_COUNTRIES = ['US']
+
+ _TESTS = [{
+ 'url': 'http://willow.tv/videos/d5winning-moment-eng-vs-ind-streaming-online-4th-test-india-tour-of-england-2021',
+ 'info_dict': {
+ 'id': '169662',
+ 'display_id': 'd5winning-moment-eng-vs-ind-streaming-online-4th-test-india-tour-of-england-2021',
+ 'ext': 'mp4',
+ 'title': 'Winning Moment: 4th Test, England vs India',
+ 'thumbnail': 'https://aimages.willow.tv/ytThumbnails/6748_D5winning_moment.jpg',
+ 'duration': 233,
+ 'timestamp': 1630947954,
+ 'upload_date': '20210906',
+ 'location': 'Kennington Oval, London',
+ 'series': 'India tour of England 2021',
+ },
+ 'params': {
+ 'skip_download': True, # AES-encrypted m3u8
+ },
+ }, {
+ 'url': 'http://willow.tv/videos/highlights-short-ind-vs-nz-streaming-online-2nd-t20i-new-zealand-tour-of-india-2021',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+ video_data = self._parse_json(self._html_search_regex(
+ r'var\s+data_js\s*=\s*JSON\.parse\(\'(.+)\'\)', webpage,
+ 'data_js'), video_id)
+
+ video = next((v for v in video_data.get('trending_videos') or []
+ if v.get('secureurl')), None)
+ if not video:
+ raise ExtractorError('No videos found')
+
+ formats = self._extract_m3u8_formats(video['secureurl'], video_id, 'mp4')
+ self._sort_formats(formats)
+
+ return {
+ 'id': str(video.get('content_id')),
+ 'display_id': video.get('video_slug'),
+ 'title': video.get('video_name') or self._html_search_meta('twitter:title', webpage),
+ 'formats': formats,
+ 'thumbnail': video.get('yt_thumb_url') or self._html_search_meta(
+ 'twitter:image', webpage, default=None),
+ 'duration': video.get('duration_seconds'),
+ 'timestamp': video.get('created_date'),
+ 'location': video.get('venue'),
+ 'series': video.get('series_name'),
+ }
diff --git a/hypervideo_dl/extractor/wppilot.py b/hypervideo_dl/extractor/wppilot.py
new file mode 100644
index 0000000..3003a0f
--- /dev/null
+++ b/hypervideo_dl/extractor/wppilot.py
@@ -0,0 +1,177 @@
+# coding: utf-8
+
+from .common import InfoExtractor
+from ..utils import (
+ try_get,
+ ExtractorError,
+)
+
+import json
+import random
+import re
+
+
+class WPPilotBaseIE(InfoExtractor):
+ _VIDEO_URL = 'https://pilot.wp.pl/api/v1/channel/%s'
+ _VIDEO_GUEST_URL = 'https://pilot.wp.pl/api/v1/guest/channel/%s'
+
+ _HEADERS_WEB = {
+ 'Content-Type': 'application/json; charset=UTF-8',
+ 'Referer': 'https://pilot.wp.pl/tv/',
+ }
+
+ def _get_channel_list(self, cache=True):
+ if cache is True:
+ cache_res = self._downloader.cache.load('wppilot', 'channel-list')
+ if cache_res:
+ return cache_res, True
+ webpage = self._download_webpage('https://pilot.wp.pl/tv/', None, 'Downloading webpage')
+ page_data_base_url = self._search_regex(
+ r'<script src="(https://wp-pilot-gatsby\.wpcdn\.pl/v[\d.-]+/desktop)',
+ webpage, 'gatsby build version') + '/page-data'
+ page_data = self._download_json(f'{page_data_base_url}/tv/page-data.json', None, 'Downloading page data')
+ for qhash in page_data['staticQueryHashes']:
+ qhash_content = self._download_json(
+ f'{page_data_base_url}/sq/d/{qhash}.json', None,
+ 'Searching for channel list')
+ channel_list = try_get(qhash_content, lambda x: x['data']['allChannels']['nodes'])
+ if channel_list is None:
+ continue
+ self._downloader.cache.store('wppilot', 'channel-list', channel_list)
+ return channel_list, False
+ raise ExtractorError('Unable to find the channel list')
+
+ def _parse_channel(self, chan):
+ return {
+ 'id': str(chan['id']),
+ 'title': chan['name'],
+ 'is_live': True,
+ 'thumbnails': [{
+ 'id': key,
+ 'url': chan[key],
+ } for key in ('thumbnail', 'thumbnail_mobile', 'icon') if chan.get(key)],
+ }
+
+
+class WPPilotIE(WPPilotBaseIE):
+ _VALID_URL = r'(?:https?://pilot\.wp\.pl/tv/?#|wppilot:)(?P<id>[a-z\d-]+)'
+ IE_NAME = 'wppilot'
+
+ _TESTS = [{
+ 'url': 'https://pilot.wp.pl/tv/#telewizja-wp-hd',
+ 'info_dict': {
+ 'id': '158',
+ 'ext': 'mp4',
+ 'title': 'Telewizja WP HD',
+ },
+ 'params': {
+ 'format': 'bestvideo',
+ },
+ }, {
+ # audio only
+ 'url': 'https://pilot.wp.pl/tv/#radio-nowy-swiat',
+ 'info_dict': {
+ 'id': '238',
+ 'ext': 'm4a',
+ 'title': 'Radio Nowy Świat',
+ },
+ 'params': {
+ 'format': 'bestaudio',
+ },
+ }, {
+ 'url': 'wppilot:9',
+ 'only_matching': True,
+ }]
+
+ def _get_channel(self, id_or_slug):
+ video_list, is_cached = self._get_channel_list(cache=True)
+ key = 'id' if re.match(r'^\d+$', id_or_slug) else 'slug'
+ for video in video_list:
+ if video.get(key) == id_or_slug:
+ return self._parse_channel(video)
+ # if cached channel not found, download and retry
+ if is_cached:
+ video_list, _ = self._get_channel_list(cache=False)
+ for video in video_list:
+ if video.get(key) == id_or_slug:
+ return self._parse_channel(video)
+ raise ExtractorError('Channel not found')
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ channel = self._get_channel(video_id)
+ video_id = str(channel['id'])
+
+ is_authorized = next((c for c in self._downloader.cookiejar if c.name == 'netviapisessid'), None)
+ # cookies starting with "g:" are assigned to guests
+ is_authorized = True if is_authorized is not None and not is_authorized.value.startswith('g:') else False
+
+ video = self._download_json(
+ (self._VIDEO_URL if is_authorized else self._VIDEO_GUEST_URL) % video_id,
+ video_id, query={
+ 'device_type': 'web',
+ }, headers=self._HEADERS_WEB,
+ expected_status=(200, 422))
+
+ stream_token = try_get(video, lambda x: x['_meta']['error']['info']['stream_token'])
+ if stream_token:
+ close = self._download_json(
+ 'https://pilot.wp.pl/api/v1/channels/close', video_id,
+ 'Invalidating previous stream session', headers=self._HEADERS_WEB,
+ data=json.dumps({
+ 'channelId': video_id,
+ 't': stream_token,
+ }).encode('utf-8'))
+ if try_get(close, lambda x: x['data']['status']) == 'ok':
+ return self.url_result(url, ie=WPPilotIE.ie_key())
+
+ formats = []
+
+ for fmt in video['data']['stream_channel']['streams']:
+ # live DASH does not work for now
+ # if fmt['type'] == 'dash@live:abr':
+ # formats.extend(
+ # self._extract_mpd_formats(
+ # random.choice(fmt['url']), video_id))
+ if fmt['type'] == 'hls@live:abr':
+ formats.extend(
+ self._extract_m3u8_formats(
+ random.choice(fmt['url']),
+ video_id, live=True))
+
+ self._sort_formats(formats)
+
+ channel['formats'] = formats
+ return channel
+
+
+class WPPilotChannelsIE(WPPilotBaseIE):
+ _VALID_URL = r'(?:https?://pilot\.wp\.pl/(?:tv/?)?(?:\?[^#]*)?#?|wppilot:)$'
+ IE_NAME = 'wppilot:channels'
+
+ _TESTS = [{
+ 'url': 'wppilot:',
+ 'info_dict': {
+ 'id': 'wppilot',
+ 'title': 'WP Pilot',
+ },
+ 'playlist_mincount': 100,
+ }, {
+ 'url': 'https://pilot.wp.pl/',
+ 'only_matching': True,
+ }]
+
+ def _entries(self):
+ channel_list, _ = self._get_channel_list()
+ for chan in channel_list:
+ entry = self._parse_channel(chan)
+ entry.update({
+ '_type': 'url_transparent',
+ 'url': f'wppilot:{chan["id"]}',
+ 'ie_key': WPPilotIE.ie_key(),
+ })
+ yield entry
+
+ def _real_extract(self, url):
+ return self.playlist_result(self._entries(), 'wppilot', 'WP Pilot')
diff --git a/hypervideo_dl/extractor/xinpianchang.py b/hypervideo_dl/extractor/xinpianchang.py
new file mode 100644
index 0000000..9832d23
--- /dev/null
+++ b/hypervideo_dl/extractor/xinpianchang.py
@@ -0,0 +1,95 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ int_or_none,
+ try_get,
+ update_url_query,
+ url_or_none,
+)
+
+
+class XinpianchangIE(InfoExtractor):
+ _VALID_URL = r'https?://www\.xinpianchang\.com/(?P<id>[^/]+?)(?:\D|$)'
+ IE_NAME = 'xinpianchang'
+ IE_DESC = 'xinpianchang.com'
+ _TESTS = [{
+ 'url': 'https://www.xinpianchang.com/a11766551',
+ 'info_dict': {
+ 'id': 'a11766551',
+ 'ext': 'mp4',
+ 'title': '北京2022冬奥会闭幕式再见短片-冰墩墩下班了',
+ 'description': 'md5:4a730c10639a82190fabe921c0fa4b87',
+ 'duration': 151,
+ 'thumbnail': r're:^https?://oss-xpc0\.xpccdn\.com.+/assets/',
+ 'uploader': '正时文创',
+ 'uploader_id': 10357277,
+ 'categories': ['宣传片', '国家城市', '广告', '其他'],
+ 'keywords': ['北京冬奥会', '冰墩墩', '再见', '告别', '冰墩墩哭了', '感动', '闭幕式', '熄火']
+ },
+ }, {
+ 'url': 'https://www.xinpianchang.com/a11762904',
+ 'info_dict': {
+ 'id': 'a11762904',
+ 'ext': 'mp4',
+ 'title': '冬奥会决胜时刻《法国派出三只鸡?》',
+ 'description': 'md5:55cb139ef8f48f0c877932d1f196df8b',
+ 'duration': 136,
+ 'thumbnail': r're:^https?://oss-xpc0\.xpccdn\.com.+/assets/',
+ 'uploader': '精品动画',
+ 'uploader_id': 10858927,
+ 'categories': ['动画', '三维CG'],
+ 'keywords': ['France Télévisions', '法国3台', '蠢萌', '冬奥会']
+ },
+ }, {
+ 'url': 'https://www.xinpianchang.com/a11779743?from=IndexPick&part=%E7%BC%96%E8%BE%91%E7%B2%BE%E9%80%89&index=2',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id=video_id)
+ domain = self.find_value_with_regex(var='requireNewDomain', webpage=webpage)
+ vid = self.find_value_with_regex(var='vid', webpage=webpage)
+ app_key = self.find_value_with_regex(var='modeServerAppKey', webpage=webpage)
+ api = update_url_query(f'{domain}/mod/api/v2/media/{vid}', {'appKey': app_key})
+ data = self._download_json(api, video_id=video_id)['data']
+ formats, subtitles = [], {}
+ for k, v in data.get('resource').items():
+ if k in ('dash', 'hls'):
+ v_url = v.get('url')
+ if not v_url:
+ continue
+ if k == 'dash':
+ fmts, subs = self._extract_mpd_formats_and_subtitles(v_url, video_id=video_id)
+ elif k == 'hls':
+ fmts, subs = self._extract_m3u8_formats_and_subtitles(v_url, video_id=video_id)
+ formats.extend(fmts)
+ subtitles = self._merge_subtitles(subtitles, subs)
+ elif k == 'progressive':
+ formats.extend([{
+ 'url': url_or_none(prog.get('url')),
+ 'width': int_or_none(prog.get('width')),
+ 'height': int_or_none(prog.get('height')),
+ 'ext': 'mp4',
+ } for prog in v if prog.get('url') or []])
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': data.get('title'),
+ 'description': data.get('description'),
+ 'duration': int_or_none(data.get('duration')),
+ 'categories': data.get('categories'),
+ 'keywords': data.get('keywords'),
+ 'thumbnail': data.get('cover'),
+ 'uploader': try_get(data, lambda x: x['owner']['username']),
+ 'uploader_id': try_get(data, lambda x: x['owner']['id']),
+ 'formats': formats,
+ 'subtitles': subtitles,
+ }
+
+ def find_value_with_regex(self, var, webpage):
+ return self._search_regex(rf'var\s{var}\s=\s\"(?P<vid>[^\"]+)\"', webpage, name=var)
diff --git a/hypervideo_dl/extractor/xnxx.py b/hypervideo_dl/extractor/xnxx.py
index dd4fb54..27f9916 100644
--- a/hypervideo_dl/extractor/xnxx.py
+++ b/hypervideo_dl/extractor/xnxx.py
@@ -13,7 +13,7 @@ from ..utils import (
class XNXXIE(InfoExtractor):
- _VALID_URL = r'https?://(?:video|www)\.xnxx\.com/video-?(?P<id>[0-9a-z]+)/'
+ _VALID_URL = r'https?://(?:video|www)\.xnxx3?\.com/video-?(?P<id>[0-9a-z]+)/'
_TESTS = [{
'url': 'http://www.xnxx.com/video-55awb78/skyrim_test_video',
'md5': '7583e96c15c0f21e9da3453d9920fbba',
@@ -32,6 +32,9 @@ class XNXXIE(InfoExtractor):
}, {
'url': 'http://www.xnxx.com/video-55awb78/',
'only_matching': True,
+ }, {
+ 'url': 'http://www.xnxx3.com/video-55awb78/',
+ 'only_matching': True,
}]
def _real_extract(self, url):
diff --git a/hypervideo_dl/extractor/xvideos.py b/hypervideo_dl/extractor/xvideos.py
index 8fc6491..d5261b6 100644
--- a/hypervideo_dl/extractor/xvideos.py
+++ b/hypervideo_dl/extractor/xvideos.py
@@ -19,25 +19,41 @@ class XVideosIE(InfoExtractor):
(?:
(?:[^/]+\.)?xvideos2?\.com/video|
(?:www\.)?xvideos\.es/video|
- flashservice\.xvideos\.com/embedframe/|
+ (?:www|flashservice)\.xvideos\.com/embedframe/|
static-hw\.xvideos\.com/swf/xv-player\.swf\?.*?\bid_video=
)
(?P<id>[0-9]+)
'''
_TESTS = [{
- 'url': 'http://www.xvideos.com/video4588838/biker_takes_his_girl',
+ 'url': 'https://www.xvideos.com/video4588838/motorcycle_guy_cucks_influencer_steals_his_gf',
'md5': '14cea69fcb84db54293b1e971466c2e1',
'info_dict': {
'id': '4588838',
'ext': 'mp4',
- 'title': 'Biker Takes his Girl',
+ 'title': 'Motorcycle Guy Cucks Influencer, Steals his GF',
'duration': 108,
'age_limit': 18,
+ 'thumbnail': r're:^https://img-hw.xvideos-cdn.com/.+\.jpg',
+ }
+ }, {
+ # Broken HLS formats
+ 'url': 'https://www.xvideos.com/video65982001/what_s_her_name',
+ 'md5': 'b82d7d7ef7d65a84b1fa6965f81f95a5',
+ 'info_dict': {
+ 'id': '65982001',
+ 'ext': 'mp4',
+ 'title': 'what\'s her name?',
+ 'duration': 120,
+ 'age_limit': 18,
+ 'thumbnail': r're:^https://img-hw.xvideos-cdn.com/.+\.jpg',
}
}, {
'url': 'https://flashservice.xvideos.com/embedframe/4588838',
'only_matching': True,
}, {
+ 'url': 'https://www.xvideos.com/embedframe/4588838',
+ 'only_matching': True,
+ }, {
'url': 'http://static-hw.xvideos.com/swf/xv-player.swf?id_video=4588838',
'only_matching': True,
}, {
@@ -80,9 +96,7 @@ class XVideosIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
-
- webpage = self._download_webpage(
- 'https://www.xvideos.com/video%s/' % video_id, video_id)
+ webpage = self._download_webpage(url, video_id)
mobj = re.search(r'<h1 class="inlineError">(.+?)</h1>', webpage)
if mobj:
@@ -125,9 +139,11 @@ class XVideosIE(InfoExtractor):
r'setVideo([^(]+)\((["\'])(http.+?)\2\)', webpage):
format_id = kind.lower()
if format_id == 'hls':
- formats.extend(self._extract_m3u8_formats(
+ hls_formats = self._extract_m3u8_formats(
format_url, video_id, 'mp4',
- entry_protocol='m3u8_native', m3u8_id='hls', fatal=False))
+ entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)
+ self._check_formats(hls_formats, video_id)
+ formats.extend(hls_formats)
elif format_id in ('urllow', 'urlhigh'):
formats.append({
'url': format_url,
diff --git a/hypervideo_dl/extractor/yahoo.py b/hypervideo_dl/extractor/yahoo.py
index 53556de..20504de 100644
--- a/hypervideo_dl/extractor/yahoo.py
+++ b/hypervideo_dl/extractor/yahoo.py
@@ -264,7 +264,7 @@ class YahooIE(InfoExtractor):
return {
'id': video_id,
- 'title': self._live_title(title) if is_live else title,
+ 'title': title,
'formats': formats,
'thumbnails': thumbnails,
'description': clean_html(video.get('description')),
@@ -414,11 +414,14 @@ class YahooGyaOIE(InfoExtractor):
IE_NAME = 'yahoo:gyao'
_VALID_URL = r'https?://(?:gyao\.yahoo\.co\.jp/(?:p|title(?:/[^/]+)?)|streaming\.yahoo\.co\.jp/p/y)/(?P<id>\d+/v\d+|[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'
_TESTS = [{
- 'url': 'https://gyao.yahoo.co.jp/p/00449/v03102/',
+ 'url': 'https://gyao.yahoo.co.jp/title/%E3%82%BF%E3%82%A4%E3%83%A0%E3%83%9C%E3%82%AB%E3%83%B3%E3%82%B7%E3%83%AA%E3%83%BC%E3%82%BA%20%E3%83%A4%E3%83%83%E3%82%BF%E3%83%BC%E3%83%9E%E3%83%B3/5f60ceb3-6e5e-40ef-ba40-d68b598d067f',
'info_dict': {
- 'id': '00449:v03102',
+ 'id': '5f60ceb3-6e5e-40ef-ba40-d68b598d067f',
},
- 'playlist_count': 2,
+ 'playlist_mincount': 80,
+ }, {
+ 'url': 'https://gyao.yahoo.co.jp/p/00449/v03102/',
+ 'only_matching': True,
}, {
'url': 'https://streaming.yahoo.co.jp/p/y/01034/v00133/',
'only_matching': True,
@@ -430,19 +433,30 @@ class YahooGyaOIE(InfoExtractor):
'only_matching': True,
}]
+ def _entries(self, program_id):
+ page = 1
+ while True:
+ playlist = self._download_json(
+ f'https://gyao.yahoo.co.jp/api/programs/{program_id}/videos?page={page}', program_id,
+ note=f'Downloading JSON metadata page {page}')
+ if not playlist:
+ break
+ for video in playlist['videos']:
+ video_id = video.get('id')
+ if not video_id:
+ continue
+ if video.get('streamingAvailability') == 'notYet':
+ continue
+ yield self.url_result(
+ 'https://gyao.yahoo.co.jp/player/%s/' % video_id.replace(':', '/'),
+ YahooGyaOPlayerIE.ie_key(), video_id)
+ if playlist.get('ended'):
+ break
+ page += 1
+
def _real_extract(self, url):
program_id = self._match_id(url).replace('/', ':')
- videos = self._download_json(
- 'https://gyao.yahoo.co.jp/api/programs/%s/videos' % program_id, program_id)['videos']
- entries = []
- for video in videos:
- video_id = video.get('id')
- if not video_id:
- continue
- entries.append(self.url_result(
- 'https://gyao.yahoo.co.jp/player/%s/' % video_id.replace(':', '/'),
- YahooGyaOPlayerIE.ie_key(), video_id))
- return self.playlist_result(entries, program_id)
+ return self.playlist_result(self._entries(program_id), program_id)
class YahooJapanNewsIE(InfoExtractor):
@@ -519,7 +533,7 @@ class YahooJapanNewsIE(InfoExtractor):
title = self._html_search_meta(
['og:title', 'twitter:title'], webpage, 'title', default=None
- ) or self._html_search_regex('<title>([^<]+)</title>', webpage, 'title')
+ ) or self._html_extract_title(webpage)
if display_id == host:
# Headline page (w/ multiple BC playlists) ('news.yahoo.co.jp', 'headlines.yahoo.co.jp/videonews/', ...)
diff --git a/hypervideo_dl/extractor/yandexvideo.py b/hypervideo_dl/extractor/yandexvideo.py
index 9974d65..7d3966b 100644
--- a/hypervideo_dl/extractor/yandexvideo.py
+++ b/hypervideo_dl/extractor/yandexvideo.py
@@ -7,9 +7,11 @@ import re
from .common import InfoExtractor
from ..utils import (
determine_ext,
+ extract_attributes,
int_or_none,
try_get,
url_or_none,
+ lowercase_escape,
)
@@ -147,8 +149,46 @@ class YandexVideoIE(InfoExtractor):
}
+class YandexVideoPreviewIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?yandex\.ru/video/preview(?:/?\?.*?filmId=|/)(?P<id>\d+)'
+ _TESTS = [{ # Odnoklassniki
+ 'url': 'https://yandex.ru/video/preview/?filmId=10682852472978372885&text=summer',
+ 'info_dict': {
+ 'id': '1352565459459',
+ 'ext': 'mp4',
+ 'like_count': int,
+ 'upload_date': '20191202',
+ 'age_limit': 0,
+ 'duration': 196,
+ 'thumbnail': 'https://i.mycdn.me/videoPreview?id=544866765315&type=37&idx=13&tkn=TY5qjLYZHxpmcnK8U2LgzYkgmaU&fn=external_8',
+ 'uploader_id': '481054701571',
+ 'title': 'LOFT - summer, summer, summer HD',
+ 'uploader': 'АРТЁМ КУДРОВ',
+ },
+ }, { # youtube
+ 'url': 'https://yandex.ru/video/preview/?filmId=4479424425337895262&source=main_redirect&text=видео&utm_source=main_stripe_big',
+ 'only_matching': True,
+ }, { # YandexVideo
+ 'url': 'https://yandex.ru/video/preview/5275069442094787341',
+ 'only_matching': True,
+ }, { # youtube
+ 'url': 'https://yandex.ru/video/preview/?filmId=16658118429797832897&from=tabbar&p=1&text=%D0%BF%D1%80%D0%BE%D1%81%D0%BC%D0%BE%D1%82%D1%80+%D1%84%D1%80%D0%B0%D0%B3%D0%BC%D0%B5%D0%BD%D1%82%D0%B0+%D0%BC%D0%B0%D0%BB%D0%B5%D0%BD%D1%8C%D0%BA%D0%B8%D0%B9+%D0%BF%D1%80%D0%B8%D0%BD%D1%86+%D0%BC%D1%8B+%D0%B2+%D0%BE%D1%82%D0%B2%D0%B5%D1%82%D0%B5+%D0%B7%D0%B0+%D1%82%D0%B5%D1%85+%D0%BA%D0%BE%D0%B3%D0%BE+%D0%BF%D1%80%D0%B8%D1%80%D1%83%D1%87%D0%B8%D0%BB%D0%B8',
+ 'only_matching': True,
+ }, { # Odnoklassniki
+ 'url': 'https://yandex.ru/video/preview/?text=Francis%20Lai%20-%20Le%20Bon%20Et%20Les%20MC)chants&path=wizard&parent-reqid=1643208087979310-1481782809207673478-sas3-0931-2f9-sas-l7-balancer-8080-BAL-9380&wiz_type=vital&filmId=12508152936505397283',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ id = self._match_id(url)
+ webpage = self._download_webpage(url, id)
+ data_raw = self._search_regex(r'window.Ya.__inline_params__\s*=\s*JSON.parse\(\'([^"]+?\\u0022video\\u0022:[^"]+?})\'\);', webpage, 'data_raw')
+ data_json = self._parse_json(data_raw, id, transform_source=lowercase_escape)
+ return self.url_result(data_json['video']['url'])
+
+
class ZenYandexIE(InfoExtractor):
- _VALID_URL = r'https?://zen\.yandex\.ru/media/(?:id/[^/]+/|[^/]+/)(?:[a-z0-9-]+)-(?P<id>[a-z0-9-]+)'
+ _VALID_URL = r'https?://zen\.yandex\.ru(?:/video)?/(media|watch)/(?:(?:id/[^/]+/|[^/]+/)(?:[a-z0-9-]+)-)?(?P<id>[a-z0-9-]+)'
_TESTS = [{
'url': 'https://zen.yandex.ru/media/popmech/izverjenie-vulkana-iz-spichek-zreliscnyi-opyt-6002240ff8b1af50bb2da5e3',
'info_dict': {
@@ -156,19 +196,38 @@ class ZenYandexIE(InfoExtractor):
'ext': 'mp4',
'title': 'Извержение вулкана из спичек: зрелищный опыт',
'description': 'md5:053ad3c61b5596d510c9a199dc8ee633',
- 'thumbnail': 'https://avatars.mds.yandex.net/get-zen-pub-og/3558619/pub_6002240ff8b1af50bb2da5e3_600bad814d953e4132a30b5e/orig',
+ 'thumbnail': 're:^https://avatars.mds.yandex.net/',
'uploader': 'Популярная механика',
},
+ 'params': {
+ 'skip_download': 'm3u8',
+ },
}, {
'url': 'https://zen.yandex.ru/media/id/606fd806cc13cb3c58c05cf5/vot-eto-focus-dedy-morozy-na-gidrociklah-60c7c443da18892ebfe85ed7',
'info_dict': {
'id': '60c7c443da18892ebfe85ed7',
'ext': 'mp4',
'title': 'ВОТ ЭТО Focus. Деды Морозы на гидроциклах',
- 'description': 'md5:8684912f6086f298f8078d4af0e8a600',
- 'thumbnail': 'https://avatars.mds.yandex.net/get-zen-pub-og/4410519/pub_60c7c443da18892ebfe85ed7_60c7c48e060a163121f42cc3/orig',
+ 'description': 'md5:f3db3d995763b9bbb7b56d4ccdedea89',
+ 'thumbnail': 're:^https://avatars.mds.yandex.net/',
'uploader': 'AcademeG DailyStream'
},
+ 'params': {
+ 'skip_download': 'm3u8',
+ 'format': 'bestvideo',
+ },
+ }, {
+ 'url': 'https://zen.yandex.ru/video/watch/6002240ff8b1af50bb2da5e3',
+ 'info_dict': {
+ 'id': '6002240ff8b1af50bb2da5e3',
+ 'ext': 'mp4',
+ 'title': 'Извержение вулкана из спичек: зрелищный опыт',
+ 'description': 'md5:053ad3c61b5596d510c9a199dc8ee633',
+ 'uploader': 'Популярная механика',
+ },
+ 'params': {
+ 'skip_download': 'm3u8',
+ },
}, {
'url': 'https://zen.yandex.ru/media/id/606fd806cc13cb3c58c05cf5/novyi-samsung-fold-3-moskvich-barahlit-612f93b7f8d48e7e945792a2?from=channel&rid=2286618386.482.1630817595976.42360',
'only_matching': True,
@@ -177,23 +236,37 @@ class ZenYandexIE(InfoExtractor):
def _real_extract(self, url):
id = self._match_id(url)
webpage = self._download_webpage(url, id)
- data_json = self._parse_json(self._search_regex(r'w\._data\s?=\s?({.+?});', webpage, 'metadata'), id)
- stream_json = try_get(data_json, lambda x: x['publication']['content']['gifContent'], dict)
- stream_url = stream_json.get('stream') or try_get(stream_json, lambda x: x['streams']['url'])
- formats = self._extract_m3u8_formats(stream_url, id)
+ data_json = self._parse_json(
+ self._search_regex(r'data\s*=\s*({["\']_*serverState_*video.+?});', webpage, 'metadata'), id)
+ serverstate = self._search_regex(r'(_+serverState_+video-site_[^_]+_+)',
+ webpage, 'server state').replace('State', 'Settings')
+ uploader = self._search_regex(r'(<a\s*class=["\']card-channel-link[^"\']+["\'][^>]+>)',
+ webpage, 'uploader', default='<a>')
+ uploader_name = extract_attributes(uploader).get('aria-label')
+ video_json = try_get(data_json, lambda x: x[serverstate]['exportData']['video'], dict)
+ stream_urls = try_get(video_json, lambda x: x['video']['streams'])
+ formats = []
+ for s_url in stream_urls:
+ ext = determine_ext(s_url)
+ if ext == 'mpd':
+ formats.extend(self._extract_mpd_formats(s_url, id, mpd_id='dash'))
+ elif ext == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(s_url, id, 'mp4'))
self._sort_formats(formats)
return {
'id': id,
- 'title': try_get(data_json, (lambda x: x['og']['title'], lambda x: x['publication']['content']['preview']['title'])),
- 'uploader': data_json.get('authorName') or try_get(data_json, lambda x: x['publisher']['name']),
- 'description': try_get(data_json, lambda x: x['og']['description']),
- 'thumbnail': try_get(data_json, lambda x: x['og']['imageUrl']),
+ 'title': video_json.get('title') or self._og_search_title(webpage),
'formats': formats,
+ 'duration': int_or_none(video_json.get('duration')),
+ 'view_count': int_or_none(video_json.get('views')),
+ 'uploader': uploader_name or data_json.get('authorName') or try_get(data_json, lambda x: x['publisher']['name']),
+ 'description': self._og_search_description(webpage) or try_get(data_json, lambda x: x['og']['description']),
+ 'thumbnail': self._og_search_thumbnail(webpage) or try_get(data_json, lambda x: x['og']['imageUrl']),
}
class ZenYandexChannelIE(InfoExtractor):
- _VALID_URL = r'https?://zen\.yandex\.ru/(?!media)(?:id/)?(?P<id>[a-z0-9-_]+)'
+ _VALID_URL = r'https?://zen\.yandex\.ru/(?!media|video)(?:id/)?(?P<id>[a-z0-9-_]+)'
_TESTS = [{
'url': 'https://zen.yandex.ru/tok_media',
'info_dict': {
diff --git a/hypervideo_dl/extractor/youjizz.py b/hypervideo_dl/extractor/youjizz.py
index 5f5fbf2..111623f 100644
--- a/hypervideo_dl/extractor/youjizz.py
+++ b/hypervideo_dl/extractor/youjizz.py
@@ -36,8 +36,7 @@ class YouJizzIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
- title = self._html_search_regex(
- r'<title>(.+?)</title>', webpage, 'title')
+ title = self._html_extract_title(webpage)
formats = []
diff --git a/hypervideo_dl/extractor/younow.py b/hypervideo_dl/extractor/younow.py
index 04dbc87..583aea3 100644
--- a/hypervideo_dl/extractor/younow.py
+++ b/hypervideo_dl/extractor/younow.py
@@ -7,6 +7,7 @@ from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
ExtractorError,
+ format_field,
int_or_none,
try_get,
)
@@ -58,7 +59,7 @@ class YouNowLiveIE(InfoExtractor):
return {
'id': uploader,
'is_live': True,
- 'title': self._live_title(uploader),
+ 'title': uploader,
'thumbnail': data.get('awsUrl'),
'tags': data.get('tags'),
'categories': data.get('tags'),
@@ -93,7 +94,7 @@ def _extract_moment(item, fatal=True):
uploader = try_get(item, lambda x: x['owner']['name'], compat_str)
uploader_id = try_get(item, lambda x: x['owner']['userId'])
- uploader_url = 'https://www.younow.com/%s' % uploader if uploader else None
+ uploader_url = format_field(uploader, template='https://www.younow.com/%s')
entry = {
'extractor_key': 'YouNowMoment',
diff --git a/hypervideo_dl/extractor/youtube.py b/hypervideo_dl/extractor/youtube.py
index dc5ee63..dec3b14 100644
--- a/hypervideo_dl/extractor/youtube.py
+++ b/hypervideo_dl/extractor/youtube.py
@@ -2,18 +2,21 @@
from __future__ import unicode_literals
-import base64
import calendar
import copy
import datetime
+import functools
import hashlib
import itertools
import json
+import math
import os.path
import random
import re
+import sys
import time
import traceback
+import threading
from .common import InfoExtractor, SearchInfoExtractor
from ..compat import (
@@ -28,7 +31,7 @@ from ..compat import (
)
from ..jsinterp import JSInterpreter
from ..utils import (
- bytes_to_intlist,
+ bug_reports_message,
clean_html,
datetime_from_str,
dict_get,
@@ -36,11 +39,14 @@ from ..utils import (
ExtractorError,
float_or_none,
format_field,
+ get_first,
int_or_none,
- intlist_to_bytes,
is_html,
+ join_nonempty,
+ js_to_json,
mimetype2ext,
network_exceptions,
+ NO_DEFAULT,
orderedSet,
parse_codecs,
parse_count,
@@ -53,10 +59,12 @@ from ..utils import (
smuggle_url,
str_or_none,
str_to_int,
+ strftime_or_none,
traverse_obj,
try_get,
unescapeHTML,
unified_strdate,
+ unified_timestamp,
unsmuggle_url,
update_url_query,
url_or_none,
@@ -72,7 +80,7 @@ INNERTUBE_CLIENTS = {
'INNERTUBE_CONTEXT': {
'client': {
'clientName': 'WEB',
- 'clientVersion': '2.20210622.10.00',
+ 'clientVersion': '2.20211221.00.00',
}
},
'INNERTUBE_CONTEXT_CLIENT_NAME': 1
@@ -82,7 +90,7 @@ INNERTUBE_CLIENTS = {
'INNERTUBE_CONTEXT': {
'client': {
'clientName': 'WEB_EMBEDDED_PLAYER',
- 'clientVersion': '1.20210620.0.1',
+ 'clientVersion': '1.20211215.00.01',
},
},
'INNERTUBE_CONTEXT_CLIENT_NAME': 56
@@ -93,96 +101,96 @@ INNERTUBE_CLIENTS = {
'INNERTUBE_CONTEXT': {
'client': {
'clientName': 'WEB_REMIX',
- 'clientVersion': '1.20210621.00.00',
+ 'clientVersion': '1.20211213.00.00',
}
},
'INNERTUBE_CONTEXT_CLIENT_NAME': 67,
},
'web_creator': {
- 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
+ 'INNERTUBE_API_KEY': 'AIzaSyBUPetSUmoZL-OhlxA7wSac5XinrygCqMo',
'INNERTUBE_CONTEXT': {
'client': {
'clientName': 'WEB_CREATOR',
- 'clientVersion': '1.20210621.00.00',
+ 'clientVersion': '1.20211220.02.00',
}
},
'INNERTUBE_CONTEXT_CLIENT_NAME': 62,
},
'android': {
- 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
+ 'INNERTUBE_API_KEY': 'AIzaSyA8eiZmM1FaDVjRy-df2KTyQ_vz_yYM39w',
'INNERTUBE_CONTEXT': {
'client': {
'clientName': 'ANDROID',
- 'clientVersion': '16.20',
+ 'clientVersion': '16.49',
}
},
'INNERTUBE_CONTEXT_CLIENT_NAME': 3,
'REQUIRE_JS_PLAYER': False
},
'android_embedded': {
- 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
+ 'INNERTUBE_API_KEY': 'AIzaSyCjc_pVEDi4qsv5MtC2dMXzpIaDoRFLsxw',
'INNERTUBE_CONTEXT': {
'client': {
'clientName': 'ANDROID_EMBEDDED_PLAYER',
- 'clientVersion': '16.20',
+ 'clientVersion': '16.49',
},
},
'INNERTUBE_CONTEXT_CLIENT_NAME': 55,
'REQUIRE_JS_PLAYER': False
},
'android_music': {
- 'INNERTUBE_API_KEY': 'AIzaSyC9XL3ZjWddXya6X74dJoCTL-WEYFDNX30',
- 'INNERTUBE_HOST': 'music.youtube.com',
+ 'INNERTUBE_API_KEY': 'AIzaSyAOghZGza2MQSZkY_zfZ370N-PUdXEo8AI',
'INNERTUBE_CONTEXT': {
'client': {
'clientName': 'ANDROID_MUSIC',
- 'clientVersion': '4.32',
+ 'clientVersion': '4.57',
}
},
'INNERTUBE_CONTEXT_CLIENT_NAME': 21,
'REQUIRE_JS_PLAYER': False
},
'android_creator': {
+ 'INNERTUBE_API_KEY': 'AIzaSyD_qjV8zaaUMehtLkrKFgVeSX_Iqbtyws8',
'INNERTUBE_CONTEXT': {
'client': {
'clientName': 'ANDROID_CREATOR',
- 'clientVersion': '21.24.100',
+ 'clientVersion': '21.47',
},
},
'INNERTUBE_CONTEXT_CLIENT_NAME': 14,
'REQUIRE_JS_PLAYER': False
},
- # ios has HLS live streams
- # See: https://github.com/TeamNewPipe/NewPipeExtractor/issues/680
+ # iOS clients have HLS live streams. Setting device model to get 60fps formats.
+ # See: https://github.com/TeamNewPipe/NewPipeExtractor/issues/680#issuecomment-1002724558
'ios': {
- 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
+ 'INNERTUBE_API_KEY': 'AIzaSyB-63vPrdThhKuerbB2N_l7Kwwcxj6yUAc',
'INNERTUBE_CONTEXT': {
'client': {
'clientName': 'IOS',
- 'clientVersion': '16.20',
+ 'clientVersion': '16.46',
+ 'deviceModel': 'iPhone14,3',
}
},
'INNERTUBE_CONTEXT_CLIENT_NAME': 5,
'REQUIRE_JS_PLAYER': False
},
'ios_embedded': {
- 'INNERTUBE_API_KEY': 'AIzaSyDCU8hByM-4DrUqRUYnGn-3llEO78bcxq8',
'INNERTUBE_CONTEXT': {
'client': {
'clientName': 'IOS_MESSAGES_EXTENSION',
- 'clientVersion': '16.20',
+ 'clientVersion': '16.46',
+ 'deviceModel': 'iPhone14,3',
},
},
'INNERTUBE_CONTEXT_CLIENT_NAME': 66,
'REQUIRE_JS_PLAYER': False
},
'ios_music': {
- 'INNERTUBE_API_KEY': 'AIzaSyDK3iBpDP9nHVTk2qL73FLJICfOC3c51Og',
- 'INNERTUBE_HOST': 'music.youtube.com',
+ 'INNERTUBE_API_KEY': 'AIzaSyBAETezhkwP0ZWA02RsqT1zu78Fpt0bC_s',
'INNERTUBE_CONTEXT': {
'client': {
'clientName': 'IOS_MUSIC',
- 'clientVersion': '4.32',
+ 'clientVersion': '4.57',
},
},
'INNERTUBE_CONTEXT_CLIENT_NAME': 26,
@@ -192,7 +200,7 @@ INNERTUBE_CLIENTS = {
'INNERTUBE_CONTEXT': {
'client': {
'clientName': 'IOS_CREATOR',
- 'clientVersion': '21.24.100',
+ 'clientVersion': '21.47',
},
},
'INNERTUBE_CONTEXT_CLIENT_NAME': 15,
@@ -201,39 +209,61 @@ INNERTUBE_CLIENTS = {
# mweb has 'ultralow' formats
# See: https://github.com/hypervideo/hypervideo/pull/557
'mweb': {
- 'INNERTUBE_API_KEY': 'AIzaSyDCU8hByM-4DrUqRUYnGn-3llEO78bcxq8',
+ 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
'INNERTUBE_CONTEXT': {
'client': {
'clientName': 'MWEB',
- 'clientVersion': '2.20210721.07.00',
+ 'clientVersion': '2.20211221.01.00',
}
},
'INNERTUBE_CONTEXT_CLIENT_NAME': 2
},
+ # This client can access age restricted videos (unless the uploader has disabled the 'allow embedding' option)
+ # See: https://github.com/zerodytrash/YouTube-Internal-Clients
+ 'tv_embedded': {
+ 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
+ 'INNERTUBE_CONTEXT': {
+ 'client': {
+ 'clientName': 'TVHTML5_SIMPLY_EMBEDDED_PLAYER',
+ 'clientVersion': '2.0',
+ },
+ },
+ 'INNERTUBE_CONTEXT_CLIENT_NAME': 85
+ },
}
+def _split_innertube_client(client_name):
+ variant, *base = client_name.rsplit('.', 1)
+ if base:
+ return variant, base[0], variant
+ base, *variant = client_name.split('_', 1)
+ return client_name, base, variant[0] if variant else None
+
+
def build_innertube_clients():
- third_party = {
- 'embedUrl': 'https://google.com', # Can be any valid URL
+ THIRD_PARTY = {
+ 'embedUrl': 'https://www.youtube.com/', # Can be any valid URL
}
- base_clients = ('android', 'web', 'ios', 'mweb')
- priority = qualities(base_clients[::-1])
+ BASE_CLIENTS = ('android', 'web', 'tv', 'ios', 'mweb')
+ priority = qualities(BASE_CLIENTS[::-1])
for client, ytcfg in tuple(INNERTUBE_CLIENTS.items()):
ytcfg.setdefault('INNERTUBE_API_KEY', 'AIzaSyDCU8hByM-4DrUqRUYnGn-3llEO78bcxq8')
ytcfg.setdefault('INNERTUBE_HOST', 'www.youtube.com')
ytcfg.setdefault('REQUIRE_JS_PLAYER', True)
ytcfg['INNERTUBE_CONTEXT']['client'].setdefault('hl', 'en')
- ytcfg['priority'] = 10 * priority(client.split('_', 1)[0])
-
- if client in base_clients:
- INNERTUBE_CLIENTS[f'{client}_agegate'] = agegate_ytcfg = copy.deepcopy(ytcfg)
- agegate_ytcfg['INNERTUBE_CONTEXT']['client']['clientScreen'] = 'EMBED'
- agegate_ytcfg['INNERTUBE_CONTEXT']['thirdParty'] = third_party
- agegate_ytcfg['priority'] -= 1
- elif client.endswith('_embedded'):
- ytcfg['INNERTUBE_CONTEXT']['thirdParty'] = third_party
+
+ _, base_client, variant = _split_innertube_client(client)
+ ytcfg['priority'] = 10 * priority(base_client)
+
+ if not variant:
+ INNERTUBE_CLIENTS[f'{client}_embedscreen'] = embedscreen = copy.deepcopy(ytcfg)
+ embedscreen['INNERTUBE_CONTEXT']['client']['clientScreen'] = 'EMBED'
+ embedscreen['INNERTUBE_CONTEXT']['thirdParty'] = THIRD_PARTY
+ embedscreen['priority'] -= 3
+ elif variant == 'embedded':
+ ytcfg['INNERTUBE_CONTEXT']['thirdParty'] = THIRD_PARTY
ytcfg['priority'] -= 2
else:
ytcfg['priority'] -= 3
@@ -247,31 +277,82 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
_RESERVED_NAMES = (
r'channel|c|user|playlist|watch|w|v|embed|e|watch_popup|clip|'
- r'shorts|movies|results|shared|hashtag|trending|feed|feeds|'
+ r'shorts|movies|results|search|shared|hashtag|trending|explore|feed|feeds|'
r'browse|oembed|get_video_info|iframe_api|s/player|'
r'storefront|oops|index|account|reporthistory|t/terms|about|upload|signin|logout')
_PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM|WL|LL|LM)'
- _NETRC_MACHINE = 'youtube'
+ # _NETRC_MACHINE = 'youtube'
# If True it will raise an error if no login info is provided
_LOGIN_REQUIRED = False
- def _login(self):
- """
- Attempt to log in to YouTube.
- If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
- """
-
- if (self._LOGIN_REQUIRED
- and self.get_param('cookiefile') is None
- and self.get_param('cookiesfrombrowser') is None):
- self.raise_login_required(
- 'Login details are needed to download this content', method='cookies')
- username, password = self._get_login_info()
- if username:
- self.report_warning(f'Cannot login to YouTube using username and password. {self._LOGIN_HINTS["cookies"]}')
+ _INVIDIOUS_SITES = (
+ # invidious-redirect websites
+ r'(?:www\.)?redirect\.invidious\.io',
+ r'(?:(?:www|dev)\.)?invidio\.us',
+ # Invidious instances taken from https://github.com/iv-org/documentation/blob/master/Invidious-Instances.md
+ r'(?:www\.)?invidious\.pussthecat\.org',
+ r'(?:www\.)?invidious\.zee\.li',
+ r'(?:www\.)?invidious\.ethibox\.fr',
+ r'(?:www\.)?invidious\.3o7z6yfxhbw7n3za4rss6l434kmv55cgw2vuziwuigpwegswvwzqipyd\.onion',
+ r'(?:www\.)?osbivz6guyeahrwp2lnwyjk2xos342h4ocsxyqrlaopqjuhwn2djiiyd\.onion',
+ r'(?:www\.)?u2cvlit75owumwpy4dj2hsmvkq7nvrclkpht7xgyye2pyoxhpmclkrad\.onion',
+ # youtube-dl invidious instances list
+ r'(?:(?:www|no)\.)?invidiou\.sh',
+ r'(?:(?:www|fi)\.)?invidious\.snopyta\.org',
+ r'(?:www\.)?invidious\.kabi\.tk',
+ r'(?:www\.)?invidious\.mastodon\.host',
+ r'(?:www\.)?invidious\.zapashcanon\.fr',
+ r'(?:www\.)?(?:invidious(?:-us)?|piped)\.kavin\.rocks',
+ r'(?:www\.)?invidious\.tinfoil-hat\.net',
+ r'(?:www\.)?invidious\.himiko\.cloud',
+ r'(?:www\.)?invidious\.reallyancient\.tech',
+ r'(?:www\.)?invidious\.tube',
+ r'(?:www\.)?invidiou\.site',
+ r'(?:www\.)?invidious\.site',
+ r'(?:www\.)?invidious\.xyz',
+ r'(?:www\.)?invidious\.nixnet\.xyz',
+ r'(?:www\.)?invidious\.048596\.xyz',
+ r'(?:www\.)?invidious\.drycat\.fr',
+ r'(?:www\.)?inv\.skyn3t\.in',
+ r'(?:www\.)?tube\.poal\.co',
+ r'(?:www\.)?tube\.connect\.cafe',
+ r'(?:www\.)?vid\.wxzm\.sx',
+ r'(?:www\.)?vid\.mint\.lgbt',
+ r'(?:www\.)?vid\.puffyan\.us',
+ r'(?:www\.)?yewtu\.be',
+ r'(?:www\.)?yt\.elukerio\.org',
+ r'(?:www\.)?yt\.lelux\.fi',
+ r'(?:www\.)?invidious\.ggc-project\.de',
+ r'(?:www\.)?yt\.maisputain\.ovh',
+ r'(?:www\.)?ytprivate\.com',
+ r'(?:www\.)?invidious\.13ad\.de',
+ r'(?:www\.)?invidious\.toot\.koeln',
+ r'(?:www\.)?invidious\.fdn\.fr',
+ r'(?:www\.)?watch\.nettohikari\.com',
+ r'(?:www\.)?invidious\.namazso\.eu',
+ r'(?:www\.)?invidious\.silkky\.cloud',
+ r'(?:www\.)?invidious\.exonip\.de',
+ r'(?:www\.)?invidious\.riverside\.rocks',
+ r'(?:www\.)?invidious\.blamefran\.net',
+ r'(?:www\.)?invidious\.moomoo\.de',
+ r'(?:www\.)?ytb\.trom\.tf',
+ r'(?:www\.)?yt\.cyberhost\.uk',
+ r'(?:www\.)?kgg2m7yk5aybusll\.onion',
+ r'(?:www\.)?qklhadlycap4cnod\.onion',
+ r'(?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion',
+ r'(?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion',
+ r'(?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion',
+ r'(?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion',
+ r'(?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p',
+ r'(?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion',
+ r'(?:www\.)?w6ijuptxiku4xpnnaetxvnkc5vqcdu7mgns2u77qefoixi63vbvnpnqd\.onion',
+ r'(?:www\.)?kbjggqkzv65ivcqj6bumvp337z6264huv5kpkwuv6gu5yjiskvan7fad\.onion',
+ r'(?:www\.)?grwp24hodrefzvjjuccrkw3mjq4tzhaaq32amf33dzpmuxe7ilepcmad\.onion',
+ r'(?:www\.)?hpniueoejy4opn7bc4ftgazyqjoeqwlvh2uiku2xqku6zpoa4bf5ruid\.onion',
+ )
def _initialize_consent(self):
cookies = self._get_cookies('https://www.youtube.com/')
@@ -288,9 +369,25 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
consent_id = random.randint(100, 999)
self._set_cookie('.youtube.com', 'CONSENT', 'YES+cb.20210328-17-p0.en+FX+%s' % consent_id)
+ def _initialize_pref(self):
+ cookies = self._get_cookies('https://www.youtube.com/')
+ pref_cookie = cookies.get('PREF')
+ pref = {}
+ if pref_cookie:
+ try:
+ pref = dict(compat_urlparse.parse_qsl(pref_cookie.value))
+ except ValueError:
+ self.report_warning('Failed to parse user PREF cookie' + bug_reports_message())
+ pref.update({'hl': 'en', 'tz': 'UTC'})
+ self._set_cookie('.youtube.com', name='PREF', value=compat_urllib_parse_urlencode(pref))
+
def _real_initialize(self):
+ self._initialize_pref()
self._initialize_consent()
- self._login()
+ if (self._LOGIN_REQUIRED
+ and self.get_param('cookiefile') is None
+ and self.get_param('cookiesfrombrowser') is None):
+ self.raise_login_required('Login details are needed to download this content', method='cookies')
_YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;'
_YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;'
@@ -321,23 +418,11 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
return self._ytcfg_get_safe(ytcfg, lambda x: x['INNERTUBE_API_KEY'], compat_str, default_client)
def _extract_context(self, ytcfg=None, default_client='web'):
- _get_context = lambda y: try_get(y, lambda x: x['INNERTUBE_CONTEXT'], dict)
- context = _get_context(ytcfg)
- if context:
- return context
-
- context = _get_context(self._get_default_ytcfg(default_client))
- if not ytcfg:
- return context
-
- # Recreate the client context (required)
- context['client'].update({
- 'clientVersion': self._extract_client_version(ytcfg, default_client),
- 'clientName': self._extract_client_name(ytcfg, default_client),
- })
- visitor_data = try_get(ytcfg, lambda x: x['VISITOR_DATA'], compat_str)
- if visitor_data:
- context['client']['visitorData'] = visitor_data
+ context = get_first(
+ (ytcfg, self._get_default_ytcfg(default_client)), 'INNERTUBE_CONTEXT', expected_type=dict)
+ # Enforce language and tz for extraction
+ client_context = traverse_obj(context, 'client', expected_type=dict, default={})
+ client_context.update({'hl': 'en', 'timeZone': 'UTC', 'utcOffsetMinutes': 0})
return context
_SAPISID = None
@@ -381,7 +466,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
'https://%s/youtubei/v1/%s' % (api_hostname or self._get_innertube_host(default_client), ep),
video_id=video_id, fatal=fatal, note=note, errnote=errnote,
data=json.dumps(data).encode('utf8'), headers=real_headers,
- query={'key': api_key or self._extract_api_key()})
+ query={'key': api_key or self._extract_api_key(), 'prettyPrint': 'false'})
def extract_yt_initial_data(self, item_id, webpage, fatal=True):
data = self._search_regex(
@@ -437,9 +522,9 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
Extracts visitorData from an API response or ytcfg
Appears to be used to track session state
"""
- return traverse_obj(
- args, (..., ('VISITOR_DATA', ('INNERTUBE_CONTEXT', 'client', 'visitorData'), ('responseContext', 'visitorData'))),
- expected_type=compat_str, get_all=False)
+ return get_first(
+ args, [('VISITOR_DATA', ('INNERTUBE_CONTEXT', 'client', 'visitorData'), ('responseContext', 'visitorData'))],
+ expected_type=str)
@property
def is_authenticated(self):
@@ -594,6 +679,72 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
if text:
return text
+ def _get_count(self, data, *path_list):
+ count_text = self._get_text(data, *path_list) or ''
+ count = parse_count(count_text)
+ if count is None:
+ count = str_to_int(
+ self._search_regex(r'^([\d,]+)', re.sub(r'\s', '', count_text), 'count', default=None))
+ return count
+
+ @staticmethod
+ def _extract_thumbnails(data, *path_list):
+ """
+ Extract thumbnails from thumbnails dict
+ @param path_list: path list to level that contains 'thumbnails' key
+ """
+ thumbnails = []
+ for path in path_list or [()]:
+ for thumbnail in traverse_obj(data, (*variadic(path), 'thumbnails', ...), default=[]):
+ thumbnail_url = url_or_none(thumbnail.get('url'))
+ if not thumbnail_url:
+ continue
+ # Sometimes youtube gives a wrong thumbnail URL. See:
+ # https://github.com/hypervideo/hypervideo/issues/233
+ # https://github.com/ytdl-org/youtube-dl/issues/28023
+ if 'maxresdefault' in thumbnail_url:
+ thumbnail_url = thumbnail_url.split('?')[0]
+ thumbnails.append({
+ 'url': thumbnail_url,
+ 'height': int_or_none(thumbnail.get('height')),
+ 'width': int_or_none(thumbnail.get('width')),
+ })
+ return thumbnails
+
+ @staticmethod
+ def extract_relative_time(relative_time_text):
+ """
+ Extracts a relative time from string and converts to dt object
+ e.g. 'streamed 6 days ago', '5 seconds ago (edited)', 'updated today'
+ """
+ mobj = re.search(r'(?P<start>today|yesterday|now)|(?P<time>\d+)\s*(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?\s*ago', relative_time_text)
+ if mobj:
+ start = mobj.group('start')
+ if start:
+ return datetime_from_str(start)
+ try:
+ return datetime_from_str('now-%s%s' % (mobj.group('time'), mobj.group('unit')))
+ except ValueError:
+ return None
+
+ def _extract_time_text(self, renderer, *path_list):
+ text = self._get_text(renderer, *path_list) or ''
+ dt = self.extract_relative_time(text)
+ timestamp = None
+ if isinstance(dt, datetime.datetime):
+ timestamp = calendar.timegm(dt.timetuple())
+
+ if timestamp is None:
+ timestamp = (
+ unified_timestamp(text) or unified_timestamp(
+ self._search_regex(
+ (r'([a-z]+\s*\d{1,2},?\s*20\d{2})', r'(?:.+|^)(?:live|premieres|ed|ing)(?:\s*(?:on|for))?\s*(.+\d)'),
+ text.lower(), 'time text', default=None)))
+
+ if text and timestamp is None:
+ self.report_warning(f"Cannot parse localized time text '{text}'" + bug_reports_message(), only_once=True)
+ return timestamp, text
+
def _extract_response(self, item_id, query, note='Downloading API JSON', headers=None,
ytcfg=None, check_get_keys=None, ep='browse', fatal=True, api_hostname=None,
default_client='web'):
@@ -617,13 +768,15 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
note='%s%s' % (note, ' (retry #%d)' % count if count else ''))
except ExtractorError as e:
if isinstance(e.cause, network_exceptions):
- if isinstance(e.cause, compat_HTTPError) and not is_html(e.cause.read(512)):
- e.cause.seek(0)
- yt_error = try_get(
- self._parse_json(e.cause.read().decode(), item_id, fatal=False),
- lambda x: x['error']['message'], compat_str)
- if yt_error:
- self._report_alerts([('ERROR', yt_error)], fatal=False)
+ if isinstance(e.cause, compat_HTTPError):
+ first_bytes = e.cause.read(512)
+ if not is_html(first_bytes):
+ yt_error = try_get(
+ self._parse_json(
+ self._webpage_read_content(e.cause, None, item_id, prefix=first_bytes) or '{}', item_id, fatal=False),
+ lambda x: x['error']['message'], compat_str)
+ if yt_error:
+ self._report_alerts([('ERROR', yt_error)], fatal=False)
# Downloading page may result in intermittent 5xx HTTP error
# Sometimes a 404 is also recieved. See: https://github.com/ytdl-org/youtube-dl/issues/28289
# We also want to catch all other network exceptions since errors in later pages can be troublesome
@@ -674,91 +827,58 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
description = self._get_text(renderer, 'descriptionSnippet')
duration = parse_duration(self._get_text(
renderer, 'lengthText', ('thumbnailOverlays', ..., 'thumbnailOverlayTimeStatusRenderer', 'text')))
- view_count_text = self._get_text(renderer, 'viewCountText') or ''
- view_count = str_to_int(self._search_regex(
- r'^([\d,]+)', re.sub(r'\s', '', view_count_text),
- 'view count', default=None))
+ if duration is None:
+ duration = parse_duration(self._search_regex(
+ r'(?i)(ago)(?!.*\1)\s+(?P<duration>[a-z0-9 ,]+?)(?:\s+[\d,]+\s+views)?(?:\s+-\s+play\s+short)?$',
+ traverse_obj(renderer, ('title', 'accessibility', 'accessibilityData', 'label'), default='', expected_type=str),
+ video_id, default=None, group='duration'))
+
+ view_count = self._get_count(renderer, 'viewCountText')
uploader = self._get_text(renderer, 'ownerText', 'shortBylineText')
+ channel_id = traverse_obj(
+ renderer, ('shortBylineText', 'runs', ..., 'navigationEndpoint', 'browseEndpoint', 'browseId'),
+ expected_type=str, get_all=False)
+ timestamp, time_text = self._extract_time_text(renderer, 'publishedTimeText')
+ scheduled_timestamp = str_to_int(traverse_obj(renderer, ('upcomingEventData', 'startTime'), get_all=False))
+ overlay_style = traverse_obj(
+ renderer, ('thumbnailOverlays', ..., 'thumbnailOverlayTimeStatusRenderer', 'style'),
+ get_all=False, expected_type=str)
+ badges = self._extract_badges(renderer)
+ thumbnails = self._extract_thumbnails(renderer, 'thumbnail')
+ navigation_url = urljoin('https://www.youtube.com/', traverse_obj(
+ renderer, ('navigationEndpoint', 'commandMetadata', 'webCommandMetadata', 'url'),
+ expected_type=str)) or ''
+ url = f'https://www.youtube.com/watch?v={video_id}'
+ if overlay_style == 'SHORTS' or '/shorts/' in navigation_url:
+ url = f'https://www.youtube.com/shorts/{video_id}'
return {
'_type': 'url',
'ie_key': YoutubeIE.ie_key(),
'id': video_id,
- 'url': f'https://www.youtube.com/watch?v={video_id}',
+ 'url': url,
'title': title,
'description': description,
'duration': duration,
'view_count': view_count,
'uploader': uploader,
+ 'channel_id': channel_id,
+ 'thumbnails': thumbnails,
+ 'upload_date': (strftime_or_none(timestamp, '%Y%m%d')
+ if self._configuration_arg('approximate_date', ie_key='youtubetab')
+ else None),
+ 'live_status': ('is_upcoming' if scheduled_timestamp is not None
+ else 'was_live' if 'streamed' in time_text.lower()
+ else 'is_live' if overlay_style is not None and overlay_style == 'LIVE' or 'live now' in badges
+ else None),
+ 'release_timestamp': scheduled_timestamp,
+ 'availability': self._availability(needs_premium='premium' in badges, needs_subscription='members only' in badges)
}
class YoutubeIE(YoutubeBaseInfoExtractor):
- IE_DESC = 'YouTube.com'
- _INVIDIOUS_SITES = (
- # invidious-redirect websites
- r'(?:www\.)?redirect\.invidious\.io',
- r'(?:(?:www|dev)\.)?invidio\.us',
- # Invidious instances taken from https://github.com/iv-org/documentation/blob/master/Invidious-Instances.md
- r'(?:www\.)?invidious\.pussthecat\.org',
- r'(?:www\.)?invidious\.zee\.li',
- r'(?:www\.)?invidious\.ethibox\.fr',
- r'(?:www\.)?invidious\.3o7z6yfxhbw7n3za4rss6l434kmv55cgw2vuziwuigpwegswvwzqipyd\.onion',
- # youtube-dl invidious instances list
- r'(?:(?:www|no)\.)?invidiou\.sh',
- r'(?:(?:www|fi)\.)?invidious\.snopyta\.org',
- r'(?:www\.)?invidious\.kabi\.tk',
- r'(?:www\.)?invidious\.mastodon\.host',
- r'(?:www\.)?invidious\.zapashcanon\.fr',
- r'(?:www\.)?(?:invidious(?:-us)?|piped)\.kavin\.rocks',
- r'(?:www\.)?invidious\.tinfoil-hat\.net',
- r'(?:www\.)?invidious\.himiko\.cloud',
- r'(?:www\.)?invidious\.reallyancient\.tech',
- r'(?:www\.)?invidious\.tube',
- r'(?:www\.)?invidiou\.site',
- r'(?:www\.)?invidious\.site',
- r'(?:www\.)?invidious\.xyz',
- r'(?:www\.)?invidious\.nixnet\.xyz',
- r'(?:www\.)?invidious\.048596\.xyz',
- r'(?:www\.)?invidious\.drycat\.fr',
- r'(?:www\.)?inv\.skyn3t\.in',
- r'(?:www\.)?tube\.poal\.co',
- r'(?:www\.)?tube\.connect\.cafe',
- r'(?:www\.)?vid\.wxzm\.sx',
- r'(?:www\.)?vid\.mint\.lgbt',
- r'(?:www\.)?vid\.puffyan\.us',
- r'(?:www\.)?yewtu\.be',
- r'(?:www\.)?yt\.elukerio\.org',
- r'(?:www\.)?yt\.lelux\.fi',
- r'(?:www\.)?invidious\.ggc-project\.de',
- r'(?:www\.)?yt\.maisputain\.ovh',
- r'(?:www\.)?ytprivate\.com',
- r'(?:www\.)?invidious\.13ad\.de',
- r'(?:www\.)?invidious\.toot\.koeln',
- r'(?:www\.)?invidious\.fdn\.fr',
- r'(?:www\.)?watch\.nettohikari\.com',
- r'(?:www\.)?invidious\.namazso\.eu',
- r'(?:www\.)?invidious\.silkky\.cloud',
- r'(?:www\.)?invidious\.exonip\.de',
- r'(?:www\.)?invidious\.riverside\.rocks',
- r'(?:www\.)?invidious\.blamefran\.net',
- r'(?:www\.)?invidious\.moomoo\.de',
- r'(?:www\.)?ytb\.trom\.tf',
- r'(?:www\.)?yt\.cyberhost\.uk',
- r'(?:www\.)?kgg2m7yk5aybusll\.onion',
- r'(?:www\.)?qklhadlycap4cnod\.onion',
- r'(?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion',
- r'(?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion',
- r'(?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion',
- r'(?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion',
- r'(?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p',
- r'(?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion',
- r'(?:www\.)?w6ijuptxiku4xpnnaetxvnkc5vqcdu7mgns2u77qefoixi63vbvnpnqd\.onion',
- r'(?:www\.)?kbjggqkzv65ivcqj6bumvp337z6264huv5kpkwuv6gu5yjiskvan7fad\.onion',
- r'(?:www\.)?grwp24hodrefzvjjuccrkw3mjq4tzhaaq32amf33dzpmuxe7ilepcmad\.onion',
- r'(?:www\.)?hpniueoejy4opn7bc4ftgazyqjoeqwlvh2uiku2xqku6zpoa4bf5ruid\.onion',
- )
+ IE_DESC = 'YouTube'
_VALID_URL = r"""(?x)^
(
(?:https?://|//) # http(s):// or protocol-independent URL
@@ -772,7 +892,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
youtube\.googleapis\.com)/ # the various hostnames, with wildcard subdomains
(?:.*?\#/)? # handle anchor (#/) redirect urls
(?: # the various things that can precede the ID:
- (?:(?:v|embed|e|shorts)/(?!videoseries)) # v/ or embed/ or e/ or shorts/
+ (?:(?:v|embed|e|shorts)/(?!videoseries|live_stream)) # v/ or embed/ or e/ or shorts/
|(?: # or the v= param in all its forms
(?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
(?:\?|\#!?) # the params delimiter ? or # or #!
@@ -792,7 +912,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
(?P<id>[0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
(?(1).+)? # if we found the ID, everything can follow
(?:\#|$)""" % {
- 'invidious': '|'.join(_INVIDIOUS_SITES),
+ 'invidious': '|'.join(YoutubeBaseInfoExtractor._INVIDIOUS_SITES),
}
_PLAYER_INFO_RE = (
r'/s/player/(?P<id>[a-zA-Z0-9_-]{8,})/player',
@@ -923,18 +1043,24 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader': 'Philipp Hagemeister',
'uploader_id': 'phihag',
'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
+ 'channel': 'Philipp Hagemeister',
'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q',
'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q',
'upload_date': '20121002',
- 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
+ 'description': 'md5:8fb536f4877b8a7455c2ec23794dbc22',
'categories': ['Science & Technology'],
'tags': ['youtube-dl'],
'duration': 10,
'view_count': int,
'like_count': int,
- 'dislike_count': int,
+ 'availability': 'public',
+ 'playable_in_embed': True,
+ 'thumbnail': 'https://i.ytimg.com/vi/BaW_jenozKc/maxresdefault.jpg',
+ 'live_status': 'not_live',
+ 'age_limit': 0,
'start_time': 1,
'end_time': 9,
+ 'channel_follower_count': int
}
},
{
@@ -963,14 +1089,22 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader': 'Philipp Hagemeister',
'uploader_id': 'phihag',
'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
+ 'channel': 'Philipp Hagemeister',
+ 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q',
+ 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q',
'upload_date': '20121002',
- 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
+ 'description': 'md5:8fb536f4877b8a7455c2ec23794dbc22',
'categories': ['Science & Technology'],
'tags': ['youtube-dl'],
'duration': 10,
'view_count': int,
'like_count': int,
- 'dislike_count': int,
+ 'availability': 'public',
+ 'playable_in_embed': True,
+ 'thumbnail': 'https://i.ytimg.com/vi/BaW_jenozKc/maxresdefault.jpg',
+ 'live_status': 'not_live',
+ 'age_limit': 0,
+ 'channel_follower_count': int
},
'params': {
'skip_download': True,
@@ -1008,6 +1142,22 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader_id': 'AfrojackVEVO',
'upload_date': '20131011',
'abr': 129.495,
+ 'like_count': int,
+ 'channel_id': 'UChuZAo1RKL85gev3Eal9_zg',
+ 'playable_in_embed': True,
+ 'channel_url': 'https://www.youtube.com/channel/UChuZAo1RKL85gev3Eal9_zg',
+ 'view_count': int,
+ 'track': 'The Spark',
+ 'live_status': 'not_live',
+ 'thumbnail': 'https://i.ytimg.com/vi_webp/IB3lcPjvWLA/maxresdefault.webp',
+ 'channel': 'Afrojack',
+ 'uploader_url': 'http://www.youtube.com/user/AfrojackVEVO',
+ 'tags': 'count:19',
+ 'availability': 'public',
+ 'categories': ['Music'],
+ 'age_limit': 0,
+ 'alt_title': 'The Spark',
+ 'channel_follower_count': int
},
'params': {
'youtube_include_dash_manifest': True,
@@ -1029,6 +1179,18 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame',
'upload_date': '20140605',
'age_limit': 18,
+ 'categories': ['Gaming'],
+ 'thumbnail': 'https://i.ytimg.com/vi_webp/HtVdAasjOgU/maxresdefault.webp',
+ 'availability': 'needs_auth',
+ 'channel_url': 'https://www.youtube.com/channel/UCzybXLxv08IApdjdN0mJhEg',
+ 'like_count': int,
+ 'channel': 'The Witcher',
+ 'live_status': 'not_live',
+ 'tags': 'count:17',
+ 'channel_id': 'UCzybXLxv08IApdjdN0mJhEg',
+ 'playable_in_embed': True,
+ 'view_count': int,
+ 'channel_follower_count': int
},
},
{
@@ -1043,6 +1205,20 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader_id': 'FlyingKitty900',
'uploader': 'FlyingKitty',
'age_limit': 18,
+ 'availability': 'needs_auth',
+ 'channel_id': 'UCYQT13AtrJC0gsM1far_zJg',
+ 'uploader_url': 'http://www.youtube.com/user/FlyingKitty900',
+ 'channel': 'FlyingKitty',
+ 'channel_url': 'https://www.youtube.com/channel/UCYQT13AtrJC0gsM1far_zJg',
+ 'view_count': int,
+ 'categories': ['Entertainment'],
+ 'live_status': 'not_live',
+ 'tags': ['Flyingkitty', 'godzilla 2'],
+ 'thumbnail': 'https://i.ytimg.com/vi/HsUATh_Nc2U/maxresdefault.jpg',
+ 'like_count': int,
+ 'duration': 177,
+ 'playable_in_embed': True,
+ 'channel_follower_count': int
},
},
{
@@ -1052,11 +1228,25 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'id': 'Tq92D6wQ1mg',
'title': '[MMD] Adios - EVERGLOW [+Motion DL]',
'ext': 'mp4',
- 'upload_date': '20191227',
+ 'upload_date': '20191228',
'uploader_id': 'UC1yoRdFoFJaCY-AGfD9W0wQ',
'uploader': 'Projekt Melody',
'description': 'md5:17eccca93a786d51bc67646756894066',
'age_limit': 18,
+ 'like_count': int,
+ 'availability': 'needs_auth',
+ 'uploader_url': 'http://www.youtube.com/channel/UC1yoRdFoFJaCY-AGfD9W0wQ',
+ 'channel_id': 'UC1yoRdFoFJaCY-AGfD9W0wQ',
+ 'view_count': int,
+ 'thumbnail': 'https://i.ytimg.com/vi_webp/Tq92D6wQ1mg/sddefault.webp',
+ 'channel': 'Projekt Melody',
+ 'live_status': 'not_live',
+ 'tags': ['mmd', 'dance', 'mikumikudance', 'kpop', 'vtuber'],
+ 'playable_in_embed': True,
+ 'categories': ['Entertainment'],
+ 'duration': 106,
+ 'channel_url': 'https://www.youtube.com/channel/UC1yoRdFoFJaCY-AGfD9W0wQ',
+ 'channel_follower_count': int
},
},
{
@@ -1070,6 +1260,25 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader_id': 'st3in234',
'description': 'Fan Video. Music & Lyrics by OOMPH!.',
'upload_date': '20130730',
+ 'track': 'Such mich find mich',
+ 'age_limit': 0,
+ 'tags': ['oomph', 'such mich find mich', 'lyrics', 'german industrial', 'musica industrial'],
+ 'like_count': int,
+ 'playable_in_embed': False,
+ 'creator': 'OOMPH!',
+ 'thumbnail': 'https://i.ytimg.com/vi/MeJVWBSsPAY/sddefault.jpg',
+ 'view_count': int,
+ 'alt_title': 'Such mich find mich',
+ 'duration': 210,
+ 'channel': 'Herr Lurik',
+ 'channel_id': 'UCdR3RSDPqub28LjZx0v9-aA',
+ 'categories': ['Music'],
+ 'availability': 'public',
+ 'uploader_url': 'http://www.youtube.com/user/st3in234',
+ 'channel_url': 'https://www.youtube.com/channel/UCdR3RSDPqub28LjZx0v9-aA',
+ 'live_status': 'not_live',
+ 'artist': 'OOMPH!',
+ 'channel_follower_count': int
},
},
{
@@ -1093,6 +1302,22 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader': 'deadmau5',
'title': 'Deadmau5 - Some Chords (HD)',
'alt_title': 'Some Chords',
+ 'availability': 'public',
+ 'tags': 'count:14',
+ 'channel_id': 'UCYEK6xds6eo-3tr4xRdflmQ',
+ 'view_count': int,
+ 'live_status': 'not_live',
+ 'channel': 'deadmau5',
+ 'thumbnail': 'https://i.ytimg.com/vi_webp/__2ABJjxzNo/maxresdefault.webp',
+ 'like_count': int,
+ 'track': 'Some Chords',
+ 'artist': 'deadmau5',
+ 'playable_in_embed': True,
+ 'age_limit': 0,
+ 'channel_url': 'https://www.youtube.com/channel/UCYEK6xds6eo-3tr4xRdflmQ',
+ 'categories': ['Music'],
+ 'album': 'Some Chords',
+ 'channel_follower_count': int
},
'expected_warnings': [
'DASH manifest missing',
@@ -1111,6 +1336,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
'uploader': 'Olympics',
'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
+ 'like_count': int,
+ 'release_timestamp': 1343767800,
+ 'playable_in_embed': True,
+ 'categories': ['Sports'],
+ 'release_date': '20120731',
+ 'channel': 'Olympics',
+ 'tags': ['Hockey', '2012-07-31', '31 July 2012', 'Riverbank Arena', 'Session', 'Olympics', 'Olympic Games', 'London 2012', '2012 Summer Olympics', 'Summer Games'],
+ 'channel_id': 'UCTl3QQTvqHFjurroKxexy2Q',
+ 'thumbnail': 'https://i.ytimg.com/vi/lqQg6PlCWgI/maxresdefault.jpg',
+ 'age_limit': 0,
+ 'availability': 'public',
+ 'live_status': 'was_live',
+ 'view_count': int,
+ 'channel_url': 'https://www.youtube.com/channel/UCTl3QQTvqHFjurroKxexy2Q',
+ 'channel_follower_count': int
},
'params': {
'skip_download': 'requires avconv',
@@ -1130,6 +1370,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
'uploader': '孫ᄋᄅ',
'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
+ 'playable_in_embed': True,
+ 'channel': '孫ᄋᄅ',
+ 'age_limit': 0,
+ 'tags': 'count:11',
+ 'channel_url': 'https://www.youtube.com/channel/UCS-xxCmRaA6BFdmgDPA_BIw',
+ 'channel_id': 'UCS-xxCmRaA6BFdmgDPA_BIw',
+ 'thumbnail': 'https://i.ytimg.com/vi/_b-2C3KPAM0/maxresdefault.jpg',
+ 'view_count': int,
+ 'categories': ['People & Blogs'],
+ 'like_count': int,
+ 'live_status': 'not_live',
+ 'availability': 'unlisted',
+ 'channel_follower_count': int
},
},
# url_encoded_fmt_stream_map is empty string
@@ -1286,6 +1539,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'track': 'Dark Walk',
'artist': 'Todd Haberman;\nDaniel Law Heath and Aaron Kaplan',
'album': 'Position Music - Production Music Vol. 143 - Dark Walk',
+ 'thumbnail': 'https://i.ytimg.com/vi_webp/lsguqyKfVQg/maxresdefault.webp',
+ 'categories': ['Film & Animation'],
+ 'view_count': int,
+ 'live_status': 'not_live',
+ 'channel_url': 'https://www.youtube.com/channel/UCTSRgz5jylBvFt_S7wnsqLQ',
+ 'channel_id': 'UCTSRgz5jylBvFt_S7wnsqLQ',
+ 'tags': 'count:13',
+ 'availability': 'public',
+ 'channel': 'IronSoulElf',
+ 'playable_in_embed': True,
+ 'like_count': int,
+ 'age_limit': 0,
+ 'channel_follower_count': int
},
'params': {
'skip_download': True,
@@ -1327,11 +1593,24 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'title': 'md5:e41008789470fc2533a3252216f1c1d1',
'description': 'md5:a677553cf0840649b731a3024aeff4cc',
'duration': 721,
- 'upload_date': '20150127',
+ 'upload_date': '20150128',
'uploader_id': 'BerkmanCenter',
'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter',
'uploader': 'The Berkman Klein Center for Internet & Society',
'license': 'Creative Commons Attribution license (reuse allowed)',
+ 'channel_id': 'UCuLGmD72gJDBwmLw06X58SA',
+ 'channel_url': 'https://www.youtube.com/channel/UCuLGmD72gJDBwmLw06X58SA',
+ 'like_count': int,
+ 'age_limit': 0,
+ 'tags': ['Copyright (Legal Subject)', 'Law (Industry)', 'William W. Fisher (Author)'],
+ 'channel': 'The Berkman Klein Center for Internet & Society',
+ 'availability': 'public',
+ 'view_count': int,
+ 'categories': ['Education'],
+ 'thumbnail': 'https://i.ytimg.com/vi_webp/M4gD1WSo5mA/maxresdefault.webp',
+ 'live_status': 'not_live',
+ 'playable_in_embed': True,
+ 'channel_follower_count': int
},
'params': {
'skip_download': True,
@@ -1346,11 +1625,24 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders',
'description': 'md5:13a2503d7b5904ef4b223aa101628f39',
'duration': 4060,
- 'upload_date': '20151119',
+ 'upload_date': '20151120',
'uploader': 'Bernie Sanders',
'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg',
'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg',
'license': 'Creative Commons Attribution license (reuse allowed)',
+ 'playable_in_embed': True,
+ 'tags': 'count:12',
+ 'like_count': int,
+ 'channel_id': 'UCH1dpzjCEiGAt8CXkryhkZg',
+ 'age_limit': 0,
+ 'availability': 'public',
+ 'categories': ['News & Politics'],
+ 'channel': 'Bernie Sanders',
+ 'thumbnail': 'https://i.ytimg.com/vi_webp/eQcmzGIKrzg/maxresdefault.webp',
+ 'view_count': int,
+ 'live_status': 'not_live',
+ 'channel_url': 'https://www.youtube.com/channel/UCH1dpzjCEiGAt8CXkryhkZg',
+ 'channel_follower_count': int
},
'params': {
'skip_download': True,
@@ -1400,6 +1692,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'series': 'Mind Field',
'season_number': 1,
'episode_number': 1,
+ 'thumbnail': 'https://i.ytimg.com/vi_webp/iqKdEhx-dD4/maxresdefault.webp',
+ 'tags': 'count:12',
+ 'view_count': int,
+ 'availability': 'public',
+ 'age_limit': 0,
+ 'channel': 'Vsauce',
+ 'episode': 'Episode 1',
+ 'categories': ['Entertainment'],
+ 'season': 'Season 1',
+ 'channel_id': 'UC6nSFpj9HTCZ5t-N3Rm3-HA',
+ 'channel_url': 'https://www.youtube.com/channel/UC6nSFpj9HTCZ5t-N3Rm3-HA',
+ 'like_count': int,
+ 'playable_in_embed': True,
+ 'live_status': 'not_live',
+ 'channel_follower_count': int
},
'params': {
'skip_download': True,
@@ -1493,6 +1800,23 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'album': 'it\'s too much love to know my dear',
'release_date': '20190313',
'release_year': 2019,
+ 'alt_title': 'Voyeur Girl',
+ 'view_count': int,
+ 'uploader_url': 'http://www.youtube.com/channel/UC-pWHpBjdGG69N9mM2auIAA',
+ 'playable_in_embed': True,
+ 'like_count': int,
+ 'categories': ['Music'],
+ 'channel_url': 'https://www.youtube.com/channel/UC-pWHpBjdGG69N9mM2auIAA',
+ 'channel': 'Stephen',
+ 'availability': 'public',
+ 'creator': 'Stephen',
+ 'duration': 169,
+ 'thumbnail': 'https://i.ytimg.com/vi_webp/MgNrAu2pzNs/maxresdefault.webp',
+ 'age_limit': 0,
+ 'channel_id': 'UC-pWHpBjdGG69N9mM2auIAA',
+ 'tags': 'count:11',
+ 'live_status': 'not_live',
+ 'channel_follower_count': int
},
'params': {
'skip_download': True,
@@ -1534,6 +1858,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'upload_date': '20170613',
'uploader_id': 'ElevageOrVert',
'uploader': 'ElevageOrVert',
+ 'view_count': int,
+ 'thumbnail': 'https://i.ytimg.com/vi_webp/x41yOUIvK2k/maxresdefault.webp',
+ 'uploader_url': 'http://www.youtube.com/user/ElevageOrVert',
+ 'like_count': int,
+ 'channel_id': 'UCo03ZQPBW5U4UC3regpt1nw',
+ 'tags': [],
+ 'channel_url': 'https://www.youtube.com/channel/UCo03ZQPBW5U4UC3regpt1nw',
+ 'availability': 'public',
+ 'age_limit': 0,
+ 'categories': ['Pets & Animals'],
+ 'duration': 7,
+ 'playable_in_embed': True,
+ 'live_status': 'not_live',
+ 'channel': 'ElevageOrVert',
+ 'channel_follower_count': int
},
'params': {
'skip_download': True,
@@ -1553,6 +1892,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'upload_date': '20130831',
'uploader_id': 'kudvenkat',
'uploader': 'kudvenkat',
+ 'channel_id': 'UCCTVrRB5KpIiK6V2GGVsR1Q',
+ 'like_count': int,
+ 'uploader_url': 'http://www.youtube.com/user/kudvenkat',
+ 'channel_url': 'https://www.youtube.com/channel/UCCTVrRB5KpIiK6V2GGVsR1Q',
+ 'live_status': 'not_live',
+ 'categories': ['Education'],
+ 'availability': 'public',
+ 'thumbnail': 'https://i.ytimg.com/vi/CHqg6qOn4no/sddefault.jpg',
+ 'tags': 'count:12',
+ 'playable_in_embed': True,
+ 'age_limit': 0,
+ 'view_count': int,
+ 'duration': 522,
+ 'channel': 'kudvenkat',
+ 'channel_follower_count': int
},
'params': {
'skip_download': True,
@@ -1582,8 +1936,22 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'artist': 'The Cinematic Orchestra',
'track': 'Burn Out',
'album': 'Every Day',
- 'release_data': None,
- 'release_year': None,
+ 'like_count': int,
+ 'live_status': 'not_live',
+ 'alt_title': 'Burn Out',
+ 'duration': 614,
+ 'age_limit': 0,
+ 'view_count': int,
+ 'channel_url': 'https://www.youtube.com/channel/UCIzsJBIyo8hhpFm1NK0uLgw',
+ 'creator': 'The Cinematic Orchestra',
+ 'channel': 'The Cinematic Orchestra',
+ 'tags': ['The Cinematic Orchestra', 'Every Day', 'Burn Out'],
+ 'channel_id': 'UCIzsJBIyo8hhpFm1NK0uLgw',
+ 'availability': 'public',
+ 'thumbnail': 'https://i.ytimg.com/vi/OtqTfy26tG0/maxresdefault.jpg',
+ 'categories': ['Music'],
+ 'playable_in_embed': True,
+ 'channel_follower_count': int
},
'params': {
'skip_download': True,
@@ -1602,10 +1970,24 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'ext': 'mp4',
'title': 'San Diego teen commits suicide after bullying over embarrassing video',
'channel_id': 'UC-SJ6nODDmufqBzPBwCvYvQ',
- 'uploader': 'CBS This Morning',
+ 'uploader': 'CBS Mornings',
'uploader_id': 'CBSThisMorning',
'upload_date': '20140716',
- 'description': 'md5:acde3a73d3f133fc97e837a9f76b53b7'
+ 'description': 'md5:acde3a73d3f133fc97e837a9f76b53b7',
+ 'duration': 170,
+ 'categories': ['News & Politics'],
+ 'uploader_url': 'http://www.youtube.com/user/CBSThisMorning',
+ 'view_count': int,
+ 'channel': 'CBS Mornings',
+ 'tags': ['suicide', 'bullying', 'video', 'cbs', 'news'],
+ 'thumbnail': 'https://i.ytimg.com/vi/SZJvDhaSDnc/hqdefault.jpg',
+ 'age_limit': 18,
+ 'availability': 'needs_auth',
+ 'channel_url': 'https://www.youtube.com/channel/UC-SJ6nODDmufqBzPBwCvYvQ',
+ 'like_count': int,
+ 'live_status': 'not_live',
+ 'playable_in_embed': True,
+ 'channel_follower_count': int
}
},
{
@@ -1620,6 +2002,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader': 'Walk around Japan',
'uploader_id': 'UC3o_t8PzBmXf5S9b7GLx1Mw',
'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UC3o_t8PzBmXf5S9b7GLx1Mw',
+ 'duration': 1456,
+ 'categories': ['Travel & Events'],
+ 'channel_id': 'UC3o_t8PzBmXf5S9b7GLx1Mw',
+ 'view_count': int,
+ 'channel': 'Walk around Japan',
+ 'tags': ['Ueno Tokyo', 'Okachimachi Tokyo', 'Ameyoko Street', 'Tokyo attraction', 'Travel in Tokyo'],
+ 'thumbnail': 'https://i.ytimg.com/vi_webp/cBvYw8_A0vQ/hqdefault.webp',
+ 'age_limit': 0,
+ 'availability': 'public',
+ 'channel_url': 'https://www.youtube.com/channel/UC3o_t8PzBmXf5S9b7GLx1Mw',
+ 'live_status': 'not_live',
+ 'playable_in_embed': True,
+ 'channel_follower_count': int
},
'params': {
'skip_download': True,
@@ -1648,7 +2043,20 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader': 'colinfurze',
'uploader_id': 'colinfurze',
'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCp68_FLety0O-n9QU6phsgw',
- 'description': 'md5:b5096f56af7ccd7a555c84db81738b22'
+ 'description': 'md5:5d5991195d599b56cd0c4148907eec50',
+ 'duration': 596,
+ 'categories': ['Entertainment'],
+ 'uploader_url': 'http://www.youtube.com/user/colinfurze',
+ 'view_count': int,
+ 'channel': 'colinfurze',
+ 'tags': ['Colin', 'furze', 'Terry', 'tunnel', 'underground', 'bunker'],
+ 'thumbnail': 'https://i.ytimg.com/vi/YOelRv7fMxY/maxresdefault.jpg',
+ 'age_limit': 0,
+ 'availability': 'public',
+ 'like_count': int,
+ 'live_status': 'not_live',
+ 'playable_in_embed': True,
+ 'channel_follower_count': int
},
'params': {
'format': '17', # 3gp format available on android
@@ -1666,6 +2074,120 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
# shorts
'url': 'https://www.youtube.com/shorts/BGQWPY4IigY',
'only_matching': True,
+ }, {
+ 'note': 'Storyboards',
+ 'url': 'https://www.youtube.com/watch?v=5KLPxDtMqe8',
+ 'info_dict': {
+ 'id': '5KLPxDtMqe8',
+ 'ext': 'mhtml',
+ 'format_id': 'sb0',
+ 'title': 'Your Brain is Plastic',
+ 'uploader_id': 'scishow',
+ 'description': 'md5:89cd86034bdb5466cd87c6ba206cd2bc',
+ 'upload_date': '20140324',
+ 'uploader': 'SciShow',
+ 'like_count': int,
+ 'channel_id': 'UCZYTClx2T1of7BRZ86-8fow',
+ 'channel_url': 'https://www.youtube.com/channel/UCZYTClx2T1of7BRZ86-8fow',
+ 'view_count': int,
+ 'thumbnail': 'https://i.ytimg.com/vi/5KLPxDtMqe8/maxresdefault.jpg',
+ 'playable_in_embed': True,
+ 'tags': 'count:12',
+ 'uploader_url': 'http://www.youtube.com/user/scishow',
+ 'availability': 'public',
+ 'channel': 'SciShow',
+ 'live_status': 'not_live',
+ 'duration': 248,
+ 'categories': ['Education'],
+ 'age_limit': 0,
+ 'channel_follower_count': int
+ }, 'params': {'format': 'mhtml', 'skip_download': True}
+ }, {
+ # Ensure video upload_date is in UTC timezone (video was uploaded 1641170939)
+ 'url': 'https://www.youtube.com/watch?v=2NUZ8W2llS4',
+ 'info_dict': {
+ 'id': '2NUZ8W2llS4',
+ 'ext': 'mp4',
+ 'title': 'The NP that test your phone performance 🙂',
+ 'description': 'md5:144494b24d4f9dfacb97c1bbef5de84d',
+ 'uploader': 'Leon Nguyen',
+ 'uploader_id': 'VNSXIII',
+ 'uploader_url': 'http://www.youtube.com/user/VNSXIII',
+ 'channel_id': 'UCRqNBSOHgilHfAczlUmlWHA',
+ 'channel_url': 'https://www.youtube.com/channel/UCRqNBSOHgilHfAczlUmlWHA',
+ 'duration': 21,
+ 'view_count': int,
+ 'age_limit': 0,
+ 'categories': ['Gaming'],
+ 'tags': 'count:23',
+ 'playable_in_embed': True,
+ 'live_status': 'not_live',
+ 'upload_date': '20220103',
+ 'like_count': int,
+ 'availability': 'public',
+ 'channel': 'Leon Nguyen',
+ 'thumbnail': 'https://i.ytimg.com/vi_webp/2NUZ8W2llS4/maxresdefault.webp',
+ 'channel_follower_count': int
+ }
+ }, {
+ # date text is premiered video, ensure upload date in UTC (published 1641172509)
+ 'url': 'https://www.youtube.com/watch?v=mzZzzBU6lrM',
+ 'info_dict': {
+ 'id': 'mzZzzBU6lrM',
+ 'ext': 'mp4',
+ 'title': 'I Met GeorgeNotFound In Real Life...',
+ 'description': 'md5:cca98a355c7184e750f711f3a1b22c84',
+ 'uploader': 'Quackity',
+ 'uploader_id': 'QuackityHQ',
+ 'uploader_url': 'http://www.youtube.com/user/QuackityHQ',
+ 'channel_id': 'UC_8NknAFiyhOUaZqHR3lq3Q',
+ 'channel_url': 'https://www.youtube.com/channel/UC_8NknAFiyhOUaZqHR3lq3Q',
+ 'duration': 955,
+ 'view_count': int,
+ 'age_limit': 0,
+ 'categories': ['Entertainment'],
+ 'tags': 'count:26',
+ 'playable_in_embed': True,
+ 'live_status': 'not_live',
+ 'release_timestamp': 1641172509,
+ 'release_date': '20220103',
+ 'upload_date': '20220103',
+ 'like_count': int,
+ 'availability': 'public',
+ 'channel': 'Quackity',
+ 'thumbnail': 'https://i.ytimg.com/vi/mzZzzBU6lrM/maxresdefault.jpg',
+ 'channel_follower_count': int
+ }
+ },
+ { # continuous livestream. Microformat upload date should be preferred.
+ # Upload date was 2021-06-19 (not UTC), while stream start is 2021-11-27
+ 'url': 'https://www.youtube.com/watch?v=kgx4WGK0oNU',
+ 'info_dict': {
+ 'id': 'kgx4WGK0oNU',
+ 'title': r're:jazz\/lofi hip hop radio🌱chill beats to relax\/study to \[LIVE 24\/7\] \d{4}-\d{2}-\d{2} \d{2}:\d{2}',
+ 'ext': 'mp4',
+ 'channel_id': 'UC84whx2xxsiA1gXHXXqKGOA',
+ 'availability': 'public',
+ 'age_limit': 0,
+ 'release_timestamp': 1637975704,
+ 'upload_date': '20210619',
+ 'channel_url': 'https://www.youtube.com/channel/UC84whx2xxsiA1gXHXXqKGOA',
+ 'live_status': 'is_live',
+ 'thumbnail': 'https://i.ytimg.com/vi/kgx4WGK0oNU/maxresdefault.jpg',
+ 'uploader': '阿鲍Abao',
+ 'uploader_url': 'http://www.youtube.com/channel/UC84whx2xxsiA1gXHXXqKGOA',
+ 'channel': 'Abao in Tokyo',
+ 'channel_follower_count': int,
+ 'release_date': '20211127',
+ 'tags': 'count:39',
+ 'categories': ['People & Blogs'],
+ 'like_count': int,
+ 'uploader_id': 'UC84whx2xxsiA1gXHXXqKGOA',
+ 'view_count': int,
+ 'playable_in_embed': True,
+ 'description': 'md5:2ef1d002cad520f65825346e2084e49d',
+ },
+ 'params': {'skip_download': True}
},
]
@@ -1683,18 +2205,158 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
self._code_cache = {}
self._player_cache = {}
+ def _prepare_live_from_start_formats(self, formats, video_id, live_start_time, url, webpage_url, smuggled_data):
+ lock = threading.Lock()
+
+ is_live = True
+ start_time = time.time()
+ formats = [f for f in formats if f.get('is_from_start')]
+
+ def refetch_manifest(format_id, delay):
+ nonlocal formats, start_time, is_live
+ if time.time() <= start_time + delay:
+ return
+
+ _, _, prs, player_url = self._download_player_responses(url, smuggled_data, video_id, webpage_url)
+ video_details = traverse_obj(
+ prs, (..., 'videoDetails'), expected_type=dict, default=[])
+ microformats = traverse_obj(
+ prs, (..., 'microformat', 'playerMicroformatRenderer'),
+ expected_type=dict, default=[])
+ _, is_live, _, formats = self._list_formats(video_id, microformats, video_details, prs, player_url)
+ start_time = time.time()
+
+ def mpd_feed(format_id, delay):
+ """
+ @returns (manifest_url, manifest_stream_number, is_live) or None
+ """
+ with lock:
+ refetch_manifest(format_id, delay)
+
+ f = next((f for f in formats if f['format_id'] == format_id), None)
+ if not f:
+ if not is_live:
+ self.to_screen(f'{video_id}: Video is no longer live')
+ else:
+ self.report_warning(
+ f'Cannot find refreshed manifest for format {format_id}{bug_reports_message()}')
+ return None
+ return f['manifest_url'], f['manifest_stream_number'], is_live
+
+ for f in formats:
+ f['is_live'] = True
+ f['protocol'] = 'http_dash_segments_generator'
+ f['fragments'] = functools.partial(
+ self._live_dash_fragments, f['format_id'], live_start_time, mpd_feed)
+
+ def _live_dash_fragments(self, format_id, live_start_time, mpd_feed, ctx):
+ FETCH_SPAN, MAX_DURATION = 5, 432000
+
+ mpd_url, stream_number, is_live = None, None, True
+
+ begin_index = 0
+ download_start_time = ctx.get('start') or time.time()
+
+ lack_early_segments = download_start_time - (live_start_time or download_start_time) > MAX_DURATION
+ if lack_early_segments:
+ self.report_warning(bug_reports_message(
+ 'Starting download from the last 120 hours of the live stream since '
+ 'YouTube does not have data before that. If you think this is wrong,'), only_once=True)
+ lack_early_segments = True
+
+ known_idx, no_fragment_score, last_segment_url = begin_index, 0, None
+ fragments, fragment_base_url = None, None
+
+ def _extract_sequence_from_mpd(refresh_sequence, immediate):
+ nonlocal mpd_url, stream_number, is_live, no_fragment_score, fragments, fragment_base_url
+ # Obtain from MPD's maximum seq value
+ old_mpd_url = mpd_url
+ last_error = ctx.pop('last_error', None)
+ expire_fast = immediate or last_error and isinstance(last_error, compat_HTTPError) and last_error.code == 403
+ mpd_url, stream_number, is_live = (mpd_feed(format_id, 5 if expire_fast else 18000)
+ or (mpd_url, stream_number, False))
+ if not refresh_sequence:
+ if expire_fast and not is_live:
+ return False, last_seq
+ elif old_mpd_url == mpd_url:
+ return True, last_seq
+ try:
+ fmts, _ = self._extract_mpd_formats_and_subtitles(
+ mpd_url, None, note=False, errnote=False, fatal=False)
+ except ExtractorError:
+ fmts = None
+ if not fmts:
+ no_fragment_score += 2
+ return False, last_seq
+ fmt_info = next(x for x in fmts if x['manifest_stream_number'] == stream_number)
+ fragments = fmt_info['fragments']
+ fragment_base_url = fmt_info['fragment_base_url']
+ assert fragment_base_url
+
+ _last_seq = int(re.search(r'(?:/|^)sq/(\d+)', fragments[-1]['path']).group(1))
+ return True, _last_seq
+
+ while is_live:
+ fetch_time = time.time()
+ if no_fragment_score > 30:
+ return
+ if last_segment_url:
+ # Obtain from "X-Head-Seqnum" header value from each segment
+ try:
+ urlh = self._request_webpage(
+ last_segment_url, None, note=False, errnote=False, fatal=False)
+ except ExtractorError:
+ urlh = None
+ last_seq = try_get(urlh, lambda x: int_or_none(x.headers['X-Head-Seqnum']))
+ if last_seq is None:
+ no_fragment_score += 2
+ last_segment_url = None
+ continue
+ else:
+ should_continue, last_seq = _extract_sequence_from_mpd(True, no_fragment_score > 15)
+ no_fragment_score += 2
+ if not should_continue:
+ continue
+
+ if known_idx > last_seq:
+ last_segment_url = None
+ continue
+
+ last_seq += 1
+
+ if begin_index < 0 and known_idx < 0:
+ # skip from the start when it's negative value
+ known_idx = last_seq + begin_index
+ if lack_early_segments:
+ known_idx = max(known_idx, last_seq - int(MAX_DURATION // fragments[-1]['duration']))
+ try:
+ for idx in range(known_idx, last_seq):
+ # do not update sequence here or you'll get skipped some part of it
+ should_continue, _ = _extract_sequence_from_mpd(False, False)
+ if not should_continue:
+ known_idx = idx - 1
+ raise ExtractorError('breaking out of outer loop')
+ last_segment_url = urljoin(fragment_base_url, 'sq/%d' % idx)
+ yield {
+ 'url': last_segment_url,
+ }
+ if known_idx == last_seq:
+ no_fragment_score += 5
+ else:
+ no_fragment_score = 0
+ known_idx = last_seq
+ except ExtractorError:
+ continue
+
+ time.sleep(max(0, FETCH_SPAN + fetch_time - time.time()))
+
def _extract_player_url(self, *ytcfgs, webpage=None):
player_url = traverse_obj(
ytcfgs, (..., 'PLAYER_JS_URL'), (..., 'WEB_PLAYER_CONTEXT_CONFIGS', ..., 'jsUrl'),
get_all=False, expected_type=compat_str)
if not player_url:
return
- if player_url.startswith('//'):
- player_url = 'https:' + player_url
- elif not re.match(r'https?://', player_url):
- player_url = compat_urlparse.urljoin(
- 'https://www.youtube.com', player_url)
- return player_url
+ return urljoin('https://www.youtube.com', player_url)
def _download_player_url(self, video_id, fatal=False):
res = self._download_webpage(
@@ -1720,7 +2382,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
raise ExtractorError('Cannot identify player %r' % player_url)
return id_m.group('id')
- def _load_player(self, video_id, player_url, fatal=True) -> bool:
+ def _load_player(self, video_id, player_url, fatal=True):
player_id = self._extract_player_info(player_url)
if player_id not in self._code_cache:
code = self._download_webpage(
@@ -1729,7 +2391,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
errnote='Download of %s failed' % player_url)
if code:
self._code_cache[player_id] = code
- return player_id in self._code_cache
+ return self._code_cache.get(player_id)
def _extract_signature_function(self, video_id, player_url, example_sig):
player_id = self._extract_player_info(player_url)
@@ -1743,8 +2405,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
if cache_spec is not None:
return lambda s: ''.join(s[i] for i in cache_spec)
- if self._load_player(video_id, player_url):
- code = self._code_cache[player_id]
+ code = self._load_player(video_id, player_url)
+ if code:
res = self._parse_sig_js(code)
test_string = ''.join(map(compat_chr, range(len(example_sig))))
@@ -1755,6 +2417,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
return res
def _print_sig_code(self, func, example_sig):
+ if not self.get_param('youtube_print_sig_code'):
+ return
+
def gen_sig_code(idxs):
def _genslice(start, end, step):
starts = '' if start == 0 else str(start)
@@ -1831,13 +2496,59 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
)
self._player_cache[player_id] = func
func = self._player_cache[player_id]
- if self.get_param('youtube_print_sig_code'):
- self._print_sig_code(func, s)
+ self._print_sig_code(func, s)
return func(s)
except Exception as e:
- tb = traceback.format_exc()
- raise ExtractorError(
- 'Signature extraction failed: ' + tb, cause=e)
+ raise ExtractorError('Signature extraction failed: ' + traceback.format_exc(), cause=e)
+
+ def _decrypt_nsig(self, s, video_id, player_url):
+ """Turn the encrypted n field into a working signature"""
+ if player_url is None:
+ raise ExtractorError('Cannot decrypt nsig without player_url')
+ player_url = urljoin('https://www.youtube.com', player_url)
+
+ sig_id = ('nsig_value', s)
+ if sig_id in self._player_cache:
+ return self._player_cache[sig_id]
+
+ try:
+ player_id = ('nsig', player_url)
+ if player_id not in self._player_cache:
+ self._player_cache[player_id] = self._extract_n_function(video_id, player_url)
+ func = self._player_cache[player_id]
+ self._player_cache[sig_id] = func(s)
+ self.write_debug(f'Decrypted nsig {s} => {self._player_cache[sig_id]}')
+ return self._player_cache[sig_id]
+ except Exception as e:
+ raise ExtractorError(traceback.format_exc(), cause=e, video_id=video_id)
+
+ def _extract_n_function_name(self, jscode):
+ nfunc, idx = self._search_regex(
+ r'\.get\("n"\)\)&&\(b=(?P<nfunc>[a-zA-Z0-9$]+)(?:\[(?P<idx>\d+)\])?\([a-zA-Z0-9]\)',
+ jscode, 'Initial JS player n function name', group=('nfunc', 'idx'))
+ if not idx:
+ return nfunc
+ return json.loads(js_to_json(self._search_regex(
+ rf'var {re.escape(nfunc)}\s*=\s*(\[.+?\]);', jscode,
+ f'Initial JS player n function list ({nfunc}.{idx})')))[int(idx)]
+
+ def _extract_n_function(self, video_id, player_url):
+ player_id = self._extract_player_info(player_url)
+ func_code = self._downloader.cache.load('youtube-nsig', player_id)
+
+ if func_code:
+ jsi = JSInterpreter(func_code)
+ else:
+ jscode = self._load_player(video_id, player_url)
+ funcname = self._extract_n_function_name(jscode)
+ jsi = JSInterpreter(jscode)
+ func_code = jsi.extract_function_code(funcname)
+ self._downloader.cache.store('youtube-nsig', player_id, func_code)
+
+ if self.get_param('youtube_print_sig_code'):
+ self.to_screen(f'Extracted nsig function from {player_id}:\n{func_code[1]}\n')
+
+ return lambda s: jsi.extract_function_from_code(*func_code)([s])
def _extract_signature_timestamp(self, video_id, player_url, ytcfg=None, fatal=False):
"""
@@ -1856,18 +2567,17 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
raise ExtractorError(error_msg)
self.report_warning(error_msg)
return
- if self._load_player(video_id, player_url, fatal=fatal):
- player_id = self._extract_player_info(player_url)
- code = self._code_cache[player_id]
+ code = self._load_player(video_id, player_url, fatal=fatal)
+ if code:
sts = int_or_none(self._search_regex(
r'(?:signatureTimestamp|sts)\s*:\s*(?P<sts>[0-9]{5})', code,
'JS player signature timestamp', group='sts', fatal=fatal))
return sts
def _mark_watched(self, video_id, player_responses):
- playback_url = traverse_obj(
- player_responses, (..., 'playbackTracking', 'videostatsPlaybackUrl', 'baseUrl'),
- expected_type=url_or_none, get_all=False)
+ playback_url = get_first(
+ player_responses, ('playbackTracking', 'videostatsPlaybackUrl', 'baseUrl'),
+ expected_type=url_or_none)
if not playback_url:
self.report_warning('Unable to mark watched')
return
@@ -1991,19 +2701,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
(r'%s\s*%s' % (regex, self._YT_INITIAL_BOUNDARY_RE),
regex), webpage, name, default='{}'), video_id, fatal=False)
- @staticmethod
- def parse_time_text(time_text):
- """
- Parse the comment time text
- time_text is in the format 'X units ago (edited)'
- """
- time_text_split = time_text.split(' ')
- if len(time_text_split) >= 3:
- try:
- return datetime_from_str('now-%s%s' % (time_text_split[0], time_text_split[1]), precision='auto')
- except ValueError:
- return None
-
def _extract_comment(self, comment_renderer, parent=None):
comment_id = comment_renderer.get('commentId')
if not comment_id:
@@ -2012,10 +2709,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
text = self._get_text(comment_renderer, 'contentText')
# note: timestamp is an estimate calculated from the current time and time_text
- time_text = self._get_text(comment_renderer, 'publishedTimeText') or ''
- time_text_dt = self.parse_time_text(time_text)
- if isinstance(time_text_dt, datetime.datetime):
- timestamp = calendar.timegm(time_text_dt.timetuple())
+ timestamp, time_text = self._extract_time_text(comment_renderer, 'publishedTimeText')
author = self._get_text(comment_renderer, 'authorText')
author_id = try_get(comment_renderer,
lambda x: x['authorEndpoint']['browseEndpoint']['browseId'], compat_str)
@@ -2042,20 +2736,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'parent': parent or 'root'
}
- def _comment_entries(self, root_continuation_data, ytcfg, video_id, parent=None, comment_counts=None):
+ def _comment_entries(self, root_continuation_data, ytcfg, video_id, parent=None, tracker=None):
+
+ get_single_config_arg = lambda c: self._configuration_arg(c, [''])[0]
def extract_header(contents):
_continuation = None
for content in contents:
- comments_header_renderer = try_get(content, lambda x: x['commentsHeaderRenderer'])
- expected_comment_count = parse_count(self._get_text(
- comments_header_renderer, 'countText', 'commentsCount', max_runs=1))
+ comments_header_renderer = traverse_obj(content, 'commentsHeaderRenderer')
+ expected_comment_count = self._get_count(
+ comments_header_renderer, 'countText', 'commentsCount')
if expected_comment_count:
- comment_counts[1] = expected_comment_count
- self.to_screen('Downloading ~%d comments' % expected_comment_count)
- sort_mode_str = self._configuration_arg('comment_sort', [''])[0]
- comment_sort_index = int(sort_mode_str != 'top') # 1 = new, 0 = top
+ tracker['est_total'] = expected_comment_count
+ self.to_screen(f'Downloading ~{expected_comment_count} comments')
+ comment_sort_index = int(get_single_config_arg('comment_sort') != 'top') # 1 = new, 0 = top
sort_menu_item = try_get(
comments_header_renderer,
@@ -2066,76 +2761,84 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
if not _continuation:
continue
- sort_text = sort_menu_item.get('title')
- if isinstance(sort_text, compat_str):
- sort_text = sort_text.lower()
- else:
+ sort_text = str_or_none(sort_menu_item.get('title'))
+ if not sort_text:
sort_text = 'top comments' if comment_sort_index == 0 else 'newest first'
- self.to_screen('Sorting comments by %s' % sort_text)
+ self.to_screen('Sorting comments by %s' % sort_text.lower())
break
return _continuation
def extract_thread(contents):
if not parent:
- comment_counts[2] = 0
+ tracker['current_page_thread'] = 0
for content in contents:
+ if not parent and tracker['total_parent_comments'] >= max_parents:
+ yield
comment_thread_renderer = try_get(content, lambda x: x['commentThreadRenderer'])
- comment_renderer = try_get(
- comment_thread_renderer, (lambda x: x['comment']['commentRenderer'], dict)) or try_get(
- content, (lambda x: x['commentRenderer'], dict))
+ comment_renderer = get_first(
+ (comment_thread_renderer, content), [['commentRenderer', ('comment', 'commentRenderer')]],
+ expected_type=dict, default={})
- if not comment_renderer:
- continue
comment = self._extract_comment(comment_renderer, parent)
if not comment:
continue
- comment_counts[0] += 1
+
+ tracker['running_total'] += 1
+ tracker['total_reply_comments' if parent else 'total_parent_comments'] += 1
yield comment
+
# Attempt to get the replies
comment_replies_renderer = try_get(
comment_thread_renderer, lambda x: x['replies']['commentRepliesRenderer'], dict)
if comment_replies_renderer:
- comment_counts[2] += 1
+ tracker['current_page_thread'] += 1
comment_entries_iter = self._comment_entries(
comment_replies_renderer, ytcfg, video_id,
- parent=comment.get('id'), comment_counts=comment_counts)
-
- for reply_comment in comment_entries_iter:
+ parent=comment.get('id'), tracker=tracker)
+ for reply_comment in itertools.islice(comment_entries_iter, min(max_replies_per_thread, max(0, max_replies - tracker['total_reply_comments']))):
yield reply_comment
+ # Keeps track of counts across recursive calls
+ if not tracker:
+ tracker = dict(
+ running_total=0,
+ est_total=0,
+ current_page_thread=0,
+ total_parent_comments=0,
+ total_reply_comments=0)
+
+ # TODO: Deprecated
# YouTube comments have a max depth of 2
- max_depth = int_or_none(self._configuration_arg('max_comment_depth', [''])[0]) or float('inf')
+ max_depth = int_or_none(get_single_config_arg('max_comment_depth'))
+ if max_depth:
+ self._downloader.deprecation_warning(
+ '[youtube] max_comment_depth extractor argument is deprecated. Set max replies in the max-comments extractor argument instead.')
if max_depth == 1 and parent:
return
- if not comment_counts:
- # comment so far, est. total comments, current comment thread #
- comment_counts = [0, 0, 0]
- continuation = self._extract_continuation(root_continuation_data)
- if continuation and len(continuation['continuation']) < 27:
- self.write_debug('Detected old API continuation token. Generating new API compatible token.')
- continuation_token = self._generate_comment_continuation(video_id)
- continuation = self._build_api_continuation_query(continuation_token, None)
+ max_comments, max_parents, max_replies, max_replies_per_thread, *_ = map(
+ lambda p: int_or_none(p, default=sys.maxsize), self._configuration_arg('max_comments', ) + [''] * 4)
+ continuation = self._extract_continuation(root_continuation_data)
message = self._get_text(root_continuation_data, ('contents', ..., 'messageRenderer', 'text'), max_runs=1)
if message and not parent:
self.report_warning(message, video_id=video_id)
- visitor_data = None
+ response = None
is_first_continuation = parent is None
for page_num in itertools.count(0):
if not continuation:
break
- headers = self.generate_api_headers(ytcfg=ytcfg, visitor_data=visitor_data)
- comment_prog_str = '(%d/%d)' % (comment_counts[0], comment_counts[1])
+ headers = self.generate_api_headers(ytcfg=ytcfg, visitor_data=self._extract_visitor_data(response))
+ comment_prog_str = f"({tracker['running_total']}/{tracker['est_total']})"
if page_num == 0:
if is_first_continuation:
note_prefix = 'Downloading comment section API JSON'
else:
note_prefix = ' Downloading comment API JSON reply thread %d %s' % (
- comment_counts[2], comment_prog_str)
+ tracker['current_page_thread'], comment_prog_str)
else:
note_prefix = '%sDownloading comment%s API JSON page %d %s' % (
' ' if parent else '', ' replies' if parent else '',
@@ -2144,83 +2847,32 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
response = self._extract_response(
item_id=None, query=continuation,
ep='next', ytcfg=ytcfg, headers=headers, note=note_prefix,
- check_get_keys=('onResponseReceivedEndpoints', 'continuationContents'))
- if not response:
- break
- visitor_data = try_get(
- response,
- lambda x: x['responseContext']['webResponseContextExtensionData']['ytConfigData']['visitorData'],
- compat_str) or visitor_data
+ check_get_keys='onResponseReceivedEndpoints')
- continuation_contents = dict_get(response, ('onResponseReceivedEndpoints', 'continuationContents'))
+ continuation_contents = traverse_obj(
+ response, 'onResponseReceivedEndpoints', expected_type=list, default=[])
continuation = None
- if isinstance(continuation_contents, list):
- for continuation_section in continuation_contents:
- if not isinstance(continuation_section, dict):
- continue
- continuation_items = try_get(
- continuation_section,
- (lambda x: x['reloadContinuationItemsCommand']['continuationItems'],
- lambda x: x['appendContinuationItemsAction']['continuationItems']),
- list) or []
- if is_first_continuation:
- continuation = extract_header(continuation_items)
- is_first_continuation = False
- if continuation:
- break
- continue
- count = 0
- for count, entry in enumerate(extract_thread(continuation_items)):
- yield entry
- continuation = self._extract_continuation({'contents': continuation_items})
+ for continuation_section in continuation_contents:
+ continuation_items = traverse_obj(
+ continuation_section,
+ (('reloadContinuationItemsCommand', 'appendContinuationItemsAction'), 'continuationItems'),
+ get_all=False, expected_type=list) or []
+ if is_first_continuation:
+ continuation = extract_header(continuation_items)
+ is_first_continuation = False
if continuation:
- # Sometimes YouTube provides a continuation without any comments
- # In most cases we end up just downloading these with very little comments to come.
- if count == 0:
- if not parent:
- self.report_warning('No comments received - assuming end of comments')
- continuation = None
break
+ continue
- # Deprecated response structure
- elif isinstance(continuation_contents, dict):
- known_continuation_renderers = ('itemSectionContinuation', 'commentRepliesContinuation')
- for key, continuation_renderer in continuation_contents.items():
- if key not in known_continuation_renderers:
- continue
- if not isinstance(continuation_renderer, dict):
- continue
- if is_first_continuation:
- header_continuation_items = [continuation_renderer.get('header') or {}]
- continuation = extract_header(header_continuation_items)
- is_first_continuation = False
- if continuation:
- break
-
- # Sometimes YouTube provides a continuation without any comments
- # In most cases we end up just downloading these with very little comments to come.
- count = 0
- for count, entry in enumerate(extract_thread(continuation_renderer.get('contents') or {})):
- yield entry
- continuation = self._extract_continuation(continuation_renderer)
- if count == 0:
- if not parent:
- self.report_warning('No comments received - assuming end of comments')
- continuation = None
+ for entry in extract_thread(continuation_items):
+ if not entry:
+ return
+ yield entry
+ continuation = self._extract_continuation({'contents': continuation_items})
+ if continuation:
break
- @staticmethod
- def _generate_comment_continuation(video_id):
- """
- Generates initial comment section continuation token from given video id
- """
- b64_vid_id = base64.b64encode(bytes(video_id.encode('utf-8')))
- parts = ('Eg0SCw==', b64_vid_id, 'GAYyJyIRIgs=', b64_vid_id, 'MAB4AjAAQhBjb21tZW50cy1zZWN0aW9u')
- new_continuation_intlist = list(itertools.chain.from_iterable(
- [bytes_to_intlist(base64.b64decode(part)) for part in parts]))
- return base64.b64encode(intlist_to_bytes(new_continuation_intlist)).decode('utf-8')
-
def _get_comments(self, ytcfg, video_id, contents, webpage):
"""Entry for comment extraction"""
def _real_comment_extract(contents):
@@ -2230,11 +2882,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
yield from self._comment_entries(renderer, ytcfg, video_id)
max_comments = int_or_none(self._configuration_arg('max_comments', [''])[0])
- # Force English regardless of account setting to prevent parsing issues
- # See: https://github.com/hypervideo/hypervideo/issues/532
- ytcfg = copy.deepcopy(ytcfg)
- traverse_obj(
- ytcfg, ('INNERTUBE_CONTEXT', 'client'), expected_type=dict, default={})['hl'] = 'en'
return itertools.islice(_real_comment_extract(contents), 0, max_comments)
@staticmethod
@@ -2290,18 +2937,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
def _get_requested_clients(self, url, smuggled_data):
requested_clients = []
+ default = ['android', 'web']
allowed_clients = sorted(
[client for client in INNERTUBE_CLIENTS.keys() if client[:1] != '_'],
key=lambda client: INNERTUBE_CLIENTS[client]['priority'], reverse=True)
for client in self._configuration_arg('player_client'):
if client in allowed_clients:
requested_clients.append(client)
+ elif client == 'default':
+ requested_clients.extend(default)
elif client == 'all':
requested_clients.extend(allowed_clients)
else:
self.report_warning(f'Skipping unsupported client {client}')
if not requested_clients:
- requested_clients = ['android', 'web']
+ requested_clients = default
if smuggled_data.get('is_music_url') or self.is_music_url(url):
requested_clients.extend(
@@ -2316,7 +2966,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
}.get(client)
if not url:
return {}
- webpage = self._download_webpage(url, video_id, fatal=False, note=f'Downloading {client} config')
+ webpage = self._download_webpage(url, video_id, fatal=False, note='Downloading %s config' % client.replace('_', ' ').strip())
return self.extract_ytcfg(video_id, webpage) or {}
def _extract_player_responses(self, clients, video_id, webpage, master_ytcfg):
@@ -2326,13 +2976,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
webpage, self._YT_INITIAL_PLAYER_RESPONSE_RE,
video_id, 'initial player response')
- original_clients = clients
+ all_clients = set(clients)
clients = clients[::-1]
prs = []
- def append_client(client_name):
- if client_name in INNERTUBE_CLIENTS and client_name not in original_clients:
- clients.append(client_name)
+ def append_client(*client_names):
+ """ Append the first client name that exists but not already used """
+ for client_name in client_names:
+ actual_client = _split_innertube_client(client_name)[0]
+ if actual_client in INNERTUBE_CLIENTS:
+ if actual_client not in all_clients:
+ clients.append(client_name)
+ all_clients.add(actual_client)
+ return
# Android player_response does not have microFormats which are needed for
# extraction of some data. So we return the initial_pr with formats
@@ -2347,7 +3003,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
tried_iframe_fallback = False
player_url = None
while clients:
- client = clients.pop()
+ client, base_client, variant = _split_innertube_client(clients.pop())
player_ytcfg = master_ytcfg if client == 'web' else {}
if 'configs' not in self._configuration_arg('player_skip'):
player_ytcfg = self._extract_player_ytcfg(client, video_id) or player_ytcfg
@@ -2375,10 +3031,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
prs.append(pr)
# creator clients can bypass AGE_VERIFICATION_REQUIRED if logged in
- if client.endswith('_agegate') and self._is_unplayable(pr) and self.is_authenticated:
- append_client(client.replace('_agegate', '_creator'))
+ if variant == 'embedded' and self._is_unplayable(pr) and self.is_authenticated:
+ append_client(f'{base_client}_creator')
elif self._is_agegated(pr):
- append_client(f'{client}_agegate')
+ if variant == 'tv_embedded':
+ append_client(f'{base_client}_embedded')
+ elif not variant:
+ append_client(f'tv_embedded.{base_client}', f'{base_client}_embedded')
if last_error:
if not len(prs):
@@ -2386,8 +3045,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
self.report_warning(last_error)
return prs, player_url
- def _extract_formats(self, streaming_data, video_id, player_url, is_live):
- itags, stream_ids = [], []
+ def _extract_formats(self, streaming_data, video_id, player_url, is_live, duration):
+ itags, stream_ids = {}, []
itag_qualities, res_qualities = {}, {}
q = qualities([
# Normally tiny is the smallest video-only formats. But
@@ -2399,7 +3058,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
streaming_formats = traverse_obj(streaming_data, (..., ('formats', 'adaptiveFormats'), ...), default=[])
for fmt in streaming_formats:
- if fmt.get('targetDurationSec') or fmt.get('drmFamilies'):
+ if fmt.get('targetDurationSec'):
continue
itag = str_or_none(fmt.get('itag'))
@@ -2440,28 +3099,56 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
sp = try_get(sc, lambda x: x['sp'][0]) or 'signature'
fmt_url += '&' + sp + '=' + signature
+ query = parse_qs(fmt_url)
+ throttled = False
+ if query.get('n'):
+ try:
+ fmt_url = update_url_query(fmt_url, {
+ 'n': self._decrypt_nsig(query['n'][0], video_id, player_url)})
+ except ExtractorError as e:
+ self.report_warning(
+ f'nsig extraction failed: You may experience throttling for some formats\n'
+ f'n = {query["n"][0]} ; player = {player_url}\n{e}', only_once=True)
+ throttled = True
+
if itag:
- itags.append(itag)
+ itags[itag] = 'https'
stream_ids.append(stream_id)
- tbr = float_or_none(
- fmt.get('averageBitrate') or fmt.get('bitrate'), 1000)
+ tbr = float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000)
+ language_preference = (
+ 10 if audio_track.get('audioIsDefault') and 10
+ else -10 if 'descriptive' in (audio_track.get('displayName') or '').lower() and -10
+ else -1)
+ # Some formats may have much smaller duration than others (possibly damaged during encoding)
+ # Eg: 2-nOtRESiUc Ref: https://github.com/hypervideo/hypervideo/issues/2823
+ # Make sure to avoid false positives with small duration differences.
+ # Eg: __2ABJjxzNo, ySuUZEjARPY
+ is_damaged = try_get(fmt, lambda x: float(x['approxDurationMs']) / duration < 500)
+ if is_damaged:
+ self.report_warning(f'{video_id}: Some formats are possibly damaged. They will be deprioritized', only_once=True)
dct = {
'asr': int_or_none(fmt.get('audioSampleRate')),
'filesize': int_or_none(fmt.get('contentLength')),
'format_id': itag,
- 'format_note': ', '.join(filter(None, (
+ 'format_note': join_nonempty(
'%s%s' % (audio_track.get('displayName') or '',
- ' (default)' if audio_track.get('audioIsDefault') else ''),
- fmt.get('qualityLabel') or quality.replace('audio_quality_', '')))),
- 'fps': int_or_none(fmt.get('fps')),
+ ' (default)' if language_preference > 0 else ''),
+ fmt.get('qualityLabel') or quality.replace('audio_quality_', ''),
+ throttled and 'THROTTLED', is_damaged and 'DAMAGED', delim=', '),
+ 'source_preference': -10 if throttled else -1,
+ 'fps': int_or_none(fmt.get('fps')) or None,
'height': height,
'quality': q(quality),
+ 'has_drm': bool(fmt.get('drmFamilies')),
'tbr': tbr,
'url': fmt_url,
'width': int_or_none(fmt.get('width')),
- 'language': audio_track.get('id', '').split('.')[0],
- 'language_preference': 1 if audio_track.get('audioIsDefault') else -1,
+ 'language': join_nonempty(audio_track.get('id', '').split('.')[0],
+ 'desc' if language_preference < -1 else ''),
+ 'language_preference': language_preference,
+ # Strictly de-prioritize damaged and 3gp formats
+ 'preference': -10 if is_damaged else -2 if itag == '17' else None,
}
mime_mobj = re.match(
r'((?:[^/]+)/(?:[^;]+))(?:;\s*codecs="([^"]+)")?', fmt.get('mimeType') or '')
@@ -2483,59 +3170,84 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
dct['container'] = dct['ext'] + '_dash'
yield dct
+ live_from_start = is_live and self.get_param('live_from_start')
skip_manifests = self._configuration_arg('skip')
- get_dash = (
- (not is_live or self._configuration_arg('include_live_dash'))
- and 'dash' not in skip_manifests and self.get_param('youtube_include_dash_manifest', True))
- get_hls = 'hls' not in skip_manifests and self.get_param('youtube_include_hls_manifest', True)
+ if not self.get_param('youtube_include_hls_manifest', True):
+ skip_manifests.append('hls')
+ get_dash = 'dash' not in skip_manifests and (
+ not is_live or live_from_start or self._configuration_arg('include_live_dash'))
+ get_hls = not live_from_start and 'hls' not in skip_manifests
+
+ def process_manifest_format(f, proto, itag):
+ if itag in itags:
+ if itags[itag] == proto or f'{itag}-{proto}' in itags:
+ return False
+ itag = f'{itag}-{proto}'
+ if itag:
+ f['format_id'] = itag
+ itags[itag] = proto
- def guess_quality(f):
- for val, qdict in ((f.get('format_id'), itag_qualities), (f.get('height'), res_qualities)):
- if val in qdict:
- return q(qdict[val])
- return -1
+ f['quality'] = next((
+ q(qdict[val])
+ for val, qdict in ((f.get('format_id', '').split('-')[0], itag_qualities), (f.get('height'), res_qualities))
+ if val in qdict), -1)
+ return True
for sd in streaming_data:
hls_manifest_url = get_hls and sd.get('hlsManifestUrl')
if hls_manifest_url:
for f in self._extract_m3u8_formats(hls_manifest_url, video_id, 'mp4', fatal=False):
- itag = self._search_regex(
- r'/itag/(\d+)', f['url'], 'itag', default=None)
- if itag in itags:
- itag += '-hls'
- if itag in itags:
- continue
- if itag:
- f['format_id'] = itag
- itags.append(itag)
- f['quality'] = guess_quality(f)
- yield f
+ if process_manifest_format(f, 'hls', self._search_regex(
+ r'/itag/(\d+)', f['url'], 'itag', default=None)):
+ yield f
dash_manifest_url = get_dash and sd.get('dashManifestUrl')
if dash_manifest_url:
for f in self._extract_mpd_formats(dash_manifest_url, video_id, fatal=False):
- itag = f['format_id']
- if itag in itags:
- itag += '-dash'
- if itag in itags:
- continue
- if itag:
- f['format_id'] = itag
- itags.append(itag)
- f['quality'] = guess_quality(f)
- filesize = int_or_none(self._search_regex(
- r'/clen/(\d+)', f.get('fragment_base_url')
- or f['url'], 'file size', default=None))
- if filesize:
- f['filesize'] = filesize
- yield f
-
- def _real_extract(self, url):
- url, smuggled_data = unsmuggle_url(url, {})
- video_id = self._match_id(url)
+ if process_manifest_format(f, 'dash', f['format_id']):
+ f['filesize'] = int_or_none(self._search_regex(
+ r'/clen/(\d+)', f.get('fragment_base_url') or f['url'], 'file size', default=None))
+ if live_from_start:
+ f['is_from_start'] = True
+
+ yield f
+
+ def _extract_storyboard(self, player_responses, duration):
+ spec = get_first(
+ player_responses, ('storyboards', 'playerStoryboardSpecRenderer', 'spec'), default='').split('|')[::-1]
+ base_url = url_or_none(urljoin('https://i.ytimg.com/', spec.pop() or None))
+ if not base_url:
+ return
+ L = len(spec) - 1
+ for i, args in enumerate(spec):
+ args = args.split('#')
+ counts = list(map(int_or_none, args[:5]))
+ if len(args) != 8 or not all(counts):
+ self.report_warning(f'Malformed storyboard {i}: {"#".join(args)}{bug_reports_message()}')
+ continue
+ width, height, frame_count, cols, rows = counts
+ N, sigh = args[6:]
+
+ url = base_url.replace('$L', str(L - i)).replace('$N', N) + f'&sigh={sigh}'
+ fragment_count = frame_count / (cols * rows)
+ fragment_duration = duration / fragment_count
+ yield {
+ 'format_id': f'sb{i}',
+ 'format_note': 'storyboard',
+ 'ext': 'mhtml',
+ 'protocol': 'mhtml',
+ 'acodec': 'none',
+ 'vcodec': 'none',
+ 'url': url,
+ 'width': width,
+ 'height': height,
+ 'fragments': [{
+ 'url': url.replace('$M', str(j)),
+ 'duration': min(fragment_duration, duration - (j * fragment_duration)),
+ } for j in range(math.ceil(fragment_count))],
+ }
- base_url = self.http_scheme() + '//www.youtube.com/'
- webpage_url = base_url + 'watch?v=' + video_id
+ def _download_player_responses(self, url, smuggled_data, video_id, webpage_url):
webpage = None
if 'webpage' not in self._configuration_arg('player_skip'):
webpage = self._download_webpage(
@@ -2547,7 +3259,27 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
self._get_requested_clients(url, smuggled_data),
video_id, webpage, master_ytcfg)
- get_first = lambda obj, keys, **kwargs: traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False)
+ return webpage, master_ytcfg, player_responses, player_url
+
+ def _list_formats(self, video_id, microformats, video_details, player_responses, player_url, duration=None):
+ live_broadcast_details = traverse_obj(microformats, (..., 'liveBroadcastDetails'))
+ is_live = get_first(video_details, 'isLive')
+ if is_live is None:
+ is_live = get_first(live_broadcast_details, 'isLiveNow')
+
+ streaming_data = traverse_obj(player_responses, (..., 'streamingData'), default=[])
+ formats = list(self._extract_formats(streaming_data, video_id, player_url, is_live, duration))
+
+ return live_broadcast_details, is_live, streaming_data, formats
+
+ def _real_extract(self, url):
+ url, smuggled_data = unsmuggle_url(url, {})
+ video_id = self._match_id(url)
+
+ base_url = self.http_scheme() + '//www.youtube.com/'
+ webpage_url = base_url + 'watch?v=' + video_id
+
+ webpage, master_ytcfg, player_responses, player_url = self._download_player_responses(url, smuggled_data, video_id, webpage_url)
playability_statuses = traverse_obj(
player_responses, (..., 'playabilityStatus'), expected_type=dict, default=[])
@@ -2574,57 +3306,56 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
or search_meta(['og:title', 'twitter:title', 'title']))
video_description = get_first(video_details, 'shortDescription')
- if not smuggled_data.get('force_singlefeed', False):
- if not self.get_param('noplaylist'):
- multifeed_metadata_list = get_first(
- player_responses,
- ('multicamera', 'playerLegacyMulticameraRenderer', 'metadataList'),
- expected_type=str)
- if multifeed_metadata_list:
- entries = []
- feed_ids = []
- for feed in multifeed_metadata_list.split(','):
- # Unquote should take place before split on comma (,) since textual
- # fields may contain comma as well (see
- # https://github.com/ytdl-org/youtube-dl/issues/8536)
- feed_data = compat_parse_qs(
- compat_urllib_parse_unquote_plus(feed))
-
- def feed_entry(name):
- return try_get(
- feed_data, lambda x: x[name][0], compat_str)
-
- feed_id = feed_entry('id')
- if not feed_id:
- continue
- feed_title = feed_entry('title')
- title = video_title
- if feed_title:
- title += ' (%s)' % feed_title
- entries.append({
- '_type': 'url_transparent',
- 'ie_key': 'Youtube',
- 'url': smuggle_url(
- '%swatch?v=%s' % (base_url, feed_data['id'][0]),
- {'force_singlefeed': True}),
- 'title': title,
- })
- feed_ids.append(feed_id)
- self.to_screen(
- 'Downloading multifeed video (%s) - add --no-playlist to just download video %s'
- % (', '.join(feed_ids), video_id))
- return self.playlist_result(
- entries, video_id, video_title, video_description)
- else:
+ multifeed_metadata_list = get_first(
+ player_responses,
+ ('multicamera', 'playerLegacyMulticameraRenderer', 'metadataList'),
+ expected_type=str)
+ if multifeed_metadata_list and not smuggled_data.get('force_singlefeed'):
+ if self.get_param('noplaylist'):
self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
+ else:
+ entries = []
+ feed_ids = []
+ for feed in multifeed_metadata_list.split(','):
+ # Unquote should take place before split on comma (,) since textual
+ # fields may contain comma as well (see
+ # https://github.com/ytdl-org/youtube-dl/issues/8536)
+ feed_data = compat_parse_qs(
+ compat_urllib_parse_unquote_plus(feed))
+
+ def feed_entry(name):
+ return try_get(
+ feed_data, lambda x: x[name][0], compat_str)
+
+ feed_id = feed_entry('id')
+ if not feed_id:
+ continue
+ feed_title = feed_entry('title')
+ title = video_title
+ if feed_title:
+ title += ' (%s)' % feed_title
+ entries.append({
+ '_type': 'url_transparent',
+ 'ie_key': 'Youtube',
+ 'url': smuggle_url(
+ '%swatch?v=%s' % (base_url, feed_data['id'][0]),
+ {'force_singlefeed': True}),
+ 'title': title,
+ })
+ feed_ids.append(feed_id)
+ self.to_screen(
+ 'Downloading multifeed video (%s) - add --no-playlist to just download video %s'
+ % (', '.join(feed_ids), video_id))
+ return self.playlist_result(
+ entries, video_id, video_title, video_description)
- live_broadcast_details = traverse_obj(microformats, (..., 'liveBroadcastDetails'))
- is_live = get_first(video_details, 'isLive')
- if is_live is None:
- is_live = get_first(live_broadcast_details, 'isLiveNow')
+ duration = int_or_none(
+ get_first(video_details, 'lengthSeconds')
+ or get_first(microformats, 'lengthSeconds')
+ or parse_duration(search_meta('duration'))) or None
- streaming_data = traverse_obj(player_responses, (..., 'streamingData'), default=[])
- formats = list(self._extract_formats(streaming_data, video_id, player_url, is_live))
+ live_broadcast_details, is_live, streaming_data, formats = self._list_formats(
+ video_id, microformats, video_details, player_responses, player_url, duration)
if not formats:
if not self.get_param('allow_unplayable_formats') and traverse_obj(streaming_data, (..., 'licenseInfos')):
@@ -2645,16 +3376,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
if reason:
self.raise_no_formats(reason, expected=True)
- for f in formats:
- if '&c=WEB&' in f['url'] and '&ratebypass=yes&' not in f['url']: # throttled
- f['source_preference'] = -10
- # TODO: this method is not reliable
- f['format_note'] = format_field(f, 'format_note', '%s ') + '(maybe throttled)'
-
- # Source is given priority since formats that throttle are given lower source_preference
- # When throttling issue is fully fixed, remove this
- self._sort_formats(formats, ('quality', 'res', 'fps', 'hdr:12', 'source', 'codec:vp9.2', 'lang'))
-
keywords = get_first(video_details, 'keywords', expected_type=list) or []
if not keywords and webpage:
keywords = [
@@ -2672,30 +3393,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
if f.get('vcodec') != 'none':
f['stretched_ratio'] = ratio
break
-
- thumbnails = []
- thumbnail_dicts = traverse_obj(
- (video_details, microformats), (..., ..., 'thumbnail', 'thumbnails', ...),
- expected_type=dict, default=[])
- for thumbnail in thumbnail_dicts:
- thumbnail_url = thumbnail.get('url')
- if not thumbnail_url:
- continue
- # Sometimes youtube gives a wrong thumbnail URL. See:
- # https://github.com/hypervideo/hypervideo/issues/233
- # https://github.com/ytdl-org/youtube-dl/issues/28023
- if 'maxresdefault' in thumbnail_url:
- thumbnail_url = thumbnail_url.split('?')[0]
- thumbnails.append({
- 'url': thumbnail_url,
- 'height': int_or_none(thumbnail.get('height')),
- 'width': int_or_none(thumbnail.get('width')),
- })
+ thumbnails = self._extract_thumbnails((video_details, microformats), (..., ..., 'thumbnail'))
thumbnail_url = search_meta(['og:image', 'twitter:image'])
if thumbnail_url:
thumbnails.append({
'url': thumbnail_url,
})
+ original_thumbnails = thumbnails.copy()
+
# The best resolution thumbnails sometimes does not appear in the webpage
# See: https://github.com/ytdl-org/youtube-dl/issues/29049, https://github.com/hypervideo/hypervideo/issues/340
# List of possible thumbnails - Ref: <https://stackoverflow.com/a/20542029>
@@ -2706,7 +3411,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'default', '1', '2', '3'
]
n_thumbnail_names = len(thumbnail_names)
-
thumbnails.extend({
'url': 'https://i.ytimg.com/vi{webp}/{video_id}/{name}{live}.{ext}'.format(
video_id=video_id, name=name, ext=ext,
@@ -2716,16 +3420,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
i = next((i for i, t in enumerate(thumbnail_names) if f'/{video_id}/{t}' in thumb['url']), n_thumbnail_names)
thumb['preference'] = (0 if '.webp' in thumb['url'] else -1) - (2 * i)
self._remove_duplicate_formats(thumbnails)
+ self._downloader._sort_thumbnails(original_thumbnails)
category = get_first(microformats, 'category') or search_meta('genre')
channel_id = str_or_none(
get_first(video_details, 'channelId')
or get_first(microformats, 'externalChannelId')
or search_meta('channelId'))
- duration = int_or_none(
- get_first(video_details, 'lengthSeconds')
- or get_first(microformats, 'lengthSeconds')
- or parse_duration(search_meta('duration'))) or None
owner_profile_url = get_first(microformats, 'ownerProfileUrl')
live_content = get_first(video_details, 'isLiveContent')
@@ -2735,25 +3436,34 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
is_live = False
if is_upcoming is None and (live_content or is_live):
is_upcoming = False
- live_starttime = parse_iso8601(get_first(live_broadcast_details, 'startTimestamp'))
- live_endtime = parse_iso8601(get_first(live_broadcast_details, 'endTimestamp'))
- if not duration and live_endtime and live_starttime:
- duration = live_endtime - live_starttime
+ live_start_time = parse_iso8601(get_first(live_broadcast_details, 'startTimestamp'))
+ live_end_time = parse_iso8601(get_first(live_broadcast_details, 'endTimestamp'))
+ if not duration and live_end_time and live_start_time:
+ duration = live_end_time - live_start_time
+
+ if is_live and self.get_param('live_from_start'):
+ self._prepare_live_from_start_formats(formats, video_id, live_start_time, url, webpage_url, smuggled_data)
+
+ formats.extend(self._extract_storyboard(player_responses, duration))
+
+ # Source is given priority since formats that throttle are given lower source_preference
+ # When throttling issue is fully fixed, remove this
+ self._sort_formats(formats, ('quality', 'res', 'fps', 'hdr:12', 'source', 'codec:vp9.2', 'lang', 'proto'))
info = {
'id': video_id,
- 'title': self._live_title(video_title) if is_live else video_title,
+ 'title': video_title,
'formats': formats,
'thumbnails': thumbnails,
+ # The best thumbnail that we are sure exists. Prevents unnecessary
+ # URL checking if user don't care about getting the best possible thumbnail
+ 'thumbnail': traverse_obj(original_thumbnails, (-1, 'url')),
'description': video_description,
- 'upload_date': unified_strdate(
- get_first(microformats, 'uploadDate')
- or search_meta('uploadDate')),
'uploader': get_first(video_details, 'author'),
'uploader_id': self._search_regex(r'/(?:channel|user)/([^/?&#]+)', owner_profile_url, 'uploader id') if owner_profile_url else None,
'uploader_url': owner_profile_url,
'channel_id': channel_id,
- 'channel_url': f'https://www.youtube.com/channel/{channel_id}' if channel_id else None,
+ 'channel_url': format_field(channel_id, template='https://www.youtube.com/channel/%s'),
'duration': duration,
'view_count': int_or_none(
get_first((video_details, microformats), (..., 'viewCount'))
@@ -2772,7 +3482,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
else None if is_live is None or is_upcoming is None
else live_content),
'live_status': 'is_upcoming' if is_upcoming else None, # rest will be set by YoutubeDL
- 'release_timestamp': live_starttime,
+ 'release_timestamp': live_start_time,
}
pctr = traverse_obj(player_responses, (..., 'captions', 'playerCaptionsTracklistRenderer'), expected_type=dict)
@@ -2797,13 +3507,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
})
lang_subs.append({
'ext': fmt,
- 'url': update_url_query(base_url, query),
+ 'url': urljoin('https://www.youtube.com', update_url_query(base_url, query)),
'name': sub_name,
})
subtitles, automatic_captions = {}, {}
for lang_code, caption_track in captions.items():
base_url = caption_track.get('baseUrl')
+ orig_lang = parse_qs(base_url).get('lang', [None])[-1]
if not base_url:
continue
lang_name = self._get_text(caption_track, 'name', max_runs=1)
@@ -2817,11 +3528,20 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
for trans_code, trans_name in translation_languages.items():
if not trans_code:
continue
+ orig_trans_code = trans_code
if caption_track.get('kind') != 'asr':
+ if 'translated_subs' in self._configuration_arg('skip'):
+ continue
trans_code += f'-{lang_code}'
trans_name += format_field(lang_name, template=' from %s')
- process_language(
- automatic_captions, base_url, trans_code, trans_name, {'tlang': trans_code})
+ # Add an "-orig" label to the original language so that it can be distinguished.
+ # The subs are returned without "-orig" as well for compatibility
+ if lang_code == f'a-{orig_trans_code}':
+ process_language(
+ automatic_captions, base_url, f'{trans_code}-orig', f'{trans_name} (Original)', {})
+ # Setting tlang=lang returns damaged subtitles.
+ process_language(automatic_captions, base_url, trans_code, trans_name,
+ {} if orig_lang == orig_trans_code else {'tlang': trans_code})
info['automatic_captions'] = automatic_captions
info['subtitles'] = subtitles
@@ -2884,87 +3604,101 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
or self._extract_chapters_from_engagement_panel(initial_data, duration)
or None)
- contents = try_get(
- initial_data,
- lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'],
- list) or []
- for content in contents:
- vpir = content.get('videoPrimaryInfoRenderer')
- if vpir:
- stl = vpir.get('superTitleLink')
- if stl:
- stl = self._get_text(stl)
- if try_get(
- vpir,
- lambda x: x['superTitleIcon']['iconType']) == 'LOCATION_PIN':
- info['location'] = stl
- else:
- mobj = re.search(r'(.+?)\s*S(\d+)\s*•\s*E(\d+)', stl)
- if mobj:
- info.update({
- 'series': mobj.group(1),
- 'season_number': int(mobj.group(2)),
- 'episode_number': int(mobj.group(3)),
- })
- for tlb in (try_get(
- vpir,
- lambda x: x['videoActions']['menuRenderer']['topLevelButtons'],
- list) or []):
- tbr = tlb.get('toggleButtonRenderer') or {}
- for getter, regex in [(
- lambda x: x['defaultText']['accessibility']['accessibilityData'],
- r'(?P<count>[\d,]+)\s*(?P<type>(?:dis)?like)'), ([
- lambda x: x['accessibility'],
- lambda x: x['accessibilityData']['accessibilityData'],
- ], r'(?P<type>(?:dis)?like) this video along with (?P<count>[\d,]+) other people')]:
- label = (try_get(tbr, getter, dict) or {}).get('label')
- if label:
- mobj = re.match(regex, label)
- if mobj:
- info[mobj.group('type') + '_count'] = str_to_int(mobj.group('count'))
- break
- sbr_tooltip = try_get(
- vpir, lambda x: x['sentimentBar']['sentimentBarRenderer']['tooltip'])
- if sbr_tooltip:
- like_count, dislike_count = sbr_tooltip.split(' / ')
+ contents = traverse_obj(
+ initial_data, ('contents', 'twoColumnWatchNextResults', 'results', 'results', 'contents'),
+ expected_type=list, default=[])
+
+ vpir = get_first(contents, 'videoPrimaryInfoRenderer')
+ if vpir:
+ stl = vpir.get('superTitleLink')
+ if stl:
+ stl = self._get_text(stl)
+ if try_get(
+ vpir,
+ lambda x: x['superTitleIcon']['iconType']) == 'LOCATION_PIN':
+ info['location'] = stl
+ else:
+ mobj = re.search(r'(.+?)\s*S(\d+)\s*•\s*E(\d+)', stl)
+ if mobj:
info.update({
- 'like_count': str_to_int(like_count),
- 'dislike_count': str_to_int(dislike_count),
+ 'series': mobj.group(1),
+ 'season_number': int(mobj.group(2)),
+ 'episode_number': int(mobj.group(3)),
})
- vsir = content.get('videoSecondaryInfoRenderer')
- if vsir:
- info['channel'] = self._get_text(vsir, ('owner', 'videoOwnerRenderer', 'title'))
- rows = try_get(
- vsir,
- lambda x: x['metadataRowContainer']['metadataRowContainerRenderer']['rows'],
- list) or []
- multiple_songs = False
- for row in rows:
- if try_get(row, lambda x: x['metadataRowRenderer']['hasDividerLine']) is True:
- multiple_songs = True
+ for tlb in (try_get(
+ vpir,
+ lambda x: x['videoActions']['menuRenderer']['topLevelButtons'],
+ list) or []):
+ tbr = tlb.get('toggleButtonRenderer') or {}
+ for getter, regex in [(
+ lambda x: x['defaultText']['accessibility']['accessibilityData'],
+ r'(?P<count>[\d,]+)\s*(?P<type>(?:dis)?like)'), ([
+ lambda x: x['accessibility'],
+ lambda x: x['accessibilityData']['accessibilityData'],
+ ], r'(?P<type>(?:dis)?like) this video along with (?P<count>[\d,]+) other people')]:
+ label = (try_get(tbr, getter, dict) or {}).get('label')
+ if label:
+ mobj = re.match(regex, label)
+ if mobj:
+ info[mobj.group('type') + '_count'] = str_to_int(mobj.group('count'))
break
- for row in rows:
- mrr = row.get('metadataRowRenderer') or {}
- mrr_title = mrr.get('title')
- if not mrr_title:
- continue
- mrr_title = self._get_text(mrr, 'title')
- mrr_contents_text = self._get_text(mrr, ('contents', 0))
- if mrr_title == 'License':
- info['license'] = mrr_contents_text
- elif not multiple_songs:
- if mrr_title == 'Album':
- info['album'] = mrr_contents_text
- elif mrr_title == 'Artist':
- info['artist'] = mrr_contents_text
- elif mrr_title == 'Song':
- info['track'] = mrr_contents_text
+ sbr_tooltip = try_get(
+ vpir, lambda x: x['sentimentBar']['sentimentBarRenderer']['tooltip'])
+ if sbr_tooltip:
+ like_count, dislike_count = sbr_tooltip.split(' / ')
+ info.update({
+ 'like_count': str_to_int(like_count),
+ 'dislike_count': str_to_int(dislike_count),
+ })
+ vsir = get_first(contents, 'videoSecondaryInfoRenderer')
+ if vsir:
+ vor = traverse_obj(vsir, ('owner', 'videoOwnerRenderer'))
+ info.update({
+ 'channel': self._get_text(vor, 'title'),
+ 'channel_follower_count': self._get_count(vor, 'subscriberCountText')})
+
+ rows = try_get(
+ vsir,
+ lambda x: x['metadataRowContainer']['metadataRowContainerRenderer']['rows'],
+ list) or []
+ multiple_songs = False
+ for row in rows:
+ if try_get(row, lambda x: x['metadataRowRenderer']['hasDividerLine']) is True:
+ multiple_songs = True
+ break
+ for row in rows:
+ mrr = row.get('metadataRowRenderer') or {}
+ mrr_title = mrr.get('title')
+ if not mrr_title:
+ continue
+ mrr_title = self._get_text(mrr, 'title')
+ mrr_contents_text = self._get_text(mrr, ('contents', 0))
+ if mrr_title == 'License':
+ info['license'] = mrr_contents_text
+ elif not multiple_songs:
+ if mrr_title == 'Album':
+ info['album'] = mrr_contents_text
+ elif mrr_title == 'Artist':
+ info['artist'] = mrr_contents_text
+ elif mrr_title == 'Song':
+ info['track'] = mrr_contents_text
fallbacks = {
'channel': 'uploader',
'channel_id': 'uploader_id',
'channel_url': 'uploader_url',
}
+
+ # The upload date for scheduled, live and past live streams / premieres in microformats
+ # may be different from the stream date. Although not in UTC, we will prefer it in this case.
+ # See: https://github.com/hypervideo/hypervideo/pull/2223#issuecomment-1008485139
+ upload_date = (
+ unified_strdate(get_first(microformats, 'uploadDate'))
+ or unified_strdate(search_meta('uploadDate')))
+ if not upload_date or (not info.get('is_live') and not info.get('was_live') and info.get('live_status') != 'is_upcoming'):
+ upload_date = strftime_or_none(self._extract_time_text(vpir, 'dateText')[0], '%Y%m%d')
+ info['upload_date'] = upload_date
+
for to, frm in fallbacks.items():
if not info.get(to):
info[to] = info.get(frm)
@@ -3009,494 +3743,27 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
return info
-class YoutubeTabIE(YoutubeBaseInfoExtractor):
- IE_DESC = 'YouTube.com tab'
- _VALID_URL = r'''(?x)
- https?://
- (?:\w+\.)?
- (?:
- youtube(?:kids)?\.com|
- invidio\.us
- )/
- (?:
- (?P<channel_type>channel|c|user|browse)/|
- (?P<not_channel>
- feed/|hashtag/|
- (?:playlist|watch)\?.*?\blist=
- )|
- (?!(?:%s)\b) # Direct URLs
- )
- (?P<id>[^/?\#&]+)
- ''' % YoutubeBaseInfoExtractor._RESERVED_NAMES
- IE_NAME = 'youtube:tab'
+class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor):
- _TESTS = [{
- 'note': 'playlists, multipage',
- 'url': 'https://www.youtube.com/c/ИгорьКлейнер/playlists?view=1&flow=grid',
- 'playlist_mincount': 94,
- 'info_dict': {
- 'id': 'UCqj7Cz7revf5maW9g5pgNcg',
- 'title': 'Игорь Клейнер - Playlists',
- 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2',
- 'uploader': 'Игорь Клейнер',
- 'uploader_id': 'UCqj7Cz7revf5maW9g5pgNcg',
- },
- }, {
- 'note': 'playlists, multipage, different order',
- 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd',
- 'playlist_mincount': 94,
- 'info_dict': {
- 'id': 'UCqj7Cz7revf5maW9g5pgNcg',
- 'title': 'Игорь Клейнер - Playlists',
- 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2',
- 'uploader_id': 'UCqj7Cz7revf5maW9g5pgNcg',
- 'uploader': 'Игорь Клейнер',
- },
- }, {
- 'note': 'playlists, series',
- 'url': 'https://www.youtube.com/c/3blue1brown/playlists?view=50&sort=dd&shelf_id=3',
- 'playlist_mincount': 5,
- 'info_dict': {
- 'id': 'UCYO_jab_esuFRV4b17AJtAw',
- 'title': '3Blue1Brown - Playlists',
- 'description': 'md5:e1384e8a133307dd10edee76e875d62f',
- 'uploader_id': 'UCYO_jab_esuFRV4b17AJtAw',
- 'uploader': '3Blue1Brown',
- },
- }, {
- 'note': 'playlists, singlepage',
- 'url': 'https://www.youtube.com/user/ThirstForScience/playlists',
- 'playlist_mincount': 4,
- 'info_dict': {
- 'id': 'UCAEtajcuhQ6an9WEzY9LEMQ',
- 'title': 'ThirstForScience - Playlists',
- 'description': 'md5:609399d937ea957b0f53cbffb747a14c',
- 'uploader': 'ThirstForScience',
- 'uploader_id': 'UCAEtajcuhQ6an9WEzY9LEMQ',
- }
- }, {
- 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists',
- 'only_matching': True,
- }, {
- 'note': 'basic, single video playlist',
- 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
- 'info_dict': {
- 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
- 'uploader': 'Sergey M.',
- 'id': 'PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
- 'title': 'youtube-dl public playlist',
- },
- 'playlist_count': 1,
- }, {
- 'note': 'empty playlist',
- 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
- 'info_dict': {
- 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
- 'uploader': 'Sergey M.',
- 'id': 'PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
- 'title': 'youtube-dl empty playlist',
- },
- 'playlist_count': 0,
- }, {
- 'note': 'Home tab',
- 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/featured',
- 'info_dict': {
- 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
- 'title': 'lex will - Home',
- 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
- 'uploader': 'lex will',
- 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
- },
- 'playlist_mincount': 2,
- }, {
- 'note': 'Videos tab',
- 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos',
- 'info_dict': {
- 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
- 'title': 'lex will - Videos',
- 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
- 'uploader': 'lex will',
- 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
- },
- 'playlist_mincount': 975,
- }, {
- 'note': 'Videos tab, sorted by popular',
- 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos?view=0&sort=p&flow=grid',
- 'info_dict': {
- 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
- 'title': 'lex will - Videos',
- 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
- 'uploader': 'lex will',
- 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
- },
- 'playlist_mincount': 199,
- }, {
- 'note': 'Playlists tab',
- 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/playlists',
- 'info_dict': {
- 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
- 'title': 'lex will - Playlists',
- 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
- 'uploader': 'lex will',
- 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
- },
- 'playlist_mincount': 17,
- }, {
- 'note': 'Community tab',
- 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/community',
- 'info_dict': {
- 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
- 'title': 'lex will - Community',
- 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
- 'uploader': 'lex will',
- 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
- },
- 'playlist_mincount': 18,
- }, {
- 'note': 'Channels tab',
- 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/channels',
- 'info_dict': {
- 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
- 'title': 'lex will - Channels',
- 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
- 'uploader': 'lex will',
- 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
- },
- 'playlist_mincount': 12,
- }, {
- 'note': 'Search tab',
- 'url': 'https://www.youtube.com/c/3blue1brown/search?query=linear%20algebra',
- 'playlist_mincount': 40,
- 'info_dict': {
- 'id': 'UCYO_jab_esuFRV4b17AJtAw',
- 'title': '3Blue1Brown - Search - linear algebra',
- 'description': 'md5:e1384e8a133307dd10edee76e875d62f',
- 'uploader': '3Blue1Brown',
- 'uploader_id': 'UCYO_jab_esuFRV4b17AJtAw',
- },
- }, {
- 'url': 'https://invidio.us/channel/UCmlqkdCBesrv2Lak1mF_MxA',
- 'only_matching': True,
- }, {
- 'url': 'https://www.youtubekids.com/channel/UCmlqkdCBesrv2Lak1mF_MxA',
- 'only_matching': True,
- }, {
- 'url': 'https://music.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA',
- 'only_matching': True,
- }, {
- 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
- 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
- 'info_dict': {
- 'title': '29C3: Not my department',
- 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
- 'uploader': 'Christiaan008',
- 'uploader_id': 'UCEPzS1rYsrkqzSLNp76nrcg',
- 'description': 'md5:a14dc1a8ef8307a9807fe136a0660268',
- },
- 'playlist_count': 96,
- }, {
- 'note': 'Large playlist',
- 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
- 'info_dict': {
- 'title': 'Uploads from Cauchemar',
- 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
- 'uploader': 'Cauchemar',
- 'uploader_id': 'UCBABnxM4Ar9ten8Mdjj1j0Q',
- },
- 'playlist_mincount': 1123,
- }, {
- 'note': 'even larger playlist, 8832 videos',
- 'url': 'http://www.youtube.com/user/NASAgovVideo/videos',
- 'only_matching': True,
- }, {
- 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
- 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
- 'info_dict': {
- 'title': 'Uploads from Interstellar Movie',
- 'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
- 'uploader': 'Interstellar Movie',
- 'uploader_id': 'UCXw-G3eDE9trcvY2sBMM_aA',
- },
- 'playlist_mincount': 21,
- }, {
- 'note': 'Playlist with "show unavailable videos" button',
- 'url': 'https://www.youtube.com/playlist?list=UUTYLiWFZy8xtPwxFwX9rV7Q',
- 'info_dict': {
- 'title': 'Uploads from Phim Siêu Nhân Nhật Bản',
- 'id': 'UUTYLiWFZy8xtPwxFwX9rV7Q',
- 'uploader': 'Phim Siêu Nhân Nhật Bản',
- 'uploader_id': 'UCTYLiWFZy8xtPwxFwX9rV7Q',
- },
- 'playlist_mincount': 200,
- }, {
- 'note': 'Playlist with unavailable videos in page 7',
- 'url': 'https://www.youtube.com/playlist?list=UU8l9frL61Yl5KFOl87nIm2w',
- 'info_dict': {
- 'title': 'Uploads from BlankTV',
- 'id': 'UU8l9frL61Yl5KFOl87nIm2w',
- 'uploader': 'BlankTV',
- 'uploader_id': 'UC8l9frL61Yl5KFOl87nIm2w',
- },
- 'playlist_mincount': 1000,
- }, {
- 'note': 'https://github.com/ytdl-org/youtube-dl/issues/21844',
- 'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
- 'info_dict': {
- 'title': 'Data Analysis with Dr Mike Pound',
- 'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
- 'uploader_id': 'UC9-y-6csu5WGm29I7JiwpnA',
- 'uploader': 'Computerphile',
- 'description': 'md5:7f567c574d13d3f8c0954d9ffee4e487',
- },
- 'playlist_mincount': 11,
- }, {
- 'url': 'https://invidio.us/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
- 'only_matching': True,
- }, {
- 'note': 'Playlist URL that does not actually serve a playlist',
- 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4',
- 'info_dict': {
- 'id': 'FqZTN594JQw',
- 'ext': 'webm',
- 'title': "Smiley's People 01 detective, Adventure Series, Action",
- 'uploader': 'STREEM',
- 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng',
- 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng',
- 'upload_date': '20150526',
- 'license': 'Standard YouTube License',
- 'description': 'md5:507cdcb5a49ac0da37a920ece610be80',
- 'categories': ['People & Blogs'],
- 'tags': list,
- 'view_count': int,
- 'like_count': int,
- 'dislike_count': int,
- },
- 'params': {
- 'skip_download': True,
- },
- 'skip': 'This video is not available.',
- 'add_ie': [YoutubeIE.ie_key()],
- }, {
- 'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g',
- 'only_matching': True,
- }, {
- 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM',
- 'only_matching': True,
- }, {
- 'url': 'https://www.youtube.com/channel/UCoMdktPbSTixAyNGwb-UYkQ/live',
- 'info_dict': {
- 'id': '3yImotZU3tw', # This will keep changing
- 'ext': 'mp4',
- 'title': compat_str,
- 'uploader': 'Sky News',
- 'uploader_id': 'skynews',
- 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/skynews',
- 'upload_date': r're:\d{8}',
- 'description': compat_str,
- 'categories': ['News & Politics'],
- 'tags': list,
- 'like_count': int,
- 'dislike_count': int,
- },
- 'params': {
- 'skip_download': True,
- },
- 'expected_warnings': ['Downloading just video ', 'Ignoring subtitle tracks found in '],
- }, {
- 'url': 'https://www.youtube.com/user/TheYoungTurks/live',
- 'info_dict': {
- 'id': 'a48o2S1cPoo',
- 'ext': 'mp4',
- 'title': 'The Young Turks - Live Main Show',
- 'uploader': 'The Young Turks',
- 'uploader_id': 'TheYoungTurks',
- 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks',
- 'upload_date': '20150715',
- 'license': 'Standard YouTube License',
- 'description': 'md5:438179573adcdff3c97ebb1ee632b891',
- 'categories': ['News & Politics'],
- 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'],
- 'like_count': int,
- 'dislike_count': int,
- },
- 'params': {
- 'skip_download': True,
- },
- 'only_matching': True,
- }, {
- 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live',
- 'only_matching': True,
- }, {
- 'url': 'https://www.youtube.com/c/CommanderVideoHq/live',
- 'only_matching': True,
- }, {
- 'note': 'A channel that is not live. Should raise error',
- 'url': 'https://www.youtube.com/user/numberphile/live',
- 'only_matching': True,
- }, {
- 'url': 'https://www.youtube.com/feed/trending',
- 'only_matching': True,
- }, {
- 'url': 'https://www.youtube.com/feed/library',
- 'only_matching': True,
- }, {
- 'url': 'https://www.youtube.com/feed/history',
- 'only_matching': True,
- }, {
- 'url': 'https://www.youtube.com/feed/subscriptions',
- 'only_matching': True,
- }, {
- 'url': 'https://www.youtube.com/feed/watch_later',
- 'only_matching': True,
- }, {
- 'note': 'Recommended - redirects to home page.',
- 'url': 'https://www.youtube.com/feed/recommended',
- 'only_matching': True,
- }, {
- 'note': 'inline playlist with not always working continuations',
- 'url': 'https://www.youtube.com/watch?v=UC6u0Tct-Fo&list=PL36D642111D65BE7C',
- 'only_matching': True,
- }, {
- 'url': 'https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8',
- 'only_matching': True,
- }, {
- 'url': 'https://www.youtube.com/course',
- 'only_matching': True,
- }, {
- 'url': 'https://www.youtube.com/zsecurity',
- 'only_matching': True,
- }, {
- 'url': 'http://www.youtube.com/NASAgovVideo/videos',
- 'only_matching': True,
- }, {
- 'url': 'https://www.youtube.com/TheYoungTurks/live',
- 'only_matching': True,
- }, {
- 'url': 'https://www.youtube.com/hashtag/cctv9',
- 'info_dict': {
- 'id': 'cctv9',
- 'title': '#cctv9',
- },
- 'playlist_mincount': 350,
- }, {
- 'url': 'https://www.youtube.com/watch?list=PLW4dVinRY435CBE_JD3t-0SRXKfnZHS1P&feature=youtu.be&v=M9cJMXmQ_ZU',
- 'only_matching': True,
- }, {
- 'note': 'Requires Premium: should request additional YTM-info webpage (and have format 141) for videos in playlist',
- 'url': 'https://music.youtube.com/playlist?list=PLRBp0Fe2GpgmgoscNFLxNyBVSFVdYmFkq',
- 'only_matching': True
- }, {
- 'note': '/browse/ should redirect to /channel/',
- 'url': 'https://music.youtube.com/browse/UC1a8OFewdjuLq6KlF8M_8Ng',
- 'only_matching': True
- }, {
- 'note': 'VLPL, should redirect to playlist?list=PL...',
- 'url': 'https://music.youtube.com/browse/VLPLRBp0Fe2GpgmgoscNFLxNyBVSFVdYmFkq',
- 'info_dict': {
- 'id': 'PLRBp0Fe2GpgmgoscNFLxNyBVSFVdYmFkq',
- 'uploader': 'NoCopyrightSounds',
- 'description': 'Providing you with copyright free / safe music for gaming, live streaming, studying and more!',
- 'uploader_id': 'UC_aEa8K-EOJ3D6gOs7HcyNg',
- 'title': 'NCS Releases',
- },
- 'playlist_mincount': 166,
- }, {
- 'note': 'Topic, should redirect to playlist?list=UU...',
- 'url': 'https://music.youtube.com/browse/UC9ALqqC4aIeG5iDs7i90Bfw',
- 'info_dict': {
- 'id': 'UU9ALqqC4aIeG5iDs7i90Bfw',
- 'uploader_id': 'UC9ALqqC4aIeG5iDs7i90Bfw',
- 'title': 'Uploads from Royalty Free Music - Topic',
- 'uploader': 'Royalty Free Music - Topic',
- },
- 'expected_warnings': [
- 'A channel/user page was given',
- 'The URL does not have a videos tab',
- ],
- 'playlist_mincount': 101,
- }, {
- 'note': 'Topic without a UU playlist',
- 'url': 'https://www.youtube.com/channel/UCtFRv9O2AHqOZjjynzrv-xg',
- 'info_dict': {
- 'id': 'UCtFRv9O2AHqOZjjynzrv-xg',
- 'title': 'UCtFRv9O2AHqOZjjynzrv-xg',
- },
- 'expected_warnings': [
- 'A channel/user page was given',
- 'The URL does not have a videos tab',
- 'Falling back to channel URL',
- ],
- 'playlist_mincount': 9,
- }, {
- 'note': 'Youtube music Album',
- 'url': 'https://music.youtube.com/browse/MPREb_gTAcphH99wE',
- 'info_dict': {
- 'id': 'OLAK5uy_l1m0thk3g31NmIIz_vMIbWtyv7eZixlH0',
- 'title': 'Album - Royalty Free Music Library V2 (50 Songs)',
- },
- 'playlist_count': 50,
- }, {
- 'note': 'unlisted single video playlist',
- 'url': 'https://www.youtube.com/playlist?list=PLwL24UFy54GrB3s2KMMfjZscDi1x5Dajf',
- 'info_dict': {
- 'uploader_id': 'UC9zHu_mHU96r19o-wV5Qs1Q',
- 'uploader': 'colethedj',
- 'id': 'PLwL24UFy54GrB3s2KMMfjZscDi1x5Dajf',
- 'title': 'hypervideo unlisted playlist test',
- 'availability': 'unlisted'
- },
- 'playlist_count': 1,
- }, {
- 'note': 'API Fallback: Recommended - redirects to home page. Requires visitorData',
- 'url': 'https://www.youtube.com/feed/recommended',
- 'info_dict': {
- 'id': 'recommended',
- 'title': 'recommended',
- },
- 'playlist_mincount': 50,
- 'params': {
- 'skip_download': True,
- 'extractor_args': {'youtubetab': {'skip': ['webpage']}}
- },
- }, {
- 'note': 'API Fallback: /videos tab, sorted by oldest first',
- 'url': 'https://www.youtube.com/user/theCodyReeder/videos?view=0&sort=da&flow=grid',
- 'info_dict': {
- 'id': 'UCu6mSoMNzHQiBIOCkHUa2Aw',
- 'title': 'Cody\'sLab - Videos',
- 'description': 'md5:d083b7c2f0c67ee7a6c74c3e9b4243fa',
- 'uploader': 'Cody\'sLab',
- 'uploader_id': 'UCu6mSoMNzHQiBIOCkHUa2Aw',
- },
- 'playlist_mincount': 650,
- 'params': {
- 'skip_download': True,
- 'extractor_args': {'youtubetab': {'skip': ['webpage']}}
- },
- }, {
- 'note': 'API Fallback: Topic, should redirect to playlist?list=UU...',
- 'url': 'https://music.youtube.com/browse/UC9ALqqC4aIeG5iDs7i90Bfw',
- 'info_dict': {
- 'id': 'UU9ALqqC4aIeG5iDs7i90Bfw',
- 'uploader_id': 'UC9ALqqC4aIeG5iDs7i90Bfw',
- 'title': 'Uploads from Royalty Free Music - Topic',
- 'uploader': 'Royalty Free Music - Topic',
- },
- 'expected_warnings': [
- 'A channel/user page was given',
- 'The URL does not have a videos tab',
- ],
- 'playlist_mincount': 101,
- 'params': {
- 'skip_download': True,
- 'extractor_args': {'youtubetab': {'skip': ['webpage']}}
- },
- }]
+ @staticmethod
+ def passthrough_smuggled_data(func):
+ def _smuggle(entries, smuggled_data):
+ for entry in entries:
+ # TODO: Convert URL to music.youtube instead.
+ # Do we need to passthrough any other smuggled_data?
+ entry['url'] = smuggle_url(entry['url'], smuggled_data)
+ yield entry
- @classmethod
- def suitable(cls, url):
- return False if YoutubeIE.suitable(url) else super(
- YoutubeTabIE, cls).suitable(url)
+ @functools.wraps(func)
+ def wrapper(self, url):
+ url, smuggled_data = unsmuggle_url(url, {})
+ if self.is_music_url(url):
+ smuggled_data['is_music_url'] = True
+ info_dict = func(self, url, smuggled_data)
+ if smuggled_data and info_dict.get('entries'):
+ info_dict['entries'] = _smuggle(info_dict['entries'], smuggled_data)
+ return info_dict
+ return wrapper
def _extract_channel_id(self, webpage):
channel_id = self._html_search_meta(
@@ -3515,7 +3782,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
def _extract_basic_item_renderer(item):
# Modified from _extract_grid_item_renderer
known_basic_renderers = (
- 'playlistRenderer', 'videoRenderer', 'channelRenderer', 'showRenderer'
+ 'playlistRenderer', 'videoRenderer', 'channelRenderer', 'showRenderer', 'reelItemRenderer'
)
for key, renderer in item.items():
if not isinstance(renderer, dict):
@@ -3565,6 +3832,24 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
ep_url, ie=ie.ie_key(), video_id=ie._match_id(ep_url), video_title=title)
break
+ def _music_reponsive_list_entry(self, renderer):
+ video_id = traverse_obj(renderer, ('playlistItemData', 'videoId'))
+ if video_id:
+ return self.url_result(f'https://music.youtube.com/watch?v={video_id}',
+ ie=YoutubeIE.ie_key(), video_id=video_id)
+ playlist_id = traverse_obj(renderer, ('navigationEndpoint', 'watchEndpoint', 'playlistId'))
+ if playlist_id:
+ video_id = traverse_obj(renderer, ('navigationEndpoint', 'watchEndpoint', 'videoId'))
+ if video_id:
+ return self.url_result(f'https://music.youtube.com/watch?v={video_id}&list={playlist_id}',
+ ie=YoutubeTabIE.ie_key(), video_id=playlist_id)
+ return self.url_result(f'https://music.youtube.com/playlist?list={playlist_id}',
+ ie=YoutubeTabIE.ie_key(), video_id=playlist_id)
+ browse_id = traverse_obj(renderer, ('navigationEndpoint', 'browseEndpoint', 'browseId'))
+ if browse_id:
+ return self.url_result(f'https://music.youtube.com/browse/{browse_id}',
+ ie=YoutubeTabIE.ie_key(), video_id=browse_id)
+
def _shelf_entries_from_content(self, shelf_renderer):
content = shelf_renderer.get('content')
if not isinstance(content, dict):
@@ -3623,6 +3908,13 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
if video_id:
return self._extract_video(video_renderer)
+ def _hashtag_tile_entry(self, hashtag_tile_renderer):
+ url = urljoin('https://youtube.com', traverse_obj(
+ hashtag_tile_renderer, ('onTapCommand', 'commandMetadata', 'webCommandMetadata', 'url')))
+ if url:
+ return self.url_result(
+ url, ie=YoutubeTabIE.ie_key(), title=self._get_text(hashtag_tile_renderer, 'hashtag'))
+
def _post_thread_entries(self, post_thread_renderer):
post_renderer = try_get(
post_thread_renderer, lambda x: x['post']['backstagePostRenderer'], dict)
@@ -3679,49 +3971,59 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
if entry:
yield entry
'''
- def _entries(self, tab, item_id, ytcfg, account_syncid, visitor_data):
- def extract_entries(parent_renderer): # this needs to called again for continuation to work with feeds
- contents = try_get(parent_renderer, lambda x: x['contents'], list) or []
- for content in contents:
- if not isinstance(content, dict):
- continue
- is_renderer = try_get(content, lambda x: x['itemSectionRenderer'], dict)
- if not is_renderer:
- renderer = content.get('richItemRenderer')
- if renderer:
- for entry in self._rich_entries(renderer):
- yield entry
- continuation_list[0] = self._extract_continuation(parent_renderer)
+ def _extract_entries(self, parent_renderer, continuation_list):
+ # continuation_list is modified in-place with continuation_list = [continuation_token]
+ continuation_list[:] = [None]
+ contents = try_get(parent_renderer, lambda x: x['contents'], list) or []
+ for content in contents:
+ if not isinstance(content, dict):
+ continue
+ is_renderer = traverse_obj(
+ content, 'itemSectionRenderer', 'musicShelfRenderer', 'musicShelfContinuation',
+ expected_type=dict)
+ if not is_renderer:
+ renderer = content.get('richItemRenderer')
+ if renderer:
+ for entry in self._rich_entries(renderer):
+ yield entry
+ continuation_list[0] = self._extract_continuation(parent_renderer)
+ continue
+ isr_contents = try_get(is_renderer, lambda x: x['contents'], list) or []
+ for isr_content in isr_contents:
+ if not isinstance(isr_content, dict):
continue
- isr_contents = try_get(is_renderer, lambda x: x['contents'], list) or []
- for isr_content in isr_contents:
- if not isinstance(isr_content, dict):
- continue
-
- known_renderers = {
- 'playlistVideoListRenderer': self._playlist_entries,
- 'gridRenderer': self._grid_entries,
- 'shelfRenderer': lambda x: self._shelf_entries(x, tab.get('title') != 'Channels'),
- 'backstagePostThreadRenderer': self._post_thread_entries,
- 'videoRenderer': lambda x: [self._video_entry(x)],
- }
- for key, renderer in isr_content.items():
- if key not in known_renderers:
- continue
- for entry in known_renderers[key](renderer):
- if entry:
- yield entry
- continuation_list[0] = self._extract_continuation(renderer)
- break
- if not continuation_list[0]:
- continuation_list[0] = self._extract_continuation(is_renderer)
+ known_renderers = {
+ 'playlistVideoListRenderer': self._playlist_entries,
+ 'gridRenderer': self._grid_entries,
+ 'reelShelfRenderer': self._grid_entries,
+ 'shelfRenderer': self._shelf_entries,
+ 'musicResponsiveListItemRenderer': lambda x: [self._music_reponsive_list_entry(x)],
+ 'backstagePostThreadRenderer': self._post_thread_entries,
+ 'videoRenderer': lambda x: [self._video_entry(x)],
+ 'playlistRenderer': lambda x: self._grid_entries({'items': [{'playlistRenderer': x}]}),
+ 'channelRenderer': lambda x: self._grid_entries({'items': [{'channelRenderer': x}]}),
+ 'hashtagTileRenderer': lambda x: [self._hashtag_tile_entry(x)]
+ }
+ for key, renderer in isr_content.items():
+ if key not in known_renderers:
+ continue
+ for entry in known_renderers[key](renderer):
+ if entry:
+ yield entry
+ continuation_list[0] = self._extract_continuation(renderer)
+ break
if not continuation_list[0]:
- continuation_list[0] = self._extract_continuation(parent_renderer)
+ continuation_list[0] = self._extract_continuation(is_renderer)
+
+ if not continuation_list[0]:
+ continuation_list[0] = self._extract_continuation(parent_renderer)
- continuation_list = [None] # Python 2 does not support nonlocal
+ def _entries(self, tab, item_id, ytcfg, account_syncid, visitor_data):
+ continuation_list = [None]
+ extract_entries = lambda x: self._extract_entries(x, continuation_list)
tab_content = try_get(tab, lambda x: x['content'], dict)
if not tab_content:
return
@@ -3770,6 +4072,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
continue
known_renderers = {
+ 'videoRenderer': (self._grid_entries, 'items'), # for membership tab
'gridPlaylistRenderer': (self._grid_entries, 'items'),
'gridVideoRenderer': (self._grid_entries, 'items'),
'gridChannelRenderer': (self._grid_entries, 'items'),
@@ -3797,13 +4100,14 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
break
@staticmethod
- def _extract_selected_tab(tabs):
+ def _extract_selected_tab(tabs, fatal=True):
for tab in tabs:
renderer = dict_get(tab, ('tabRenderer', 'expandableTabRenderer')) or {}
if renderer.get('selected') is True:
return renderer
else:
- raise ExtractorError('Unable to find selected tab')
+ if fatal:
+ raise ExtractorError('Unable to find selected tab')
@classmethod
def _extract_uploader(cls, data):
@@ -3822,10 +4126,10 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
def _extract_from_tabs(self, item_id, ytcfg, data, tabs):
playlist_id = title = description = channel_url = channel_name = channel_id = None
- thumbnails_list = []
tags = []
selected_tab = self._extract_selected_tab(tabs)
+ primary_sidebar_renderer = self._extract_sidebar_info_renderer(data, 'playlistSidebarPrimaryInfoRenderer')
renderer = try_get(
data, lambda x: x['metadata']['channelMetadataRenderer'], dict)
if renderer:
@@ -3841,34 +4145,49 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
description = renderer.get('description', '')
playlist_id = channel_id
tags = renderer.get('keywords', '').split()
- thumbnails_list = (
- try_get(renderer, lambda x: x['avatar']['thumbnails'], list)
- or try_get(
- self._extract_sidebar_info_renderer(data, 'playlistSidebarPrimaryInfoRenderer'),
- lambda x: x['thumbnailRenderer']['playlistVideoThumbnailRenderer']['thumbnail']['thumbnails'],
- list)
- or [])
- thumbnails = []
- for t in thumbnails_list:
- if not isinstance(t, dict):
- continue
- thumbnail_url = url_or_none(t.get('url'))
- if not thumbnail_url:
- continue
- thumbnails.append({
- 'url': thumbnail_url,
- 'width': int_or_none(t.get('width')),
- 'height': int_or_none(t.get('height')),
- })
+ # We can get the uncropped banner/avatar by replacing the crop params with '=s0'
+ # See: https://github.com/hypervideo/hypervideo/issues/2237#issuecomment-1013694714
+ def _get_uncropped(url):
+ return url_or_none((url or '').split('=')[0] + '=s0')
+
+ avatar_thumbnails = self._extract_thumbnails(renderer, 'avatar')
+ if avatar_thumbnails:
+ uncropped_avatar = _get_uncropped(avatar_thumbnails[0]['url'])
+ if uncropped_avatar:
+ avatar_thumbnails.append({
+ 'url': uncropped_avatar,
+ 'id': 'avatar_uncropped',
+ 'preference': 1
+ })
+
+ channel_banners = self._extract_thumbnails(
+ data, ('header', ..., ['banner', 'mobileBanner', 'tvBanner']))
+ for banner in channel_banners:
+ banner['preference'] = -10
+
+ if channel_banners:
+ uncropped_banner = _get_uncropped(channel_banners[0]['url'])
+ if uncropped_banner:
+ channel_banners.append({
+ 'url': uncropped_banner,
+ 'id': 'banner_uncropped',
+ 'preference': -5
+ })
+
+ primary_thumbnails = self._extract_thumbnails(
+ primary_sidebar_renderer, ('thumbnailRenderer', ('playlistVideoThumbnailRenderer', 'playlistCustomThumbnailRenderer'), 'thumbnail'))
+
if playlist_id is None:
playlist_id = item_id
+
+ playlist_stats = traverse_obj(primary_sidebar_renderer, 'stats')
+ last_updated_unix, _ = self._extract_time_text(playlist_stats, 2)
if title is None:
- title = (
- try_get(data, lambda x: x['header']['hashtagHeaderRenderer']['hashtag']['simpleText'])
- or playlist_id)
+ title = self._get_text(data, ('header', 'hashtagHeaderRenderer', 'hashtag')) or playlist_id
title += format_field(selected_tab, 'title', ' - %s')
title += format_field(selected_tab, 'expandedText', ' - %s')
+
metadata = {
'playlist_id': playlist_id,
'playlist_title': title,
@@ -3876,12 +4195,14 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
'uploader': channel_name,
'uploader_id': channel_id,
'uploader_url': channel_url,
- 'thumbnails': thumbnails,
+ 'thumbnails': primary_thumbnails + avatar_thumbnails + channel_banners,
'tags': tags,
+ 'view_count': self._get_count(playlist_stats, 1),
+ 'availability': self._extract_availability(data),
+ 'modified_date': strftime_or_none(last_updated_unix, '%Y%m%d'),
+ 'playlist_count': self._get_count(playlist_stats, 0),
+ 'channel_follower_count': self._get_count(data, ('header', ..., 'subscriberCountText')),
}
- availability = self._extract_availability(data)
- if availability:
- metadata['availability'] = availability
if not channel_id:
metadata.update(self._extract_uploader(data))
metadata.update({
@@ -4059,7 +4380,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
self.report_warning(error_to_compat_str(e))
break
- if dict_get(data, ('contents', 'currentVideoEndpoint')):
+ if dict_get(data, ('contents', 'currentVideoEndpoint', 'onResponseReceivedActions')):
break
last_error = 'Incomplete yt initial data received'
@@ -4076,6 +4397,16 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
if 'webpage' not in self._configuration_arg('skip'):
webpage, data = self._extract_webpage(url, item_id, fatal=webpage_fatal)
ytcfg = ytcfg or self.extract_ytcfg(item_id, webpage)
+ # Reject webpage data if redirected to home page without explicitly requesting
+ selected_tab = self._extract_selected_tab(traverse_obj(
+ data, ('contents', 'twoColumnBrowseResultsRenderer', 'tabs'), expected_type=list, default=[]), fatal=False) or {}
+ if (url != 'https://www.youtube.com/feed/recommended'
+ and selected_tab.get('tabIdentifier') == 'FEwhat_to_watch' # Home page
+ and 'no-youtube-channel-redirect' not in self.get_param('compat_opts', [])):
+ msg = 'The channel/playlist does not exist and the URL redirected to youtube.com home page'
+ if fatal:
+ raise ExtractorError(msg, expected=True)
+ self.report_warning(msg, only_once=True)
if not data:
if not ytcfg and self.is_authenticated:
msg = 'Playlists that require authentication may not extract correctly without a successful webpage download.'
@@ -4100,67 +4431,756 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
return self._extract_response(
item_id=item_id, query=params, ep=ep, headers=headers,
ytcfg=ytcfg, fatal=fatal, default_client=default_client,
- check_get_keys=('contents', 'currentVideoEndpoint'))
+ check_get_keys=('contents', 'currentVideoEndpoint', 'onResponseReceivedActions'))
err_note = 'Failed to resolve url (does the playlist exist?)'
if fatal:
raise ExtractorError(err_note, expected=True)
self.report_warning(err_note, item_id)
- @staticmethod
- def _smuggle_data(entries, data):
- for entry in entries:
- if data:
- entry['url'] = smuggle_url(entry['url'], data)
- yield entry
+ _SEARCH_PARAMS = None
+
+ def _search_results(self, query, params=NO_DEFAULT, default_client='web'):
+ data = {'query': query}
+ if params is NO_DEFAULT:
+ params = self._SEARCH_PARAMS
+ if params:
+ data['params'] = params
+
+ content_keys = (
+ ('contents', 'twoColumnSearchResultsRenderer', 'primaryContents', 'sectionListRenderer', 'contents'),
+ ('onResponseReceivedCommands', 0, 'appendContinuationItemsAction', 'continuationItems'),
+ # ytmusic search
+ ('contents', 'tabbedSearchResultsRenderer', 'tabs', 0, 'tabRenderer', 'content', 'sectionListRenderer', 'contents'),
+ ('continuationContents', ),
+ )
+ check_get_keys = tuple(set(keys[0] for keys in content_keys))
+
+ continuation_list = [None]
+ for page_num in itertools.count(1):
+ data.update(continuation_list[0] or {})
+ search = self._extract_response(
+ item_id='query "%s" page %s' % (query, page_num), ep='search', query=data,
+ default_client=default_client, check_get_keys=check_get_keys)
+ slr_contents = traverse_obj(search, *content_keys)
+ yield from self._extract_entries({'contents': list(variadic(slr_contents))}, continuation_list)
+ if not continuation_list[0]:
+ break
- def _real_extract(self, url):
- url, smuggled_data = unsmuggle_url(url, {})
- if self.is_music_url(url):
- smuggled_data['is_music_url'] = True
- info_dict = self.__real_extract(url, smuggled_data)
- if info_dict.get('entries'):
- info_dict['entries'] = self._smuggle_data(info_dict['entries'], smuggled_data)
- return info_dict
- _url_re = re.compile(r'(?P<pre>%s)(?(channel_type)(?P<tab>/\w+))?(?P<post>.*)$' % _VALID_URL)
+class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
+ IE_DESC = 'YouTube Tabs'
+ _VALID_URL = r'''(?x:
+ https?://
+ (?:\w+\.)?
+ (?:
+ youtube(?:kids)?\.com|
+ %(invidious)s
+ )/
+ (?:
+ (?P<channel_type>channel|c|user|browse)/|
+ (?P<not_channel>
+ feed/|hashtag/|
+ (?:playlist|watch)\?.*?\blist=
+ )|
+ (?!(?:%(reserved_names)s)\b) # Direct URLs
+ )
+ (?P<id>[^/?\#&]+)
+ )''' % {
+ 'reserved_names': YoutubeBaseInfoExtractor._RESERVED_NAMES,
+ 'invidious': '|'.join(YoutubeBaseInfoExtractor._INVIDIOUS_SITES),
+ }
+ IE_NAME = 'youtube:tab'
- def __real_extract(self, url, smuggled_data):
+ _TESTS = [{
+ 'note': 'playlists, multipage',
+ 'url': 'https://www.youtube.com/c/ИгорьКлейнер/playlists?view=1&flow=grid',
+ 'playlist_mincount': 94,
+ 'info_dict': {
+ 'id': 'UCqj7Cz7revf5maW9g5pgNcg',
+ 'title': 'Igor Kleiner - Playlists',
+ 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2',
+ 'uploader': 'Igor Kleiner',
+ 'uploader_id': 'UCqj7Cz7revf5maW9g5pgNcg',
+ 'channel': 'Igor Kleiner',
+ 'channel_id': 'UCqj7Cz7revf5maW9g5pgNcg',
+ 'tags': ['"критическое', 'мышление"', '"наука', 'просто"', 'математика', '"анализ', 'данных"'],
+ 'channel_url': 'https://www.youtube.com/channel/UCqj7Cz7revf5maW9g5pgNcg',
+ 'uploader_url': 'https://www.youtube.com/channel/UCqj7Cz7revf5maW9g5pgNcg',
+ 'channel_follower_count': int
+ },
+ }, {
+ 'note': 'playlists, multipage, different order',
+ 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd',
+ 'playlist_mincount': 94,
+ 'info_dict': {
+ 'id': 'UCqj7Cz7revf5maW9g5pgNcg',
+ 'title': 'Igor Kleiner - Playlists',
+ 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2',
+ 'uploader_id': 'UCqj7Cz7revf5maW9g5pgNcg',
+ 'uploader': 'Igor Kleiner',
+ 'uploader_url': 'https://www.youtube.com/channel/UCqj7Cz7revf5maW9g5pgNcg',
+ 'tags': ['"критическое', 'мышление"', '"наука', 'просто"', 'математика', '"анализ', 'данных"'],
+ 'channel_id': 'UCqj7Cz7revf5maW9g5pgNcg',
+ 'channel': 'Igor Kleiner',
+ 'channel_url': 'https://www.youtube.com/channel/UCqj7Cz7revf5maW9g5pgNcg',
+ 'channel_follower_count': int
+ },
+ }, {
+ 'note': 'playlists, series',
+ 'url': 'https://www.youtube.com/c/3blue1brown/playlists?view=50&sort=dd&shelf_id=3',
+ 'playlist_mincount': 5,
+ 'info_dict': {
+ 'id': 'UCYO_jab_esuFRV4b17AJtAw',
+ 'title': '3Blue1Brown - Playlists',
+ 'description': 'md5:e1384e8a133307dd10edee76e875d62f',
+ 'uploader_id': 'UCYO_jab_esuFRV4b17AJtAw',
+ 'uploader': '3Blue1Brown',
+ 'channel_url': 'https://www.youtube.com/channel/UCYO_jab_esuFRV4b17AJtAw',
+ 'uploader_url': 'https://www.youtube.com/channel/UCYO_jab_esuFRV4b17AJtAw',
+ 'channel': '3Blue1Brown',
+ 'channel_id': 'UCYO_jab_esuFRV4b17AJtAw',
+ 'tags': ['Mathematics'],
+ 'channel_follower_count': int
+ },
+ }, {
+ 'note': 'playlists, singlepage',
+ 'url': 'https://www.youtube.com/user/ThirstForScience/playlists',
+ 'playlist_mincount': 4,
+ 'info_dict': {
+ 'id': 'UCAEtajcuhQ6an9WEzY9LEMQ',
+ 'title': 'ThirstForScience - Playlists',
+ 'description': 'md5:609399d937ea957b0f53cbffb747a14c',
+ 'uploader': 'ThirstForScience',
+ 'uploader_id': 'UCAEtajcuhQ6an9WEzY9LEMQ',
+ 'uploader_url': 'https://www.youtube.com/channel/UCAEtajcuhQ6an9WEzY9LEMQ',
+ 'channel_url': 'https://www.youtube.com/channel/UCAEtajcuhQ6an9WEzY9LEMQ',
+ 'channel_id': 'UCAEtajcuhQ6an9WEzY9LEMQ',
+ 'tags': 'count:13',
+ 'channel': 'ThirstForScience',
+ 'channel_follower_count': int
+ }
+ }, {
+ 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists',
+ 'only_matching': True,
+ }, {
+ 'note': 'basic, single video playlist',
+ 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
+ 'info_dict': {
+ 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
+ 'uploader': 'Sergey M.',
+ 'id': 'PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
+ 'title': 'youtube-dl public playlist',
+ 'description': '',
+ 'tags': [],
+ 'view_count': int,
+ 'modified_date': '20201130',
+ 'channel': 'Sergey M.',
+ 'channel_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
+ 'uploader_url': 'https://www.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA',
+ 'channel_url': 'https://www.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA',
+ },
+ 'playlist_count': 1,
+ }, {
+ 'note': 'empty playlist',
+ 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
+ 'info_dict': {
+ 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
+ 'uploader': 'Sergey M.',
+ 'id': 'PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
+ 'title': 'youtube-dl empty playlist',
+ 'tags': [],
+ 'channel': 'Sergey M.',
+ 'description': '',
+ 'modified_date': '20160902',
+ 'channel_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
+ 'channel_url': 'https://www.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA',
+ 'uploader_url': 'https://www.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA',
+ },
+ 'playlist_count': 0,
+ }, {
+ 'note': 'Home tab',
+ 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/featured',
+ 'info_dict': {
+ 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
+ 'title': 'lex will - Home',
+ 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
+ 'uploader': 'lex will',
+ 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
+ 'channel': 'lex will',
+ 'tags': ['bible', 'history', 'prophesy'],
+ 'uploader_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
+ 'channel_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
+ 'channel_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
+ 'channel_follower_count': int
+ },
+ 'playlist_mincount': 2,
+ }, {
+ 'note': 'Videos tab',
+ 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos',
+ 'info_dict': {
+ 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
+ 'title': 'lex will - Videos',
+ 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
+ 'uploader': 'lex will',
+ 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
+ 'tags': ['bible', 'history', 'prophesy'],
+ 'channel_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
+ 'channel_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
+ 'uploader_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
+ 'channel': 'lex will',
+ 'channel_follower_count': int
+ },
+ 'playlist_mincount': 975,
+ }, {
+ 'note': 'Videos tab, sorted by popular',
+ 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos?view=0&sort=p&flow=grid',
+ 'info_dict': {
+ 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
+ 'title': 'lex will - Videos',
+ 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
+ 'uploader': 'lex will',
+ 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
+ 'channel_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
+ 'uploader_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
+ 'channel': 'lex will',
+ 'tags': ['bible', 'history', 'prophesy'],
+ 'channel_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
+ 'channel_follower_count': int
+ },
+ 'playlist_mincount': 199,
+ }, {
+ 'note': 'Playlists tab',
+ 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/playlists',
+ 'info_dict': {
+ 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
+ 'title': 'lex will - Playlists',
+ 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
+ 'uploader': 'lex will',
+ 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
+ 'uploader_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
+ 'channel': 'lex will',
+ 'channel_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
+ 'channel_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
+ 'tags': ['bible', 'history', 'prophesy'],
+ 'channel_follower_count': int
+ },
+ 'playlist_mincount': 17,
+ }, {
+ 'note': 'Community tab',
+ 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/community',
+ 'info_dict': {
+ 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
+ 'title': 'lex will - Community',
+ 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
+ 'uploader': 'lex will',
+ 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
+ 'uploader_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
+ 'channel': 'lex will',
+ 'channel_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
+ 'channel_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
+ 'tags': ['bible', 'history', 'prophesy'],
+ 'channel_follower_count': int
+ },
+ 'playlist_mincount': 18,
+ }, {
+ 'note': 'Channels tab',
+ 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/channels',
+ 'info_dict': {
+ 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
+ 'title': 'lex will - Channels',
+ 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
+ 'uploader': 'lex will',
+ 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
+ 'uploader_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
+ 'channel': 'lex will',
+ 'channel_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
+ 'channel_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
+ 'tags': ['bible', 'history', 'prophesy'],
+ 'channel_follower_count': int
+ },
+ 'playlist_mincount': 12,
+ }, {
+ 'note': 'Search tab',
+ 'url': 'https://www.youtube.com/c/3blue1brown/search?query=linear%20algebra',
+ 'playlist_mincount': 40,
+ 'info_dict': {
+ 'id': 'UCYO_jab_esuFRV4b17AJtAw',
+ 'title': '3Blue1Brown - Search - linear algebra',
+ 'description': 'md5:e1384e8a133307dd10edee76e875d62f',
+ 'uploader': '3Blue1Brown',
+ 'uploader_id': 'UCYO_jab_esuFRV4b17AJtAw',
+ 'channel_url': 'https://www.youtube.com/channel/UCYO_jab_esuFRV4b17AJtAw',
+ 'uploader_url': 'https://www.youtube.com/channel/UCYO_jab_esuFRV4b17AJtAw',
+ 'tags': ['Mathematics'],
+ 'channel': '3Blue1Brown',
+ 'channel_id': 'UCYO_jab_esuFRV4b17AJtAw',
+ 'channel_follower_count': int
+ },
+ }, {
+ 'url': 'https://invidio.us/channel/UCmlqkdCBesrv2Lak1mF_MxA',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.youtubekids.com/channel/UCmlqkdCBesrv2Lak1mF_MxA',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://music.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA',
+ 'only_matching': True,
+ }, {
+ 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
+ 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
+ 'info_dict': {
+ 'title': '29C3: Not my department',
+ 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
+ 'uploader': 'Christiaan008',
+ 'uploader_id': 'UCEPzS1rYsrkqzSLNp76nrcg',
+ 'description': 'md5:a14dc1a8ef8307a9807fe136a0660268',
+ 'tags': [],
+ 'uploader_url': 'https://www.youtube.com/c/ChRiStIaAn008',
+ 'view_count': int,
+ 'modified_date': '20150605',
+ 'channel_id': 'UCEPzS1rYsrkqzSLNp76nrcg',
+ 'channel_url': 'https://www.youtube.com/c/ChRiStIaAn008',
+ 'channel': 'Christiaan008',
+ },
+ 'playlist_count': 96,
+ }, {
+ 'note': 'Large playlist',
+ 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
+ 'info_dict': {
+ 'title': 'Uploads from Cauchemar',
+ 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
+ 'uploader': 'Cauchemar',
+ 'uploader_id': 'UCBABnxM4Ar9ten8Mdjj1j0Q',
+ 'channel_url': 'https://www.youtube.com/c/Cauchemar89',
+ 'tags': [],
+ 'modified_date': r're:\d{8}',
+ 'channel': 'Cauchemar',
+ 'uploader_url': 'https://www.youtube.com/c/Cauchemar89',
+ 'view_count': int,
+ 'description': '',
+ 'channel_id': 'UCBABnxM4Ar9ten8Mdjj1j0Q',
+ },
+ 'playlist_mincount': 1123,
+ 'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden'],
+ }, {
+ 'note': 'even larger playlist, 8832 videos',
+ 'url': 'http://www.youtube.com/user/NASAgovVideo/videos',
+ 'only_matching': True,
+ }, {
+ 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
+ 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
+ 'info_dict': {
+ 'title': 'Uploads from Interstellar Movie',
+ 'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
+ 'uploader': 'Interstellar Movie',
+ 'uploader_id': 'UCXw-G3eDE9trcvY2sBMM_aA',
+ 'uploader_url': 'https://www.youtube.com/c/InterstellarMovie',
+ 'tags': [],
+ 'view_count': int,
+ 'channel_id': 'UCXw-G3eDE9trcvY2sBMM_aA',
+ 'channel_url': 'https://www.youtube.com/c/InterstellarMovie',
+ 'channel': 'Interstellar Movie',
+ 'description': '',
+ 'modified_date': r're:\d{8}',
+ },
+ 'playlist_mincount': 21,
+ }, {
+ 'note': 'Playlist with "show unavailable videos" button',
+ 'url': 'https://www.youtube.com/playlist?list=UUTYLiWFZy8xtPwxFwX9rV7Q',
+ 'info_dict': {
+ 'title': 'Uploads from Phim Siêu Nhân Nhật Bản',
+ 'id': 'UUTYLiWFZy8xtPwxFwX9rV7Q',
+ 'uploader': 'Phim Siêu Nhân Nhật Bản',
+ 'uploader_id': 'UCTYLiWFZy8xtPwxFwX9rV7Q',
+ 'view_count': int,
+ 'channel': 'Phim Siêu Nhân Nhật Bản',
+ 'tags': [],
+ 'uploader_url': 'https://www.youtube.com/channel/UCTYLiWFZy8xtPwxFwX9rV7Q',
+ 'description': '',
+ 'channel_url': 'https://www.youtube.com/channel/UCTYLiWFZy8xtPwxFwX9rV7Q',
+ 'channel_id': 'UCTYLiWFZy8xtPwxFwX9rV7Q',
+ 'modified_date': r're:\d{8}',
+ },
+ 'playlist_mincount': 200,
+ 'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden'],
+ }, {
+ 'note': 'Playlist with unavailable videos in page 7',
+ 'url': 'https://www.youtube.com/playlist?list=UU8l9frL61Yl5KFOl87nIm2w',
+ 'info_dict': {
+ 'title': 'Uploads from BlankTV',
+ 'id': 'UU8l9frL61Yl5KFOl87nIm2w',
+ 'uploader': 'BlankTV',
+ 'uploader_id': 'UC8l9frL61Yl5KFOl87nIm2w',
+ 'channel': 'BlankTV',
+ 'channel_url': 'https://www.youtube.com/c/blanktv',
+ 'channel_id': 'UC8l9frL61Yl5KFOl87nIm2w',
+ 'view_count': int,
+ 'tags': [],
+ 'uploader_url': 'https://www.youtube.com/c/blanktv',
+ 'modified_date': r're:\d{8}',
+ 'description': '',
+ },
+ 'playlist_mincount': 1000,
+ 'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden'],
+ }, {
+ 'note': 'https://github.com/ytdl-org/youtube-dl/issues/21844',
+ 'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
+ 'info_dict': {
+ 'title': 'Data Analysis with Dr Mike Pound',
+ 'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
+ 'uploader_id': 'UC9-y-6csu5WGm29I7JiwpnA',
+ 'uploader': 'Computerphile',
+ 'description': 'md5:7f567c574d13d3f8c0954d9ffee4e487',
+ 'uploader_url': 'https://www.youtube.com/user/Computerphile',
+ 'tags': [],
+ 'view_count': int,
+ 'channel_id': 'UC9-y-6csu5WGm29I7JiwpnA',
+ 'channel_url': 'https://www.youtube.com/user/Computerphile',
+ 'channel': 'Computerphile',
+ },
+ 'playlist_mincount': 11,
+ }, {
+ 'url': 'https://invidio.us/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
+ 'only_matching': True,
+ }, {
+ 'note': 'Playlist URL that does not actually serve a playlist',
+ 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4',
+ 'info_dict': {
+ 'id': 'FqZTN594JQw',
+ 'ext': 'webm',
+ 'title': "Smiley's People 01 detective, Adventure Series, Action",
+ 'uploader': 'STREEM',
+ 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng',
+ 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng',
+ 'upload_date': '20150526',
+ 'license': 'Standard YouTube License',
+ 'description': 'md5:507cdcb5a49ac0da37a920ece610be80',
+ 'categories': ['People & Blogs'],
+ 'tags': list,
+ 'view_count': int,
+ 'like_count': int,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ 'skip': 'This video is not available.',
+ 'add_ie': [YoutubeIE.ie_key()],
+ }, {
+ 'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.youtube.com/channel/UCoMdktPbSTixAyNGwb-UYkQ/live',
+ 'info_dict': {
+ 'id': 'GgL890LIznQ', # This will keep changing
+ 'ext': 'mp4',
+ 'title': str,
+ 'uploader': 'Sky News',
+ 'uploader_id': 'skynews',
+ 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/skynews',
+ 'upload_date': r're:\d{8}',
+ 'description': str,
+ 'categories': ['News & Politics'],
+ 'tags': list,
+ 'like_count': int,
+ 'release_timestamp': 1642502819,
+ 'channel': 'Sky News',
+ 'channel_id': 'UCoMdktPbSTixAyNGwb-UYkQ',
+ 'age_limit': 0,
+ 'view_count': int,
+ 'thumbnail': 'https://i.ytimg.com/vi/GgL890LIznQ/maxresdefault_live.jpg',
+ 'playable_in_embed': True,
+ 'release_date': '20220118',
+ 'availability': 'public',
+ 'live_status': 'is_live',
+ 'channel_url': 'https://www.youtube.com/channel/UCoMdktPbSTixAyNGwb-UYkQ',
+ 'channel_follower_count': int
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ 'expected_warnings': ['Ignoring subtitle tracks found in '],
+ }, {
+ 'url': 'https://www.youtube.com/user/TheYoungTurks/live',
+ 'info_dict': {
+ 'id': 'a48o2S1cPoo',
+ 'ext': 'mp4',
+ 'title': 'The Young Turks - Live Main Show',
+ 'uploader': 'The Young Turks',
+ 'uploader_id': 'TheYoungTurks',
+ 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks',
+ 'upload_date': '20150715',
+ 'license': 'Standard YouTube License',
+ 'description': 'md5:438179573adcdff3c97ebb1ee632b891',
+ 'categories': ['News & Politics'],
+ 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'],
+ 'like_count': int,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.youtube.com/c/CommanderVideoHq/live',
+ 'only_matching': True,
+ }, {
+ 'note': 'A channel that is not live. Should raise error',
+ 'url': 'https://www.youtube.com/user/numberphile/live',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.youtube.com/feed/trending',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.youtube.com/feed/library',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.youtube.com/feed/history',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.youtube.com/feed/subscriptions',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.youtube.com/feed/watch_later',
+ 'only_matching': True,
+ }, {
+ 'note': 'Recommended - redirects to home page.',
+ 'url': 'https://www.youtube.com/feed/recommended',
+ 'only_matching': True,
+ }, {
+ 'note': 'inline playlist with not always working continuations',
+ 'url': 'https://www.youtube.com/watch?v=UC6u0Tct-Fo&list=PL36D642111D65BE7C',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.youtube.com/course',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.youtube.com/zsecurity',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.youtube.com/NASAgovVideo/videos',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.youtube.com/TheYoungTurks/live',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.youtube.com/hashtag/cctv9',
+ 'info_dict': {
+ 'id': 'cctv9',
+ 'title': '#cctv9',
+ 'tags': [],
+ },
+ 'playlist_mincount': 350,
+ }, {
+ 'url': 'https://www.youtube.com/watch?list=PLW4dVinRY435CBE_JD3t-0SRXKfnZHS1P&feature=youtu.be&v=M9cJMXmQ_ZU',
+ 'only_matching': True,
+ }, {
+ 'note': 'Requires Premium: should request additional YTM-info webpage (and have format 141) for videos in playlist',
+ 'url': 'https://music.youtube.com/playlist?list=PLRBp0Fe2GpgmgoscNFLxNyBVSFVdYmFkq',
+ 'only_matching': True
+ }, {
+ 'note': '/browse/ should redirect to /channel/',
+ 'url': 'https://music.youtube.com/browse/UC1a8OFewdjuLq6KlF8M_8Ng',
+ 'only_matching': True
+ }, {
+ 'note': 'VLPL, should redirect to playlist?list=PL...',
+ 'url': 'https://music.youtube.com/browse/VLPLRBp0Fe2GpgmgoscNFLxNyBVSFVdYmFkq',
+ 'info_dict': {
+ 'id': 'PLRBp0Fe2GpgmgoscNFLxNyBVSFVdYmFkq',
+ 'uploader': 'NoCopyrightSounds',
+ 'description': 'Providing you with copyright free / safe music for gaming, live streaming, studying and more!',
+ 'uploader_id': 'UC_aEa8K-EOJ3D6gOs7HcyNg',
+ 'title': 'NCS Releases',
+ 'uploader_url': 'https://www.youtube.com/c/NoCopyrightSounds',
+ 'channel_url': 'https://www.youtube.com/c/NoCopyrightSounds',
+ 'modified_date': r're:\d{8}',
+ 'view_count': int,
+ 'channel_id': 'UC_aEa8K-EOJ3D6gOs7HcyNg',
+ 'tags': [],
+ 'channel': 'NoCopyrightSounds',
+ },
+ 'playlist_mincount': 166,
+ 'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden'],
+ }, {
+ 'note': 'Topic, should redirect to playlist?list=UU...',
+ 'url': 'https://music.youtube.com/browse/UC9ALqqC4aIeG5iDs7i90Bfw',
+ 'info_dict': {
+ 'id': 'UU9ALqqC4aIeG5iDs7i90Bfw',
+ 'uploader_id': 'UC9ALqqC4aIeG5iDs7i90Bfw',
+ 'title': 'Uploads from Royalty Free Music - Topic',
+ 'uploader': 'Royalty Free Music - Topic',
+ 'tags': [],
+ 'channel_id': 'UC9ALqqC4aIeG5iDs7i90Bfw',
+ 'channel': 'Royalty Free Music - Topic',
+ 'view_count': int,
+ 'channel_url': 'https://www.youtube.com/channel/UC9ALqqC4aIeG5iDs7i90Bfw',
+ 'channel_url': 'https://www.youtube.com/channel/UC9ALqqC4aIeG5iDs7i90Bfw',
+ 'modified_date': r're:\d{8}',
+ 'uploader_url': 'https://www.youtube.com/channel/UC9ALqqC4aIeG5iDs7i90Bfw',
+ 'description': '',
+ },
+ 'expected_warnings': [
+ 'The URL does not have a videos tab',
+ r'[Uu]navailable videos (are|will be) hidden',
+ ],
+ 'playlist_mincount': 101,
+ }, {
+ 'note': 'Topic without a UU playlist',
+ 'url': 'https://www.youtube.com/channel/UCtFRv9O2AHqOZjjynzrv-xg',
+ 'info_dict': {
+ 'id': 'UCtFRv9O2AHqOZjjynzrv-xg',
+ 'title': 'UCtFRv9O2AHqOZjjynzrv-xg',
+ 'tags': [],
+ },
+ 'expected_warnings': [
+ 'the playlist redirect gave error',
+ ],
+ 'playlist_mincount': 9,
+ }, {
+ 'note': 'Youtube music Album',
+ 'url': 'https://music.youtube.com/browse/MPREb_gTAcphH99wE',
+ 'info_dict': {
+ 'id': 'OLAK5uy_l1m0thk3g31NmIIz_vMIbWtyv7eZixlH0',
+ 'title': 'Album - Royalty Free Music Library V2 (50 Songs)',
+ 'tags': [],
+ 'view_count': int,
+ 'description': '',
+ 'availability': 'unlisted',
+ 'modified_date': r're:\d{8}',
+ },
+ 'playlist_count': 50,
+ }, {
+ 'note': 'unlisted single video playlist',
+ 'url': 'https://www.youtube.com/playlist?list=PLwL24UFy54GrB3s2KMMfjZscDi1x5Dajf',
+ 'info_dict': {
+ 'uploader_id': 'UC9zHu_mHU96r19o-wV5Qs1Q',
+ 'uploader': 'colethedj',
+ 'id': 'PLwL24UFy54GrB3s2KMMfjZscDi1x5Dajf',
+ 'title': 'hypervideo unlisted playlist test',
+ 'availability': 'unlisted',
+ 'tags': [],
+ 'modified_date': '20211208',
+ 'channel': 'colethedj',
+ 'view_count': int,
+ 'description': '',
+ 'uploader_url': 'https://www.youtube.com/channel/UC9zHu_mHU96r19o-wV5Qs1Q',
+ 'channel_id': 'UC9zHu_mHU96r19o-wV5Qs1Q',
+ 'channel_url': 'https://www.youtube.com/channel/UC9zHu_mHU96r19o-wV5Qs1Q',
+ },
+ 'playlist_count': 1,
+ }, {
+ 'note': 'API Fallback: Recommended - redirects to home page. Requires visitorData',
+ 'url': 'https://www.youtube.com/feed/recommended',
+ 'info_dict': {
+ 'id': 'recommended',
+ 'title': 'recommended',
+ 'tags': [],
+ },
+ 'playlist_mincount': 50,
+ 'params': {
+ 'skip_download': True,
+ 'extractor_args': {'youtubetab': {'skip': ['webpage']}}
+ },
+ }, {
+ 'note': 'API Fallback: /videos tab, sorted by oldest first',
+ 'url': 'https://www.youtube.com/user/theCodyReeder/videos?view=0&sort=da&flow=grid',
+ 'info_dict': {
+ 'id': 'UCu6mSoMNzHQiBIOCkHUa2Aw',
+ 'title': 'Cody\'sLab - Videos',
+ 'description': 'md5:d083b7c2f0c67ee7a6c74c3e9b4243fa',
+ 'uploader': 'Cody\'sLab',
+ 'uploader_id': 'UCu6mSoMNzHQiBIOCkHUa2Aw',
+ 'channel': 'Cody\'sLab',
+ 'channel_id': 'UCu6mSoMNzHQiBIOCkHUa2Aw',
+ 'tags': [],
+ 'channel_url': 'https://www.youtube.com/channel/UCu6mSoMNzHQiBIOCkHUa2Aw',
+ 'uploader_url': 'https://www.youtube.com/channel/UCu6mSoMNzHQiBIOCkHUa2Aw',
+ 'channel_follower_count': int
+ },
+ 'playlist_mincount': 650,
+ 'params': {
+ 'skip_download': True,
+ 'extractor_args': {'youtubetab': {'skip': ['webpage']}}
+ },
+ }, {
+ 'note': 'API Fallback: Topic, should redirect to playlist?list=UU...',
+ 'url': 'https://music.youtube.com/browse/UC9ALqqC4aIeG5iDs7i90Bfw',
+ 'info_dict': {
+ 'id': 'UU9ALqqC4aIeG5iDs7i90Bfw',
+ 'uploader_id': 'UC9ALqqC4aIeG5iDs7i90Bfw',
+ 'title': 'Uploads from Royalty Free Music - Topic',
+ 'uploader': 'Royalty Free Music - Topic',
+ 'modified_date': r're:\d{8}',
+ 'channel_id': 'UC9ALqqC4aIeG5iDs7i90Bfw',
+ 'description': '',
+ 'channel_url': 'https://www.youtube.com/channel/UC9ALqqC4aIeG5iDs7i90Bfw',
+ 'tags': [],
+ 'channel': 'Royalty Free Music - Topic',
+ 'view_count': int,
+ 'uploader_url': 'https://www.youtube.com/channel/UC9ALqqC4aIeG5iDs7i90Bfw',
+ },
+ 'expected_warnings': [
+ 'does not have a videos tab',
+ r'[Uu]navailable videos (are|will be) hidden',
+ ],
+ 'playlist_mincount': 101,
+ 'params': {
+ 'skip_download': True,
+ 'extractor_args': {'youtubetab': {'skip': ['webpage']}}
+ },
+ }, {
+ 'note': 'non-standard redirect to regional channel',
+ 'url': 'https://www.youtube.com/channel/UCwVVpHQ2Cs9iGJfpdFngePQ',
+ 'only_matching': True
+ }]
+
+ @classmethod
+ def suitable(cls, url):
+ return False if YoutubeIE.suitable(url) else super(
+ YoutubeTabIE, cls).suitable(url)
+
+ _URL_RE = re.compile(rf'(?P<pre>{_VALID_URL})(?(not_channel)|(?P<tab>/\w+))?(?P<post>.*)$')
+
+ @YoutubeTabBaseInfoExtractor.passthrough_smuggled_data
+ def _real_extract(self, url, smuggled_data):
item_id = self._match_id(url)
url = compat_urlparse.urlunparse(
compat_urlparse.urlparse(url)._replace(netloc='www.youtube.com'))
compat_opts = self.get_param('compat_opts', [])
def get_mobj(url):
- mobj = self._url_re.match(url).groupdict()
+ mobj = self._URL_RE.match(url).groupdict()
mobj.update((k, '') for k, v in mobj.items() if v is None)
return mobj
- mobj = get_mobj(url)
+ mobj, redirect_warning = get_mobj(url), None
# Youtube returns incomplete data if tabname is not lower case
pre, tab, post, is_channel = mobj['pre'], mobj['tab'].lower(), mobj['post'], not mobj['not_channel']
if is_channel:
if smuggled_data.get('is_music_url'):
- if item_id[:2] == 'VL':
- # Youtube music VL channels have an equivalent playlist
+ if item_id[:2] == 'VL': # Youtube music VL channels have an equivalent playlist
item_id = item_id[2:]
- pre, tab, post, is_channel = 'https://www.youtube.com/playlist?list=%s' % item_id, '', '', False
- elif item_id[:2] == 'MP':
- # Resolve albums (/[channel/browse]/MP...) to their equivalent playlist
+ pre, tab, post, is_channel = f'https://www.youtube.com/playlist?list={item_id}', '', '', False
+ elif item_id[:2] == 'MP': # Resolve albums (/[channel/browse]/MP...) to their equivalent playlist
mdata = self._extract_tab_endpoint(
- 'https://music.youtube.com/channel/%s' % item_id, item_id, default_client='web_music')
- murl = traverse_obj(
- mdata, ('microformat', 'microformatDataRenderer', 'urlCanonical'), get_all=False, expected_type=compat_str)
+ f'https://music.youtube.com/channel/{item_id}', item_id, default_client='web_music')
+ murl = traverse_obj(mdata, ('microformat', 'microformatDataRenderer', 'urlCanonical'),
+ get_all=False, expected_type=compat_str)
if not murl:
- raise ExtractorError('Failed to resolve album to playlist.')
+ raise ExtractorError('Failed to resolve album to playlist')
return self.url_result(murl, ie=YoutubeTabIE.ie_key())
- elif mobj['channel_type'] == 'browse':
- # Youtube music /browse/ should be changed to /channel/
- pre = 'https://www.youtube.com/channel/%s' % item_id
+ elif mobj['channel_type'] == 'browse': # Youtube music /browse/ should be changed to /channel/
+ pre = f'https://www.youtube.com/channel/{item_id}'
+
+ original_tab_name = tab
if is_channel and not tab and 'no-youtube-channel-redirect' not in compat_opts:
# Home URLs should redirect to /videos/
- self.report_warning(
- 'A channel/user page was given. All the channel\'s videos will be downloaded. '
- 'To download only the videos in the home page, add a "/featured" to the URL')
+ redirect_warning = ('A channel/user page was given. All the channel\'s videos will be downloaded. '
+ 'To download only the videos in the home page, add a "/featured" to the URL')
tab = '/videos'
url = ''.join((pre, tab, post))
@@ -4168,89 +5188,111 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
# Handle both video/playlist URLs
qs = parse_qs(url)
- video_id = qs.get('v', [None])[0]
- playlist_id = qs.get('list', [None])[0]
+ video_id, playlist_id = [qs.get(key, [None])[0] for key in ('v', 'list')]
if not video_id and mobj['not_channel'].startswith('watch'):
if not playlist_id:
# If there is neither video or playlist ids, youtube redirects to home page, which is undesirable
raise ExtractorError('Unable to recognize tab page')
# Common mistake: https://www.youtube.com/watch?list=playlist_id
- self.report_warning('A video URL was given without video ID. Trying to download playlist %s' % playlist_id)
- url = 'https://www.youtube.com/playlist?list=%s' % playlist_id
+ self.report_warning(f'A video URL was given without video ID. Trying to download playlist {playlist_id}')
+ url = f'https://www.youtube.com/playlist?list={playlist_id}'
mobj = get_mobj(url)
if video_id and playlist_id:
if self.get_param('noplaylist'):
- self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
- return self.url_result(f'https://www.youtube.com/watch?v={video_id}', ie=YoutubeIE.ie_key(), video_id=video_id)
- self.to_screen('Downloading playlist %s; add --no-playlist to just download video %s' % (playlist_id, video_id))
+ self.to_screen(f'Downloading just video {video_id} because of --no-playlist')
+ return self.url_result(f'https://www.youtube.com/watch?v={video_id}',
+ ie=YoutubeIE.ie_key(), video_id=video_id)
+ self.to_screen(f'Downloading playlist {playlist_id}; add --no-playlist to just download video {video_id}')
data, ytcfg = self._extract_data(url, item_id)
- tabs = try_get(
- data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list)
+ # YouTube may provide a non-standard redirect to the regional channel
+ # See: https://github.com/hypervideo/hypervideo/issues/2694
+ redirect_url = traverse_obj(
+ data, ('onResponseReceivedActions', ..., 'navigateAction', 'endpoint', 'commandMetadata', 'webCommandMetadata', 'url'), get_all=False)
+ if redirect_url and 'no-youtube-channel-redirect' not in compat_opts:
+ redirect_url = ''.join((
+ urljoin('https://www.youtube.com', redirect_url), mobj['tab'], mobj['post']))
+ self.to_screen(f'This playlist is likely not available in your region. Following redirect to regional playlist {redirect_url}')
+ return self.url_result(redirect_url, ie=YoutubeTabIE.ie_key())
+
+ tabs = traverse_obj(data, ('contents', 'twoColumnBrowseResultsRenderer', 'tabs'), expected_type=list)
if tabs:
selected_tab = self._extract_selected_tab(tabs)
- tab_name = selected_tab.get('title', '')
+ selected_tab_name = selected_tab.get('title', '').lower()
+ if selected_tab_name == 'home':
+ selected_tab_name = 'featured'
+ requested_tab_name = mobj['tab'][1:]
if 'no-youtube-channel-redirect' not in compat_opts:
- if mobj['tab'] == '/live':
+ if requested_tab_name == 'live':
# Live tab should have redirected to the video
raise ExtractorError('The channel is not currently live', expected=True)
- if mobj['tab'] == '/videos' and tab_name.lower() != mobj['tab'][1:]:
- if not mobj['not_channel'] and item_id[:2] == 'UC':
- # Topic channels don't have /videos. Use the equivalent playlist instead
- self.report_warning('The URL does not have a %s tab. Trying to redirect to playlist UU%s instead' % (mobj['tab'][1:], item_id[2:]))
- pl_id = 'UU%s' % item_id[2:]
- pl_url = 'https://www.youtube.com/playlist?list=%s%s' % (pl_id, mobj['post'])
- try:
- data, ytcfg, item_id, url = *self._extract_data(pl_url, pl_id, ytcfg=ytcfg, fatal=True), pl_id, pl_url
- except ExtractorError:
- self.report_warning('The playlist gave error. Falling back to channel URL')
+ if requested_tab_name not in ('', selected_tab_name):
+ redirect_warning = f'The channel does not have a {requested_tab_name} tab'
+ if not original_tab_name:
+ if item_id[:2] == 'UC':
+ # Topic channels don't have /videos. Use the equivalent playlist instead
+ pl_id = f'UU{item_id[2:]}'
+ pl_url = f'https://www.youtube.com/playlist?list={pl_id}'
+ try:
+ data, ytcfg = self._extract_data(pl_url, pl_id, ytcfg=ytcfg, fatal=True, webpage_fatal=True)
+ except ExtractorError:
+ redirect_warning += ' and the playlist redirect gave error'
+ else:
+ item_id, url, selected_tab_name = pl_id, pl_url, requested_tab_name
+ redirect_warning += f'. Redirecting to playlist {pl_id} instead'
+ if selected_tab_name and selected_tab_name != requested_tab_name:
+ redirect_warning += f'. {selected_tab_name} tab is being downloaded instead'
else:
- self.report_warning('The URL does not have a %s tab. %s is being downloaded instead' % (mobj['tab'][1:], tab_name))
+ raise ExtractorError(redirect_warning, expected=True)
- self.write_debug('Final URL: %s' % url)
+ if redirect_warning:
+ self.to_screen(redirect_warning)
+ self.write_debug(f'Final URL: {url}')
# YouTube sometimes provides a button to reload playlist with unavailable videos.
if 'no-youtube-unavailable-videos' not in compat_opts:
data = self._reload_with_unavailable_videos(item_id, data, ytcfg) or data
self._extract_and_report_alerts(data, only_once=True)
- tabs = try_get(
- data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list)
+ tabs = traverse_obj(data, ('contents', 'twoColumnBrowseResultsRenderer', 'tabs'), expected_type=list)
if tabs:
return self._extract_from_tabs(item_id, ytcfg, data, tabs)
- playlist = try_get(
- data, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist'], dict)
+ playlist = traverse_obj(
+ data, ('contents', 'twoColumnWatchNextResults', 'playlist', 'playlist'), expected_type=dict)
if playlist:
return self._extract_from_playlist(item_id, url, data, playlist, ytcfg)
- video_id = try_get(
- data, lambda x: x['currentVideoEndpoint']['watchEndpoint']['videoId'],
- compat_str) or video_id
+ video_id = traverse_obj(
+ data, ('currentVideoEndpoint', 'watchEndpoint', 'videoId'), expected_type=str) or video_id
if video_id:
if mobj['tab'] != '/live': # live tab is expected to redirect to video
- self.report_warning('Unable to recognize playlist. Downloading just video %s' % video_id)
- return self.url_result(f'https://www.youtube.com/watch?v={video_id}', ie=YoutubeIE.ie_key(), video_id=video_id)
+ self.report_warning(f'Unable to recognize playlist. Downloading just video {video_id}')
+ return self.url_result(f'https://www.youtube.com/watch?v={video_id}',
+ ie=YoutubeIE.ie_key(), video_id=video_id)
raise ExtractorError('Unable to recognize tab page')
class YoutubePlaylistIE(InfoExtractor):
- IE_DESC = 'YouTube.com playlists'
+ IE_DESC = 'YouTube playlists'
_VALID_URL = r'''(?x)(?:
(?:https?://)?
(?:\w+\.)?
(?:
(?:
youtube(?:kids)?\.com|
- invidio\.us
+ %(invidious)s
)
/.*?\?.*?\blist=
)?
(?P<id>%(playlist_id)s)
- )''' % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
+ )''' % {
+ 'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE,
+ 'invidious': '|'.join(YoutubeBaseInfoExtractor._INVIDIOUS_SITES),
+ }
IE_NAME = 'youtube:playlist'
_TESTS = [{
'note': 'issue #673',
@@ -4258,9 +5300,16 @@ class YoutubePlaylistIE(InfoExtractor):
'info_dict': {
'title': '[OLD]Team Fortress 2 (Class-based LP)',
'id': 'PLBB231211A4F62143',
- 'uploader': 'Wickydoo',
+ 'uploader': 'Wickman',
'uploader_id': 'UCKSpbfbl5kRQpTdL7kMc-1Q',
'description': 'md5:8fa6f52abb47a9552002fa3ddfc57fc2',
+ 'view_count': int,
+ 'uploader_url': 'https://www.youtube.com/user/Wickydoo',
+ 'modified_date': r're:\d{8}',
+ 'channel_id': 'UCKSpbfbl5kRQpTdL7kMc-1Q',
+ 'channel': 'Wickman',
+ 'tags': [],
+ 'channel_url': 'https://www.youtube.com/user/Wickydoo',
},
'playlist_mincount': 29,
}, {
@@ -4280,7 +5329,16 @@ class YoutubePlaylistIE(InfoExtractor):
'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
'uploader': 'milan',
'uploader_id': 'UCEI1-PVPcYXjB73Hfelbmaw',
- }
+ 'description': '',
+ 'channel_url': 'https://www.youtube.com/channel/UCEI1-PVPcYXjB73Hfelbmaw',
+ 'tags': [],
+ 'modified_date': '20140919',
+ 'view_count': int,
+ 'channel': 'milan',
+ 'channel_id': 'UCEI1-PVPcYXjB73Hfelbmaw',
+ 'uploader_url': 'https://www.youtube.com/channel/UCEI1-PVPcYXjB73Hfelbmaw',
+ },
+ 'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden'],
}, {
'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
'playlist_mincount': 654,
@@ -4290,7 +5348,15 @@ class YoutubePlaylistIE(InfoExtractor):
'uploader': 'LBK',
'uploader_id': 'UC21nz3_MesPLqtDqwdvnoxA',
'description': 'md5:da521864744d60a198e3a88af4db0d9d',
- }
+ 'channel': 'LBK',
+ 'view_count': int,
+ 'channel_url': 'https://www.youtube.com/c/愛低音的國王',
+ 'tags': [],
+ 'uploader_url': 'https://www.youtube.com/c/愛低音的國王',
+ 'channel_id': 'UC21nz3_MesPLqtDqwdvnoxA',
+ 'modified_date': r're:\d{8}',
+ },
+ 'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden'],
}, {
'url': 'TLGGrESM50VT6acwMjAyMjAxNw',
'only_matching': True,
@@ -4304,9 +5370,7 @@ class YoutubePlaylistIE(InfoExtractor):
def suitable(cls, url):
if YoutubeTabIE.suitable(url):
return False
- # Hack for lazy extractors until more generic solution is implemented
- # (see #28780)
- from .youtube import parse_qs
+ from ..utils import parse_qs
qs = parse_qs(url)
if qs.get('v', [None])[0]:
return False
@@ -4340,7 +5404,16 @@ class YoutubeYtBeIE(InfoExtractor):
'categories': ['Nonprofits & Activism'],
'tags': list,
'like_count': int,
- 'dislike_count': int,
+ 'age_limit': 0,
+ 'playable_in_embed': True,
+ 'thumbnail': 'https://i.ytimg.com/vi_webp/yeWKywCrFtk/maxresdefault.webp',
+ 'channel': 'Backus-Page House Museum',
+ 'channel_id': 'UCEfMCQ9bs3tjvjy1s451zaw',
+ 'live_status': 'not_live',
+ 'view_count': int,
+ 'channel_url': 'https://www.youtube.com/channel/UCEfMCQ9bs3tjvjy1s451zaw',
+ 'availability': 'public',
+ 'duration': 59,
},
'params': {
'noplaylist': True,
@@ -4363,8 +5436,24 @@ class YoutubeYtBeIE(InfoExtractor):
}), ie=YoutubeTabIE.ie_key(), video_id=playlist_id)
+class YoutubeLivestreamEmbedIE(InfoExtractor):
+ IE_DESC = 'YouTube livestream embeds'
+ _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/embed/live_stream/?\?(?:[^#]+&)?channel=(?P<id>[^&#]+)'
+ _TESTS = [{
+ 'url': 'https://www.youtube.com/embed/live_stream?channel=UC2_KI6RB__jGdlnK6dvFEZA',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ channel_id = self._match_id(url)
+ return self.url_result(
+ f'https://www.youtube.com/channel/{channel_id}/live',
+ ie=YoutubeTabIE.ie_key(), video_id=channel_id)
+
+
class YoutubeYtUserIE(InfoExtractor):
- IE_DESC = 'YouTube.com user videos, URL or "ytuser" keyword'
+ IE_DESC = 'YouTube user videos; "ytuser:" prefix'
+ IE_NAME = 'youtube:user'
_VALID_URL = r'ytuser:(?P<id>.+)'
_TESTS = [{
'url': 'ytuser:phihag',
@@ -4374,13 +5463,13 @@ class YoutubeYtUserIE(InfoExtractor):
def _real_extract(self, url):
user_id = self._match_id(url)
return self.url_result(
- 'https://www.youtube.com/user/%s' % user_id,
+ 'https://www.youtube.com/user/%s/videos' % user_id,
ie=YoutubeTabIE.ie_key(), video_id=user_id)
class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
IE_NAME = 'youtube:favorites'
- IE_DESC = 'YouTube.com liked videos, ":ytfav" for short (requires authentication)'
+ IE_DESC = 'YouTube liked videos; ":ytfav" keyword (requires cookies)'
_VALID_URL = r':ytfav(?:ou?rite)?s?'
_LOGIN_REQUIRED = True
_TESTS = [{
@@ -4397,79 +5486,40 @@ class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
ie=YoutubeTabIE.ie_key())
-class YoutubeSearchIE(SearchInfoExtractor, YoutubeTabIE):
- IE_DESC = 'YouTube.com searches, "ytsearch" keyword'
- # there doesn't appear to be a real limit, for example if you search for
- # 'python' you get more than 8.000.000 results
- _MAX_RESULTS = float('inf')
+class YoutubeSearchIE(YoutubeTabBaseInfoExtractor, SearchInfoExtractor):
+ IE_DESC = 'YouTube search'
IE_NAME = 'youtube:search'
_SEARCH_KEY = 'ytsearch'
- _SEARCH_PARAMS = None
- _TESTS = []
-
- def _search_results(self, query):
- data = {'query': query}
- if self._SEARCH_PARAMS:
- data['params'] = self._SEARCH_PARAMS
- continuation = {}
- for page_num in itertools.count(1):
- data.update(continuation)
- search = self._extract_response(
- item_id='query "%s" page %s' % (query, page_num), ep='search', query=data,
- check_get_keys=('contents', 'onResponseReceivedCommands')
- )
- if not search:
- break
- slr_contents = try_get(
- search,
- (lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'],
- lambda x: x['onResponseReceivedCommands'][0]['appendContinuationItemsAction']['continuationItems']),
- list)
- if not slr_contents:
- break
-
- # Youtube sometimes adds promoted content to searches,
- # changing the index location of videos and token.
- # So we search through all entries till we find them.
- continuation = None
- for slr_content in slr_contents:
- if not continuation:
- continuation = self._extract_continuation({'contents': [slr_content]})
-
- isr_contents = try_get(
- slr_content,
- lambda x: x['itemSectionRenderer']['contents'],
- list)
- if not isr_contents:
- continue
- for content in isr_contents:
- if not isinstance(content, dict):
- continue
- video = content.get('videoRenderer')
- if not isinstance(video, dict):
- continue
- video_id = video.get('videoId')
- if not video_id:
- continue
-
- yield self._extract_video(video)
-
- if not continuation:
- break
+ _SEARCH_PARAMS = 'EgIQAQ%3D%3D' # Videos only
+ _TESTS = [{
+ 'url': 'ytsearch5:youtube-dl test video',
+ 'playlist_count': 5,
+ 'info_dict': {
+ 'id': 'youtube-dl test video',
+ 'title': 'youtube-dl test video',
+ }
+ }]
-class YoutubeSearchDateIE(YoutubeSearchIE):
+class YoutubeSearchDateIE(YoutubeTabBaseInfoExtractor, SearchInfoExtractor):
IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
_SEARCH_KEY = 'ytsearchdate'
- IE_DESC = 'YouTube.com searches, newest videos first, "ytsearchdate" keyword'
- _SEARCH_PARAMS = 'CAI%3D'
+ IE_DESC = 'YouTube search, newest videos first'
+ _SEARCH_PARAMS = 'CAISAhAB' # Videos only, sorted by date
+ _TESTS = [{
+ 'url': 'ytsearchdate5:youtube-dl test video',
+ 'playlist_count': 5,
+ 'info_dict': {
+ 'id': 'youtube-dl test video',
+ 'title': 'youtube-dl test video',
+ }
+ }]
-class YoutubeSearchURLIE(YoutubeSearchIE):
- IE_DESC = 'YouTube.com search URLs'
+class YoutubeSearchURLIE(YoutubeTabBaseInfoExtractor):
+ IE_DESC = 'YouTube search URLs with sorting and filter support'
IE_NAME = YoutubeSearchIE.IE_NAME + '_url'
- _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?:[^&]+)(?:[&]|$)'
- # _MAX_RESULTS = 100
+ _VALID_URL = r'https?://(?:www\.)?youtube\.com/(?:results|search)\?([^#]+&)?(?:search_query|q)=(?:[^&]+)(?:[&#]|$)'
_TESTS = [{
'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
'playlist_mincount': 5,
@@ -4478,22 +5528,88 @@ class YoutubeSearchURLIE(YoutubeSearchIE):
'title': 'youtube-dl test video',
}
}, {
+ 'url': 'https://www.youtube.com/results?search_query=python&sp=EgIQAg%253D%253D',
+ 'playlist_mincount': 5,
+ 'info_dict': {
+ 'id': 'python',
+ 'title': 'python',
+ }
+ }, {
+ 'url': 'https://www.youtube.com/results?search_query=%23cats',
+ 'playlist_mincount': 1,
+ 'info_dict': {
+ 'id': '#cats',
+ 'title': '#cats',
+ 'entries': [{
+ 'url': r're:https://(www\.)?youtube\.com/hashtag/cats',
+ 'title': '#cats',
+ }],
+ },
+ }, {
'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB',
'only_matching': True,
}]
- @classmethod
- def _make_valid_url(cls):
- return cls._VALID_URL
+ def _real_extract(self, url):
+ qs = parse_qs(url)
+ query = (qs.get('search_query') or qs.get('q'))[0]
+ return self.playlist_result(self._search_results(query, qs.get('sp', (None,))[0]), query, query)
+
+
+class YoutubeMusicSearchURLIE(YoutubeTabBaseInfoExtractor):
+ IE_DESC = 'YouTube music search URLs with selectable sections (Eg: #songs)'
+ IE_NAME = 'youtube:music:search_url'
+ _VALID_URL = r'https?://music\.youtube\.com/search\?([^#]+&)?(?:search_query|q)=(?:[^&]+)(?:[&#]|$)'
+ _TESTS = [{
+ 'url': 'https://music.youtube.com/search?q=royalty+free+music',
+ 'playlist_count': 16,
+ 'info_dict': {
+ 'id': 'royalty free music',
+ 'title': 'royalty free music',
+ }
+ }, {
+ 'url': 'https://music.youtube.com/search?q=royalty+free+music&sp=EgWKAQIIAWoKEAoQAxAEEAkQBQ%3D%3D',
+ 'playlist_mincount': 30,
+ 'info_dict': {
+ 'id': 'royalty free music - songs',
+ 'title': 'royalty free music - songs',
+ },
+ 'params': {'extract_flat': 'in_playlist'}
+ }, {
+ 'url': 'https://music.youtube.com/search?q=royalty+free+music#community+playlists',
+ 'playlist_mincount': 30,
+ 'info_dict': {
+ 'id': 'royalty free music - community playlists',
+ 'title': 'royalty free music - community playlists',
+ },
+ 'params': {'extract_flat': 'in_playlist'}
+ }]
+
+ _SECTIONS = {
+ 'albums': 'EgWKAQIYAWoKEAoQAxAEEAkQBQ==',
+ 'artists': 'EgWKAQIgAWoKEAoQAxAEEAkQBQ==',
+ 'community playlists': 'EgeKAQQoAEABagoQChADEAQQCRAF',
+ 'featured playlists': 'EgeKAQQoADgBagwQAxAJEAQQDhAKEAU==',
+ 'songs': 'EgWKAQIIAWoKEAoQAxAEEAkQBQ==',
+ 'videos': 'EgWKAQIQAWoKEAoQAxAEEAkQBQ==',
+ }
def _real_extract(self, url):
qs = parse_qs(url)
query = (qs.get('search_query') or qs.get('q'))[0]
- self._SEARCH_PARAMS = qs.get('sp', ('',))[0]
- return self._get_n_results(query, self._MAX_RESULTS)
+ params = qs.get('sp', (None,))[0]
+ if params:
+ section = next((k for k, v in self._SECTIONS.items() if v == params), params)
+ else:
+ section = compat_urllib_parse_unquote_plus((url.split('#') + [''])[1]).lower()
+ params = self._SECTIONS.get(section)
+ if not params:
+ section = None
+ title = join_nonempty(query, section, delim=' - ')
+ return self.playlist_result(self._search_results(query, params, default_client='web_music'), title, title)
-class YoutubeFeedsInfoExtractor(YoutubeTabIE):
+class YoutubeFeedsInfoExtractor(InfoExtractor):
"""
Base class for feed extractors
Subclasses must define the _FEED_NAME property.
@@ -4507,13 +5623,12 @@ class YoutubeFeedsInfoExtractor(YoutubeTabIE):
def _real_extract(self, url):
return self.url_result(
- 'https://www.youtube.com/feed/%s' % self._FEED_NAME,
- ie=YoutubeTabIE.ie_key())
+ f'https://www.youtube.com/feed/{self._FEED_NAME}', ie=YoutubeTabIE.ie_key())
class YoutubeWatchLaterIE(InfoExtractor):
IE_NAME = 'youtube:watchlater'
- IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
+ IE_DESC = 'Youtube watch later list; ":ytwatchlater" keyword (requires cookies)'
_VALID_URL = r':ytwatchlater'
_TESTS = [{
'url': ':ytwatchlater',
@@ -4526,7 +5641,7 @@ class YoutubeWatchLaterIE(InfoExtractor):
class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
- IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
+ IE_DESC = 'YouTube recommended videos; ":ytrec" keyword'
_VALID_URL = r'https?://(?:www\.)?youtube\.com/?(?:[?#]|$)|:ytrec(?:ommended)?'
_FEED_NAME = 'recommended'
_LOGIN_REQUIRED = False
@@ -4543,7 +5658,7 @@ class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
- IE_DESC = 'YouTube.com subscriptions feed, ":ytsubs" for short (requires authentication)'
+ IE_DESC = 'YouTube subscriptions feed; ":ytsubs" keyword (requires cookies)'
_VALID_URL = r':ytsub(?:scription)?s?'
_FEED_NAME = 'subscriptions'
_TESTS = [{
@@ -4556,7 +5671,7 @@ class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
- IE_DESC = 'Youtube watch history, ":ythis" for short (requires authentication)'
+ IE_DESC = 'Youtube watch history; ":ythis" keyword (requires cookies)'
_VALID_URL = r':ythis(?:tory)?'
_FEED_NAME = 'history'
_TESTS = [{
diff --git a/hypervideo_dl/extractor/zattoo.py b/hypervideo_dl/extractor/zattoo.py
index a13d124..c02b4ca 100644
--- a/hypervideo_dl/extractor/zattoo.py
+++ b/hypervideo_dl/extractor/zattoo.py
@@ -12,6 +12,7 @@ from ..compat import (
from ..utils import (
ExtractorError,
int_or_none,
+ join_nonempty,
try_get,
url_or_none,
urlencode_postdata,
@@ -24,13 +25,11 @@ class ZattooPlatformBaseIE(InfoExtractor):
def _host_url(self):
return 'https://%s' % (self._API_HOST if hasattr(self, '_API_HOST') else self._HOST)
- def _login(self):
- username, password = self._get_login_info()
- if not username or not password:
- self.raise_login_required(
- 'A valid %s account is needed to access this media.'
- % self._NETRC_MACHINE)
+ def _real_initialize(self):
+ if not self._power_guide_hash:
+ self.raise_login_required('An account is needed to access this media', method='password')
+ def _perform_login(self, username, password):
try:
data = self._download_json(
'%s/zapi/v2/account/login' % self._host_url(), None, 'Logging in',
@@ -51,7 +50,7 @@ class ZattooPlatformBaseIE(InfoExtractor):
self._power_guide_hash = data['session']['power_guide_hash']
- def _real_initialize(self):
+ def _initialize_pre_login(self):
webpage = self._download_webpage(
self._host_url(), None, 'Downloading app token')
app_token = self._html_search_regex(
@@ -71,8 +70,6 @@ class ZattooPlatformBaseIE(InfoExtractor):
'format': 'json',
}))
- self._login()
-
def _extract_cid(self, video_id, channel_name):
channel_groups = self._download_json(
'%s/zapi/v2/cached/channels/%s' % (self._host_url(),
@@ -156,15 +153,9 @@ class ZattooPlatformBaseIE(InfoExtractor):
watch_url = url_or_none(watch.get('url'))
if not watch_url:
continue
- format_id_list = [stream_type]
- maxrate = watch.get('maxrate')
- if maxrate:
- format_id_list.append(compat_str(maxrate))
audio_channel = watch.get('audio_channel')
- if audio_channel:
- format_id_list.append(compat_str(audio_channel))
preference = 1 if audio_channel == 'A' else None
- format_id = '-'.join(format_id_list)
+ format_id = join_nonempty(stream_type, watch.get('maxrate'), audio_channel)
if stream_type in ('dash', 'dash_widevine', 'dash_playready'):
this_formats = self._extract_mpd_formats(
watch_url, video_id, mpd_id=format_id, fatal=False)
@@ -192,7 +183,7 @@ class ZattooPlatformBaseIE(InfoExtractor):
cid = self._extract_cid(video_id, channel_name)
info_dict = {
'id': channel_name,
- 'title': self._live_title(channel_name),
+ 'title': channel_name,
'is_live': True,
}
else:
diff --git a/hypervideo_dl/extractor/zdf.py b/hypervideo_dl/extractor/zdf.py
index 8c279c5..5f4d266 100644
--- a/hypervideo_dl/extractor/zdf.py
+++ b/hypervideo_dl/extractor/zdf.py
@@ -9,12 +9,13 @@ from ..utils import (
determine_ext,
float_or_none,
int_or_none,
+ join_nonempty,
merge_dicts,
NO_DEFAULT,
orderedSet,
parse_codecs,
qualities,
- str_or_none,
+ traverse_obj,
try_get,
unified_timestamp,
update_url_query,
@@ -70,11 +71,11 @@ class ZDFBaseIE(InfoExtractor):
f = {'vcodec': data[0], 'acodec': data[1]}
f.update({
'url': format_url,
- 'format_id': '-'.join(filter(str_or_none, ('http', meta.get('type'), meta.get('quality')))),
+ 'format_id': join_nonempty('http', meta.get('type'), meta.get('quality')),
})
new_formats = [f]
formats.extend(merge_dicts(f, {
- 'format_note': ', '.join(filter(None, (meta.get('quality'), meta.get('class')))),
+ 'format_note': join_nonempty('quality', 'class', from_dict=meta, delim=', '),
'language': meta.get('language'),
'language_preference': 10 if meta.get('class') == 'main' else -10 if meta.get('class') == 'ad' else -1,
'quality': qualities(self._QUALITIES)(meta.get('quality')),
@@ -147,6 +148,7 @@ class ZDFIE(ZDFBaseIE):
'timestamp': 1613948400,
'upload_date': '20210221',
},
+ 'skip': 'No longer available: "Diese Seite wurde leider nicht gefunden"',
}, {
# Same as https://www.3sat.de/film/ab-18/10-wochen-sommer-108.html
'url': 'https://www.zdf.de/dokumentation/ab-18/10-wochen-sommer-102.html',
@@ -160,6 +162,20 @@ class ZDFIE(ZDFBaseIE):
'timestamp': 1608604200,
'upload_date': '20201222',
},
+ 'skip': 'No longer available: "Diese Seite wurde leider nicht gefunden"',
+ }, {
+ 'url': 'https://www.zdf.de/nachrichten/heute-journal/heute-journal-vom-30-12-2021-100.html',
+ 'info_dict': {
+ 'id': '211230_sendung_hjo',
+ 'ext': 'mp4',
+ 'description': 'md5:47dff85977bde9fb8cba9e9c9b929839',
+ 'duration': 1890.0,
+ 'upload_date': '20211230',
+ 'chapters': list,
+ 'thumbnail': 'md5:e65f459f741be5455c952cd820eb188e',
+ 'title': 'heute journal vom 30.12.2021',
+ 'timestamp': 1640897100,
+ }
}, {
'url': 'https://www.zdf.de/dokumentation/terra-x/die-magie-der-farben-von-koenigspurpur-und-jeansblau-100.html',
'info_dict': {
@@ -170,6 +186,20 @@ class ZDFIE(ZDFBaseIE):
'duration': 2615,
'timestamp': 1465021200,
'upload_date': '20160604',
+ 'thumbnail': 'https://www.zdf.de/assets/mauve-im-labor-100~768x432?cb=1464909117806',
+ },
+ }, {
+ 'url': 'https://www.zdf.de/funk/druck-11790/funk-alles-ist-verzaubert-102.html',
+ 'md5': '3d6f1049e9682178a11c54b91f3dd065',
+ 'info_dict': {
+ 'ext': 'mp4',
+ 'id': 'video_funk_1770473',
+ 'duration': 1278,
+ 'description': 'Die Neue an der Schule verdreht Ismail den Kopf.',
+ 'title': 'Alles ist verzaubert',
+ 'timestamp': 1635520560,
+ 'upload_date': '20211029',
+ 'thumbnail': 'https://www.zdf.de/assets/teaser-funk-alles-ist-verzaubert-100~1920x1080?cb=1636466431799',
},
}, {
# Same as https://www.phoenix.de/sendungen/dokumentationen/gesten-der-maechtigen-i-a-89468.html?ref=suche
@@ -192,6 +222,17 @@ class ZDFIE(ZDFBaseIE):
}, {
'url': 'https://www.zdf.de/dokumentation/planet-e/planet-e-uebersichtsseite-weitere-dokumentationen-von-planet-e-100.html',
'only_matching': True,
+ }, {
+ 'url': 'https://www.zdf.de/arte/todliche-flucht/page-video-artede-toedliche-flucht-16-100.html',
+ 'info_dict': {
+ 'id': 'video_artede_083871-001-A',
+ 'ext': 'mp4',
+ 'title': 'Tödliche Flucht (1/6)',
+ 'description': 'md5:e34f96a9a5f8abd839ccfcebad3d5315',
+ 'duration': 3193.0,
+ 'timestamp': 1641355200,
+ 'upload_date': '20220105',
+ },
}]
def _extract_entry(self, url, player, content, video_id):
@@ -202,8 +243,9 @@ class ZDFIE(ZDFBaseIE):
ptmd_path = t.get('http://zdf.de/rels/streams/ptmd')
if not ptmd_path:
- ptmd_path = t[
- 'http://zdf.de/rels/streams/ptmd-template'].replace(
+ ptmd_path = traverse_obj(
+ t, ('streams', 'default', 'http://zdf.de/rels/streams/ptmd-template'),
+ 'http://zdf.de/rels/streams/ptmd-template').replace(
'{playerId}', 'ngplayer_2_4')
info = self._extract_ptmd(
@@ -229,12 +271,21 @@ class ZDFIE(ZDFBaseIE):
})
thumbnails.append(thumbnail)
+ chapter_marks = t.get('streamAnchorTag') or []
+ chapter_marks.append({'anchorOffset': int_or_none(t.get('duration'))})
+ chapters = [{
+ 'start_time': chap.get('anchorOffset'),
+ 'end_time': next_chap.get('anchorOffset'),
+ 'title': chap.get('anchorLabel')
+ } for chap, next_chap in zip(chapter_marks, chapter_marks[1:])]
+
return merge_dicts(info, {
'title': title,
'description': content.get('leadParagraph') or content.get('teasertext'),
'duration': int_or_none(t.get('duration')),
'timestamp': unified_timestamp(content.get('editorialDate')),
'thumbnails': thumbnails,
+ 'chapters': chapters or None
})
def _extract_regular(self, url, player, video_id):
diff --git a/hypervideo_dl/extractor/zee5.py b/hypervideo_dl/extractor/zee5.py
index 5366041..3e3f11b 100644
--- a/hypervideo_dl/extractor/zee5.py
+++ b/hypervideo_dl/extractor/zee5.py
@@ -21,9 +21,9 @@ class Zee5IE(InfoExtractor):
_VALID_URL = r'''(?x)
(?:
zee5:|
- (?:https?://)(?:www\.)?zee5\.com/(?:[^#?]+/)?
+ https?://(?:www\.)?zee5\.com/(?:[^#?]+/)?
(?:
- (?:tvshows|kids|zee5originals)(?:/[^#/?]+){3}
+ (?:tv-shows|kids|web-series|zee5originals)(?:/[^#/?]+){3}
|movies/[^#/?]+
)/(?P<display_id>[^#/?]+)/
)
@@ -37,48 +37,53 @@ class Zee5IE(InfoExtractor):
'display_id': 'krishna-the-birth',
'title': 'Krishna - The Birth',
'duration': 4368,
- 'average_rating': 4,
'description': compat_str,
'alt_title': 'Krishna - The Birth',
'uploader': 'Zee Entertainment Enterprises Ltd',
'release_date': '20060101',
'upload_date': '20060101',
'timestamp': 1136073600,
- 'thumbnail': 'https://akamaividz.zee5.com/resources/0-0-63098/list/270x152/0063098_list_80888170.jpg',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'episode_number': 0,
+ 'episode': 'Episode 0',
'tags': list
},
'params': {
'format': 'bv',
},
}, {
- 'url': 'https://zee5.com/tvshows/details/krishna-balram/0-6-1871/episode-1-the-test-of-bramha/0-1-233402',
+ 'url': 'https://www.zee5.com/kids/kids-shows/bandbudh-aur-budbak/0-6-1899/yoga-se-hoga-bandbudh-aur-budbak/0-1-239839',
'info_dict': {
- 'id': '0-1-233402',
+ 'id': '0-1-239839',
'ext': 'mp4',
- 'display_id': 'episode-1-the-test-of-bramha',
- 'title': 'Episode 1 - The Test Of Bramha',
- 'duration': 1336,
- 'average_rating': 4,
+ 'display_id': 'yoga-se-hoga-bandbudh-aur-budbak',
+ 'title': 'Yoga Se Hoga-Bandbudh aur Budbak',
+ 'duration': 659,
'description': compat_str,
- 'alt_title': 'Episode 1 - The Test Of Bramha',
+ 'alt_title': 'Yoga Se Hoga-Bandbudh aur Budbak',
'uploader': 'Zee Entertainment Enterprises Ltd',
- 'release_date': '20090101',
- 'upload_date': '20090101',
- 'timestamp': 1230768000,
- 'thumbnail': 'https://akamaividz.zee5.com/resources/0-1-233402/list/270x152/01233402_list.jpg',
- 'series': 'Krishna Balram',
+ 'release_date': '20150101',
+ 'upload_date': '20150101',
+ 'timestamp': 1420070400,
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'series': 'Bandbudh Aur Budbak',
'season_number': 1,
'episode_number': 1,
+ 'episode': 'Episode 1',
+ 'season': 'Season 1',
'tags': list,
},
'params': {
'format': 'bv',
},
}, {
- 'url': 'https://www.zee5.com/hi/tvshows/details/kundali-bhagya/0-6-366/kundali-bhagya-march-08-2021/0-1-manual_7g9jv1os7730?country=IN',
+ 'url': 'https://www.zee5.com/hi/tv-shows/details/kundali-bhagya/0-6-366/kundali-bhagya-march-08-2021/0-1-manual_7g9jv1os7730?country=IN',
'only_matching': True
}, {
- 'url': 'https://www.zee5.com/global/hi/tvshows/details/kundali-bhagya/0-6-366/kundali-bhagya-march-08-2021/0-1-manual_7g9jv1os7730',
+ 'url': 'https://www.zee5.com/global/hi/tv-shows/details/kundali-bhagya/0-6-366/kundali-bhagya-march-08-2021/0-1-manual_7g9jv1os7730',
+ 'only_matching': True
+ }, {
+ 'url': 'https://www.zee5.com/web-series/details/mithya/0-6-4z587408/maine-dekhi-hai-uski-mrityu/0-1-6z587412',
'only_matching': True
}]
_DETAIL_API_URL = 'https://spapi.zee5.com/singlePlayback/getDetails?content_id={}&device_id={}&platform_name=desktop_web&country=IN&check_parental_control=false'
@@ -86,31 +91,29 @@ class Zee5IE(InfoExtractor):
_USER_TOKEN = None
_LOGIN_HINT = 'Use "--username <mobile_number>" to login using otp or "--username token" and "--password <user_token>" to login using user token.'
_NETRC_MACHINE = 'zee5'
+ _GEO_COUNTRIES = ['IN']
- def _login(self):
- username, password = self._get_login_info()
- if username:
- if len(username) == 10 and username.isdigit() and self._USER_TOKEN is None:
- self.report_login()
- otp_request_json = self._download_json('https://b2bapi.zee5.com/device/sendotp_v1.php?phoneno=91{}'.format(username),
- None, note='Sending OTP')
- if otp_request_json['code'] == 0:
- self.to_screen(otp_request_json['message'])
- else:
- raise ExtractorError(otp_request_json['message'], expected=True)
- otp_code = self._get_tfa_info('OTP')
- otp_verify_json = self._download_json('https://b2bapi.zee5.com/device/verifyotp_v1.php?phoneno=91{}&otp={}&guest_token={}&platform=web'.format(username, otp_code, self._DEVICE_ID),
- None, note='Verifying OTP', fatal=False)
- if not otp_verify_json:
- raise ExtractorError('Unable to verify OTP.', expected=True)
- self._USER_TOKEN = otp_verify_json.get('token')
- if not self._USER_TOKEN:
- raise ExtractorError(otp_request_json['message'], expected=True)
- elif username.lower() == 'token' and len(password) > 1198:
- self._USER_TOKEN = password
-
- def _real_initialize(self):
- self._login()
+ def _perform_login(self, username, password):
+ if len(username) == 10 and username.isdigit() and self._USER_TOKEN is None:
+ self.report_login()
+ otp_request_json = self._download_json('https://b2bapi.zee5.com/device/sendotp_v1.php?phoneno=91{}'.format(username),
+ None, note='Sending OTP')
+ if otp_request_json['code'] == 0:
+ self.to_screen(otp_request_json['message'])
+ else:
+ raise ExtractorError(otp_request_json['message'], expected=True)
+ otp_code = self._get_tfa_info('OTP')
+ otp_verify_json = self._download_json('https://b2bapi.zee5.com/device/verifyotp_v1.php?phoneno=91{}&otp={}&guest_token={}&platform=web'.format(username, otp_code, self._DEVICE_ID),
+ None, note='Verifying OTP', fatal=False)
+ if not otp_verify_json:
+ raise ExtractorError('Unable to verify OTP.', expected=True)
+ self._USER_TOKEN = otp_verify_json.get('token')
+ if not self._USER_TOKEN:
+ raise ExtractorError(otp_request_json['message'], expected=True)
+ elif username.lower() == 'token' and len(password) > 1198:
+ self._USER_TOKEN = password
+ else:
+ raise ExtractorError(self._LOGIN_HINT, expected=True)
def _real_extract(self, url):
video_id, display_id = self._match_valid_url(url).group('id', 'display_id')
@@ -153,7 +156,6 @@ class Zee5IE(InfoExtractor):
'formats': formats,
'subtitles': subtitles,
'duration': int_or_none(asset_data.get('duration')),
- 'average_rating': int_or_none(asset_data.get('rating')),
'description': str_or_none(asset_data.get('description')),
'alt_title': str_or_none(asset_data.get('original_title')),
'uploader': str_or_none(asset_data.get('content_owner')),
@@ -174,43 +176,48 @@ class Zee5SeriesIE(InfoExtractor):
_VALID_URL = r'''(?x)
(?:
zee5:series:|
- (?:https?://)(?:www\.)?zee5\.com/(?:[^#?]+/)?
- (?:tvshows|kids|zee5originals)(?:/[^#/?]+){2}/
+ https?://(?:www\.)?zee5\.com/(?:[^#?]+/)?
+ (?:tv-shows|web-series|kids|zee5originals)(?:/[^#/?]+){2}/
)
- (?P<id>[^#/?]+)/?(?:$|[?#])
+ (?P<id>[^#/?]+)(?:/episodes)?/?(?:$|[?#])
'''
_TESTS = [{
- 'url': 'https://www.zee5.com/kids/kids-shows/krishna-balram/0-6-1871',
- 'playlist_mincount': 43,
+ 'url': 'https://www.zee5.com/kids/kids-shows/bandbudh-aur-budbak/0-6-1899',
+ 'playlist_mincount': 156,
'info_dict': {
- 'id': '0-6-1871',
+ 'id': '0-6-1899',
},
}, {
- 'url': 'https://www.zee5.com/tvshows/details/bhabi-ji-ghar-par-hai/0-6-199',
+ 'url': 'https://www.zee5.com/tv-shows/details/bhabi-ji-ghar-par-hai/0-6-199',
'playlist_mincount': 1500,
'info_dict': {
'id': '0-6-199',
},
}, {
- 'url': 'https://www.zee5.com/tvshows/details/agent-raghav-crime-branch/0-6-965',
+ 'url': 'https://www.zee5.com/tv-shows/details/agent-raghav-crime-branch/0-6-965',
'playlist_mincount': 24,
'info_dict': {
'id': '0-6-965',
},
}, {
- 'url': 'https://www.zee5.com/ta/tvshows/details/nagabhairavi/0-6-3201',
+ 'url': 'https://www.zee5.com/ta/tv-shows/details/nagabhairavi/0-6-3201',
'playlist_mincount': 3,
'info_dict': {
'id': '0-6-3201',
},
}, {
- 'url': 'https://www.zee5.com/global/hi/tvshows/details/khwaabon-ki-zamin-par/0-6-270',
+ 'url': 'https://www.zee5.com/global/hi/tv-shows/details/khwaabon-ki-zamin-par/0-6-270',
'playlist_mincount': 150,
'info_dict': {
'id': '0-6-270',
},
- }
- ]
+ }, {
+ 'url': 'https://www.zee5.com/tv-shows/details/chala-hawa-yeu-dya-ladies-zindabaad/0-6-2943/episodes',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.zee5.com/web-series/details/mithya/0-6-4z587408',
+ 'only_matching': True,
+ }]
def _entries(self, show_id):
access_token_request = self._download_json(
diff --git a/hypervideo_dl/extractor/zhihu.py b/hypervideo_dl/extractor/zhihu.py
index d1ed55b..278a943 100644
--- a/hypervideo_dl/extractor/zhihu.py
+++ b/hypervideo_dl/extractor/zhihu.py
@@ -2,7 +2,7 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..utils import float_or_none, int_or_none
+from ..utils import format_field, float_or_none, int_or_none
class ZhihuIE(InfoExtractor):
@@ -61,7 +61,7 @@ class ZhihuIE(InfoExtractor):
'uploader': author.get('name'),
'timestamp': int_or_none(zvideo.get('published_at')),
'uploader_id': author.get('id'),
- 'uploader_url': 'https://www.zhihu.com/people/' + url_token if url_token else None,
+ 'uploader_url': format_field(url_token, template='https://www.zhihu.com/people/%s'),
'duration': float_or_none(video.get('duration')),
'view_count': int_or_none(zvideo.get('play_count')),
'like_count': int_or_none(zvideo.get('liked_count')),
diff --git a/hypervideo_dl/extractor/zingmp3.py b/hypervideo_dl/extractor/zingmp3.py
index a3edc15..419bf30 100644
--- a/hypervideo_dl/extractor/zingmp3.py
+++ b/hypervideo_dl/extractor/zingmp3.py
@@ -1,22 +1,46 @@
# coding: utf-8
from __future__ import unicode_literals
+import hashlib
+import hmac
+import urllib.parse
+
from .common import InfoExtractor
from ..utils import (
int_or_none,
+ traverse_obj,
)
class ZingMp3BaseIE(InfoExtractor):
- _VALID_URL_TMPL = r'https?://(?:mp3\.zing|zingmp3)\.vn/(?:%s)/[^/]+/(?P<id>\w+)\.html'
+ _VALID_URL_TMPL = r'https?://(?:mp3\.zing|zingmp3)\.vn/(?P<type>(?:%s))/[^/]+/(?P<id>\w+)(?:\.html|\?)'
_GEO_COUNTRIES = ['VN']
+ _DOMAIN = 'https://zingmp3.vn'
+ _SLUG_API = {
+ 'bai-hat': '/api/v2/page/get/song',
+ 'embed': '/api/v2/page/get/song',
+ 'video-clip': '/api/v2/page/get/video',
+ 'playlist': '/api/v2/page/get/playlist',
+ 'album': '/api/v2/page/get/playlist',
+ 'lyric': '/api/v2/lyric/get/lyric',
+ 'song_streaming': '/api/v2/song/get/streaming',
+ }
+
+ _API_KEY = '88265e23d4284f25963e6eedac8fbfa3'
+ _SECRET_KEY = b'2aa2d1c561e809b267f3638c4a307aab'
+
+ def _extract_item(self, item, song_id, type_url, fatal):
+ item_id = item.get('encodeId') or song_id
+ title = item.get('title') or item.get('alias')
- def _extract_item(self, item, fatal):
- item_id = item['id']
- title = item.get('name') or item['title']
+ if type_url == 'video-clip':
+ source = item.get('streaming')
+ else:
+ api = self.get_api_with_signature(name_api=self._SLUG_API.get('song_streaming'), param={'id': item_id})
+ source = self._download_json(api, video_id=item_id).get('data')
formats = []
- for k, v in (item.get('source') or {}).items():
+ for k, v in (source or {}).items():
if not v:
continue
if k in ('mp4', 'hls'):
@@ -34,31 +58,35 @@ class ZingMp3BaseIE(InfoExtractor):
'height': int_or_none(self._search_regex(
r'^(\d+)p', res, 'resolution', default=None)),
})
- else:
- formats.append({
- 'ext': 'mp3',
- 'format_id': k,
- 'tbr': int_or_none(k),
- 'url': self._proto_relative_url(v),
- 'vcodec': 'none',
- })
+ continue
+ elif v == 'VIP':
+ continue
+ formats.append({
+ 'ext': 'mp3',
+ 'format_id': k,
+ 'tbr': int_or_none(k),
+ 'url': self._proto_relative_url(v),
+ 'vcodec': 'none',
+ })
if not formats:
if not fatal:
return
- msg = item['msg']
+ msg = item.get('msg')
if msg == 'Sorry, this content is not available in your country.':
self.raise_geo_restricted(countries=self._GEO_COUNTRIES, metadata_available=True)
self.raise_no_formats(msg, expected=True)
self._sort_formats(formats)
- subtitles = None
lyric = item.get('lyric')
- if lyric:
- subtitles = {
- 'origin': [{
- 'url': lyric,
- }],
- }
+ if not lyric:
+ api = self.get_api_with_signature(name_api=self._SLUG_API.get("lyric"), param={'id': item_id})
+ info_lyric = self._download_json(api, video_id=item_id)
+ lyric = traverse_obj(info_lyric, ('data', 'file'))
+ subtitles = {
+ 'origin': [{
+ 'url': lyric,
+ }],
+ } if lyric else None
album = item.get('album') or {}
@@ -66,30 +94,40 @@ class ZingMp3BaseIE(InfoExtractor):
'id': item_id,
'title': title,
'formats': formats,
- 'thumbnail': item.get('thumbnail'),
+ 'thumbnail': traverse_obj(item, 'thumbnail', 'thumbnailM'),
'subtitles': subtitles,
'duration': int_or_none(item.get('duration')),
'track': title,
- 'artist': item.get('artists_names'),
- 'album': album.get('name') or album.get('title'),
- 'album_artist': album.get('artists_names'),
+ 'artist': traverse_obj(item, 'artistsNames', 'artists_names'),
+ 'album': traverse_obj(album, 'name', 'title'),
+ 'album_artist': traverse_obj(album, 'artistsNames', 'artists_names'),
}
+ def _real_initialize(self):
+ if not self.get_param('cookiefile') and not self.get_param('cookiesfrombrowser'):
+ self._request_webpage(self.get_api_with_signature(name_api=self._SLUG_API['bai-hat'], param={'id': ''}),
+ None, note='Updating cookies')
+
def _real_extract(self, url):
- page_id = self._match_id(url)
- webpage = self._download_webpage(
- url.replace('://zingmp3.vn/', '://mp3.zing.vn/'),
- page_id, query={'play_song': 1})
- data_path = self._search_regex(
- r'data-xml="([^"]+)', webpage, 'data path')
- return self._process_data(self._download_json(
- 'https://mp3.zing.vn/xhr' + data_path, page_id)['data'])
+ song_id, type_url = self._match_valid_url(url).group('id', 'type')
+ api = self.get_api_with_signature(name_api=self._SLUG_API[type_url], param={'id': song_id})
+ return self._process_data(self._download_json(api, song_id)['data'], song_id, type_url)
+
+ def get_api_with_signature(self, name_api, param):
+ param.update({'ctime': '1'})
+ sha256 = hashlib.sha256(''.join(f'{i}={param[i]}' for i in sorted(param)).encode('utf-8')).hexdigest()
+ data = {
+ 'apiKey': self._API_KEY,
+ 'sig': hmac.new(self._SECRET_KEY, f'{name_api}{sha256}'.encode('utf-8'), hashlib.sha512).hexdigest(),
+ **param,
+ }
+ return f'{self._DOMAIN}{name_api}?{urllib.parse.urlencode(data)}'
class ZingMp3IE(ZingMp3BaseIE):
- _VALID_URL = ZingMp3BaseIE._VALID_URL_TMPL % 'bai-hat|video-clip'
+ _VALID_URL = ZingMp3BaseIE._VALID_URL_TMPL % 'bai-hat|video-clip|embed'
_TESTS = [{
- 'url': 'http://mp3.zing.vn/bai-hat/Xa-Mai-Xa-Bao-Thy/ZWZB9WAB.html',
+ 'url': 'https://mp3.zing.vn/bai-hat/Xa-Mai-Xa-Bao-Thy/ZWZB9WAB.html',
'md5': 'ead7ae13693b3205cbc89536a077daed',
'info_dict': {
'id': 'ZWZB9WAB',
@@ -108,8 +146,8 @@ class ZingMp3IE(ZingMp3BaseIE):
'album_artist': 'Bảo Thy',
},
}, {
- 'url': 'https://mp3.zing.vn/video-clip/Suong-Hoa-Dua-Loi-K-ICM-RYO/ZO8ZF7C7.html',
- 'md5': 'e9c972b693aa88301ef981c8151c4343',
+ 'url': 'https://zingmp3.vn/video-clip/Suong-Hoa-Dua-Loi-K-ICM-RYO/ZO8ZF7C7.html',
+ 'md5': 'c7f23d971ac1a4f675456ed13c9b9612',
'info_dict': {
'id': 'ZO8ZF7C7',
'title': 'Sương Hoa Đưa Lối',
@@ -118,16 +156,35 @@ class ZingMp3IE(ZingMp3BaseIE):
'duration': 207,
'track': 'Sương Hoa Đưa Lối',
'artist': 'K-ICM, RYO',
+ 'album': 'Sương Hoa Đưa Lối (Single)',
+ 'album_artist': 'K-ICM, RYO',
},
}, {
+ 'url': 'https://zingmp3.vn/bai-hat/Nguoi-Yeu-Toi-Lanh-Lung-Sat-Da-Mr-Siro/ZZ6IW7OU.html',
+ 'md5': '3e9f7a9bd0d965573dbff8d7c68b629d',
+ 'info_dict': {
+ 'id': 'ZZ6IW7OU',
+ 'title': 'Người Yêu Tôi Lạnh Lùng Sắt Đá',
+ 'ext': 'mp3',
+ 'thumbnail': r're:^https?://.+\.jpg',
+ 'duration': 303,
+ 'track': 'Người Yêu Tôi Lạnh Lùng Sắt Đá',
+ 'artist': 'Mr. Siro',
+ 'album': 'Người Yêu Tôi Lạnh Lùng Sắt Đá (Single)',
+ 'album_artist': 'Mr. Siro',
+ },
+ }, {
+ 'url': 'https://zingmp3.vn/embed/song/ZWZEI76B?start=false',
+ 'only_matching': True,
+ }, {
'url': 'https://zingmp3.vn/bai-hat/Xa-Mai-Xa-Bao-Thy/ZWZB9WAB.html',
'only_matching': True,
}]
IE_NAME = 'zingmp3'
- IE_DESC = 'mp3.zing.vn'
+ IE_DESC = 'zingmp3.vn'
- def _process_data(self, data):
- return self._extract_item(data, True)
+ def _process_data(self, data, song_id, type_url):
+ return self._extract_item(data, song_id, type_url, True)
class ZingMp3AlbumIE(ZingMp3BaseIE):
@@ -139,7 +196,15 @@ class ZingMp3AlbumIE(ZingMp3BaseIE):
'id': 'ZWZBWDAF',
'title': 'Lâu Đài Tình Ái',
},
- 'playlist_count': 10,
+ 'playlist_count': 9,
+ }, {
+ 'url': 'https://zingmp3.vn/album/Nhung-Bai-Hat-Hay-Nhat-Cua-Mr-Siro-Mr-Siro/ZWZAEZZD.html',
+ 'info_dict': {
+ '_type': 'playlist',
+ 'id': 'ZWZAEZZD',
+ 'title': 'Những Bài Hát Hay Nhất Của Mr. Siro',
+ },
+ 'playlist_count': 49,
}, {
'url': 'http://mp3.zing.vn/playlist/Duong-Hong-Loan-apollobee/IWCAACCB.html',
'only_matching': True,
@@ -149,12 +214,12 @@ class ZingMp3AlbumIE(ZingMp3BaseIE):
}]
IE_NAME = 'zingmp3:album'
- def _process_data(self, data):
+ def _process_data(self, data, song_id, type_url):
def entries():
- for item in (data.get('items') or []):
- entry = self._extract_item(item, False)
+ for item in traverse_obj(data, ('song', 'items')) or []:
+ entry = self._extract_item(item, song_id, type_url, False)
if entry:
yield entry
- info = data.get('info') or {}
- return self.playlist_result(
- entries(), info.get('id'), info.get('name') or info.get('title'))
+
+ return self.playlist_result(entries(), traverse_obj(data, 'id', 'encodeId'),
+ traverse_obj(data, 'name', 'title'))
diff --git a/hypervideo_dl/extractor/zoom.py b/hypervideo_dl/extractor/zoom.py
index 25a0902..c005488 100644
--- a/hypervideo_dl/extractor/zoom.py
+++ b/hypervideo_dl/extractor/zoom.py
@@ -6,6 +6,7 @@ from .common import InfoExtractor
from ..utils import (
ExtractorError,
int_or_none,
+ str_or_none,
js_to_json,
parse_filesize,
urlencode_postdata,
@@ -23,7 +24,8 @@ class ZoomIE(InfoExtractor):
'id': 'dUk_CNBETmZ5VA2BwEl-jjakPpJ3M1pcfVYAPRsoIbEByGsLjUZtaa4yCATQuOL3der8BlTwxQePl_j0.EImBkXzTIaPvdZO5',
'ext': 'mp4',
'title': 'China\'s "two sessions" and the new five-year plan',
- }
+ },
+ 'skip': 'Recording requires email authentication to access',
}
def _real_extract(self, url):
@@ -56,22 +58,46 @@ class ZoomIE(InfoExtractor):
webpage, 'data'), play_id, js_to_json)
subtitles = {}
- for _type in ('transcript', 'cc'):
+ for _type in ('transcript', 'cc', 'chapter'):
if data.get('%sUrl' % _type):
subtitles[_type] = [{
'url': urljoin(base_url, data['%sUrl' % _type]),
'ext': 'vtt',
}]
+ formats = []
+
+ if data.get('viewMp4Url'):
+ formats.append({
+ 'format_note': 'Camera stream',
+ 'url': str_or_none(data.get('viewMp4Url')),
+ 'width': int_or_none(data.get('viewResolvtionsWidth')),
+ 'height': int_or_none(data.get('viewResolvtionsHeight')),
+ 'format_id': str_or_none(data.get('recordingId')),
+ 'ext': 'mp4',
+ 'filesize_approx': parse_filesize(data.get('fileSize')),
+ 'preference': 0
+ })
+
+ if data.get('shareMp4Url'):
+ formats.append({
+ 'format_note': 'Screen share stream',
+ 'url': str_or_none(data.get('shareMp4Url')),
+ 'width': int_or_none(data.get('shareResolvtionsWidth')),
+ 'height': int_or_none(data.get('shareResolvtionsHeight')),
+ 'format_id': str_or_none(data.get('shareVideoId')),
+ 'ext': 'mp4',
+ 'preference': -1
+ })
+
+ self._sort_formats(formats)
+
return {
'id': play_id,
- 'title': data['topic'],
- 'url': data['viewMp4Url'],
+ 'title': data.get('topic'),
'subtitles': subtitles,
- 'width': int_or_none(data.get('viewResolvtionsWidth')),
- 'height': int_or_none(data.get('viewResolvtionsHeight')),
+ 'formats': formats,
'http_headers': {
'Referer': base_url,
},
- 'filesize_approx': parse_filesize(data.get('fileSize')),
}
diff --git a/hypervideo_dl/jsinterp.py b/hypervideo_dl/jsinterp.py
index 7bda596..46834f8 100644
--- a/hypervideo_dl/jsinterp.py
+++ b/hypervideo_dl/jsinterp.py
@@ -1,5 +1,4 @@
-from __future__ import unicode_literals
-
+from collections.abc import MutableMapping
import json
import operator
import re
@@ -22,10 +21,55 @@ _OPERATORS = [
('*', operator.mul),
]
_ASSIGN_OPERATORS = [(op + '=', opfunc) for op, opfunc in _OPERATORS]
-_ASSIGN_OPERATORS.append(('=', lambda cur, right: right))
+_ASSIGN_OPERATORS.append(('=', (lambda cur, right: right)))
_NAME_RE = r'[a-zA-Z_$][a-zA-Z_$0-9]*'
+_MATCHING_PARENS = dict(zip('({[', ')}]'))
+
+
+class JS_Break(ExtractorError):
+ def __init__(self):
+ ExtractorError.__init__(self, 'Invalid break')
+
+
+class JS_Continue(ExtractorError):
+ def __init__(self):
+ ExtractorError.__init__(self, 'Invalid continue')
+
+
+class LocalNameSpace(MutableMapping):
+ def __init__(self, *stack):
+ self.stack = tuple(stack)
+
+ def __getitem__(self, key):
+ for scope in self.stack:
+ if key in scope:
+ return scope[key]
+ raise KeyError(key)
+
+ def __setitem__(self, key, value):
+ for scope in self.stack:
+ if key in scope:
+ scope[key] = value
+ break
+ else:
+ self.stack[0][key] = value
+ return value
+
+ def __delitem__(self, key):
+ raise NotImplementedError('Deleting is not supported')
+
+ def __iter__(self):
+ for scope in self.stack:
+ yield from scope
+
+ def __len__(self, key):
+ return len(iter(self))
+
+ def __repr__(self):
+ return f'LocalNameSpace{self.stack}'
+
class JSInterpreter(object):
def __init__(self, code, objects=None):
@@ -34,11 +78,56 @@ class JSInterpreter(object):
self.code = code
self._functions = {}
self._objects = objects
+ self.__named_object_counter = 0
+
+ def _named_object(self, namespace, obj):
+ self.__named_object_counter += 1
+ name = f'__hypervideo_dl_jsinterp_obj{self.__named_object_counter}'
+ namespace[name] = obj
+ return name
+
+ @staticmethod
+ def _separate(expr, delim=',', max_split=None):
+ if not expr:
+ return
+ counters = {k: 0 for k in _MATCHING_PARENS.values()}
+ start, splits, pos, delim_len = 0, 0, 0, len(delim) - 1
+ for idx, char in enumerate(expr):
+ if char in _MATCHING_PARENS:
+ counters[_MATCHING_PARENS[char]] += 1
+ elif char in counters:
+ counters[char] -= 1
+ if char != delim[pos] or any(counters.values()):
+ pos = 0
+ continue
+ elif pos != delim_len:
+ pos += 1
+ continue
+ yield expr[start: idx - delim_len]
+ start, pos = idx + 1, 0
+ splits += 1
+ if max_split and splits >= max_split:
+ break
+ yield expr[start:]
+
+ @staticmethod
+ def _separate_at_paren(expr, delim):
+ separated = list(JSInterpreter._separate(expr, delim, 1))
+ if len(separated) < 2:
+ raise ExtractorError(f'No terminating paren {delim} in {expr}')
+ return separated[0][1:].strip(), separated[1].strip()
def interpret_statement(self, stmt, local_vars, allow_recursion=100):
if allow_recursion < 0:
raise ExtractorError('Recursion limit reached')
+ sub_statements = list(self._separate(stmt, ';'))
+ stmt = (sub_statements or ['']).pop()
+ for sub_stmt in sub_statements:
+ ret, should_abort = self.interpret_statement(sub_stmt, local_vars, allow_recursion - 1)
+ if should_abort:
+ return ret
+
should_abort = False
stmt = stmt.lstrip()
stmt_m = re.match(r'var\s', stmt)
@@ -61,25 +150,122 @@ class JSInterpreter(object):
if expr == '': # Empty expression
return None
+ if expr.startswith('{'):
+ inner, outer = self._separate_at_paren(expr, '}')
+ inner, should_abort = self.interpret_statement(inner, local_vars, allow_recursion - 1)
+ if not outer or should_abort:
+ return inner
+ else:
+ expr = json.dumps(inner) + outer
+
if expr.startswith('('):
- parens_count = 0
- for m in re.finditer(r'[()]', expr):
- if m.group(0) == '(':
- parens_count += 1
+ inner, outer = self._separate_at_paren(expr, ')')
+ inner = self.interpret_expression(inner, local_vars, allow_recursion)
+ if not outer:
+ return inner
+ else:
+ expr = json.dumps(inner) + outer
+
+ if expr.startswith('['):
+ inner, outer = self._separate_at_paren(expr, ']')
+ name = self._named_object(local_vars, [
+ self.interpret_expression(item, local_vars, allow_recursion)
+ for item in self._separate(inner)])
+ expr = name + outer
+
+ m = re.match(r'try\s*', expr)
+ if m:
+ if expr[m.end()] == '{':
+ try_expr, expr = self._separate_at_paren(expr[m.end():], '}')
+ else:
+ try_expr, expr = expr[m.end() - 1:], ''
+ ret, should_abort = self.interpret_statement(try_expr, local_vars, allow_recursion - 1)
+ if should_abort:
+ return ret
+ return self.interpret_statement(expr, local_vars, allow_recursion - 1)[0]
+
+ m = re.match(r'catch\s*\(', expr)
+ if m:
+ # We ignore the catch block
+ _, expr = self._separate_at_paren(expr, '}')
+ return self.interpret_statement(expr, local_vars, allow_recursion - 1)[0]
+
+ m = re.match(r'for\s*\(', expr)
+ if m:
+ constructor, remaining = self._separate_at_paren(expr[m.end() - 1:], ')')
+ if remaining.startswith('{'):
+ body, expr = self._separate_at_paren(remaining, '}')
+ else:
+ m = re.match(r'switch\s*\(', remaining) # FIXME
+ if m:
+ switch_val, remaining = self._separate_at_paren(remaining[m.end() - 1:], ')')
+ body, expr = self._separate_at_paren(remaining, '}')
+ body = 'switch(%s){%s}' % (switch_val, body)
else:
- parens_count -= 1
- if parens_count == 0:
- sub_expr = expr[1:m.start()]
- sub_result = self.interpret_expression(
- sub_expr, local_vars, allow_recursion)
- remaining_expr = expr[m.end():].strip()
- if not remaining_expr:
- return sub_result
- else:
- expr = json.dumps(sub_result) + remaining_expr
+ body, expr = remaining, ''
+ start, cndn, increment = self._separate(constructor, ';')
+ if self.interpret_statement(start, local_vars, allow_recursion - 1)[1]:
+ raise ExtractorError(
+ f'Premature return in the initialization of a for loop in {constructor!r}')
+ while True:
+ if not self.interpret_expression(cndn, local_vars, allow_recursion):
+ break
+ try:
+ ret, should_abort = self.interpret_statement(body, local_vars, allow_recursion - 1)
+ if should_abort:
+ return ret
+ except JS_Break:
+ break
+ except JS_Continue:
+ pass
+ if self.interpret_statement(increment, local_vars, allow_recursion - 1)[1]:
+ raise ExtractorError(
+ f'Premature return in the initialization of a for loop in {constructor!r}')
+ return self.interpret_statement(expr, local_vars, allow_recursion - 1)[0]
+
+ m = re.match(r'switch\s*\(', expr)
+ if m:
+ switch_val, remaining = self._separate_at_paren(expr[m.end() - 1:], ')')
+ switch_val = self.interpret_expression(switch_val, local_vars, allow_recursion)
+ body, expr = self._separate_at_paren(remaining, '}')
+ items = body.replace('default:', 'case default:').split('case ')[1:]
+ for default in (False, True):
+ matched = False
+ for item in items:
+ case, stmt = [i.strip() for i in self._separate(item, ':', 1)]
+ if default:
+ matched = matched or case == 'default'
+ elif not matched:
+ matched = case != 'default' and switch_val == self.interpret_expression(case, local_vars, allow_recursion)
+ if not matched:
+ continue
+ try:
+ ret, should_abort = self.interpret_statement(stmt, local_vars, allow_recursion - 1)
+ if should_abort:
+ return ret
+ except JS_Break:
break
- else:
- raise ExtractorError('Premature end of parens in %r' % expr)
+ if matched:
+ break
+ return self.interpret_statement(expr, local_vars, allow_recursion - 1)[0]
+
+ # Comma separated statements
+ sub_expressions = list(self._separate(expr))
+ expr = sub_expressions.pop().strip() if sub_expressions else ''
+ for sub_expr in sub_expressions:
+ self.interpret_expression(sub_expr, local_vars, allow_recursion)
+
+ for m in re.finditer(rf'''(?x)
+ (?P<pre_sign>\+\+|--)(?P<var1>{_NAME_RE})|
+ (?P<var2>{_NAME_RE})(?P<post_sign>\+\+|--)''', expr):
+ var = m.group('var1') or m.group('var2')
+ start, end = m.span()
+ sign = m.group('pre_sign') or m.group('post_sign')
+ ret = local_vars[var]
+ local_vars[var] += 1 if sign[0] == '+' else -1
+ if m.group('pre_sign'):
+ ret = local_vars[var]
+ expr = expr[:start] + json.dumps(ret) + expr[end:]
for op, opfunc in _ASSIGN_OPERATORS:
m = re.match(r'''(?x)
@@ -88,14 +274,13 @@ class JSInterpreter(object):
(?P<expr>.*)$''' % (_NAME_RE, re.escape(op)), expr)
if not m:
continue
- right_val = self.interpret_expression(
- m.group('expr'), local_vars, allow_recursion - 1)
+ right_val = self.interpret_expression(m.group('expr'), local_vars, allow_recursion)
if m.groupdict().get('index'):
lvar = local_vars[m.group('out')]
- idx = self.interpret_expression(
- m.group('index'), local_vars, allow_recursion)
- assert isinstance(idx, int)
+ idx = self.interpret_expression(m.group('index'), local_vars, allow_recursion)
+ if not isinstance(idx, int):
+ raise ExtractorError(f'List indices must be integers: {idx}')
cur = lvar[idx]
val = opfunc(cur, right_val)
lvar[idx] = val
@@ -109,8 +294,13 @@ class JSInterpreter(object):
if expr.isdigit():
return int(expr)
+ if expr == 'break':
+ raise JS_Break()
+ elif expr == 'continue':
+ raise JS_Continue()
+
var_m = re.match(
- r'(?!if|return|true|false)(?P<name>%s)$' % _NAME_RE,
+ r'(?!if|return|true|false|null)(?P<name>%s)$' % _NAME_RE,
expr)
if var_m:
return local_vars[var_m.group('name')]
@@ -124,91 +314,154 @@ class JSInterpreter(object):
r'(?P<in>%s)\[(?P<idx>.+)\]$' % _NAME_RE, expr)
if m:
val = local_vars[m.group('in')]
- idx = self.interpret_expression(
- m.group('idx'), local_vars, allow_recursion - 1)
+ idx = self.interpret_expression(m.group('idx'), local_vars, allow_recursion)
return val[idx]
+ for op, opfunc in _OPERATORS:
+ separated = list(self._separate(expr, op))
+ if len(separated) < 2:
+ continue
+ right_val = separated.pop()
+ left_val = op.join(separated)
+ left_val, should_abort = self.interpret_statement(
+ left_val, local_vars, allow_recursion - 1)
+ if should_abort:
+ raise ExtractorError(f'Premature left-side return of {op} in {expr!r}')
+ right_val, should_abort = self.interpret_statement(
+ right_val, local_vars, allow_recursion - 1)
+ if should_abort:
+ raise ExtractorError(f'Premature right-side return of {op} in {expr!r}')
+ return opfunc(left_val or 0, right_val)
+
m = re.match(
- r'(?P<var>%s)(?:\.(?P<member>[^(]+)|\[(?P<member2>[^]]+)\])\s*(?:\(+(?P<args>[^()]*)\))?$' % _NAME_RE,
+ r'(?P<var>%s)(?:\.(?P<member>[^(]+)|\[(?P<member2>[^]]+)\])\s*' % _NAME_RE,
expr)
if m:
variable = m.group('var')
member = remove_quotes(m.group('member') or m.group('member2'))
- arg_str = m.group('args')
-
- if variable in local_vars:
- obj = local_vars[variable]
- else:
- if variable not in self._objects:
- self._objects[variable] = self.extract_object(variable)
- obj = self._objects[variable]
-
- if arg_str is None:
- # Member access
- if member == 'length':
- return len(obj)
- return obj[member]
-
- assert expr.endswith(')')
- # Function call
- if arg_str == '':
- argvals = tuple()
+ arg_str = expr[m.end():]
+ if arg_str.startswith('('):
+ arg_str, remaining = self._separate_at_paren(arg_str, ')')
else:
- argvals = tuple([
+ arg_str, remaining = None, arg_str
+
+ def assertion(cndn, msg):
+ """ assert, but without risk of getting optimized out """
+ if not cndn:
+ raise ExtractorError(f'{member} {msg}: {expr}')
+
+ def eval_method():
+ nonlocal member
+ if variable == 'String':
+ obj = str
+ elif variable in local_vars:
+ obj = local_vars[variable]
+ else:
+ if variable not in self._objects:
+ self._objects[variable] = self.extract_object(variable)
+ obj = self._objects[variable]
+
+ if arg_str is None:
+ # Member access
+ if member == 'length':
+ return len(obj)
+ return obj[member]
+
+ # Function call
+ argvals = [
self.interpret_expression(v, local_vars, allow_recursion)
- for v in arg_str.split(',')])
-
- if member == 'split':
- assert argvals == ('',)
- return list(obj)
- if member == 'join':
- assert len(argvals) == 1
- return argvals[0].join(obj)
- if member == 'reverse':
- assert len(argvals) == 0
- obj.reverse()
- return obj
- if member == 'slice':
- assert len(argvals) == 1
- return obj[argvals[0]:]
- if member == 'splice':
- assert isinstance(obj, list)
- index, howMany = argvals
- res = []
- for i in range(index, min(index + howMany, len(obj))):
- res.append(obj.pop(index))
- return res
-
- return obj[member](argvals)
-
- for op, opfunc in _OPERATORS:
- m = re.match(r'(?P<x>.+?)%s(?P<y>.+)' % re.escape(op), expr)
- if not m:
- continue
- x, abort = self.interpret_statement(
- m.group('x'), local_vars, allow_recursion - 1)
- if abort:
- raise ExtractorError(
- 'Premature left-side return of %s in %r' % (op, expr))
- y, abort = self.interpret_statement(
- m.group('y'), local_vars, allow_recursion - 1)
- if abort:
- raise ExtractorError(
- 'Premature right-side return of %s in %r' % (op, expr))
- return opfunc(x, y)
+ for v in self._separate(arg_str)]
+
+ if obj == str:
+ if member == 'fromCharCode':
+ assertion(argvals, 'takes one or more arguments')
+ return ''.join(map(chr, argvals))
+ raise ExtractorError(f'Unsupported string method {member}')
+
+ if member == 'split':
+ assertion(argvals, 'takes one or more arguments')
+ assertion(argvals == [''], 'with arguments is not implemented')
+ return list(obj)
+ elif member == 'join':
+ assertion(isinstance(obj, list), 'must be applied on a list')
+ assertion(len(argvals) == 1, 'takes exactly one argument')
+ return argvals[0].join(obj)
+ elif member == 'reverse':
+ assertion(not argvals, 'does not take any arguments')
+ obj.reverse()
+ return obj
+ elif member == 'slice':
+ assertion(isinstance(obj, list), 'must be applied on a list')
+ assertion(len(argvals) == 1, 'takes exactly one argument')
+ return obj[argvals[0]:]
+ elif member == 'splice':
+ assertion(isinstance(obj, list), 'must be applied on a list')
+ assertion(argvals, 'takes one or more arguments')
+ index, howMany = map(int, (argvals + [len(obj)])[:2])
+ if index < 0:
+ index += len(obj)
+ add_items = argvals[2:]
+ res = []
+ for i in range(index, min(index + howMany, len(obj))):
+ res.append(obj.pop(index))
+ for i, item in enumerate(add_items):
+ obj.insert(index + i, item)
+ return res
+ elif member == 'unshift':
+ assertion(isinstance(obj, list), 'must be applied on a list')
+ assertion(argvals, 'takes one or more arguments')
+ for item in reversed(argvals):
+ obj.insert(0, item)
+ return obj
+ elif member == 'pop':
+ assertion(isinstance(obj, list), 'must be applied on a list')
+ assertion(not argvals, 'does not take any arguments')
+ if not obj:
+ return
+ return obj.pop()
+ elif member == 'push':
+ assertion(argvals, 'takes one or more arguments')
+ obj.extend(argvals)
+ return obj
+ elif member == 'forEach':
+ assertion(argvals, 'takes one or more arguments')
+ assertion(len(argvals) <= 2, 'takes at-most 2 arguments')
+ f, this = (argvals + [''])[:2]
+ return [f((item, idx, obj), this=this) for idx, item in enumerate(obj)]
+ elif member == 'indexOf':
+ assertion(argvals, 'takes one or more arguments')
+ assertion(len(argvals) <= 2, 'takes at-most 2 arguments')
+ idx, start = (argvals + [0])[:2]
+ try:
+ return obj.index(idx, start)
+ except ValueError:
+ return -1
+
+ if isinstance(obj, list):
+ member = int(member)
+ return obj[member](argvals)
+
+ if remaining:
+ return self.interpret_expression(
+ self._named_object(local_vars, eval_method()) + remaining,
+ local_vars, allow_recursion)
+ else:
+ return eval_method()
- m = re.match(
- r'^(?P<func>%s)\((?P<args>[a-zA-Z0-9_$,]*)\)$' % _NAME_RE, expr)
+ m = re.match(r'^(?P<func>%s)\((?P<args>[a-zA-Z0-9_$,]*)\)$' % _NAME_RE, expr)
if m:
fname = m.group('func')
argvals = tuple([
int(v) if v.isdigit() else local_vars[v]
- for v in m.group('args').split(',')]) if len(m.group('args')) > 0 else tuple()
- if fname not in self._functions:
+ for v in self._separate(m.group('args'))])
+ if fname in local_vars:
+ return local_vars[fname](argvals)
+ elif fname not in self._functions:
self._functions[fname] = self.extract_function(fname)
return self._functions[fname](argvals)
- raise ExtractorError('Unsupported JS expression %r' % expr)
+ if expr:
+ raise ExtractorError('Unsupported JS expression %r' % expr)
def extract_object(self, objname):
_FUNC_NAME_RE = r'''(?:[a-zA-Z$0-9]+|"[a-zA-Z$0-9]+"|'[a-zA-Z$0-9]+')'''
@@ -233,30 +486,55 @@ class JSInterpreter(object):
return obj
- def extract_function(self, funcname):
+ def extract_function_code(self, funcname):
+ """ @returns argnames, code """
func_m = re.search(
r'''(?x)
(?:function\s+%s|[{;,]\s*%s\s*=\s*function|var\s+%s\s*=\s*function)\s*
\((?P<args>[^)]*)\)\s*
- \{(?P<code>[^}]+)\}''' % (
+ (?P<code>\{(?:(?!};)[^"]|"([^"]|\\")*")+\})''' % (
re.escape(funcname), re.escape(funcname), re.escape(funcname)),
self.code)
+ code, _ = self._separate_at_paren(func_m.group('code'), '}') # refine the match
if func_m is None:
raise ExtractorError('Could not find JS function %r' % funcname)
- argnames = func_m.group('args').split(',')
+ return func_m.group('args').split(','), code
- return self.build_function(argnames, func_m.group('code'))
+ def extract_function(self, funcname):
+ return self.extract_function_from_code(*self.extract_function_code(funcname))
+
+ def extract_function_from_code(self, argnames, code, *global_stack):
+ local_vars = {}
+ while True:
+ mobj = re.search(r'function\((?P<args>[^)]*)\)\s*{', code)
+ if mobj is None:
+ break
+ start, body_start = mobj.span()
+ body, remaining = self._separate_at_paren(code[body_start - 1:], '}')
+ name = self._named_object(
+ local_vars,
+ self.extract_function_from_code(
+ [str.strip(x) for x in mobj.group('args').split(',')],
+ body, local_vars, *global_stack))
+ code = code[:start] + name + remaining
+ return self.build_function(argnames, code, local_vars, *global_stack)
def call_function(self, funcname, *args):
- f = self.extract_function(funcname)
- return f(args)
-
- def build_function(self, argnames, code):
- def resf(args):
- local_vars = dict(zip(argnames, args))
- for stmt in code.split(';'):
- res, abort = self.interpret_statement(stmt, local_vars)
- if abort:
+ return self.extract_function(funcname)(args)
+
+ def build_function(self, argnames, code, *global_stack):
+ global_stack = list(global_stack) or [{}]
+ local_vars = global_stack.pop(0)
+
+ def resf(args, **kwargs):
+ local_vars.update({
+ **dict(zip(argnames, args)),
+ **kwargs
+ })
+ var_stack = LocalNameSpace(local_vars, *global_stack)
+ for stmt in self._separate(code.replace('\n', ''), ';'):
+ ret, should_abort = self.interpret_statement(stmt, var_stack)
+ if should_abort:
break
- return res
+ return ret
return resf
diff --git a/hypervideo_dl/minicurses.py b/hypervideo_dl/minicurses.py
index a6e159a..f9f99e3 100644
--- a/hypervideo_dl/minicurses.py
+++ b/hypervideo_dl/minicurses.py
@@ -1,12 +1,84 @@
import functools
from threading import Lock
-from .utils import supports_terminal_sequences, TERMINAL_SEQUENCES, write_string
+from .utils import supports_terminal_sequences, write_string
+
+
+CONTROL_SEQUENCES = {
+ 'DOWN': '\n',
+ 'UP': '\033[A',
+ 'ERASE_LINE': '\033[K',
+ 'RESET': '\033[0m',
+}
+
+
+_COLORS = {
+ 'BLACK': '0',
+ 'RED': '1',
+ 'GREEN': '2',
+ 'YELLOW': '3',
+ 'BLUE': '4',
+ 'PURPLE': '5',
+ 'CYAN': '6',
+ 'WHITE': '7',
+}
+
+
+_TEXT_STYLES = {
+ 'NORMAL': '0',
+ 'BOLD': '1',
+ 'UNDERLINED': '4',
+}
+
+
+def format_text(text, f):
+ '''
+ @param f String representation of formatting to apply in the form:
+ [style] [light] font_color [on [light] bg_color]
+ Eg: "red", "bold green on light blue"
+ '''
+ f = f.upper()
+ tokens = f.strip().split()
+
+ bg_color = ''
+ if 'ON' in tokens:
+ if tokens[-1] == 'ON':
+ raise SyntaxError(f'Empty background format specified in {f!r}')
+ if tokens[-1] not in _COLORS:
+ raise SyntaxError(f'{tokens[-1]} in {f!r} must be a color')
+ bg_color = f'4{_COLORS[tokens.pop()]}'
+ if tokens[-1] == 'LIGHT':
+ bg_color = f'0;10{bg_color[1:]}'
+ tokens.pop()
+ if tokens[-1] != 'ON':
+ raise SyntaxError(f'Invalid format {f.split(" ON ", 1)[1]!r} in {f!r}')
+ bg_color = f'\033[{bg_color}m'
+ tokens.pop()
+
+ if not tokens:
+ fg_color = ''
+ elif tokens[-1] not in _COLORS:
+ raise SyntaxError(f'{tokens[-1]} in {f!r} must be a color')
+ else:
+ fg_color = f'3{_COLORS[tokens.pop()]}'
+ if tokens and tokens[-1] == 'LIGHT':
+ fg_color = f'9{fg_color[1:]}'
+ tokens.pop()
+ fg_style = tokens.pop() if tokens and tokens[-1] in _TEXT_STYLES else 'NORMAL'
+ fg_color = f'\033[{_TEXT_STYLES[fg_style]};{fg_color}m'
+ if tokens:
+ raise SyntaxError(f'Invalid format {" ".join(tokens)!r} in {f!r}')
+
+ if fg_color or bg_color:
+ return f'{fg_color}{bg_color}{text}{CONTROL_SEQUENCES["RESET"]}'
+ else:
+ return text
class MultilinePrinterBase:
def __init__(self, stream=None, lines=1):
self.stream = stream
self.maximum = lines - 1
+ self._HAVE_FULLCAP = supports_terminal_sequences(stream)
def __enter__(self):
return self
@@ -53,7 +125,6 @@ class MultilinePrinter(MultilinePrinterBase):
self.preserve_output = preserve_output
self._lastline = self._lastlength = 0
self._movelock = Lock()
- self._HAVE_FULLCAP = supports_terminal_sequences(self.stream)
def lock(func):
@functools.wraps(func)
@@ -67,15 +138,16 @@ class MultilinePrinter(MultilinePrinterBase):
yield '\r'
distance = dest - current
if distance < 0:
- yield TERMINAL_SEQUENCES['UP'] * -distance
+ yield CONTROL_SEQUENCES['UP'] * -distance
elif distance > 0:
- yield TERMINAL_SEQUENCES['DOWN'] * distance
+ yield CONTROL_SEQUENCES['DOWN'] * distance
self._lastline = dest
@lock
def print_at_line(self, text, pos):
if self._HAVE_FULLCAP:
- self.write(*self._move_cursor(pos), TERMINAL_SEQUENCES['ERASE_LINE'], text)
+ self.write(*self._move_cursor(pos), CONTROL_SEQUENCES['ERASE_LINE'], text)
+ return
text = self._add_line_number(text, pos)
textlen = len(text)
@@ -103,7 +175,7 @@ class MultilinePrinter(MultilinePrinterBase):
if self._HAVE_FULLCAP:
self.write(
- *text, TERMINAL_SEQUENCES['ERASE_LINE'],
- f'{TERMINAL_SEQUENCES["UP"]}{TERMINAL_SEQUENCES["ERASE_LINE"]}' * self.maximum)
+ *text, CONTROL_SEQUENCES['ERASE_LINE'],
+ f'{CONTROL_SEQUENCES["UP"]}{CONTROL_SEQUENCES["ERASE_LINE"]}' * self.maximum)
else:
self.write(*text, ' ' * self._lastlength)
diff --git a/hypervideo_dl/options.py b/hypervideo_dl/options.py
index 578fb86..b91193a 100644
--- a/hypervideo_dl/options.py
+++ b/hypervideo_dl/options.py
@@ -13,14 +13,15 @@ from .compat import (
compat_shlex_split,
)
from .utils import (
+ Config,
expand_path,
get_executable_path,
OUTTMPL_TYPES,
- preferredencoding,
+ POSTPROCESS_WHEN,
remove_end,
write_string,
)
-from .cookies import SUPPORTED_BROWSERS
+from .cookies import SUPPORTED_BROWSERS, SUPPORTED_KEYRINGS
from .version import __version__
from .downloader.external import list_external_downloaders
@@ -34,39 +35,16 @@ from .postprocessor import (
from .postprocessor.modify_chapters import DEFAULT_SPONSORBLOCK_CHAPTER_TITLE
-def _hide_login_info(opts):
- PRIVATE_OPTS = set(['-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'])
- eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
-
- def _scrub_eq(o):
- m = eqre.match(o)
- if m:
- return m.group('key') + '=PRIVATE'
- else:
- return o
+def parseOpts(overrideArguments=None, ignore_config_files='if_override'):
+ parser = create_parser()
+ root = Config(parser)
- opts = list(map(_scrub_eq, opts))
- for idx, opt in enumerate(opts):
- if opt in PRIVATE_OPTS and idx + 1 < len(opts):
- opts[idx + 1] = 'PRIVATE'
- return opts
-
-
-def parseOpts(overrideArguments=None):
- def _readOptions(filename_bytes, default=[]):
- try:
- optionf = open(filename_bytes)
- except IOError:
- return default # silently skip if file is not present
- try:
- # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
- contents = optionf.read()
- if sys.version_info < (3,):
- contents = contents.decode(preferredencoding())
- res = compat_shlex_split(contents, comments=True)
- finally:
- optionf.close()
- return res
+ if ignore_config_files == 'if_override':
+ ignore_config_files = overrideArguments is not None
+ if overrideArguments:
+ root.append_config(overrideArguments, label='Override')
+ else:
+ root.append_config(sys.argv[1:], label='Command-line')
def _readUserConf(package_name, default=[]):
# .config
@@ -74,7 +52,7 @@ def parseOpts(overrideArguments=None):
userConfFile = os.path.join(xdg_config_home, package_name, 'config')
if not os.path.isfile(userConfFile):
userConfFile = os.path.join(xdg_config_home, '%s.conf' % package_name)
- userConf = _readOptions(userConfFile, default=None)
+ userConf = Config.read_file(userConfFile, default=None)
if userConf is not None:
return userConf, userConfFile
@@ -82,24 +60,77 @@ def parseOpts(overrideArguments=None):
appdata_dir = compat_getenv('appdata')
if appdata_dir:
userConfFile = os.path.join(appdata_dir, package_name, 'config')
- userConf = _readOptions(userConfFile, default=None)
+ userConf = Config.read_file(userConfFile, default=None)
if userConf is None:
userConfFile += '.txt'
- userConf = _readOptions(userConfFile, default=None)
+ userConf = Config.read_file(userConfFile, default=None)
if userConf is not None:
return userConf, userConfFile
# home
userConfFile = os.path.join(compat_expanduser('~'), '%s.conf' % package_name)
- userConf = _readOptions(userConfFile, default=None)
+ userConf = Config.read_file(userConfFile, default=None)
if userConf is None:
userConfFile += '.txt'
- userConf = _readOptions(userConfFile, default=None)
+ userConf = Config.read_file(userConfFile, default=None)
if userConf is not None:
return userConf, userConfFile
return default, None
+ def add_config(label, path, user=False):
+ """ Adds config and returns whether to continue """
+ if root.parse_args()[0].ignoreconfig:
+ return False
+ # Multiple package names can be given here
+ # Eg: ('hypervideo', 'youtube-dlc', 'youtube-dl') will look for
+ # the configuration file of any of these three packages
+ for package in ('hypervideo',):
+ if user:
+ args, current_path = _readUserConf(package, default=None)
+ else:
+ current_path = os.path.join(path, '%s.conf' % package)
+ args = Config.read_file(current_path, default=None)
+ if args is not None:
+ root.append_config(args, current_path, label=label)
+ return True
+ return True
+
+ def load_configs():
+ yield not ignore_config_files
+ yield add_config('Portable', get_executable_path())
+ yield add_config('Home', expand_path(root.parse_args()[0].paths.get('home', '')).strip())
+ yield add_config('User', None, user=True)
+ yield add_config('System', '/etc')
+
+ if all(load_configs()):
+ # If ignoreconfig is found inside the system configuration file,
+ # the user configuration is removed
+ if root.parse_args()[0].ignoreconfig:
+ user_conf = next((i for i, conf in enumerate(root.configs) if conf.label == 'User'), None)
+ if user_conf is not None:
+ root.configs.pop(user_conf)
+
+ opts, args = root.parse_args()
+ if opts.verbose:
+ write_string(f'\n{root}'.replace('\n| ', '\n[debug] ')[1:] + '\n')
+ return parser, opts, args
+
+
+class _YoutubeDLOptionParser(optparse.OptionParser):
+ # optparse is deprecated since python 3.2. So assume a stable interface even for private methods
+
+ def _match_long_opt(self, opt):
+ """Improve ambigious argument resolution by comparing option objects instead of argument strings"""
+ try:
+ return super()._match_long_opt(opt)
+ except optparse.AmbiguousOptionError as e:
+ if len(set(self._long_opt[p] for p in e.possibilities)) == 1:
+ return e.possibilities[0]
+ raise
+
+
+def create_parser():
def _format_option_string(option):
''' ('-o', '--option') -> -o, --format METAVAR'''
@@ -119,7 +150,7 @@ def parseOpts(overrideArguments=None):
def _list_from_options_callback(option, opt_str, value, parser, append=True, delim=',', process=str.strip):
# append can be True, False or -1 (prepend)
- current = getattr(parser.values, option.dest) if append else []
+ current = list(getattr(parser.values, option.dest)) if append else []
value = list(filter(None, [process(value)] if delim is None else map(process, value.split(delim))))
setattr(
parser.values, option.dest,
@@ -128,10 +159,12 @@ def parseOpts(overrideArguments=None):
def _set_from_options_callback(
option, opt_str, value, parser, delim=',', allowed_values=None, aliases={},
process=lambda x: x.lower().strip()):
- current = getattr(parser.values, option.dest)
+ current = set(getattr(parser.values, option.dest))
values = [process(value)] if delim is None else list(map(process, value.split(delim)[::-1]))
while values:
actual_val = val = values.pop()
+ if not val:
+ raise optparse.OptionValueError(f'Invalid {option.metavar} for {opt_str}: {value}')
if val == 'all':
current.update(allowed_values)
elif val == '-all':
@@ -151,27 +184,33 @@ def parseOpts(overrideArguments=None):
def _dict_from_options_callback(
option, opt_str, value, parser,
- allowed_keys=r'[\w-]+', delimiter=':', default_key=None, process=None, multiple_keys=True):
+ allowed_keys=r'[\w-]+', delimiter=':', default_key=None, process=None, multiple_keys=True,
+ process_key=str.lower, append=False):
- out_dict = getattr(parser.values, option.dest)
+ out_dict = dict(getattr(parser.values, option.dest))
+ multiple_args = not isinstance(value, str)
if multiple_keys:
allowed_keys = r'(%s)(,(%s))*' % (allowed_keys, allowed_keys)
- mobj = re.match(r'(?i)(?P<keys>%s)%s(?P<val>.*)$' % (allowed_keys, delimiter), value)
+ mobj = re.match(
+ r'(?i)(?P<keys>%s)%s(?P<val>.*)$' % (allowed_keys, delimiter),
+ value[0] if multiple_args else value)
if mobj is not None:
- keys = [k.strip() for k in mobj.group('keys').lower().split(',')]
- val = mobj.group('val')
+ keys, val = mobj.group('keys').split(','), mobj.group('val')
+ if multiple_args:
+ val = [val, *value[1:]]
elif default_key is not None:
keys, val = [default_key], value
else:
raise optparse.OptionValueError(
'wrong %s formatting; it should be %s, not "%s"' % (opt_str, option.metavar, value))
try:
+ keys = map(process_key, keys) if process_key else keys
val = process(val) if process else val
except Exception as err:
- raise optparse.OptionValueError(
- 'wrong %s formatting; %s' % (opt_str, err))
+ raise optparse.OptionValueError(f'wrong {opt_str} formatting; {err}')
for key in keys:
- out_dict[key] = val
+ out_dict[key] = out_dict.get(key, []) + [val] if append else val
+ setattr(parser.values, option.dest, out_dict)
# No need to wrap help messages if we're on a wide console
columns = compat_get_terminal_size().columns
@@ -191,7 +230,7 @@ def parseOpts(overrideArguments=None):
'conflict_handler': 'resolve',
}
- parser = optparse.OptionParser(**compat_kwargs(kw))
+ parser = _YoutubeDLOptionParser(**compat_kwargs(kw))
general = optparse.OptionGroup(parser, 'General Options')
general.add_option(
@@ -205,7 +244,7 @@ def parseOpts(overrideArguments=None):
general.add_option(
'-i', '--ignore-errors',
action='store_true', dest='ignoreerrors',
- help='Ignore download and postprocessing errors. The download will be considered successfull even if the postprocessing fails')
+ help='Ignore download and postprocessing errors. The download will be considered successful even if the postprocessing fails')
general.add_option(
'--no-abort-on-error',
action='store_const', dest='ignoreerrors', const='only_download',
@@ -238,14 +277,21 @@ def parseOpts(overrideArguments=None):
'--ignore-config', '--no-config',
action='store_true', dest='ignoreconfig',
help=(
- 'Disable loading any configuration files except the one provided by --config-location. '
- 'When given inside a configuration file, no further configuration files are loaded. '
- 'Additionally, (for backward compatibility) if this option is found inside the '
- 'system configuration file, the user configuration is not loaded'))
+ 'Don\'t load any more configuration files except those given by --config-locations. '
+ 'For backward compatibility, if this option is found inside the system configuration file, the user configuration is not loaded. '
+ '(Alias: --no-config)'))
general.add_option(
- '--config-location',
- dest='config_location', metavar='PATH',
- help='Location of the main configuration file; either the path to the config or its containing directory')
+ '--no-config-locations',
+ action='store_const', dest='config_locations', const=[],
+ help=(
+ 'Do not load any custom configuration files (default). When given inside a '
+ 'configuration file, ignore all previous --config-locations defined in the current file'))
+ general.add_option(
+ '--config-locations',
+ dest='config_locations', metavar='PATH', action='append',
+ help=(
+ 'Location of the main configuration file; either the path to the config or its containing directory. '
+ 'Can be used multiple times and inside other configuration files'))
general.add_option(
'--flat-playlist',
action='store_const', dest='extract_flat', const='in_playlist', default=False,
@@ -255,9 +301,27 @@ def parseOpts(overrideArguments=None):
action='store_false', dest='extract_flat',
help='Extract the videos of a playlist')
general.add_option(
+ '--live-from-start',
+ action='store_true', dest='live_from_start',
+ help='Download livestreams from the start. Currently only supported for YouTube (Experimental)')
+ general.add_option(
+ '--no-live-from-start',
+ action='store_false', dest='live_from_start',
+ help='Download livestreams from the current time (default)')
+ general.add_option(
+ '--wait-for-video',
+ dest='wait_for_video', metavar='MIN[-MAX]', default=None,
+ help=(
+ 'Wait for scheduled streams to become available. '
+ 'Pass the minimum number of seconds (or range) to wait between retries'))
+ general.add_option(
+ '--no-wait-for-video',
+ dest='wait_for_video', action='store_const', const=None,
+ help='Do not wait for scheduled streams (default)')
+ general.add_option(
'--mark-watched',
action='store_true', dest='mark_watched', default=False,
- help='Mark videos watched (even with --simulate). Currently only supported for YouTube')
+ help='Mark videos watched (even with --simulate)')
general.add_option(
'--no-mark-watched',
action='store_false', dest='mark_watched',
@@ -272,10 +336,10 @@ def parseOpts(overrideArguments=None):
action='callback', callback=_set_from_options_callback,
callback_kwargs={
'allowed_values': {
- 'filename', 'format-sort', 'abort-on-error', 'format-spec', 'no-playlist-metafiles',
+ 'filename', 'filename-sanitization', 'format-sort', 'abort-on-error', 'format-spec', 'no-playlist-metafiles',
'multistreams', 'no-live-chat', 'playlist-index', 'list-formats', 'no-direct-merge',
- 'no-youtube-channel-redirect', 'no-youtube-unavailable-videos', 'no-attach-info-json',
- 'embed-thumbnail-atomicparsley', 'seperate-video-versions', 'no-clean-infojson', 'no-keep-subs',
+ 'no-youtube-channel-redirect', 'no-youtube-unavailable-videos', 'no-attach-info-json', 'embed-metadata',
+ 'embed-thumbnail-atomicparsley', 'seperate-video-versions', 'no-clean-infojson', 'no-keep-subs', 'no-certifi',
}, 'aliases': {
'youtube-dl': ['-multistreams', 'all'],
'youtube-dlc': ['-no-youtube-channel-redirect', '-no-live-chat', 'all'],
@@ -292,7 +356,7 @@ def parseOpts(overrideArguments=None):
help=(
'Use the specified HTTP/HTTPS/SOCKS proxy. To enable '
'SOCKS proxy, specify a proper scheme. For example '
- 'socks5://127.0.0.1:1080/. Pass in an empty string (--proxy "") '
+ 'socks5://user:pass@127.0.0.1:1080/. Pass in an empty string (--proxy "") '
'for direct connection'))
network.add_option(
'--socket-timeout',
@@ -328,10 +392,10 @@ def parseOpts(overrideArguments=None):
geo.add_option(
'--geo-bypass',
action='store_true', dest='geo_bypass', default=True,
- help='Bypass geographic restriction via faking X-Forwarded-For HTTP header')
+ help='Bypass geographic restriction via faking X-Forwarded-For HTTP header (default)')
geo.add_option(
'--no-geo-bypass',
- action='store_false', dest='geo_bypass', default=True,
+ action='store_false', dest='geo_bypass',
help='Do not bypass geographic restriction via faking X-Forwarded-For HTTP header')
geo.add_option(
'--geo-bypass-country', metavar='CODE',
@@ -364,10 +428,6 @@ def parseOpts(overrideArguments=None):
dest='rejecttitle', metavar='REGEX',
help=optparse.SUPPRESS_HELP)
selection.add_option(
- '--max-downloads',
- dest='max_downloads', metavar='NUMBER', type=int, default=None,
- help='Abort after downloading NUMBER files')
- selection.add_option(
'--min-filesize',
metavar='SIZE', dest='min_filesize', default=None,
help='Do not download any videos smaller than SIZE (e.g. 50k or 44.6m)')
@@ -379,7 +439,7 @@ def parseOpts(overrideArguments=None):
'--date',
metavar='DATE', dest='date', default=None,
help=(
- 'Download only videos uploaded in this date. '
+ 'Download only videos uploaded on this date. '
'The date can be "YYYYMMDD" or in the format '
'"(now|today)[+-][0-9](day|week|month|year)(s)?"'))
selection.add_option(
@@ -403,19 +463,18 @@ def parseOpts(overrideArguments=None):
metavar='COUNT', dest='max_views', default=None, type=int,
help=optparse.SUPPRESS_HELP)
selection.add_option(
- '--match-filter',
- metavar='FILTER', dest='match_filter', default=None,
+ '--match-filters',
+ metavar='FILTER', dest='match_filter', action='append',
help=(
'Generic video filter. Any field (see "OUTPUT TEMPLATE") can be compared with a '
'number or a string using the operators defined in "Filtering formats". '
- 'You can also simply specify a field to match if the field is present '
- 'and "!field" to check if the field is not present. In addition, '
- 'Python style regular expression matching can be done using "~=", '
- 'and multiple filters can be checked with "&". '
- 'Use a "\\" to escape "&" or quotes if needed. Eg: --match-filter '
- '"!is_live & like_count>?100 & description~=\'(?i)\\bcats \\& dogs\\b\'" '
- 'matches only videos that are not live, has a like count more than 100 '
- '(or the like field is not available), and also has a description '
+ 'You can also simply specify a field to match if the field is present, '
+ 'use "!field" to check if the field is not present, and "&" to check multiple conditions. '
+ 'Use a "\\" to escape "&" or quotes if needed. If used multiple times, '
+ 'the filter matches if atleast one of the conditions are met. Eg: --match-filter '
+ '!is_live --match-filter "like_count>?100 & description~=\'(?i)\\bcats \\& dogs\\b\'" '
+ 'matches only videos that are not live OR those that have a like count more than 100 '
+ '(or the like field is not available) and also has a description '
'that contains the phrase "cats & dogs" (ignoring case)'))
selection.add_option(
'--no-match-filter',
@@ -438,6 +497,14 @@ def parseOpts(overrideArguments=None):
dest='download_archive',
help='Download only videos not listed in the archive file. Record the IDs of all downloaded videos in it')
selection.add_option(
+ '--no-download-archive',
+ dest='download_archive', action="store_const", const=None,
+ help='Do not use archive file (default)')
+ selection.add_option(
+ '--max-downloads',
+ dest='max_downloads', metavar='NUMBER', type=int, default=None,
+ help='Abort after downloading NUMBER files')
+ selection.add_option(
'--break-on-existing',
action='store_true', dest='break_on_existing', default=False,
help='Stop the download process when encountering a file that is in the archive')
@@ -446,14 +513,18 @@ def parseOpts(overrideArguments=None):
action='store_true', dest='break_on_reject', default=False,
help='Stop the download process when encountering a file that has been filtered out')
selection.add_option(
+ '--break-per-input',
+ action='store_true', dest='break_per_url', default=False,
+ help='Make --break-on-existing and --break-on-reject act only on the current input URL')
+ selection.add_option(
+ '--no-break-per-input',
+ action='store_false', dest='break_per_url',
+ help='--break-on-existing and --break-on-reject terminates the entire download queue')
+ selection.add_option(
'--skip-playlist-after-errors', metavar='N',
dest='skip_playlist_after_errors', default=None, type=int,
help='Number of allowed failures until the rest of the playlist is skipped')
selection.add_option(
- '--no-download-archive',
- dest='download_archive', action="store_const", const=None,
- help='Do not use archive file (default)')
- selection.add_option(
'--include-ads',
dest='include_ads', action='store_true',
help=optparse.SUPPRESS_HELP)
@@ -558,12 +629,16 @@ def parseOpts(overrideArguments=None):
help="Don't give any special preference to free containers (default)")
video_format.add_option(
'--check-formats',
- action='store_true', dest='check_formats', default=None,
- help='Check that the formats selected are actually downloadable')
+ action='store_const', const='selected', dest='check_formats', default=None,
+ help='Check that the selected formats are actually downloadable')
+ video_format.add_option(
+ '--check-all-formats',
+ action='store_true', dest='check_formats',
+ help='Check all formats for whether they are actually downloadable')
video_format.add_option(
'--no-check-formats',
action='store_false', dest='check_formats',
- help='Do not check that the formats selected are actually downloadable')
+ help='Do not check that the formats are actually downloadable')
video_format.add_option(
'-F', '--list-formats',
action='store_true', dest='listformats',
@@ -626,7 +701,7 @@ def parseOpts(overrideArguments=None):
action='callback', dest='subtitleslangs', metavar='LANGS', type='str',
default=[], callback=_list_from_options_callback,
help=(
- 'Languages of the subtitles to download (can be regex) or "all" separated by commas. (Eg: --sub-langs en.*,ja) '
+ 'Languages of the subtitles to download (can be regex) or "all" separated by commas. (Eg: --sub-langs "en.*,ja") '
'You can prefix the language code with a "-" to exempt it from the requested languages. (Eg: --sub-langs all,-live_chat) '
'Use --list-subs for a list of available language tags'))
@@ -634,7 +709,7 @@ def parseOpts(overrideArguments=None):
downloader.add_option(
'-N', '--concurrent-fragments',
dest='concurrent_fragment_downloads', metavar='N', default=1, type=int,
- help='Number of fragments of a dash/hlsnative video that should be download concurrently (default is %default)')
+ help='Number of fragments of a dash/hlsnative video that should be downloaded concurrently (default is %default)')
downloader.add_option(
'-r', '--limit-rate', '--rate-limit',
dest='ratelimit', metavar='RATE',
@@ -648,6 +723,10 @@ def parseOpts(overrideArguments=None):
dest='retries', metavar='RETRIES', default=10,
help='Number of retries (default is %default), or "infinite"')
downloader.add_option(
+ '--file-access-retries',
+ dest='file_access_retries', metavar='RETRIES', default=3,
+ help='Number of times to retry on file access error (default is %default), or "infinite"')
+ downloader.add_option(
'--fragment-retries',
dest='fragment_retries', metavar='RETRIES', default=10,
help='Number of retries for a fragment (default is %default), or "infinite" (DASH, hlsnative and ISM)')
@@ -748,7 +827,7 @@ def parseOpts(overrideArguments=None):
metavar='NAME:ARGS', dest='external_downloader_args', default={}, type='str',
action='callback', callback=_dict_from_options_callback,
callback_kwargs={
- 'allowed_keys': r'ffmpeg_[io]\d*|%s' % '|'.join(list_external_downloaders()),
+ 'allowed_keys': r'ffmpeg_[io]\d*|%s' % '|'.join(map(re.escape, list_external_downloaders())),
'default_key': 'default',
'process': compat_shlex_split
}, help=(
@@ -764,6 +843,10 @@ def parseOpts(overrideArguments=None):
dest='encoding', metavar='ENCODING',
help='Force the specified encoding (experimental)')
workarounds.add_option(
+ '--legacy-server-connect',
+ action='store_true', dest='legacy_server_connect', default=False,
+ help='Explicitly allow HTTPS connection to servers that do not support RFC 5746 secure renegotiation')
+ workarounds.add_option(
'--no-check-certificates',
action='store_true', dest='no_check_certificate', default=False,
help='Suppress HTTPS certificate validation')
@@ -774,12 +857,11 @@ def parseOpts(overrideArguments=None):
workarounds.add_option(
'--user-agent',
metavar='UA', dest='user_agent',
- help='Specify a custom user agent')
+ help=optparse.SUPPRESS_HELP)
workarounds.add_option(
'--referer',
metavar='URL', dest='referer', default=None,
- help='Specify a custom referer, use if the video access is restricted to one domain',
- )
+ help=optparse.SUPPRESS_HELP)
workarounds.add_option(
'--add-header',
metavar='FIELD:VALUE', dest='headers', default={}, type='str',
@@ -832,7 +914,7 @@ def parseOpts(overrideArguments=None):
'--ignore-no-formats-error',
action='store_true', dest='ignore_no_formats_error', default=False,
help=(
- 'Ignore "No video formats" error. Usefull for extracting metadata '
+ 'Ignore "No video formats" error. Useful for extracting metadata '
'even if the videos are not actually available for download (experimental)'))
verbosity.add_option(
'--no-ignore-no-formats-error',
@@ -844,10 +926,29 @@ def parseOpts(overrideArguments=None):
help='Do not download the video but write all related files (Alias: --no-download)')
verbosity.add_option(
'-O', '--print',
- metavar='TEMPLATE', action='append', dest='forceprint',
- help=(
- 'Quiet, but print the given fields for each video. Simulate unless --no-simulate is used. '
- 'Either a field name or same syntax as the output template can be used'))
+ metavar='[WHEN:]TEMPLATE', dest='forceprint', default={}, type='str',
+ action='callback', callback=_dict_from_options_callback,
+ callback_kwargs={
+ 'allowed_keys': 'video|' + '|'.join(map(re.escape, POSTPROCESS_WHEN)),
+ 'default_key': 'video',
+ 'multiple_keys': False,
+ 'append': True,
+ }, help=(
+ 'Field name or output template to print to screen, optionally prefixed with when to print it, separated by a ":". '
+ 'Supported values of "WHEN" are the same as that of --use-postprocessor, and "video" (default). '
+ 'Implies --quiet and --simulate (unless --no-simulate is used). This option can be used multiple times'))
+ verbosity.add_option(
+ '--print-to-file',
+ metavar='[WHEN:]TEMPLATE FILE', dest='print_to_file', default={}, type='str', nargs=2,
+ action='callback', callback=_dict_from_options_callback,
+ callback_kwargs={
+ 'allowed_keys': 'video|' + '|'.join(map(re.escape, POSTPROCESS_WHEN)),
+ 'default_key': 'video',
+ 'multiple_keys': False,
+ 'append': True,
+ }, help=(
+ 'Append given template to the file. The values of WHEN and TEMPLATE are same as that of --print. '
+ 'FILE uses the same syntax as the output template. This option can be used multiple times'))
verbosity.add_option(
'-g', '--get-url',
action='store_true', dest='geturl', default=False,
@@ -927,7 +1028,7 @@ def parseOpts(overrideArguments=None):
'Template for progress outputs, optionally prefixed with one of "download:" (default), '
'"download-title:" (the console title), "postprocess:", or "postprocess-title:". '
'The video\'s fields are accessible under the "info" key and '
- 'the progress attributes are accessible under "progress" key. Eg: '
+ 'the progress attributes are accessible under "progress" key. E.g.: '
# TODO: Document the fields inside "progress"
'--console-title --progress-template "download-title:%(info.id)s-%(progress.eta)s"'))
verbosity.add_option(
@@ -965,19 +1066,27 @@ def parseOpts(overrideArguments=None):
filesystem.add_option(
'-a', '--batch-file',
dest='batchfile', metavar='FILE',
- help="File containing URLs to download ('-' for stdin), one URL per line. "
- "Lines starting with '#', ';' or ']' are considered as comments and ignored")
+ help=(
+ 'File containing URLs to download ("-" for stdin), one URL per line. '
+ 'Lines starting with "#", ";" or "]" are considered as comments and ignored'))
+ filesystem.add_option(
+ '--no-batch-file',
+ dest='batchfile', action='store_const', const=None,
+ help='Do not read URLs from batch file (default)')
+ filesystem.add_option(
+ '--id', default=False,
+ action='store_true', dest='useid', help=optparse.SUPPRESS_HELP)
filesystem.add_option(
'-P', '--paths',
metavar='[TYPES:]PATH', dest='paths', default={}, type='str',
action='callback', callback=_dict_from_options_callback,
callback_kwargs={
- 'allowed_keys': 'home|temp|%s' % '|'.join(OUTTMPL_TYPES.keys()),
+ 'allowed_keys': 'home|temp|%s' % '|'.join(map(re.escape, OUTTMPL_TYPES.keys())),
'default_key': 'home'
}, help=(
'The paths where the files should be downloaded. '
'Specify the type of file and the path separated by a colon ":". '
- 'All the same types as --output are supported. '
+ 'All the same TYPES as --output are supported. '
'Additionally, you can also provide "home" (default) and "temp" paths. '
'All intermediary files are first downloaded to the temp path and '
'then the final files are moved over to the home path after download is finished. '
@@ -987,7 +1096,7 @@ def parseOpts(overrideArguments=None):
metavar='[TYPES:]TEMPLATE', dest='outtmpl', default={}, type='str',
action='callback', callback=_dict_from_options_callback,
callback_kwargs={
- 'allowed_keys': '|'.join(OUTTMPL_TYPES.keys()),
+ 'allowed_keys': '|'.join(map(re.escape, OUTTMPL_TYPES.keys())),
'default_key': 'default'
}, help='Output filename template; see "OUTPUT TEMPLATE" for details')
filesystem.add_option(
@@ -1013,11 +1122,11 @@ def parseOpts(overrideArguments=None):
filesystem.add_option(
'--windows-filenames',
action='store_true', dest='windowsfilenames', default=False,
- help='Force filenames to be windows compatible')
+ help='Force filenames to be Windows-compatible')
filesystem.add_option(
'--no-windows-filenames',
action='store_false', dest='windowsfilenames',
- help='Make filenames windows compatible only if using windows (default)')
+ help='Make filenames Windows-compatible only if using Windows (default)')
filesystem.add_option(
'--trim-filenames', '--trim-file-names', metavar='LENGTH',
dest='trim_file_name', default=0, type=int,
@@ -1070,7 +1179,7 @@ def parseOpts(overrideArguments=None):
help='Do not write video description (default)')
filesystem.add_option(
'--write-info-json',
- action='store_true', dest='writeinfojson', default=False,
+ action='store_true', dest='writeinfojson', default=None,
help='Write video metadata to a .info.json file (this may contain personal information)')
filesystem.add_option(
'--no-write-info-json',
@@ -1095,13 +1204,13 @@ def parseOpts(overrideArguments=None):
action='store_false', dest='allow_playlist_files',
help='Do not write playlist metadata when using --write-info-json, --write-description etc.')
filesystem.add_option(
- '--clean-infojson',
+ '--clean-info-json', '--clean-infojson',
action='store_true', dest='clean_infojson', default=None,
help=(
'Remove some private fields such as filenames from the infojson. '
'Note that it could still contain some personal information (default)'))
filesystem.add_option(
- '--no-clean-infojson',
+ '--no-clean-info-json', '--no-clean-infojson',
action='store_false', dest='clean_infojson',
help='Write all fields to the infojson')
filesystem.add_option(
@@ -1121,21 +1230,22 @@ def parseOpts(overrideArguments=None):
filesystem.add_option(
'--cookies',
dest='cookiefile', metavar='FILE',
- help='File to read cookies from and dump cookie jar in')
+ help='Netscape formatted file to read cookies from and dump cookie jar in')
filesystem.add_option(
'--no-cookies',
action='store_const', const=None, dest='cookiefile', metavar='FILE',
help='Do not read/dump cookies from/to file (default)')
filesystem.add_option(
'--cookies-from-browser',
- dest='cookiesfrombrowser', metavar='BROWSER[:PROFILE]',
+ dest='cookiesfrombrowser', metavar='BROWSER[+KEYRING][:PROFILE]',
help=(
- 'Load cookies from a user profile of the given web browser. '
- 'Currently supported browsers are: {}. '
- 'You can specify the user profile name or directory using '
- '"BROWSER:PROFILE_NAME" or "BROWSER:PROFILE_PATH". '
- 'If no profile is given, the most recently accessed one is used'.format(
- ', '.join(sorted(SUPPORTED_BROWSERS)))))
+ 'The name of the browser and (optionally) the name/path of '
+ 'the profile to load cookies from, separated by a ":". '
+ f'Currently supported browsers are: {", ".join(sorted(SUPPORTED_BROWSERS))}. '
+ 'By default, the most recently accessed profile is used. '
+ 'The keyring used for decrypting Chromium cookies on Linux can be '
+ '(optionally) specified after the browser name separated by a "+". '
+ f'Currently supported keyrings are: {", ".join(map(str.lower, sorted(SUPPORTED_KEYRINGS)))}'))
filesystem.add_option(
'--no-cookies-from-browser',
action='store_const', const=None, dest='cookiesfrombrowser',
@@ -1154,7 +1264,10 @@ def parseOpts(overrideArguments=None):
thumbnail = optparse.OptionGroup(parser, 'Thumbnail Options')
thumbnail.add_option(
'--write-thumbnail',
- action='store_true', dest='writethumbnail', default=False,
+ action='callback', dest='writethumbnail', default=False,
+ # Should override --no-write-thumbnail, but not --write-all-thumbnail
+ callback=lambda option, _, __, parser: setattr(
+ parser.values, option.dest, getattr(parser.values, option.dest) or True),
help='Write thumbnail image to disk')
thumbnail.add_option(
'--no-write-thumbnail',
@@ -1162,7 +1275,7 @@ def parseOpts(overrideArguments=None):
help='Do not write thumbnail image to disk (default)')
thumbnail.add_option(
'--write-all-thumbnails',
- action='store_true', dest='write_all_thumbnails', default=False,
+ action='store_const', dest='writethumbnail', const='all',
help='Write all thumbnail image formats to disk')
thumbnail.add_option(
'--list-thumbnails',
@@ -1196,11 +1309,11 @@ def parseOpts(overrideArguments=None):
'--audio-format', metavar='FORMAT', dest='audioformat', default='best',
help=(
'Specify audio format to convert the audio to when -x is used. Currently supported formats are: '
- 'best (default) or one of %s' % '|'.join(FFmpegExtractAudioPP.SUPPORTED_EXTS)))
+ 'best (default) or one of %s' % ', '.join(FFmpegExtractAudioPP.SUPPORTED_EXTS)))
postproc.add_option(
'--audio-quality', metavar='QUALITY',
dest='audioquality', default='5',
- help='Specify ffmpeg audio quality, insert a value between 0 (better) and 9 (worse) for VBR or a specific bitrate like 128K (default %default)')
+ help='Specify ffmpeg audio quality to use when converting the audio with -x. Insert a value between 0 (best) and 10 (worst) for VBR or a specific bitrate like 128K (default %default)')
postproc.add_option(
'--remux-video',
metavar='FORMAT', dest='remuxvideo', default=None,
@@ -1208,7 +1321,7 @@ def parseOpts(overrideArguments=None):
'Remux the video into another container if necessary (currently supported: %s). '
'If target container does not support the video/audio codec, remuxing will fail. '
'You can specify multiple rules; Eg. "aac>m4a/mov>mp4/mkv" will remux aac to m4a, mov to mp4 '
- 'and anything else to mkv.' % '|'.join(FFmpegVideoRemuxerPP.SUPPORTED_EXTS)))
+ 'and anything else to mkv.' % ', '.join(FFmpegVideoRemuxerPP.SUPPORTED_EXTS)))
postproc.add_option(
'--recode-video',
metavar='FORMAT', dest='recodevideo', default=None,
@@ -1220,7 +1333,8 @@ def parseOpts(overrideArguments=None):
metavar='NAME:ARGS', dest='postprocessor_args', default={}, type='str',
action='callback', callback=_dict_from_options_callback,
callback_kwargs={
- 'allowed_keys': r'\w+(?:\+\w+)?', 'default_key': 'default-compat',
+ 'allowed_keys': r'\w+(?:\+\w+)?',
+ 'default_key': 'default-compat',
'process': compat_shlex_split,
'multiple_keys': False
}, help=(
@@ -1272,7 +1386,9 @@ def parseOpts(overrideArguments=None):
postproc.add_option(
'--embed-metadata', '--add-metadata',
action='store_true', dest='addmetadata', default=False,
- help='Embed metadata to the video file. Also adds chapters to file unless --no-add-chapters is used (Alias: --add-metadata)')
+ help=(
+ 'Embed metadata to the video file. Also embeds chapters/infojson if present '
+ 'unless --no-embed-chapters/--no-embed-info-json are used (Alias: --add-metadata)'))
postproc.add_option(
'--no-embed-metadata', '--no-add-metadata',
action='store_false', dest='addmetadata',
@@ -1286,6 +1402,14 @@ def parseOpts(overrideArguments=None):
action='store_false', dest='addchapters',
help='Do not add chapter markers (default) (Alias: --no-add-chapters)')
postproc.add_option(
+ '--embed-info-json',
+ action='store_true', dest='embed_infojson', default=None,
+ help='Embed the infojson as an attachment to mkv/mka video files')
+ postproc.add_option(
+ '--no-embed-info-json',
+ action='store_false', dest='embed_infojson',
+ help='Do not embed the infojson as an attachment to the video file')
+ postproc.add_option(
'--metadata-from-title',
metavar='FORMAT', dest='metafromtitle',
help=optparse.SUPPRESS_HELP)
@@ -1304,6 +1428,16 @@ def parseOpts(overrideArguments=None):
action='store_true', dest='xattrs', default=False,
help='Write metadata to the video file\'s xattrs (using dublin core and xdg standards)')
postproc.add_option(
+ '--concat-playlist',
+ metavar='POLICY', dest='concat_playlist', default='multi_video',
+ choices=('never', 'always', 'multi_video'),
+ help=(
+ 'Concatenate videos in a playlist. One of "never", "always", or '
+ '"multi_video" (default; only when the videos form a single show). '
+ 'All the video files must have same codecs and number of streams to be concatable. '
+ 'The "pl_video:" prefix can be used with "--paths" and "--output" to '
+ 'set the output filename for the concatenated files. See "OUTPUT TEMPLATE" for details'))
+ postproc.add_option(
'--fixup',
metavar='POLICY', dest='fixup', default=None,
choices=('never', 'ignore', 'warn', 'detect_or_warn', 'force'),
@@ -1311,7 +1445,7 @@ def parseOpts(overrideArguments=None):
'Automatically correct known faults of the file. '
'One of never (do nothing), warn (only emit a warning), '
'detect_or_warn (the default; fix file if we can, warn otherwise), '
- 'force (try fixing even if file already exists'))
+ 'force (try fixing even if file already exists)'))
postproc.add_option(
'--prefer-avconv', '--no-prefer-ffmpeg',
action='store_false', dest='prefer_ffmpeg',
@@ -1325,41 +1459,45 @@ def parseOpts(overrideArguments=None):
dest='ffmpeg_location',
help='Location of the ffmpeg binary; either the path to the binary or its containing directory')
postproc.add_option(
- '--exec', metavar='CMD',
- action='append', dest='exec_cmd',
- help=(
- 'Execute a command on the file after downloading and post-processing. '
+ '--exec',
+ metavar='[WHEN:]CMD', dest='exec_cmd', default={}, type='str',
+ action='callback', callback=_dict_from_options_callback,
+ callback_kwargs={
+ 'allowed_keys': '|'.join(map(re.escape, POSTPROCESS_WHEN)),
+ 'default_key': 'after_move',
+ 'multiple_keys': False,
+ 'append': True,
+ }, help=(
+ 'Execute a command, optionally prefixed with when to execute it (after_move if unspecified), separated by a ":". '
+ 'Supported values of "WHEN" are the same as that of --use-postprocessor. '
'Same syntax as the output template can be used to pass any field as arguments to the command. '
- 'An additional field "filepath" that contains the final path of the downloaded file is also available. '
- 'If no fields are passed, %(filepath)q is appended to the end of the command. '
+ 'After download, an additional field "filepath" that contains the final path of the downloaded file '
+ 'is also available, and if no fields are passed, %(filepath)q is appended to the end of the command. '
'This option can be used multiple times'))
postproc.add_option(
'--no-exec',
- action='store_const', dest='exec_cmd', const=[],
+ action='store_const', dest='exec_cmd', const={},
help='Remove any previously defined --exec')
postproc.add_option(
'--exec-before-download', metavar='CMD',
action='append', dest='exec_before_dl_cmd',
- help=(
- 'Execute a command before the actual download. '
- 'The syntax is the same as --exec but "filepath" is not available. '
- 'This option can be used multiple times'))
+ help=optparse.SUPPRESS_HELP)
postproc.add_option(
'--no-exec-before-download',
- action='store_const', dest='exec_before_dl_cmd', const=[],
- help='Remove any previously defined --exec-before-download')
+ action='store_const', dest='exec_before_dl_cmd', const=None,
+ help=optparse.SUPPRESS_HELP)
postproc.add_option(
'--convert-subs', '--convert-sub', '--convert-subtitles',
metavar='FORMAT', dest='convertsubtitles', default=None,
help=(
'Convert the subtitles to another format (currently supported: %s) '
- '(Alias: --convert-subtitles)' % '|'.join(FFmpegSubtitlesConvertorPP.SUPPORTED_EXTS)))
+ '(Alias: --convert-subtitles)' % ', '.join(FFmpegSubtitlesConvertorPP.SUPPORTED_EXTS)))
postproc.add_option(
'--convert-thumbnails',
metavar='FORMAT', dest='convertthumbnails', default=None,
help=(
'Convert the thumbnails to another format '
- '(currently supported: %s) ' % '|'.join(FFmpegThumbnailsConvertorPP.SUPPORTED_EXTS)))
+ '(currently supported: %s) ' % ', '.join(FFmpegThumbnailsConvertorPP.SUPPORTED_EXTS)))
postproc.add_option(
'--split-chapters', '--split-tracks',
dest='split_chapters', action='store_true', default=False,
@@ -1387,7 +1525,7 @@ def parseOpts(overrideArguments=None):
action='store_true', dest='force_keyframes_at_cuts', default=False,
help=(
'Force keyframes around the chapters before removing/splitting them. '
- 'Requires a reencode and thus is very slow, but the resulting video '
+ 'Requires a re-encode and thus is very slow, but the resulting video '
'may have fewer artifacts around the cuts'))
postproc.add_option(
'--no-force-keyframes-at-cuts',
@@ -1405,12 +1543,14 @@ def parseOpts(overrideArguments=None):
'process': lambda val: dict(_postprocessor_opts_parser(*val.split(':', 1)))
}, help=(
'The (case sensitive) name of plugin postprocessors to be enabled, '
- 'and (optionally) arguments to be passed to it, seperated by a colon ":". '
+ 'and (optionally) arguments to be passed to it, separated by a colon ":". '
'ARGS are a semicolon ";" delimited list of NAME=VALUE. '
'The "when" argument determines when the postprocessor is invoked. '
- 'It can be one of "pre_process" (after extraction), '
- '"before_dl" (before video download), "post_process" (after video download; default) '
- 'or "after_move" (after moving file to their final locations). '
+ 'It can be one of "pre_process" (after video extraction), "after_filter" (after video passes filter), '
+ '"before_dl" (before each video download), "post_process" (after each video download; default), '
+ '"after_move" (after moving video file to it\'s final locations), '
+ '"after_video" (after downloading and processing all formats of a video), '
+ 'or "playlist" (at end of playlist). '
'This option can be used multiple times to add different postprocessors'))
sponsorblock = optparse.OptionGroup(parser, 'SponsorBlock Options', description=(
@@ -1419,20 +1559,29 @@ def parseOpts(overrideArguments=None):
sponsorblock.add_option(
'--sponsorblock-mark', metavar='CATS',
dest='sponsorblock_mark', default=set(), action='callback', type='str',
- callback=_set_from_options_callback, callback_kwargs={'allowed_values': SponsorBlockPP.CATEGORIES.keys()},
- help=(
+ callback=_set_from_options_callback, callback_kwargs={
+ 'allowed_values': SponsorBlockPP.CATEGORIES.keys(),
+ 'aliases': {'default': ['all']}
+ }, help=(
'SponsorBlock categories to create chapters for, separated by commas. '
- 'Available categories are all, %s. You can prefix the category with a "-" to exempt it. '
- 'See https://wiki.sponsor.ajay.app/index.php/Segment_Categories for description of the categories. '
- 'Eg: --sponsorblock-mark all,-preview' % ', '.join(SponsorBlockPP.CATEGORIES.keys())))
+ f'Available categories are all, default(=all), {", ".join(SponsorBlockPP.CATEGORIES.keys())}. '
+ 'You can prefix the category with a "-" to exempt it. See [1] for description of the categories. '
+ 'Eg: --sponsorblock-mark all,-preview [1] https://wiki.sponsor.ajay.app/w/Segment_Categories'))
sponsorblock.add_option(
'--sponsorblock-remove', metavar='CATS',
dest='sponsorblock_remove', default=set(), action='callback', type='str',
- callback=_set_from_options_callback, callback_kwargs={'allowed_values': SponsorBlockPP.CATEGORIES.keys()},
- help=(
+ callback=_set_from_options_callback, callback_kwargs={
+ 'allowed_values': set(SponsorBlockPP.CATEGORIES.keys()) - set(SponsorBlockPP.POI_CATEGORIES.keys()),
+ # Note: From https://wiki.sponsor.ajay.app/w/Types:
+ # The filler category is very aggressive.
+ # It is strongly recommended to not use this in a client by default.
+ 'aliases': {'default': ['all', '-filler']}
+ }, help=(
'SponsorBlock categories to be removed from the video file, separated by commas. '
'If a category is present in both mark and remove, remove takes precedence. '
- 'The syntax and available categories are the same as for --sponsorblock-mark'))
+ 'The syntax and available categories are the same as for --sponsorblock-mark '
+ 'except that "default" refers to "all,-filler" '
+ f'and {", ".join(SponsorBlockPP.POI_CATEGORIES.keys())} is not available'))
sponsorblock.add_option(
'--sponsorblock-chapter-title', metavar='TEMPLATE',
default=DEFAULT_SPONSORBLOCK_CHAPTER_TITLE, dest='sponsorblock_chapter_title',
@@ -1503,7 +1652,8 @@ def parseOpts(overrideArguments=None):
'--no-hls-split-discontinuity',
dest='hls_split_discontinuity', action='store_false',
help='Do not split HLS playlists to different formats at discontinuities such as ad breaks (default)')
- _extractor_arg_parser = lambda key, vals='': (key.strip().lower().replace('-', '_'), [val.strip() for val in vals.split(',')])
+ _extractor_arg_parser = lambda key, vals='': (key.strip().lower().replace('-', '_'), [
+ val.replace(r'\,', ',').strip() for val in re.split(r'(?<!\\),', vals)])
extractor.add_option(
'--extractor-args',
metavar='KEY:ARGS', dest='extractor_args', default={}, type='str',
@@ -1549,72 +1699,11 @@ def parseOpts(overrideArguments=None):
parser.add_option_group(sponsorblock)
parser.add_option_group(extractor)
- if overrideArguments is not None:
- opts, args = parser.parse_args(overrideArguments)
- if opts.verbose:
- write_string('[debug] Override config: ' + repr(overrideArguments) + '\n')
- else:
- def compat_conf(conf):
- if sys.version_info < (3,):
- return [a.decode(preferredencoding(), 'replace') for a in conf]
- return conf
-
- configs = {
- 'command-line': compat_conf(sys.argv[1:]),
- 'custom': [], 'home': [], 'portable': [], 'user': [], 'system': []}
- paths = {'command-line': False}
+ return parser
- def read_options(name, path, user=False):
- ''' loads config files and returns ignoreconfig '''
- # Multiple package names can be given here
- # Eg: ('hypervideo', 'youtube-dlc', 'youtube-dl') will look for
- # the configuration file of any of these three packages
- for package in ('hypervideo',):
- if user:
- config, current_path = _readUserConf(package, default=None)
- else:
- current_path = os.path.join(path, '%s.conf' % package)
- config = _readOptions(current_path, default=None)
- if config is not None:
- configs[name], paths[name] = config, current_path
- return parser.parse_args(config)[0].ignoreconfig
- return False
- def get_configs():
- opts, _ = parser.parse_args(configs['command-line'])
- if opts.config_location is not None:
- location = compat_expanduser(opts.config_location)
- if os.path.isdir(location):
- location = os.path.join(location, 'hypervideo.conf')
- if not os.path.exists(location):
- parser.error('config-location %s does not exist.' % location)
- config = _readOptions(location, default=None)
- if config:
- configs['custom'], paths['custom'] = config, location
-
- if opts.ignoreconfig:
- return
- if parser.parse_args(configs['custom'])[0].ignoreconfig:
- return
- if read_options('portable', get_executable_path()):
- return
- opts, _ = parser.parse_args(configs['portable'] + configs['custom'] + configs['command-line'])
- if read_options('home', expand_path(opts.paths.get('home', '')).strip()):
- return
- if read_options('system', '/etc'):
- return
- if read_options('user', None, user=True):
- configs['system'], paths['system'] = [], None
-
- get_configs()
- argv = configs['system'] + configs['user'] + configs['home'] + configs['portable'] + configs['custom'] + configs['command-line']
- opts, args = parser.parse_args(argv)
- if opts.verbose:
- for label in ('Command-line', 'Custom', 'Portable', 'Home', 'User', 'System'):
- key = label.lower()
- if paths.get(key):
- write_string(f'[debug] {label} config file: {paths[key]}\n')
- if paths.get(key) is not None:
- write_string(f'[debug] {label} config: {_hide_login_info(configs[key])!r}\n')
-
- return parser, opts, args
+def _hide_login_info(opts):
+ write_string(
+ 'DeprecationWarning: "hypervideo_dl.options._hide_login_info" is deprecated and may be removed in a future version. '
+ 'Use "hypervideo_dl.utils.Config.hide_login_info" instead\n')
+ return Config.hide_login_info(opts)
diff --git a/hypervideo_dl/postprocessor/__init__.py b/hypervideo_dl/postprocessor/__init__.py
index 07c87b7..e47631e 100644
--- a/hypervideo_dl/postprocessor/__init__.py
+++ b/hypervideo_dl/postprocessor/__init__.py
@@ -2,12 +2,16 @@
from ..utils import load_plugins
+from .common import PostProcessor
from .embedthumbnail import EmbedThumbnailPP
from .exec import ExecPP, ExecAfterDownloadPP
from .ffmpeg import (
FFmpegPostProcessor,
+ FFmpegCopyStreamPP,
+ FFmpegConcatPP,
FFmpegEmbedSubtitlePP,
FFmpegExtractAudioPP,
+ FFmpegFixupDuplicateMoovPP,
FFmpegFixupDurationPP,
FFmpegFixupStretchedPP,
FFmpegFixupTimestampPP,
@@ -39,5 +43,5 @@ def get_postprocessor(key):
return globals()[key + 'PP']
-__all__ = [name for name in globals().keys() if name.endswith('IE')]
-__all__.append('FFmpegPostProcessor')
+__all__ = [name for name in globals().keys() if name.endswith('PP')]
+__all__.extend(('PostProcessor', 'FFmpegPostProcessor'))
diff --git a/hypervideo_dl/postprocessor/common.py b/hypervideo_dl/postprocessor/common.py
index b491afb..3899646 100644
--- a/hypervideo_dl/postprocessor/common.py
+++ b/hypervideo_dl/postprocessor/common.py
@@ -1,14 +1,19 @@
from __future__ import unicode_literals
-import copy
import functools
+import itertools
+import json
import os
+import time
+import urllib.error
-from ..compat import compat_str
from ..utils import (
_configuration_args,
encodeFilename,
+ network_exceptions,
PostProcessingError,
+ sanitized_Request,
+ write_string,
)
@@ -17,7 +22,7 @@ class PostProcessorMetaClass(type):
def run_wrapper(func):
@functools.wraps(func)
def run(self, info, *args, **kwargs):
- info_copy = copy.deepcopy(self._copy_infodict(info))
+ info_copy = self._copy_infodict(info)
self._hook_progress({'status': 'started'}, info_copy)
ret = func(self, info, *args, **kwargs)
if ret is not None:
@@ -63,7 +68,7 @@ class PostProcessor(metaclass=PostProcessorMetaClass):
@classmethod
def pp_key(cls):
name = cls.__name__[:-2]
- return compat_str(name[6:]) if name[:6].lower() == 'ffmpeg' else name
+ return name[6:] if name[:6].lower() == 'ffmpeg' else name
def to_screen(self, text, prefix=True, *args, **kwargs):
tag = '[%s] ' % self.PP_NAME if prefix else ''
@@ -74,6 +79,11 @@ class PostProcessor(metaclass=PostProcessorMetaClass):
if self._downloader:
return self._downloader.report_warning(text, *args, **kwargs)
+ def deprecation_warning(self, text):
+ if self._downloader:
+ return self._downloader.deprecation_warning(text)
+ write_string(f'DeprecationWarning: {text}')
+
def report_error(self, text, *args, **kwargs):
# Exists only for compatibility. Do not use
if self._downloader:
@@ -98,12 +108,14 @@ class PostProcessor(metaclass=PostProcessorMetaClass):
return getattr(self._downloader, '_copy_infodict', dict)(info_dict)
@staticmethod
- def _restrict_to(*, video=True, audio=True, images=True):
+ def _restrict_to(*, video=True, audio=True, images=True, simulated=True):
allowed = {'video': video, 'audio': audio, 'images': images}
def decorator(func):
@functools.wraps(func)
def wrapper(self, info):
+ if not simulated and (self.get_param('simulate') or self.get_param('skip_download')):
+ return [], info
format_type = (
'video' if info.get('vcodec') != 'none'
else 'audio' if info.get('acodec') != 'none'
@@ -173,6 +185,28 @@ class PostProcessor(metaclass=PostProcessorMetaClass):
progress_template.get('postprocess-title') or 'hypervideo %(progress._default_template)s',
progress_dict))
+ def _download_json(self, url, *, expected_http_errors=(404,)):
+ # While this is not an extractor, it behaves similar to one and
+ # so obey extractor_retries and sleep_interval_requests
+ max_retries = self.get_param('extractor_retries', 3)
+ sleep_interval = self.get_param('sleep_interval_requests') or 0
+
+ self.write_debug(f'{self.PP_NAME} query: {url}')
+ for retries in itertools.count():
+ try:
+ rsp = self._downloader.urlopen(sanitized_Request(url))
+ return json.loads(rsp.read().decode(rsp.info().get_param('charset') or 'utf-8'))
+ except network_exceptions as e:
+ if isinstance(e, urllib.error.HTTPError) and e.code in expected_http_errors:
+ return None
+ if retries < max_retries:
+ self.report_warning(f'{e}. Retrying...')
+ if sleep_interval > 0:
+ self.to_screen(f'Sleeping {sleep_interval} seconds ...')
+ time.sleep(sleep_interval)
+ continue
+ raise PostProcessingError(f'Unable to communicate with {self.PP_NAME} API: {e}')
+
class AudioConversionError(PostProcessingError):
pass
diff --git a/hypervideo_dl/postprocessor/embedthumbnail.py b/hypervideo_dl/postprocessor/embedthumbnail.py
index 3139a63..815221d 100644
--- a/hypervideo_dl/postprocessor/embedthumbnail.py
+++ b/hypervideo_dl/postprocessor/embedthumbnail.py
@@ -26,9 +26,9 @@ from ..utils import (
encodeArgument,
encodeFilename,
error_to_compat_str,
+ Popen,
PostProcessingError,
prepend_extension,
- process_communicate_or_kill,
shell_quote,
)
@@ -108,7 +108,7 @@ class EmbedThumbnailPP(FFmpegPostProcessor):
self.run_ffmpeg_multiple_files([filename, thumbnail_filename], temp_filename, options)
elif info['ext'] in ['mkv', 'mka']:
- options = ['-c', 'copy', '-map', '0', '-dn']
+ options = list(self.stream_copy_opts())
mimetype = 'image/%s' % ('png' if thumbnail_ext == 'png' else 'jpeg')
old_stream, new_stream = self.get_stream_number(
@@ -145,11 +145,46 @@ class EmbedThumbnailPP(FFmpegPostProcessor):
self.report_warning('unable to embed using mutagen; %s' % error_to_compat_str(err))
success = False
- # Method 2: Use ffmpeg+ffprobe
- if not success and not prefer_atomicparsley:
+ # Method 2: Use AtomicParsley
+ if not success:
+ success = True
+ atomicparsley = next((
+ x for x in ['AtomicParsley', 'atomicparsley']
+ if check_executable(x, ['-v'])), None)
+ if atomicparsley is None:
+ self.to_screen('Neither mutagen nor AtomicParsley was found. Falling back to ffmpeg')
+ success = False
+ else:
+ if not prefer_atomicparsley:
+ self.to_screen('mutagen was not found. Falling back to AtomicParsley')
+ cmd = [encodeFilename(atomicparsley, True),
+ encodeFilename(filename, True),
+ encodeArgument('--artwork'),
+ encodeFilename(thumbnail_filename, True),
+ encodeArgument('-o'),
+ encodeFilename(temp_filename, True)]
+ cmd += [encodeArgument(o) for o in self._configuration_args('AtomicParsley')]
+
+ self._report_run('atomicparsley', filename)
+ self.write_debug('AtomicParsley command line: %s' % shell_quote(cmd))
+ p = Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+ stdout, stderr = p.communicate_or_kill()
+ if p.returncode != 0:
+ msg = stderr.decode('utf-8', 'replace').strip()
+ self.report_warning(f'Unable to embed thumbnails using AtomicParsley; {msg}')
+ # for formats that don't support thumbnails (like 3gp) AtomicParsley
+ # won't create to the temporary file
+ if b'No changes' in stdout:
+ self.report_warning('The file format doesn\'t support embedding a thumbnail')
+ success = False
+
+ # Method 3: Use ffmpeg+ffprobe
+ # Thumbnails attached using this method doesn't show up as cover in some cases
+ # See https://github.com/hypervideo/hypervideo/issues/2125, https://github.com/hypervideo/hypervideo/issues/411
+ if not success:
success = True
try:
- options = ['-c', 'copy', '-map', '0', '-dn', '-map', '1']
+ options = [*self.stream_copy_opts(), '-map', '1']
old_stream, new_stream = self.get_stream_number(
filename, ('disposition', 'attached_pic'), 1)
@@ -161,38 +196,8 @@ class EmbedThumbnailPP(FFmpegPostProcessor):
self._report_run('ffmpeg', filename)
self.run_ffmpeg_multiple_files([filename, thumbnail_filename], temp_filename, options)
except PostProcessingError as err:
- self.report_warning('unable to embed using ffprobe & ffmpeg; %s' % error_to_compat_str(err))
- success = False
-
- # Method 3: Use AtomicParsley
- if not success:
- success = True
- atomicparsley = next((
- x for x in ['AtomicParsley', 'atomicparsley']
- if check_executable(x, ['-v'])), None)
- if atomicparsley is None:
- raise EmbedThumbnailPPError('AtomicParsley was not found. Please install')
-
- cmd = [encodeFilename(atomicparsley, True),
- encodeFilename(filename, True),
- encodeArgument('--artwork'),
- encodeFilename(thumbnail_filename, True),
- encodeArgument('-o'),
- encodeFilename(temp_filename, True)]
- cmd += [encodeArgument(o) for o in self._configuration_args('AtomicParsley')]
-
- self._report_run('atomicparsley', filename)
- self.write_debug('AtomicParsley command line: %s' % shell_quote(cmd))
- p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
- stdout, stderr = process_communicate_or_kill(p)
- if p.returncode != 0:
- msg = stderr.decode('utf-8', 'replace').strip()
- raise EmbedThumbnailPPError(msg)
- # for formats that don't support thumbnails (like 3gp) AtomicParsley
- # won't create to the temporary file
- if b'No changes' in stdout:
- self.report_warning('The file format doesn\'t support embedding a thumbnail')
success = False
+ raise EmbedThumbnailPPError(f'Unable to embed using ffprobe & ffmpeg; {err}')
elif info['ext'] in ['ogg', 'opus', 'flac']:
if not has_mutagen:
diff --git a/hypervideo_dl/postprocessor/exec.py b/hypervideo_dl/postprocessor/exec.py
index 7a3cb49..c0bd6df 100644
--- a/hypervideo_dl/postprocessor/exec.py
+++ b/hypervideo_dl/postprocessor/exec.py
@@ -22,11 +22,13 @@ class ExecPP(PostProcessor):
if tmpl_dict: # if there are no replacements, tmpl_dict = {}
return self._downloader.escape_outtmpl(tmpl) % tmpl_dict
- # If no replacements are found, replace {} for backard compatibility
- if '{}' not in cmd:
- cmd += ' {}'
- return cmd.replace('{}', compat_shlex_quote(
- info.get('filepath') or info['_filename']))
+ filepath = info.get('filepath', info.get('_filename'))
+ # If video, and no replacements are found, replace {} for backard compatibility
+ if filepath:
+ if '{}' not in cmd:
+ cmd += ' {}'
+ cmd = cmd.replace('{}', compat_shlex_quote(filepath))
+ return cmd
def run(self, info):
for tmpl in self.exec_cmd:
@@ -38,5 +40,10 @@ class ExecPP(PostProcessor):
return [], info
-class ExecAfterDownloadPP(ExecPP): # for backward compatibility
- pass
+# Deprecated
+class ExecAfterDownloadPP(ExecPP):
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ self.deprecation_warning(
+ 'hypervideo_dl.postprocessor.ExecAfterDownloadPP is deprecated '
+ 'and may be removed in a future version. Use hypervideo_dl.postprocessor.ExecPP instead')
diff --git a/hypervideo_dl/postprocessor/ffmpeg.py b/hypervideo_dl/postprocessor/ffmpeg.py
index a6d6d78..3e6edcf 100644
--- a/hypervideo_dl/postprocessor/ffmpeg.py
+++ b/hypervideo_dl/postprocessor/ffmpeg.py
@@ -1,5 +1,6 @@
from __future__ import unicode_literals
+import collections
import io
import itertools
import os
@@ -12,21 +13,24 @@ from .common import AudioConversionError, PostProcessor
from ..compat import compat_str
from ..utils import (
+ determine_ext,
dfxp2srt,
encodeArgument,
encodeFilename,
float_or_none,
- get_exe_version,
+ _get_exe_version_output,
+ detect_exe_version,
is_outdated_version,
ISO639Utils,
orderedSet,
+ Popen,
PostProcessingError,
prepend_extension,
- process_communicate_or_kill,
replace_extension,
shell_quote,
traverse_obj,
variadic,
+ write_json_file,
)
@@ -41,6 +45,7 @@ EXT_TO_OUT_FORMATS = {
'ts': 'mpegts',
'wma': 'asf',
'wmv': 'asf',
+ 'vtt': 'webvtt',
}
ACODECS = {
'mp3': 'libmp3lame',
@@ -50,6 +55,7 @@ ACODECS = {
'opus': 'libopus',
'vorbis': 'libvorbis',
'wav': None,
+ 'alac': None,
}
@@ -74,15 +80,25 @@ class FFmpegPostProcessor(PostProcessor):
self.report_warning(warning)
@staticmethod
+ def get_versions_and_features(downloader=None):
+ pp = FFmpegPostProcessor(downloader)
+ return pp._versions, pp._features
+
+ @staticmethod
def get_versions(downloader=None):
- return FFmpegPostProcessor(downloader)._versions
+ return FFmpegPostProcessor.get_versions_and_features(downloader)[0]
+
+ _version_cache, _features_cache = {}, {}
def _determine_executables(self):
programs = ['avprobe', 'avconv', 'ffmpeg', 'ffprobe']
- prefer_ffmpeg = True
- def get_ffmpeg_version(path):
- ver = get_exe_version(path, args=['-version'])
+ def get_ffmpeg_version(path, prog):
+ if path in self._version_cache:
+ self._versions[prog], self._features = self._version_cache[path], self._features_cache.get(path, {})
+ return
+ out = _get_exe_version_output(path, ['-bsfs'], to_screen=self.write_debug)
+ ver = detect_exe_version(out) if out else False
if ver:
regexs = [
r'(?:\d+:)?([0-9.]+)-[0-9]+ubuntu[0-9.]+$', # Ubuntu, see [1]
@@ -93,60 +109,66 @@ class FFmpegPostProcessor(PostProcessor):
mobj = re.match(regex, ver)
if mobj:
ver = mobj.group(1)
- return ver
+ self._versions[prog] = self._version_cache[path] = ver
+ if prog != 'ffmpeg' or not out:
+ return
+
+ mobj = re.search(r'(?m)^\s+libavformat\s+(?:[0-9. ]+)\s+/\s+(?P<runtime>[0-9. ]+)', out)
+ lavf_runtime_version = mobj.group('runtime').replace(' ', '') if mobj else None
+ self._features = self._features_cache[path] = {
+ 'fdk': '--enable-libfdk-aac' in out,
+ 'setts': 'setts' in out.splitlines(),
+ 'needs_adtstoasc': is_outdated_version(lavf_runtime_version, '57.56.100', False),
+ }
self.basename = None
self.probe_basename = None
-
self._paths = None
self._versions = None
- if self._downloader:
- prefer_ffmpeg = self.get_param('prefer_ffmpeg', True)
- location = self.get_param('ffmpeg_location')
- if location is not None:
- if not os.path.exists(location):
- self.report_warning(
- 'ffmpeg-location %s does not exist! '
- 'Continuing without ffmpeg.' % (location))
- self._versions = {}
- return
- elif os.path.isdir(location):
- dirname, basename = location, None
- else:
- basename = os.path.splitext(os.path.basename(location))[0]
- basename = next((p for p in programs if basename.startswith(p)), 'ffmpeg')
- dirname = os.path.dirname(os.path.abspath(location))
- if basename in ('ffmpeg', 'ffprobe'):
- prefer_ffmpeg = True
-
- self._paths = dict(
- (p, os.path.join(dirname, p)) for p in programs)
- if basename:
- self._paths[basename] = location
- self._versions = dict(
- (p, get_ffmpeg_version(self._paths[p])) for p in programs)
- if self._versions is None:
- self._versions = dict(
- (p, get_ffmpeg_version(p)) for p in programs)
- self._paths = dict((p, p) for p in programs)
+ self._features = {}
- if prefer_ffmpeg is False:
- prefs = ('avconv', 'ffmpeg')
+ prefer_ffmpeg = self.get_param('prefer_ffmpeg', True)
+ location = self.get_param('ffmpeg_location')
+ if location is None:
+ self._paths = {p: p for p in programs}
else:
- prefs = ('ffmpeg', 'avconv')
- for p in prefs:
- if self._versions[p]:
- self.basename = p
- break
-
+ if not os.path.exists(location):
+ self.report_warning(
+ 'ffmpeg-location %s does not exist! '
+ 'Continuing without ffmpeg.' % (location))
+ self._versions = {}
+ return
+ elif os.path.isdir(location):
+ dirname, basename = location, None
+ else:
+ basename = os.path.splitext(os.path.basename(location))[0]
+ basename = next((p for p in programs if basename.startswith(p)), 'ffmpeg')
+ dirname = os.path.dirname(os.path.abspath(location))
+ if basename in ('ffmpeg', 'ffprobe'):
+ prefer_ffmpeg = True
+
+ self._paths = dict(
+ (p, os.path.join(dirname, p)) for p in programs)
+ if basename:
+ self._paths[basename] = location
+
+ self._versions = {}
+ executables = {'basename': ('ffmpeg', 'avconv'), 'probe_basename': ('ffprobe', 'avprobe')}
if prefer_ffmpeg is False:
- prefs = ('avprobe', 'ffprobe')
- else:
- prefs = ('ffprobe', 'avprobe')
- for p in prefs:
- if self._versions[p]:
- self.probe_basename = p
- break
+ executables = {k: v[::-1] for k, v in executables.items()}
+ for var, prefs in executables.items():
+ for p in prefs:
+ get_ffmpeg_version(self._paths[p], p)
+ if self._versions[p]:
+ setattr(self, var, p)
+ break
+
+ if self.basename == 'avconv':
+ self.deprecation_warning(
+ 'Support for avconv is deprecated and may be removed in a future version. Use ffmpeg instead')
+ if self.probe_basename == 'avprobe':
+ self.deprecation_warning(
+ 'Support for avprobe is deprecated and may be removed in a future version. Use ffprobe instead')
@property
def available(self):
@@ -164,6 +186,18 @@ class FFmpegPostProcessor(PostProcessor):
def probe_executable(self):
return self._paths[self.probe_basename]
+ @staticmethod
+ def stream_copy_opts(copy=True, *, ext=None):
+ yield from ('-map', '0')
+ # Don't copy Apple TV chapters track, bin_data
+ # See https://github.com/hypervideo/hypervideo/issues/2, #19042, #19024, https://trac.ffmpeg.org/ticket/6016
+ yield from ('-dn', '-ignore_unknown')
+ if copy:
+ yield from ('-c', 'copy')
+ # For some reason, '-c copy -map 0' is not enough to copy subtitles
+ if ext in ('mp4', 'mov'):
+ yield from ('-c:s', 'mov_text')
+
def get_audio_codec(self, path):
if not self.probe_available and not self.available:
raise PostProcessingError('ffprobe and ffmpeg not found. Please install or provide the path using --ffmpeg-location')
@@ -178,10 +212,8 @@ class FFmpegPostProcessor(PostProcessor):
encodeArgument('-i')]
cmd.append(encodeFilename(self._ffmpeg_filename_argument(path), True))
self.write_debug('%s command line: %s' % (self.basename, shell_quote(cmd)))
- handle = subprocess.Popen(
- cmd, stderr=subprocess.PIPE,
- stdout=subprocess.PIPE, stdin=subprocess.PIPE)
- stdout_data, stderr_data = process_communicate_or_kill(handle)
+ handle = Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+ stdout_data, stderr_data = handle.communicate_or_kill()
expected_ret = 0 if self.probe_available else 1
if handle.wait() != expected_ret:
return None
@@ -223,7 +255,7 @@ class FFmpegPostProcessor(PostProcessor):
cmd += opts
cmd.append(encodeFilename(self._ffmpeg_filename_argument(path), True))
self.write_debug('ffprobe command line: %s' % shell_quote(cmd))
- p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
+ p = Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
stdout, stderr = p.communicate()
return json.loads(stdout.decode('utf-8', 'replace'))
@@ -234,22 +266,23 @@ class FFmpegPostProcessor(PostProcessor):
None)
return num, len(streams)
- def _get_real_video_duration(self, info, fatal=True):
+ def _get_real_video_duration(self, filepath, fatal=True):
try:
- if '_real_duration' not in info:
- info['_real_duration'] = float_or_none(
- traverse_obj(self.get_metadata_object(info['filepath']), ('format', 'duration')))
- if not info['_real_duration']:
+ duration = float_or_none(
+ traverse_obj(self.get_metadata_object(filepath), ('format', 'duration')))
+ if not duration:
raise PostProcessingError('ffprobe returned empty duration')
+ return duration
except PostProcessingError as e:
if fatal:
- raise PostProcessingError(f'Unable to determine video duration; {e}')
- return info.setdefault('_real_duration', None)
+ raise PostProcessingError(f'Unable to determine video duration: {e.msg}')
def _duration_mismatch(self, d1, d2):
if not d1 or not d2:
return None
- return abs(d1 - d2) > 1
+ # The duration is often only known to nearest second. So there can be <1sec disparity natually.
+ # Further excuse an additional <1sec difference.
+ return abs(d1 - d2) > 2
def run_ffmpeg_multiple_files(self, input_paths, out_path, opts, **kwargs):
return self.real_run_ffmpeg(
@@ -269,8 +302,10 @@ class FFmpegPostProcessor(PostProcessor):
def make_args(file, args, name, number):
keys = ['_%s%d' % (name, number), '_%s' % name]
- if name == 'o' and number == 1:
- keys.append('')
+ if name == 'o':
+ args += ['-movflags', '+faststart']
+ if number == 1:
+ keys.append('')
args += self._configuration_args(self.basename, keys)
if name == 'i':
args.append('-i')
@@ -284,8 +319,8 @@ class FFmpegPostProcessor(PostProcessor):
for i, (path, opts) in enumerate(path_opts) if path)
self.write_debug('ffmpeg command line: %s' % shell_quote(cmd))
- p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
- stdout, stderr = process_communicate_or_kill(p)
+ p = Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
+ stdout, stderr = p.communicate_or_kill()
if p.returncode not in variadic(expected_retcodes):
stderr = stderr.decode('utf-8', 'replace').strip()
self.write_debug(stderr)
@@ -324,8 +359,9 @@ class FFmpegPostProcessor(PostProcessor):
timestamps = timestamps[1:]
keyframe_file = prepend_extension(filename, 'keyframes.temp')
self.to_screen(f'Re-encoding "{filename}" with appropriate keyframes')
- self.run_ffmpeg(filename, keyframe_file, ['-force_key_frames', ','.join(
- f'{t:.6f}' for t in timestamps)])
+ self.run_ffmpeg(filename, keyframe_file, [
+ *self.stream_copy_opts(False, ext=determine_ext(filename)),
+ '-force_key_frames', ','.join(f'{t:.6f}' for t in timestamps)])
return keyframe_file
def concat_files(self, in_files, out_file, concat_opts=None):
@@ -340,17 +376,12 @@ class FFmpegPostProcessor(PostProcessor):
with open(concat_file, 'wt', encoding='utf-8') as f:
f.writelines(self._concat_spec(in_files, concat_opts))
- out_flags = ['-c', 'copy']
- if out_file.rpartition('.')[-1] in ('mp4', 'mov'):
- # For some reason, '-c copy' is not enough to copy subtitles
- out_flags.extend(['-c:s', 'mov_text', '-movflags', '+faststart'])
+ out_flags = list(self.stream_copy_opts(ext=determine_ext(out_file)))
- try:
- self.real_run_ffmpeg(
- [(concat_file, ['-hide_banner', '-nostdin', '-f', 'concat', '-safe', '0'])],
- [(out_file, out_flags)])
- finally:
- os.remove(concat_file)
+ self.real_run_ffmpeg(
+ [(concat_file, ['-hide_banner', '-nostdin', '-f', 'concat', '-safe', '0'])],
+ [(out_file, out_flags)])
+ os.remove(concat_file)
@classmethod
def _concat_spec(cls, in_files, concat_opts=None):
@@ -367,14 +398,36 @@ class FFmpegPostProcessor(PostProcessor):
class FFmpegExtractAudioPP(FFmpegPostProcessor):
COMMON_AUDIO_EXTS = ('wav', 'flac', 'm4a', 'aiff', 'mp3', 'ogg', 'mka', 'opus', 'wma')
- SUPPORTED_EXTS = ('best', 'aac', 'flac', 'mp3', 'm4a', 'opus', 'vorbis', 'wav')
+ SUPPORTED_EXTS = ('aac', 'flac', 'mp3', 'm4a', 'opus', 'vorbis', 'wav', 'alac')
def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, nopostoverwrites=False):
FFmpegPostProcessor.__init__(self, downloader)
self._preferredcodec = preferredcodec or 'best'
- self._preferredquality = preferredquality
+ self._preferredquality = float_or_none(preferredquality)
self._nopostoverwrites = nopostoverwrites
+ def _quality_args(self, codec):
+ if self._preferredquality is None:
+ return []
+ elif self._preferredquality > 10:
+ return ['-b:a', f'{self._preferredquality}k']
+
+ limits = {
+ 'libmp3lame': (10, 0),
+ 'libvorbis': (0, 10),
+ # FFmpeg's AAC encoder does not have an upper limit for the value of -q:a.
+ # Experimentally, with values over 4, bitrate changes were minimal or non-existent
+ 'aac': (0.1, 4),
+ 'libfdk_aac': (1, 5),
+ }.get(codec)
+ if not limits:
+ return []
+
+ q = limits[1] + (limits[0] - limits[1]) * (self._preferredquality / 10)
+ if codec == 'libfdk_aac':
+ return ['-vbr', f'{int(q)}']
+ return ['-q:a', f'{q}']
+
def run_ffmpeg(self, path, out_path, codec, more_opts):
if codec is None:
acodec_opts = []
@@ -388,7 +441,7 @@ class FFmpegExtractAudioPP(FFmpegPostProcessor):
@PostProcessor._restrict_to(images=False)
def run(self, information):
- path = information['filepath']
+ orig_path = path = information['filepath']
orig_ext = information['ext']
if self._preferredcodec == 'best' and orig_ext in self.COMMON_AUDIO_EXTS:
@@ -414,69 +467,74 @@ class FFmpegExtractAudioPP(FFmpegPostProcessor):
more_opts = ['-f', 'adts']
if filecodec == 'vorbis':
extension = 'ogg'
+ elif filecodec == 'alac':
+ acodec = None
+ extension = 'm4a'
+ more_opts += ['-acodec', 'alac']
else:
# MP3 otherwise.
acodec = 'libmp3lame'
extension = 'mp3'
- more_opts = []
- if self._preferredquality is not None:
- if int(self._preferredquality) < 10:
- more_opts += ['-q:a', self._preferredquality]
- else:
- more_opts += ['-b:a', self._preferredquality + 'k']
+ more_opts = self._quality_args(acodec)
else:
# We convert the audio (lossy if codec is lossy)
acodec = ACODECS[self._preferredcodec]
+ if acodec == 'aac' and self._features.get('fdk'):
+ acodec = 'libfdk_aac'
extension = self._preferredcodec
- more_opts = []
- if self._preferredquality is not None:
- # The opus codec doesn't support the -aq option
- if int(self._preferredquality) < 10 and extension != 'opus':
- more_opts += ['-q:a', self._preferredquality]
- else:
- more_opts += ['-b:a', self._preferredquality + 'k']
+ more_opts = self._quality_args(acodec)
if self._preferredcodec == 'aac':
more_opts += ['-f', 'adts']
- if self._preferredcodec == 'm4a':
+ elif self._preferredcodec == 'm4a':
more_opts += ['-bsf:a', 'aac_adtstoasc']
- if self._preferredcodec == 'vorbis':
+ elif self._preferredcodec == 'vorbis':
extension = 'ogg'
- if self._preferredcodec == 'wav':
+ elif self._preferredcodec == 'wav':
extension = 'wav'
more_opts += ['-f', 'wav']
+ elif self._preferredcodec == 'alac':
+ extension = 'm4a'
+ more_opts += ['-acodec', 'alac']
prefix, sep, ext = path.rpartition('.') # not os.path.splitext, since the latter does not work on unicode in all setups
- new_path = prefix + sep + extension
-
- information['filepath'] = new_path
- information['ext'] = extension
-
- # If we download foo.mp3 and convert it to... foo.mp3, then don't delete foo.mp3, silly.
- if (new_path == path
- or (self._nopostoverwrites and os.path.exists(encodeFilename(new_path)))):
+ temp_path = new_path = prefix + sep + extension
+
+ if new_path == path:
+ if acodec == 'copy':
+ self.to_screen(f'File is already in target format {self._preferredcodec}, skipping')
+ return [], information
+ orig_path = prepend_extension(path, 'orig')
+ temp_path = prepend_extension(path, 'temp')
+ if (self._nopostoverwrites and os.path.exists(encodeFilename(new_path))
+ and os.path.exists(encodeFilename(orig_path))):
self.to_screen('Post-process file %s exists, skipping' % new_path)
return [], information
try:
- self.to_screen('Destination: ' + new_path)
- self.run_ffmpeg(path, new_path, acodec, more_opts)
+ self.to_screen(f'Destination: {new_path}')
+ self.run_ffmpeg(path, temp_path, acodec, more_opts)
except AudioConversionError as e:
raise PostProcessingError(
'audio conversion failed: ' + e.msg)
except Exception:
raise PostProcessingError('error running ' + self.basename)
+ os.replace(path, orig_path)
+ os.replace(temp_path, new_path)
+ information['filepath'] = new_path
+ information['ext'] = extension
+
# Try to update the date time for extracted audio file.
if information.get('filetime') is not None:
self.try_utime(
new_path, time.time(), information['filetime'],
errnote='Cannot update utime of audio file')
- return [path], information
+ return [orig_path], information
class FFmpegVideoConvertorPP(FFmpegPostProcessor):
- SUPPORTED_EXTS = ('mp4', 'mkv', 'flv', 'webm', 'mov', 'avi', 'mp3', 'mka', 'm4a', 'ogg', 'opus')
+ SUPPORTED_EXTS = ('mp4', 'mkv', 'flv', 'webm', 'mov', 'avi', 'mka', 'ogg', *FFmpegExtractAudioPP.SUPPORTED_EXTS)
FORMAT_RE = re.compile(r'{0}(?:/{0})*$'.format(r'(?:\w+>)?(?:%s)' % '|'.join(SUPPORTED_EXTS)))
_ACTION = 'converting'
@@ -492,9 +550,9 @@ class FFmpegVideoConvertorPP(FFmpegPostProcessor):
@staticmethod
def _options(target_ext):
+ yield from FFmpegPostProcessor.stream_copy_opts(False)
if target_ext == 'avi':
- return ['-c:v', 'libxvid', '-vtag', 'XVID']
- return []
+ yield from ('-c:v', 'libxvid', '-vtag', 'XVID')
@PostProcessor._restrict_to(images=False)
def run(self, info):
@@ -505,7 +563,7 @@ class FFmpegVideoConvertorPP(FFmpegPostProcessor):
else f'already is in target format {source_ext}' if source_ext == target_ext
else None)
if _skip_msg:
- self.to_screen(f'Not {self._ACTION} media file {filename!r}; {_skip_msg}')
+ self.to_screen(f'Not {self._ACTION} media file "{filename}"; {_skip_msg}')
return [], info
outpath = replace_extension(filename, target_ext, source_ext)
@@ -522,10 +580,7 @@ class FFmpegVideoRemuxerPP(FFmpegVideoConvertorPP):
@staticmethod
def _options(target_ext):
- options = ['-c', 'copy', '-map', '0', '-dn']
- if target_ext in ['mp4', 'm4a', 'mov']:
- options.extend(['-movflags', '+faststart'])
- return options
+ return FFmpegPostProcessor.stream_copy_opts()
class FFmpegEmbedSubtitlePP(FFmpegPostProcessor):
@@ -534,22 +589,28 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor):
self._already_have_subtitle = already_have_subtitle
@PostProcessor._restrict_to(images=False)
- def run(self, information):
- if information['ext'] not in ('mp4', 'webm', 'mkv'):
+ def run(self, info):
+ if info['ext'] not in ('mp4', 'webm', 'mkv'):
self.to_screen('Subtitles can only be embedded in mp4, webm or mkv files')
- return [], information
- subtitles = information.get('requested_subtitles')
+ return [], info
+ subtitles = info.get('requested_subtitles')
if not subtitles:
self.to_screen('There aren\'t any subtitles to embed')
- return [], information
+ return [], info
+
+ filename = info['filepath']
- filename = information['filepath']
- if information.get('duration') and self._duration_mismatch(
- self._get_real_video_duration(information, False), information['duration']):
+ # Disabled temporarily. There needs to be a way to overide this
+ # in case of duration actually mismatching in extractor
+ # See: https://github.com/hypervideo/hypervideo/issues/1870, https://github.com/hypervideo/hypervideo/issues/1385
+ '''
+ if info.get('duration') and not info.get('__real_download') and self._duration_mismatch(
+ self._get_real_video_duration(filename, False), info['duration']):
self.to_screen(f'Skipping {self.pp_key()} since the real and expected durations mismatch')
- return [], information
+ return [], info
+ '''
- ext = information['ext']
+ ext = info['ext']
sub_langs, sub_names, sub_filenames = [], [], []
webm_vtt_warn = False
mp4_ass_warn = False
@@ -574,21 +635,16 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor):
self.report_warning('ASS subtitles cannot be properly embedded in mp4 files; expect issues')
if not sub_langs:
- return [], information
+ return [], info
input_files = [filename] + sub_filenames
opts = [
- '-c', 'copy', '-map', '0', '-dn',
+ *self.stream_copy_opts(ext=info['ext']),
# Don't copy the existing subtitles, we may be running the
# postprocessor a second time
'-map', '-0:s',
- # Don't copy Apple TV chapters track, bin_data (see #19042, #19024,
- # https://trac.ffmpeg.org/ticket/6016)
- '-map', '-0:d',
]
- if information['ext'] == 'mp4':
- opts += ['-c:s', 'mov_text']
for i, (lang, name) in enumerate(zip(sub_langs, sub_names)):
opts.extend(['-map', '%d:0' % (i + 1)])
lang_code = ISO639Utils.short2long(lang) or lang
@@ -603,34 +659,44 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor):
os.replace(temp_filename, filename)
files_to_delete = [] if self._already_have_subtitle else sub_filenames
- return files_to_delete, information
+ return files_to_delete, info
class FFmpegMetadataPP(FFmpegPostProcessor):
- def __init__(self, downloader, add_metadata=True, add_chapters=True):
+ def __init__(self, downloader, add_metadata=True, add_chapters=True, add_infojson='if_exists'):
FFmpegPostProcessor.__init__(self, downloader)
self._add_metadata = add_metadata
self._add_chapters = add_chapters
+ self._add_infojson = add_infojson
@staticmethod
def _options(target_ext):
- yield from ('-map', '0', '-dn')
- if target_ext == 'm4a':
+ audio_only = target_ext == 'm4a'
+ yield from FFmpegPostProcessor.stream_copy_opts(not audio_only)
+ if audio_only:
yield from ('-vn', '-acodec', 'copy')
- else:
- yield from ('-c', 'copy')
@PostProcessor._restrict_to(images=False)
def run(self, info):
filename, metadata_filename = info['filepath'], None
- options = []
+ files_to_delete, options = [], []
if self._add_chapters and info.get('chapters'):
metadata_filename = replace_extension(filename, 'meta')
options.extend(self._get_chapter_opts(info['chapters'], metadata_filename))
+ files_to_delete.append(metadata_filename)
if self._add_metadata:
options.extend(self._get_metadata_opts(info))
+ if self._add_infojson:
+ if info['ext'] in ('mkv', 'mka'):
+ infojson_filename = info.get('infojson_filename')
+ options.extend(self._get_infojson_opts(info, infojson_filename))
+ if not infojson_filename:
+ files_to_delete.append(info.get('infojson_filename'))
+ elif self._add_infojson is True:
+ self.to_screen('The info-json can only be attached to mkv/mka files')
+
if not options:
self.to_screen('There isn\'t any metadata to add')
return [], info
@@ -640,8 +706,8 @@ class FFmpegMetadataPP(FFmpegPostProcessor):
self.run_ffmpeg_multiple_files(
(filename, metadata_filename), temp_filename,
itertools.chain(self._options(info['ext']), *options))
- if metadata_filename:
- os.remove(metadata_filename)
+ for file in filter(None, files_to_delete):
+ os.remove(file) # Don't obey --keep-files
os.replace(temp_filename, filename)
return [], info
@@ -663,15 +729,15 @@ class FFmpegMetadataPP(FFmpegPostProcessor):
yield ('-map_metadata', '1')
def _get_metadata_opts(self, info):
- metadata = {}
- meta_prefix = 'meta_'
+ meta_prefix = 'meta'
+ metadata = collections.defaultdict(dict)
def add(meta_list, info_list=None):
value = next((
- str(info[key]) for key in [meta_prefix] + list(variadic(info_list or meta_list))
+ str(info[key]) for key in [f'{meta_prefix}_'] + list(variadic(info_list or meta_list))
if info.get(key) is not None), None)
if value not in ('', None):
- metadata.update({meta_f: value for meta_f in variadic(meta_list)})
+ metadata['common'].update({meta_f: value for meta_f in variadic(meta_list)})
# See [1-4] for some info on media metadata/metadata supported
# by ffmpeg.
@@ -693,32 +759,50 @@ class FFmpegMetadataPP(FFmpegPostProcessor):
add('season_number')
add('episode_id', ('episode', 'episode_id'))
add('episode_sort', 'episode_number')
+ if 'embed-metadata' in self.get_param('compat_opts', []):
+ add('comment', 'description')
+ metadata['common'].pop('synopsis', None)
+ meta_regex = rf'{re.escape(meta_prefix)}(?P<i>\d+)?_(?P<key>.+)'
for key, value in info.items():
- if value is not None and key != meta_prefix and key.startswith(meta_prefix):
- metadata[key[len(meta_prefix):]] = value
+ mobj = re.fullmatch(meta_regex, key)
+ if value is not None and mobj:
+ metadata[mobj.group('i') or 'common'][mobj.group('key')] = value
- for name, value in metadata.items():
+ for name, value in metadata['common'].items():
yield ('-metadata', f'{name}={value}')
stream_idx = 0
for fmt in info.get('requested_formats') or []:
stream_count = 2 if 'none' not in (fmt.get('vcodec'), fmt.get('acodec')) else 1
- if fmt.get('language'):
- lang = ISO639Utils.short2long(fmt['language']) or fmt['language']
- for i in range(stream_count):
- yield ('-metadata:s:%d' % (stream_idx + i), 'language=%s' % lang)
+ lang = ISO639Utils.short2long(fmt.get('language') or '') or fmt.get('language')
+ for i in range(stream_idx, stream_idx + stream_count):
+ if lang:
+ metadata[str(i)].setdefault('language', lang)
+ for name, value in metadata[str(i)].items():
+ yield (f'-metadata:s:{i}', f'{name}={value}')
stream_idx += stream_count
- if ('no-attach-info-json' not in self.get_param('compat_opts', [])
- and '__infojson_filename' in info and info['ext'] in ('mkv', 'mka')):
- old_stream, new_stream = self.get_stream_number(info['filepath'], ('tags', 'mimetype'), 'application/json')
- if old_stream is not None:
- yield ('-map', '-0:%d' % old_stream)
- new_stream -= 1
+ def _get_infojson_opts(self, info, infofn):
+ if not infofn or not os.path.exists(infofn):
+ if self._add_infojson is not True:
+ return
+ infofn = infofn or '%s.temp' % (
+ self._downloader.prepare_filename(info, 'infojson')
+ or replace_extension(self._downloader.prepare_filename(info), 'info.json', info['ext']))
+ if not self._downloader._ensure_dir_exists(infofn):
+ return
+ self.write_debug(f'Writing info-json to: {infofn}')
+ write_json_file(self._downloader.sanitize_info(info, self.get_param('clean_infojson', True)), infofn)
+ info['infojson_filename'] = infofn
+
+ old_stream, new_stream = self.get_stream_number(info['filepath'], ('tags', 'mimetype'), 'application/json')
+ if old_stream is not None:
+ yield ('-map', '-0:%d' % old_stream)
+ new_stream -= 1
- yield ('-attach', info['__infojson_filename'],
- '-metadata:s:%d' % new_stream, 'mimetype=application/json')
+ yield ('-attach', infofn,
+ '-metadata:s:%d' % new_stream, 'mimetype=application/json')
class FFmpegMergerPP(FFmpegPostProcessor):
@@ -775,7 +859,7 @@ class FFmpegFixupStretchedPP(FFmpegFixupPostProcessor):
stretched_ratio = info.get('stretched_ratio')
if stretched_ratio not in (None, 1):
self._fixup('Fixing aspect ratio', info['filepath'], [
- '-c', 'copy', '-map', '0', '-dn', '-aspect', '%f' % stretched_ratio])
+ *self.stream_copy_opts(), '-aspect', '%f' % stretched_ratio])
return [], info
@@ -783,17 +867,27 @@ class FFmpegFixupM4aPP(FFmpegFixupPostProcessor):
@PostProcessor._restrict_to(images=False, video=False)
def run(self, info):
if info.get('container') == 'm4a_dash':
- self._fixup('Correcting container', info['filepath'], [
- '-c', 'copy', '-map', '0', '-dn', '-f', 'mp4'])
+ self._fixup('Correcting container', info['filepath'], [*self.stream_copy_opts(), '-f', 'mp4'])
return [], info
class FFmpegFixupM3u8PP(FFmpegFixupPostProcessor):
+ def _needs_fixup(self, info):
+ yield info['ext'] in ('mp4', 'm4a')
+ yield info['protocol'].startswith('m3u8')
+ try:
+ metadata = self.get_metadata_object(info['filepath'])
+ except PostProcessingError as e:
+ self.report_warning(f'Unable to extract metadata: {e.msg}')
+ yield True
+ else:
+ yield traverse_obj(metadata, ('format', 'format_name'), casesense=False) == 'mpegts'
+
@PostProcessor._restrict_to(images=False)
def run(self, info):
- if self.get_audio_codec(info['filepath']) == 'aac':
- self._fixup('Fixing malformed AAC bitstream', info['filepath'], [
- '-c', 'copy', '-map', '0', '-dn', '-f', 'mp4', '-bsf:a', 'aac_adtstoasc'])
+ if all(self._needs_fixup(info)):
+ self._fixup('Fixing MPEG-TS in MP4 container', info['filepath'], [
+ *self.stream_copy_opts(), '-f', 'mp4', '-bsf:a', 'aac_adtstoasc'])
return [], info
@@ -807,25 +901,34 @@ class FFmpegFixupTimestampPP(FFmpegFixupPostProcessor):
@PostProcessor._restrict_to(images=False)
def run(self, info):
- required_version = '4.4'
- if is_outdated_version(self._versions[self.basename], required_version):
+ if not self._features.get('setts'):
self.report_warning(
'A re-encode is needed to fix timestamps in older versions of ffmpeg. '
- f'Please install ffmpeg {required_version} or later to fixup without re-encoding')
+ 'Please install ffmpeg 4.4 or later to fixup without re-encoding')
opts = ['-vf', 'setpts=PTS-STARTPTS']
else:
opts = ['-c', 'copy', '-bsf', 'setts=ts=TS-STARTPTS']
- self._fixup('Fixing frame timestamp', info['filepath'], opts + ['-map', '0', '-dn', '-ss', self.trim])
+ self._fixup('Fixing frame timestamp', info['filepath'], opts + [*self.stream_copy_opts(False), '-ss', self.trim])
return [], info
-class FFmpegFixupDurationPP(FFmpegFixupPostProcessor):
+class FFmpegCopyStreamPP(FFmpegFixupPostProcessor):
+ MESSAGE = 'Copying stream'
+
@PostProcessor._restrict_to(images=False)
def run(self, info):
- self._fixup('Fixing video duration', info['filepath'], ['-c', 'copy', '-map', '0', '-dn'])
+ self._fixup(self.MESSAGE, info['filepath'], self.stream_copy_opts())
return [], info
+class FFmpegFixupDurationPP(FFmpegCopyStreamPP):
+ MESSAGE = 'Fixing video duration'
+
+
+class FFmpegFixupDuplicateMoovPP(FFmpegCopyStreamPP):
+ MESSAGE = 'Fixing duplicate MOOV atoms'
+
+
class FFmpegSubtitlesConvertorPP(FFmpegPostProcessor):
SUPPORTED_EXTS = ('srt', 'vtt', 'ass', 'lrc')
@@ -942,14 +1045,14 @@ class FFmpegSplitChaptersPP(FFmpegPostProcessor):
self.to_screen('Splitting video by chapters; %d chapters found' % len(chapters))
for idx, chapter in enumerate(chapters):
destination, opts = self._ffmpeg_args_for_chapter(idx + 1, chapter, info)
- self.real_run_ffmpeg([(in_file, opts)], [(destination, ['-c', 'copy'])])
+ self.real_run_ffmpeg([(in_file, opts)], [(destination, self.stream_copy_opts())])
if in_file != info['filepath']:
os.remove(in_file)
return [], info
class FFmpegThumbnailsConvertorPP(FFmpegPostProcessor):
- SUPPORTED_EXTS = ('jpg', 'png')
+ SUPPORTED_EXTS = ('jpg', 'png', 'webp')
def __init__(self, downloader=None, format=None):
super(FFmpegThumbnailsConvertorPP, self).__init__(downloader)
@@ -993,12 +1096,12 @@ class FFmpegThumbnailsConvertorPP(FFmpegPostProcessor):
files_to_delete = []
has_thumbnail = False
- for idx, thumbnail_dict in enumerate(info['thumbnails']):
- if 'filepath' not in thumbnail_dict:
+ for idx, thumbnail_dict in enumerate(info.get('thumbnails') or []):
+ original_thumbnail = thumbnail_dict.get('filepath')
+ if not original_thumbnail:
continue
has_thumbnail = True
self.fixup_webp(info, idx)
- original_thumbnail = thumbnail_dict['filepath']
_, thumbnail_ext = os.path.splitext(original_thumbnail)
if thumbnail_ext:
thumbnail_ext = thumbnail_ext[1:].lower()
@@ -1015,3 +1118,57 @@ class FFmpegThumbnailsConvertorPP(FFmpegPostProcessor):
if not has_thumbnail:
self.to_screen('There aren\'t any thumbnails to convert')
return files_to_delete, info
+
+
+class FFmpegConcatPP(FFmpegPostProcessor):
+ def __init__(self, downloader, only_multi_video=False):
+ self._only_multi_video = only_multi_video
+ super().__init__(downloader)
+
+ def _get_codecs(self, file):
+ codecs = traverse_obj(self.get_metadata_object(file), ('streams', ..., 'codec_name'))
+ self.write_debug(f'Codecs = {", ".join(codecs)}')
+ return tuple(codecs)
+
+ def concat_files(self, in_files, out_file):
+ if not self._downloader._ensure_dir_exists(out_file):
+ return
+ if len(in_files) == 1:
+ if os.path.realpath(in_files[0]) != os.path.realpath(out_file):
+ self.to_screen(f'Moving "{in_files[0]}" to "{out_file}"')
+ os.replace(in_files[0], out_file)
+ return []
+
+ if len(set(map(self._get_codecs, in_files))) > 1:
+ raise PostProcessingError(
+ 'The files have different streams/codecs and cannot be concatenated. '
+ 'Either select different formats or --recode-video them to a common format')
+
+ self.to_screen(f'Concatenating {len(in_files)} files; Destination: {out_file}')
+ super().concat_files(in_files, out_file)
+ return in_files
+
+ @PostProcessor._restrict_to(images=False, simulated=False)
+ def run(self, info):
+ entries = info.get('entries') or []
+ if not any(entries) or (self._only_multi_video and info['_type'] != 'multi_video'):
+ return [], info
+ elif traverse_obj(entries, (..., 'requested_downloads', lambda _, v: len(v) > 1)):
+ raise PostProcessingError('Concatenation is not supported when downloading multiple separate formats')
+
+ in_files = traverse_obj(entries, (..., 'requested_downloads', 0, 'filepath')) or []
+ if len(in_files) < len(entries):
+ raise PostProcessingError('Aborting concatenation because some downloads failed')
+
+ ie_copy = self._downloader._playlist_infodict(info)
+ exts = traverse_obj(entries, (..., 'requested_downloads', 0, 'ext'), (..., 'ext'))
+ ie_copy['ext'] = exts[0] if len(set(exts)) == 1 else 'mkv'
+ out_file = self._downloader.prepare_filename(ie_copy, 'pl_video')
+
+ files_to_delete = self.concat_files(in_files, out_file)
+
+ info['requested_downloads'] = [{
+ 'filepath': out_file,
+ 'ext': ie_copy['ext'],
+ }]
+ return files_to_delete, info
diff --git a/hypervideo_dl/postprocessor/metadataparser.py b/hypervideo_dl/postprocessor/metadataparser.py
index 96aac9b..01ee6c1 100644
--- a/hypervideo_dl/postprocessor/metadataparser.py
+++ b/hypervideo_dl/postprocessor/metadataparser.py
@@ -1,5 +1,4 @@
import re
-
from enum import Enum
from .common import PostProcessor
@@ -16,7 +15,7 @@ class MetadataParserPP(PostProcessor):
for f in actions:
action = f[0]
assert isinstance(action, self.Actions)
- self._actions.append(getattr(self, action._value_)(*f[1:]))
+ self._actions.append(getattr(self, action.value)(*f[1:]))
@classmethod
def validate_action(cls, action, *data):
@@ -26,12 +25,17 @@ class MetadataParserPP(PostProcessor):
'''
if not isinstance(action, cls.Actions):
raise ValueError(f'{action!r} is not a valid action')
- getattr(cls, action._value_)(cls, *data)
+ getattr(cls, action.value)(cls, *data) # So this can raise error to validate
@staticmethod
def field_to_template(tmpl):
if re.match(r'[a-zA-Z_]+$', tmpl):
return f'%({tmpl})s'
+
+ from ..YoutubeDL import YoutubeDL
+ err = YoutubeDL.validate_outtmpl(tmpl)
+ if err:
+ raise err
return tmpl
@staticmethod
@@ -66,7 +70,7 @@ class MetadataParserPP(PostProcessor):
self.write_debug(f'Searching for {out_re.pattern!r} in {template!r}')
match = out_re.search(data_to_parse)
if match is None:
- self.report_warning(f'Could not interpret {inp!r} as {out!r}')
+ self.to_screen(f'Could not interpret {inp!r} as {out!r}')
return
for attribute, value in match.groupdict().items():
info[attribute] = value
@@ -80,7 +84,7 @@ class MetadataParserPP(PostProcessor):
def f(info):
val = info.get(field)
if val is None:
- self.report_warning(f'Video does not have a {field}')
+ self.to_screen(f'Video does not have a {field}')
return
elif not isinstance(val, str):
self.report_warning(f'Cannot replace in field {field} since it is a {type(val).__name__}')
@@ -99,18 +103,23 @@ class MetadataParserPP(PostProcessor):
class MetadataFromFieldPP(MetadataParserPP):
@classmethod
def to_action(cls, f):
- match = re.match(r'(?P<in>.*?)(?<!\\):(?P<out>.+)$', f)
+ match = re.match(r'(?s)(?P<in>.*?)(?<!\\):(?P<out>.+)$', f)
if match is None:
raise ValueError(f'it should be FROM:TO, not {f!r}')
return (
cls.Actions.INTERPRET,
match.group('in').replace('\\:', ':'),
- match.group('out'))
+ match.group('out'),
+ )
def __init__(self, downloader, formats):
- MetadataParserPP.__init__(self, downloader, [self.to_action(f) for f in formats])
+ super().__init__(downloader, [self.to_action(f) for f in formats])
-class MetadataFromTitlePP(MetadataParserPP): # for backward compatibility
+# Deprecated
+class MetadataFromTitlePP(MetadataParserPP):
def __init__(self, downloader, titleformat):
- MetadataParserPP.__init__(self, downloader, [(self.Actions.INTERPRET, 'title', titleformat)])
+ super().__init__(downloader, [(self.Actions.INTERPRET, 'title', titleformat)])
+ self.deprecation_warning(
+ 'hypervideo_dl.postprocessor.MetadataFromTitlePP is deprecated '
+ 'and may be removed in a future version. Use hypervideo_dl.postprocessor.MetadataFromFieldPP instead')
diff --git a/hypervideo_dl/postprocessor/modify_chapters.py b/hypervideo_dl/postprocessor/modify_chapters.py
index a0818c4..22506bc 100644
--- a/hypervideo_dl/postprocessor/modify_chapters.py
+++ b/hypervideo_dl/postprocessor/modify_chapters.py
@@ -24,19 +24,21 @@ class ModifyChaptersPP(FFmpegPostProcessor):
*, sponsorblock_chapter_title=DEFAULT_SPONSORBLOCK_CHAPTER_TITLE, force_keyframes=False):
FFmpegPostProcessor.__init__(self, downloader)
self._remove_chapters_patterns = set(remove_chapters_patterns or [])
- self._remove_sponsor_segments = set(remove_sponsor_segments or [])
+ self._remove_sponsor_segments = set(remove_sponsor_segments or []) - set(SponsorBlockPP.POI_CATEGORIES.keys())
self._ranges_to_remove = set(remove_ranges or [])
self._sponsorblock_chapter_title = sponsorblock_chapter_title
self._force_keyframes = force_keyframes
@PostProcessor._restrict_to(images=False)
def run(self, info):
+ # Chapters must be preserved intact when downloading multiple formats of the same video.
chapters, sponsor_chapters = self._mark_chapters_to_remove(
- info.get('chapters') or [], info.get('sponsorblock_chapters') or [])
+ copy.deepcopy(info.get('chapters')) or [],
+ copy.deepcopy(info.get('sponsorblock_chapters')) or [])
if not chapters and not sponsor_chapters:
return [], info
- real_duration = self._get_real_video_duration(info)
+ real_duration = self._get_real_video_duration(info['filepath'])
if not chapters:
chapters = [{'start_time': 0, 'end_time': real_duration, 'title': info['title']}]
@@ -55,6 +57,7 @@ class ModifyChaptersPP(FFmpegPostProcessor):
self.write_debug('Expected and actual durations mismatch')
concat_opts = self._make_concat_opts(cuts, real_duration)
+ self.write_debug('Concat spec = %s' % ', '.join(f'{c.get("inpoint", 0.0)}-{c.get("outpoint", "inf")}' for c in concat_opts))
def remove_chapters(file, is_sub):
return file, self.remove_chapters(file, cuts, concat_opts, self._force_keyframes and not is_sub)
@@ -65,12 +68,13 @@ class ModifyChaptersPP(FFmpegPostProcessor):
# Renaming should only happen after all files are processed
files_to_remove = []
for in_file, out_file in in_out_files:
+ mtime = os.stat(in_file).st_mtime
uncut_file = prepend_extension(in_file, 'uncut')
os.replace(in_file, uncut_file)
os.replace(out_file, in_file)
+ self.try_utime(in_file, mtime, mtime)
files_to_remove.append(uncut_file)
- info['_real_duration'] = info['chapters'][-1]['end_time']
return files_to_remove, info
def _mark_chapters_to_remove(self, chapters, sponsor_chapters):
@@ -126,7 +130,7 @@ class ModifyChaptersPP(FFmpegPostProcessor):
cuts = []
def append_cut(c):
- assert 'remove' in c
+ assert 'remove' in c, 'Not a cut is appended to cuts'
last_to_cut = cuts[-1] if cuts else None
if last_to_cut and last_to_cut['end_time'] >= c['start_time']:
last_to_cut['end_time'] = max(last_to_cut['end_time'], c['end_time'])
@@ -154,7 +158,7 @@ class ModifyChaptersPP(FFmpegPostProcessor):
new_chapters = []
def append_chapter(c):
- assert 'remove' not in c
+ assert 'remove' not in c, 'Cut is appended to chapters'
length = c['end_time'] - c['start_time'] - excess_duration(c)
# Chapter is completely covered by cuts or sponsors.
if length <= 0:
@@ -237,7 +241,7 @@ class ModifyChaptersPP(FFmpegPostProcessor):
heapq.heappush(chapters, (c['start_time'], i, c))
# (normal, sponsor) and (sponsor, sponsor)
else:
- assert '_categories' in c
+ assert '_categories' in c, 'Normal chapters overlap'
cur_chapter['_was_cut'] = True
c['_was_cut'] = True
# Push the part after the sponsor to PQ.
@@ -301,7 +305,7 @@ class ModifyChaptersPP(FFmpegPostProcessor):
'name': SponsorBlockPP.CATEGORIES[category],
'category_names': [SponsorBlockPP.CATEGORIES[c] for c in cats]
})
- c['title'] = self._downloader.evaluate_outtmpl(self._sponsorblock_chapter_title, c)
+ c['title'] = self._downloader.evaluate_outtmpl(self._sponsorblock_chapter_title, c.copy())
# Merge identically named sponsors.
if (new_chapters and 'categories' in new_chapters[-1]
and new_chapters[-1]['title'] == c['title']):
@@ -331,6 +335,6 @@ class ModifyChaptersPP(FFmpegPostProcessor):
continue
opts[-1]['outpoint'] = f'{s["start_time"]:.6f}'
# Do not create 0 duration chunk at the end.
- if s['end_time'] != duration:
+ if s['end_time'] < duration:
opts.append({'inpoint': f'{s["end_time"]:.6f}'})
return opts
diff --git a/hypervideo_dl/postprocessor/sponskrub.py b/hypervideo_dl/postprocessor/sponskrub.py
index 932555a..400cbcc 100644
--- a/hypervideo_dl/postprocessor/sponskrub.py
+++ b/hypervideo_dl/postprocessor/sponskrub.py
@@ -11,9 +11,9 @@ from ..utils import (
encodeFilename,
shell_quote,
str_or_none,
+ Popen,
PostProcessingError,
prepend_extension,
- process_communicate_or_kill,
)
@@ -22,13 +22,18 @@ class SponSkrubPP(PostProcessor):
_temp_ext = 'spons'
_exe_name = 'sponskrub'
- def __init__(self, downloader, path='', args=None, ignoreerror=False, cut=False, force=False):
+ def __init__(self, downloader, path='', args=None, ignoreerror=False, cut=False, force=False, _from_cli=False):
PostProcessor.__init__(self, downloader)
self.force = force
self.cutout = cut
self.args = str_or_none(args) or '' # For backward compatibility
self.path = self.get_exe(path)
+ if not _from_cli:
+ self.deprecation_warning(
+ 'hypervideo_dl.postprocessor.SponSkrubPP support is deprecated and may be removed in a future version. '
+ 'Use hypervideo_dl.postprocessor.SponsorBlock and hypervideo_dl.postprocessor.ModifyChaptersPP instead')
+
if not ignoreerror and self.path is None:
if path:
raise PostProcessingError('sponskrub not found in "%s"' % path)
@@ -81,8 +86,8 @@ class SponSkrubPP(PostProcessor):
self.write_debug('sponskrub command line: %s' % shell_quote(cmd))
pipe = None if self.get_param('verbose') else subprocess.PIPE
- p = subprocess.Popen(cmd, stdout=pipe)
- stdout = process_communicate_or_kill(p)[0]
+ p = Popen(cmd, stdout=pipe)
+ stdout = p.communicate_or_kill()[0]
if p.returncode == 0:
os.replace(temp_filename, filename)
diff --git a/hypervideo_dl/postprocessor/sponsorblock.py b/hypervideo_dl/postprocessor/sponsorblock.py
index 7265a9d..7943014 100644
--- a/hypervideo_dl/postprocessor/sponsorblock.py
+++ b/hypervideo_dl/postprocessor/sponsorblock.py
@@ -1,25 +1,29 @@
+from hashlib import sha256
import json
import re
-from hashlib import sha256
from .ffmpeg import FFmpegPostProcessor
-from ..compat import compat_urllib_parse_urlencode, compat_HTTPError
-from ..utils import PostProcessingError, network_exceptions, sanitized_Request
+from ..compat import compat_urllib_parse_urlencode
class SponsorBlockPP(FFmpegPostProcessor):
-
+ # https://wiki.sponsor.ajay.app/w/Types
EXTRACTORS = {
'Youtube': 'YouTube',
}
+ POI_CATEGORIES = {
+ 'poi_highlight': 'Highlight',
+ }
CATEGORIES = {
'sponsor': 'Sponsor',
'intro': 'Intermission/Intro Animation',
'outro': 'Endcards/Credits',
'selfpromo': 'Unpaid/Self Promotion',
- 'interaction': 'Interaction Reminder',
'preview': 'Preview/Recap',
- 'music_offtopic': 'Non-Music Section'
+ 'filler': 'Filler Tangent',
+ 'interaction': 'Interaction Reminder',
+ 'music_offtopic': 'Non-Music Section',
+ **POI_CATEGORIES,
}
def __init__(self, downloader, categories=None, api='https://sponsor.ajay.app'):
@@ -33,6 +37,7 @@ class SponsorBlockPP(FFmpegPostProcessor):
self.to_screen(f'SponsorBlock is not supported for {extractor}')
return [], info
+ self.to_screen('Fetching SponsorBlock segments')
info['sponsorblock_chapters'] = self._get_sponsor_chapters(info, info['duration'])
return [], info
@@ -41,9 +46,15 @@ class SponsorBlockPP(FFmpegPostProcessor):
def duration_filter(s):
start_end = s['segment']
+ # Ignore entire video segments (https://wiki.sponsor.ajay.app/w/Types).
+ if start_end == (0, 0):
+ return False
# Ignore milliseconds difference at the start.
if start_end[0] <= 1:
start_end[0] = 0
+ # Make POI chapters 1 sec so that we can properly mark them
+ if s['category'] in self.POI_CATEGORIES.keys():
+ start_end[1] += 1
# Ignore milliseconds difference at the end.
# Never allow the segment to exceed the video.
if duration and duration - start_end[1] <= 1:
@@ -78,19 +89,9 @@ class SponsorBlockPP(FFmpegPostProcessor):
url = f'{self._API_URL}/api/skipSegments/{hash[:4]}?' + compat_urllib_parse_urlencode({
'service': service,
'categories': json.dumps(self._categories),
+ 'actionTypes': json.dumps(['skip', 'poi'])
})
- for d in self._get_json(url):
+ for d in self._download_json(url) or []:
if d['videoID'] == video_id:
return d['segments']
return []
-
- def _get_json(self, url):
- self.write_debug(f'SponsorBlock query: {url}')
- try:
- rsp = self._downloader.urlopen(sanitized_Request(url))
- except network_exceptions as e:
- if isinstance(e, compat_HTTPError) and e.code == 404:
- return []
- raise PostProcessingError(f'Unable to communicate with SponsorBlock API - {e}')
-
- return json.loads(rsp.read().decode(rsp.info().get_param('charset') or 'utf-8'))
diff --git a/hypervideo_dl/utils.py b/hypervideo_dl/utils.py
index 0199f4c..6379872 100644
--- a/hypervideo_dl/utils.py
+++ b/hypervideo_dl/utils.py
@@ -3,6 +3,8 @@
from __future__ import unicode_literals
+import asyncio
+import atexit
import base64
import binascii
import calendar
@@ -38,12 +40,14 @@ import time
import traceback
import xml.etree.ElementTree
import zlib
+import mimetypes
from .compat import (
compat_HTMLParseError,
compat_HTMLParser,
compat_HTTPError,
compat_basestring,
+ compat_brotli,
compat_chr,
compat_cookiejar,
compat_ctypes_WINFUNCTYPE,
@@ -57,6 +61,7 @@ from .compat import (
compat_kwargs,
compat_os_name,
compat_parse_qs,
+ compat_shlex_split,
compat_shlex_quote,
compat_str,
compat_struct_pack,
@@ -71,6 +76,7 @@ from .compat import (
compat_urllib_parse_unquote_plus,
compat_urllib_request,
compat_urlparse,
+ compat_websockets,
compat_xpath,
)
@@ -79,6 +85,12 @@ from .socks import (
sockssocket,
)
+try:
+ import certifi
+ has_certifi = True
+except ImportError:
+ has_certifi = False
+
def register_socks_protocols():
# "Register" SOCKS protocols
@@ -96,1592 +108,59 @@ compiled_regex_type = type(re.compile(''))
def random_user_agent():
_USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
_CHROME_VERSIONS = (
- '74.0.3729.129',
- '76.0.3780.3',
- '76.0.3780.2',
- '74.0.3729.128',
- '76.0.3780.1',
- '76.0.3780.0',
- '75.0.3770.15',
- '74.0.3729.127',
- '74.0.3729.126',
- '76.0.3779.1',
- '76.0.3779.0',
- '75.0.3770.14',
- '74.0.3729.125',
- '76.0.3778.1',
- '76.0.3778.0',
- '75.0.3770.13',
- '74.0.3729.124',
- '74.0.3729.123',
- '73.0.3683.121',
- '76.0.3777.1',
- '76.0.3777.0',
- '75.0.3770.12',
- '74.0.3729.122',
- '76.0.3776.4',
- '75.0.3770.11',
- '74.0.3729.121',
- '76.0.3776.3',
- '76.0.3776.2',
- '73.0.3683.120',
- '74.0.3729.120',
- '74.0.3729.119',
- '74.0.3729.118',
- '76.0.3776.1',
- '76.0.3776.0',
- '76.0.3775.5',
- '75.0.3770.10',
- '74.0.3729.117',
- '76.0.3775.4',
- '76.0.3775.3',
- '74.0.3729.116',
- '75.0.3770.9',
- '76.0.3775.2',
- '76.0.3775.1',
- '76.0.3775.0',
- '75.0.3770.8',
- '74.0.3729.115',
- '74.0.3729.114',
- '76.0.3774.1',
- '76.0.3774.0',
- '75.0.3770.7',
- '74.0.3729.113',
- '74.0.3729.112',
- '74.0.3729.111',
- '76.0.3773.1',
- '76.0.3773.0',
- '75.0.3770.6',
- '74.0.3729.110',
- '74.0.3729.109',
- '76.0.3772.1',
- '76.0.3772.0',
- '75.0.3770.5',
- '74.0.3729.108',
- '74.0.3729.107',
- '76.0.3771.1',
- '76.0.3771.0',
- '75.0.3770.4',
- '74.0.3729.106',
- '74.0.3729.105',
- '75.0.3770.3',
- '74.0.3729.104',
- '74.0.3729.103',
- '74.0.3729.102',
- '75.0.3770.2',
- '74.0.3729.101',
- '75.0.3770.1',
- '75.0.3770.0',
- '74.0.3729.100',
- '75.0.3769.5',
- '75.0.3769.4',
- '74.0.3729.99',
- '75.0.3769.3',
- '75.0.3769.2',
- '75.0.3768.6',
- '74.0.3729.98',
- '75.0.3769.1',
- '75.0.3769.0',
- '74.0.3729.97',
- '73.0.3683.119',
- '73.0.3683.118',
- '74.0.3729.96',
- '75.0.3768.5',
- '75.0.3768.4',
- '75.0.3768.3',
- '75.0.3768.2',
- '74.0.3729.95',
- '74.0.3729.94',
- '75.0.3768.1',
- '75.0.3768.0',
- '74.0.3729.93',
- '74.0.3729.92',
- '73.0.3683.117',
- '74.0.3729.91',
- '75.0.3766.3',
- '74.0.3729.90',
- '75.0.3767.2',
- '75.0.3767.1',
- '75.0.3767.0',
- '74.0.3729.89',
- '73.0.3683.116',
- '75.0.3766.2',
- '74.0.3729.88',
- '75.0.3766.1',
- '75.0.3766.0',
- '74.0.3729.87',
- '73.0.3683.115',
- '74.0.3729.86',
- '75.0.3765.1',
- '75.0.3765.0',
- '74.0.3729.85',
- '73.0.3683.114',
- '74.0.3729.84',
- '75.0.3764.1',
- '75.0.3764.0',
- '74.0.3729.83',
- '73.0.3683.113',
- '75.0.3763.2',
- '75.0.3761.4',
- '74.0.3729.82',
- '75.0.3763.1',
- '75.0.3763.0',
- '74.0.3729.81',
- '73.0.3683.112',
- '75.0.3762.1',
- '75.0.3762.0',
- '74.0.3729.80',
- '75.0.3761.3',
- '74.0.3729.79',
- '73.0.3683.111',
- '75.0.3761.2',
- '74.0.3729.78',
- '74.0.3729.77',
- '75.0.3761.1',
- '75.0.3761.0',
- '73.0.3683.110',
- '74.0.3729.76',
- '74.0.3729.75',
- '75.0.3760.0',
- '74.0.3729.74',
- '75.0.3759.8',
- '75.0.3759.7',
- '75.0.3759.6',
- '74.0.3729.73',
- '75.0.3759.5',
- '74.0.3729.72',
- '73.0.3683.109',
- '75.0.3759.4',
- '75.0.3759.3',
- '74.0.3729.71',
- '75.0.3759.2',
- '74.0.3729.70',
- '73.0.3683.108',
- '74.0.3729.69',
- '75.0.3759.1',
- '75.0.3759.0',
- '74.0.3729.68',
- '73.0.3683.107',
- '74.0.3729.67',
- '75.0.3758.1',
- '75.0.3758.0',
- '74.0.3729.66',
- '73.0.3683.106',
- '74.0.3729.65',
- '75.0.3757.1',
- '75.0.3757.0',
- '74.0.3729.64',
- '73.0.3683.105',
- '74.0.3729.63',
- '75.0.3756.1',
- '75.0.3756.0',
- '74.0.3729.62',
- '73.0.3683.104',
- '75.0.3755.3',
- '75.0.3755.2',
- '73.0.3683.103',
- '75.0.3755.1',
- '75.0.3755.0',
- '74.0.3729.61',
- '73.0.3683.102',
- '74.0.3729.60',
- '75.0.3754.2',
- '74.0.3729.59',
- '75.0.3753.4',
- '74.0.3729.58',
- '75.0.3754.1',
- '75.0.3754.0',
- '74.0.3729.57',
- '73.0.3683.101',
- '75.0.3753.3',
- '75.0.3752.2',
- '75.0.3753.2',
- '74.0.3729.56',
- '75.0.3753.1',
- '75.0.3753.0',
- '74.0.3729.55',
- '73.0.3683.100',
- '74.0.3729.54',
- '75.0.3752.1',
- '75.0.3752.0',
- '74.0.3729.53',
- '73.0.3683.99',
- '74.0.3729.52',
- '75.0.3751.1',
- '75.0.3751.0',
- '74.0.3729.51',
- '73.0.3683.98',
- '74.0.3729.50',
- '75.0.3750.0',
- '74.0.3729.49',
- '74.0.3729.48',
- '74.0.3729.47',
- '75.0.3749.3',
- '74.0.3729.46',
- '73.0.3683.97',
- '75.0.3749.2',
- '74.0.3729.45',
- '75.0.3749.1',
- '75.0.3749.0',
- '74.0.3729.44',
- '73.0.3683.96',
- '74.0.3729.43',
- '74.0.3729.42',
- '75.0.3748.1',
- '75.0.3748.0',
- '74.0.3729.41',
- '75.0.3747.1',
- '73.0.3683.95',
- '75.0.3746.4',
- '74.0.3729.40',
- '74.0.3729.39',
- '75.0.3747.0',
- '75.0.3746.3',
- '75.0.3746.2',
- '74.0.3729.38',
- '75.0.3746.1',
- '75.0.3746.0',
- '74.0.3729.37',
- '73.0.3683.94',
- '75.0.3745.5',
- '75.0.3745.4',
- '75.0.3745.3',
- '75.0.3745.2',
- '74.0.3729.36',
- '75.0.3745.1',
- '75.0.3745.0',
- '75.0.3744.2',
- '74.0.3729.35',
- '73.0.3683.93',
- '74.0.3729.34',
- '75.0.3744.1',
- '75.0.3744.0',
- '74.0.3729.33',
- '73.0.3683.92',
- '74.0.3729.32',
- '74.0.3729.31',
- '73.0.3683.91',
- '75.0.3741.2',
- '75.0.3740.5',
- '74.0.3729.30',
- '75.0.3741.1',
- '75.0.3741.0',
- '74.0.3729.29',
- '75.0.3740.4',
- '73.0.3683.90',
- '74.0.3729.28',
- '75.0.3740.3',
- '73.0.3683.89',
- '75.0.3740.2',
- '74.0.3729.27',
- '75.0.3740.1',
- '75.0.3740.0',
- '74.0.3729.26',
- '73.0.3683.88',
- '73.0.3683.87',
- '74.0.3729.25',
- '75.0.3739.1',
- '75.0.3739.0',
- '73.0.3683.86',
- '74.0.3729.24',
- '73.0.3683.85',
- '75.0.3738.4',
- '75.0.3738.3',
- '75.0.3738.2',
- '75.0.3738.1',
- '75.0.3738.0',
- '74.0.3729.23',
- '73.0.3683.84',
- '74.0.3729.22',
- '74.0.3729.21',
- '75.0.3737.1',
- '75.0.3737.0',
- '74.0.3729.20',
- '73.0.3683.83',
- '74.0.3729.19',
- '75.0.3736.1',
- '75.0.3736.0',
- '74.0.3729.18',
- '73.0.3683.82',
- '74.0.3729.17',
- '75.0.3735.1',
- '75.0.3735.0',
- '74.0.3729.16',
- '73.0.3683.81',
- '75.0.3734.1',
- '75.0.3734.0',
- '74.0.3729.15',
- '73.0.3683.80',
- '74.0.3729.14',
- '75.0.3733.1',
- '75.0.3733.0',
- '75.0.3732.1',
- '74.0.3729.13',
- '74.0.3729.12',
- '73.0.3683.79',
- '74.0.3729.11',
- '75.0.3732.0',
- '74.0.3729.10',
- '73.0.3683.78',
- '74.0.3729.9',
- '74.0.3729.8',
- '74.0.3729.7',
- '75.0.3731.3',
- '75.0.3731.2',
- '75.0.3731.0',
- '74.0.3729.6',
- '73.0.3683.77',
- '73.0.3683.76',
- '75.0.3730.5',
- '75.0.3730.4',
- '73.0.3683.75',
- '74.0.3729.5',
- '73.0.3683.74',
- '75.0.3730.3',
- '75.0.3730.2',
- '74.0.3729.4',
- '73.0.3683.73',
- '73.0.3683.72',
- '75.0.3730.1',
- '75.0.3730.0',
- '74.0.3729.3',
- '73.0.3683.71',
- '74.0.3729.2',
- '73.0.3683.70',
- '74.0.3729.1',
- '74.0.3729.0',
- '74.0.3726.4',
- '73.0.3683.69',
- '74.0.3726.3',
- '74.0.3728.0',
- '74.0.3726.2',
- '73.0.3683.68',
- '74.0.3726.1',
- '74.0.3726.0',
- '74.0.3725.4',
- '73.0.3683.67',
- '73.0.3683.66',
- '74.0.3725.3',
- '74.0.3725.2',
- '74.0.3725.1',
- '74.0.3724.8',
- '74.0.3725.0',
- '73.0.3683.65',
- '74.0.3724.7',
- '74.0.3724.6',
- '74.0.3724.5',
- '74.0.3724.4',
- '74.0.3724.3',
- '74.0.3724.2',
- '74.0.3724.1',
- '74.0.3724.0',
- '73.0.3683.64',
- '74.0.3723.1',
- '74.0.3723.0',
- '73.0.3683.63',
- '74.0.3722.1',
- '74.0.3722.0',
- '73.0.3683.62',
- '74.0.3718.9',
- '74.0.3702.3',
- '74.0.3721.3',
- '74.0.3721.2',
- '74.0.3721.1',
- '74.0.3721.0',
- '74.0.3720.6',
- '73.0.3683.61',
- '72.0.3626.122',
- '73.0.3683.60',
- '74.0.3720.5',
- '72.0.3626.121',
- '74.0.3718.8',
- '74.0.3720.4',
- '74.0.3720.3',
- '74.0.3718.7',
- '74.0.3720.2',
- '74.0.3720.1',
- '74.0.3720.0',
- '74.0.3718.6',
- '74.0.3719.5',
- '73.0.3683.59',
- '74.0.3718.5',
- '74.0.3718.4',
- '74.0.3719.4',
- '74.0.3719.3',
- '74.0.3719.2',
- '74.0.3719.1',
- '73.0.3683.58',
- '74.0.3719.0',
- '73.0.3683.57',
- '73.0.3683.56',
- '74.0.3718.3',
- '73.0.3683.55',
- '74.0.3718.2',
- '74.0.3718.1',
- '74.0.3718.0',
- '73.0.3683.54',
- '74.0.3717.2',
- '73.0.3683.53',
- '74.0.3717.1',
- '74.0.3717.0',
- '73.0.3683.52',
- '74.0.3716.1',
- '74.0.3716.0',
- '73.0.3683.51',
- '74.0.3715.1',
- '74.0.3715.0',
- '73.0.3683.50',
- '74.0.3711.2',
- '74.0.3714.2',
- '74.0.3713.3',
- '74.0.3714.1',
- '74.0.3714.0',
- '73.0.3683.49',
- '74.0.3713.1',
- '74.0.3713.0',
- '72.0.3626.120',
- '73.0.3683.48',
- '74.0.3712.2',
- '74.0.3712.1',
- '74.0.3712.0',
- '73.0.3683.47',
- '72.0.3626.119',
- '73.0.3683.46',
- '74.0.3710.2',
- '72.0.3626.118',
- '74.0.3711.1',
- '74.0.3711.0',
- '73.0.3683.45',
- '72.0.3626.117',
- '74.0.3710.1',
- '74.0.3710.0',
- '73.0.3683.44',
- '72.0.3626.116',
- '74.0.3709.1',
- '74.0.3709.0',
- '74.0.3704.9',
- '73.0.3683.43',
- '72.0.3626.115',
- '74.0.3704.8',
- '74.0.3704.7',
- '74.0.3708.0',
- '74.0.3706.7',
- '74.0.3704.6',
- '73.0.3683.42',
- '72.0.3626.114',
- '74.0.3706.6',
- '72.0.3626.113',
- '74.0.3704.5',
- '74.0.3706.5',
- '74.0.3706.4',
- '74.0.3706.3',
- '74.0.3706.2',
- '74.0.3706.1',
- '74.0.3706.0',
- '73.0.3683.41',
- '72.0.3626.112',
- '74.0.3705.1',
- '74.0.3705.0',
- '73.0.3683.40',
- '72.0.3626.111',
- '73.0.3683.39',
- '74.0.3704.4',
- '73.0.3683.38',
- '74.0.3704.3',
- '74.0.3704.2',
- '74.0.3704.1',
- '74.0.3704.0',
- '73.0.3683.37',
- '72.0.3626.110',
- '72.0.3626.109',
- '74.0.3703.3',
- '74.0.3703.2',
- '73.0.3683.36',
- '74.0.3703.1',
- '74.0.3703.0',
- '73.0.3683.35',
- '72.0.3626.108',
- '74.0.3702.2',
- '74.0.3699.3',
- '74.0.3702.1',
- '74.0.3702.0',
- '73.0.3683.34',
- '72.0.3626.107',
- '73.0.3683.33',
- '74.0.3701.1',
- '74.0.3701.0',
- '73.0.3683.32',
- '73.0.3683.31',
- '72.0.3626.105',
- '74.0.3700.1',
- '74.0.3700.0',
- '73.0.3683.29',
- '72.0.3626.103',
- '74.0.3699.2',
- '74.0.3699.1',
- '74.0.3699.0',
- '73.0.3683.28',
- '72.0.3626.102',
- '73.0.3683.27',
- '73.0.3683.26',
- '74.0.3698.0',
- '74.0.3696.2',
- '72.0.3626.101',
- '73.0.3683.25',
- '74.0.3696.1',
- '74.0.3696.0',
- '74.0.3694.8',
- '72.0.3626.100',
- '74.0.3694.7',
- '74.0.3694.6',
- '74.0.3694.5',
- '74.0.3694.4',
- '72.0.3626.99',
- '72.0.3626.98',
- '74.0.3694.3',
- '73.0.3683.24',
- '72.0.3626.97',
- '72.0.3626.96',
- '72.0.3626.95',
- '73.0.3683.23',
- '72.0.3626.94',
- '73.0.3683.22',
- '73.0.3683.21',
- '72.0.3626.93',
- '74.0.3694.2',
- '72.0.3626.92',
- '74.0.3694.1',
- '74.0.3694.0',
- '74.0.3693.6',
- '73.0.3683.20',
- '72.0.3626.91',
- '74.0.3693.5',
- '74.0.3693.4',
- '74.0.3693.3',
- '74.0.3693.2',
- '73.0.3683.19',
- '74.0.3693.1',
- '74.0.3693.0',
- '73.0.3683.18',
- '72.0.3626.90',
- '74.0.3692.1',
- '74.0.3692.0',
- '73.0.3683.17',
- '72.0.3626.89',
- '74.0.3687.3',
- '74.0.3691.1',
- '74.0.3691.0',
- '73.0.3683.16',
- '72.0.3626.88',
- '72.0.3626.87',
- '73.0.3683.15',
- '74.0.3690.1',
- '74.0.3690.0',
- '73.0.3683.14',
- '72.0.3626.86',
- '73.0.3683.13',
- '73.0.3683.12',
- '74.0.3689.1',
- '74.0.3689.0',
- '73.0.3683.11',
- '72.0.3626.85',
- '73.0.3683.10',
- '72.0.3626.84',
- '73.0.3683.9',
- '74.0.3688.1',
- '74.0.3688.0',
- '73.0.3683.8',
- '72.0.3626.83',
- '74.0.3687.2',
- '74.0.3687.1',
- '74.0.3687.0',
- '73.0.3683.7',
- '72.0.3626.82',
- '74.0.3686.4',
- '72.0.3626.81',
- '74.0.3686.3',
- '74.0.3686.2',
- '74.0.3686.1',
- '74.0.3686.0',
- '73.0.3683.6',
- '72.0.3626.80',
- '74.0.3685.1',
- '74.0.3685.0',
- '73.0.3683.5',
- '72.0.3626.79',
- '74.0.3684.1',
- '74.0.3684.0',
- '73.0.3683.4',
- '72.0.3626.78',
- '72.0.3626.77',
- '73.0.3683.3',
- '73.0.3683.2',
- '72.0.3626.76',
- '73.0.3683.1',
- '73.0.3683.0',
- '72.0.3626.75',
- '71.0.3578.141',
- '73.0.3682.1',
- '73.0.3682.0',
- '72.0.3626.74',
- '71.0.3578.140',
- '73.0.3681.4',
- '73.0.3681.3',
- '73.0.3681.2',
- '73.0.3681.1',
- '73.0.3681.0',
- '72.0.3626.73',
- '71.0.3578.139',
- '72.0.3626.72',
- '72.0.3626.71',
- '73.0.3680.1',
- '73.0.3680.0',
- '72.0.3626.70',
- '71.0.3578.138',
- '73.0.3678.2',
- '73.0.3679.1',
- '73.0.3679.0',
- '72.0.3626.69',
- '71.0.3578.137',
- '73.0.3678.1',
- '73.0.3678.0',
- '71.0.3578.136',
- '73.0.3677.1',
- '73.0.3677.0',
- '72.0.3626.68',
- '72.0.3626.67',
- '71.0.3578.135',
- '73.0.3676.1',
- '73.0.3676.0',
- '73.0.3674.2',
- '72.0.3626.66',
- '71.0.3578.134',
- '73.0.3674.1',
- '73.0.3674.0',
- '72.0.3626.65',
- '71.0.3578.133',
- '73.0.3673.2',
- '73.0.3673.1',
- '73.0.3673.0',
- '72.0.3626.64',
- '71.0.3578.132',
- '72.0.3626.63',
- '72.0.3626.62',
- '72.0.3626.61',
- '72.0.3626.60',
- '73.0.3672.1',
- '73.0.3672.0',
- '72.0.3626.59',
- '71.0.3578.131',
- '73.0.3671.3',
- '73.0.3671.2',
- '73.0.3671.1',
- '73.0.3671.0',
- '72.0.3626.58',
- '71.0.3578.130',
- '73.0.3670.1',
- '73.0.3670.0',
- '72.0.3626.57',
- '71.0.3578.129',
- '73.0.3669.1',
- '73.0.3669.0',
- '72.0.3626.56',
- '71.0.3578.128',
- '73.0.3668.2',
- '73.0.3668.1',
- '73.0.3668.0',
- '72.0.3626.55',
- '71.0.3578.127',
- '73.0.3667.2',
- '73.0.3667.1',
- '73.0.3667.0',
- '72.0.3626.54',
- '71.0.3578.126',
- '73.0.3666.1',
- '73.0.3666.0',
- '72.0.3626.53',
- '71.0.3578.125',
- '73.0.3665.4',
- '73.0.3665.3',
- '72.0.3626.52',
- '73.0.3665.2',
- '73.0.3664.4',
- '73.0.3665.1',
- '73.0.3665.0',
- '72.0.3626.51',
- '71.0.3578.124',
- '72.0.3626.50',
- '73.0.3664.3',
- '73.0.3664.2',
- '73.0.3664.1',
- '73.0.3664.0',
- '73.0.3663.2',
- '72.0.3626.49',
- '71.0.3578.123',
- '73.0.3663.1',
- '73.0.3663.0',
- '72.0.3626.48',
- '71.0.3578.122',
- '73.0.3662.1',
- '73.0.3662.0',
- '72.0.3626.47',
- '71.0.3578.121',
- '73.0.3661.1',
- '72.0.3626.46',
- '73.0.3661.0',
- '72.0.3626.45',
- '71.0.3578.120',
- '73.0.3660.2',
- '73.0.3660.1',
- '73.0.3660.0',
- '72.0.3626.44',
- '71.0.3578.119',
- '73.0.3659.1',
- '73.0.3659.0',
- '72.0.3626.43',
- '71.0.3578.118',
- '73.0.3658.1',
- '73.0.3658.0',
- '72.0.3626.42',
- '71.0.3578.117',
- '73.0.3657.1',
- '73.0.3657.0',
- '72.0.3626.41',
- '71.0.3578.116',
- '73.0.3656.1',
- '73.0.3656.0',
- '72.0.3626.40',
- '71.0.3578.115',
- '73.0.3655.1',
- '73.0.3655.0',
- '72.0.3626.39',
- '71.0.3578.114',
- '73.0.3654.1',
- '73.0.3654.0',
- '72.0.3626.38',
- '71.0.3578.113',
- '73.0.3653.1',
- '73.0.3653.0',
- '72.0.3626.37',
- '71.0.3578.112',
- '73.0.3652.1',
- '73.0.3652.0',
- '72.0.3626.36',
- '71.0.3578.111',
- '73.0.3651.1',
- '73.0.3651.0',
- '72.0.3626.35',
- '71.0.3578.110',
- '73.0.3650.1',
- '73.0.3650.0',
- '72.0.3626.34',
- '71.0.3578.109',
- '73.0.3649.1',
- '73.0.3649.0',
- '72.0.3626.33',
- '71.0.3578.108',
- '73.0.3648.2',
- '73.0.3648.1',
- '73.0.3648.0',
- '72.0.3626.32',
- '71.0.3578.107',
- '73.0.3647.2',
- '73.0.3647.1',
- '73.0.3647.0',
- '72.0.3626.31',
- '71.0.3578.106',
- '73.0.3635.3',
- '73.0.3646.2',
- '73.0.3646.1',
- '73.0.3646.0',
- '72.0.3626.30',
- '71.0.3578.105',
- '72.0.3626.29',
- '73.0.3645.2',
- '73.0.3645.1',
- '73.0.3645.0',
- '72.0.3626.28',
- '71.0.3578.104',
- '72.0.3626.27',
- '72.0.3626.26',
- '72.0.3626.25',
- '72.0.3626.24',
- '73.0.3644.0',
- '73.0.3643.2',
- '72.0.3626.23',
- '71.0.3578.103',
- '73.0.3643.1',
- '73.0.3643.0',
- '72.0.3626.22',
- '71.0.3578.102',
- '73.0.3642.1',
- '73.0.3642.0',
- '72.0.3626.21',
- '71.0.3578.101',
- '73.0.3641.1',
- '73.0.3641.0',
- '72.0.3626.20',
- '71.0.3578.100',
- '72.0.3626.19',
- '73.0.3640.1',
- '73.0.3640.0',
- '72.0.3626.18',
- '73.0.3639.1',
- '71.0.3578.99',
- '73.0.3639.0',
- '72.0.3626.17',
- '73.0.3638.2',
- '72.0.3626.16',
- '73.0.3638.1',
- '73.0.3638.0',
- '72.0.3626.15',
- '71.0.3578.98',
- '73.0.3635.2',
- '71.0.3578.97',
- '73.0.3637.1',
- '73.0.3637.0',
- '72.0.3626.14',
- '71.0.3578.96',
- '71.0.3578.95',
- '72.0.3626.13',
- '71.0.3578.94',
- '73.0.3636.2',
- '71.0.3578.93',
- '73.0.3636.1',
- '73.0.3636.0',
- '72.0.3626.12',
- '71.0.3578.92',
- '73.0.3635.1',
- '73.0.3635.0',
- '72.0.3626.11',
- '71.0.3578.91',
- '73.0.3634.2',
- '73.0.3634.1',
- '73.0.3634.0',
- '72.0.3626.10',
- '71.0.3578.90',
- '71.0.3578.89',
- '73.0.3633.2',
- '73.0.3633.1',
- '73.0.3633.0',
- '72.0.3610.4',
- '72.0.3626.9',
- '71.0.3578.88',
- '73.0.3632.5',
- '73.0.3632.4',
- '73.0.3632.3',
- '73.0.3632.2',
- '73.0.3632.1',
- '73.0.3632.0',
- '72.0.3626.8',
- '71.0.3578.87',
- '73.0.3631.2',
- '73.0.3631.1',
- '73.0.3631.0',
- '72.0.3626.7',
- '71.0.3578.86',
- '72.0.3626.6',
- '73.0.3630.1',
- '73.0.3630.0',
- '72.0.3626.5',
- '71.0.3578.85',
- '72.0.3626.4',
- '73.0.3628.3',
- '73.0.3628.2',
- '73.0.3629.1',
- '73.0.3629.0',
- '72.0.3626.3',
- '71.0.3578.84',
- '73.0.3628.1',
- '73.0.3628.0',
- '71.0.3578.83',
- '73.0.3627.1',
- '73.0.3627.0',
- '72.0.3626.2',
- '71.0.3578.82',
- '71.0.3578.81',
- '71.0.3578.80',
- '72.0.3626.1',
- '72.0.3626.0',
- '71.0.3578.79',
- '70.0.3538.124',
- '71.0.3578.78',
- '72.0.3623.4',
- '72.0.3625.2',
- '72.0.3625.1',
- '72.0.3625.0',
- '71.0.3578.77',
- '70.0.3538.123',
- '72.0.3624.4',
- '72.0.3624.3',
- '72.0.3624.2',
- '71.0.3578.76',
- '72.0.3624.1',
- '72.0.3624.0',
- '72.0.3623.3',
- '71.0.3578.75',
- '70.0.3538.122',
- '71.0.3578.74',
- '72.0.3623.2',
- '72.0.3610.3',
- '72.0.3623.1',
- '72.0.3623.0',
- '72.0.3622.3',
- '72.0.3622.2',
- '71.0.3578.73',
- '70.0.3538.121',
- '72.0.3622.1',
- '72.0.3622.0',
- '71.0.3578.72',
- '70.0.3538.120',
- '72.0.3621.1',
- '72.0.3621.0',
- '71.0.3578.71',
- '70.0.3538.119',
- '72.0.3620.1',
- '72.0.3620.0',
- '71.0.3578.70',
- '70.0.3538.118',
- '71.0.3578.69',
- '72.0.3619.1',
- '72.0.3619.0',
- '71.0.3578.68',
- '70.0.3538.117',
- '71.0.3578.67',
- '72.0.3618.1',
- '72.0.3618.0',
- '71.0.3578.66',
- '70.0.3538.116',
- '72.0.3617.1',
- '72.0.3617.0',
- '71.0.3578.65',
- '70.0.3538.115',
- '72.0.3602.3',
- '71.0.3578.64',
- '72.0.3616.1',
- '72.0.3616.0',
- '71.0.3578.63',
- '70.0.3538.114',
- '71.0.3578.62',
- '72.0.3615.1',
- '72.0.3615.0',
- '71.0.3578.61',
- '70.0.3538.113',
- '72.0.3614.1',
- '72.0.3614.0',
- '71.0.3578.60',
- '70.0.3538.112',
- '72.0.3613.1',
- '72.0.3613.0',
- '71.0.3578.59',
- '70.0.3538.111',
- '72.0.3612.2',
- '72.0.3612.1',
- '72.0.3612.0',
- '70.0.3538.110',
- '71.0.3578.58',
- '70.0.3538.109',
- '72.0.3611.2',
- '72.0.3611.1',
- '72.0.3611.0',
- '71.0.3578.57',
- '70.0.3538.108',
- '72.0.3610.2',
- '71.0.3578.56',
- '71.0.3578.55',
- '72.0.3610.1',
- '72.0.3610.0',
- '71.0.3578.54',
- '70.0.3538.107',
- '71.0.3578.53',
- '72.0.3609.3',
- '71.0.3578.52',
- '72.0.3609.2',
- '71.0.3578.51',
- '72.0.3608.5',
- '72.0.3609.1',
- '72.0.3609.0',
- '71.0.3578.50',
- '70.0.3538.106',
- '72.0.3608.4',
- '72.0.3608.3',
- '72.0.3608.2',
- '71.0.3578.49',
- '72.0.3608.1',
- '72.0.3608.0',
- '70.0.3538.105',
- '71.0.3578.48',
- '72.0.3607.1',
- '72.0.3607.0',
- '71.0.3578.47',
- '70.0.3538.104',
- '72.0.3606.2',
- '72.0.3606.1',
- '72.0.3606.0',
- '71.0.3578.46',
- '70.0.3538.103',
- '70.0.3538.102',
- '72.0.3605.3',
- '72.0.3605.2',
- '72.0.3605.1',
- '72.0.3605.0',
- '71.0.3578.45',
- '70.0.3538.101',
- '71.0.3578.44',
- '71.0.3578.43',
- '70.0.3538.100',
- '70.0.3538.99',
- '71.0.3578.42',
- '72.0.3604.1',
- '72.0.3604.0',
- '71.0.3578.41',
- '70.0.3538.98',
- '71.0.3578.40',
- '72.0.3603.2',
- '72.0.3603.1',
- '72.0.3603.0',
- '71.0.3578.39',
- '70.0.3538.97',
- '72.0.3602.2',
- '71.0.3578.38',
- '71.0.3578.37',
- '72.0.3602.1',
- '72.0.3602.0',
- '71.0.3578.36',
- '70.0.3538.96',
- '72.0.3601.1',
- '72.0.3601.0',
- '71.0.3578.35',
- '70.0.3538.95',
- '72.0.3600.1',
- '72.0.3600.0',
- '71.0.3578.34',
- '70.0.3538.94',
- '72.0.3599.3',
- '72.0.3599.2',
- '72.0.3599.1',
- '72.0.3599.0',
- '71.0.3578.33',
- '70.0.3538.93',
- '72.0.3598.1',
- '72.0.3598.0',
- '71.0.3578.32',
- '70.0.3538.87',
- '72.0.3597.1',
- '72.0.3597.0',
- '72.0.3596.2',
- '71.0.3578.31',
- '70.0.3538.86',
- '71.0.3578.30',
- '71.0.3578.29',
- '72.0.3596.1',
- '72.0.3596.0',
- '71.0.3578.28',
- '70.0.3538.85',
- '72.0.3595.2',
- '72.0.3591.3',
- '72.0.3595.1',
- '72.0.3595.0',
- '71.0.3578.27',
- '70.0.3538.84',
- '72.0.3594.1',
- '72.0.3594.0',
- '71.0.3578.26',
- '70.0.3538.83',
- '72.0.3593.2',
- '72.0.3593.1',
- '72.0.3593.0',
- '71.0.3578.25',
- '70.0.3538.82',
- '72.0.3589.3',
- '72.0.3592.2',
- '72.0.3592.1',
- '72.0.3592.0',
- '71.0.3578.24',
- '72.0.3589.2',
- '70.0.3538.81',
- '70.0.3538.80',
- '72.0.3591.2',
- '72.0.3591.1',
- '72.0.3591.0',
- '71.0.3578.23',
- '70.0.3538.79',
- '71.0.3578.22',
- '72.0.3590.1',
- '72.0.3590.0',
- '71.0.3578.21',
- '70.0.3538.78',
- '70.0.3538.77',
- '72.0.3589.1',
- '72.0.3589.0',
- '71.0.3578.20',
- '70.0.3538.76',
- '71.0.3578.19',
- '70.0.3538.75',
- '72.0.3588.1',
- '72.0.3588.0',
- '71.0.3578.18',
- '70.0.3538.74',
- '72.0.3586.2',
- '72.0.3587.0',
- '71.0.3578.17',
- '70.0.3538.73',
- '72.0.3586.1',
- '72.0.3586.0',
- '71.0.3578.16',
- '70.0.3538.72',
- '72.0.3585.1',
- '72.0.3585.0',
- '71.0.3578.15',
- '70.0.3538.71',
- '71.0.3578.14',
- '72.0.3584.1',
- '72.0.3584.0',
- '71.0.3578.13',
- '70.0.3538.70',
- '72.0.3583.2',
- '71.0.3578.12',
- '72.0.3583.1',
- '72.0.3583.0',
- '71.0.3578.11',
- '70.0.3538.69',
- '71.0.3578.10',
- '72.0.3582.0',
- '72.0.3581.4',
- '71.0.3578.9',
- '70.0.3538.67',
- '72.0.3581.3',
- '72.0.3581.2',
- '72.0.3581.1',
- '72.0.3581.0',
- '71.0.3578.8',
- '70.0.3538.66',
- '72.0.3580.1',
- '72.0.3580.0',
- '71.0.3578.7',
- '70.0.3538.65',
- '71.0.3578.6',
- '72.0.3579.1',
- '72.0.3579.0',
- '71.0.3578.5',
- '70.0.3538.64',
- '71.0.3578.4',
- '71.0.3578.3',
- '71.0.3578.2',
- '71.0.3578.1',
- '71.0.3578.0',
- '70.0.3538.63',
- '69.0.3497.128',
- '70.0.3538.62',
- '70.0.3538.61',
- '70.0.3538.60',
- '70.0.3538.59',
- '71.0.3577.1',
- '71.0.3577.0',
- '70.0.3538.58',
- '69.0.3497.127',
- '71.0.3576.2',
- '71.0.3576.1',
- '71.0.3576.0',
- '70.0.3538.57',
- '70.0.3538.56',
- '71.0.3575.2',
- '70.0.3538.55',
- '69.0.3497.126',
- '70.0.3538.54',
- '71.0.3575.1',
- '71.0.3575.0',
- '71.0.3574.1',
- '71.0.3574.0',
- '70.0.3538.53',
- '69.0.3497.125',
- '70.0.3538.52',
- '71.0.3573.1',
- '71.0.3573.0',
- '70.0.3538.51',
- '69.0.3497.124',
- '71.0.3572.1',
- '71.0.3572.0',
- '70.0.3538.50',
- '69.0.3497.123',
- '71.0.3571.2',
- '70.0.3538.49',
- '69.0.3497.122',
- '71.0.3571.1',
- '71.0.3571.0',
- '70.0.3538.48',
- '69.0.3497.121',
- '71.0.3570.1',
- '71.0.3570.0',
- '70.0.3538.47',
- '69.0.3497.120',
- '71.0.3568.2',
- '71.0.3569.1',
- '71.0.3569.0',
- '70.0.3538.46',
- '69.0.3497.119',
- '70.0.3538.45',
- '71.0.3568.1',
- '71.0.3568.0',
- '70.0.3538.44',
- '69.0.3497.118',
- '70.0.3538.43',
- '70.0.3538.42',
- '71.0.3567.1',
- '71.0.3567.0',
- '70.0.3538.41',
- '69.0.3497.117',
- '71.0.3566.1',
- '71.0.3566.0',
- '70.0.3538.40',
- '69.0.3497.116',
- '71.0.3565.1',
- '71.0.3565.0',
- '70.0.3538.39',
- '69.0.3497.115',
- '71.0.3564.1',
- '71.0.3564.0',
- '70.0.3538.38',
- '69.0.3497.114',
- '71.0.3563.0',
- '71.0.3562.2',
- '70.0.3538.37',
- '69.0.3497.113',
- '70.0.3538.36',
- '70.0.3538.35',
- '71.0.3562.1',
- '71.0.3562.0',
- '70.0.3538.34',
- '69.0.3497.112',
- '70.0.3538.33',
- '71.0.3561.1',
- '71.0.3561.0',
- '70.0.3538.32',
- '69.0.3497.111',
- '71.0.3559.6',
- '71.0.3560.1',
- '71.0.3560.0',
- '71.0.3559.5',
- '71.0.3559.4',
- '70.0.3538.31',
- '69.0.3497.110',
- '71.0.3559.3',
- '70.0.3538.30',
- '69.0.3497.109',
- '71.0.3559.2',
- '71.0.3559.1',
- '71.0.3559.0',
- '70.0.3538.29',
- '69.0.3497.108',
- '71.0.3558.2',
- '71.0.3558.1',
- '71.0.3558.0',
- '70.0.3538.28',
- '69.0.3497.107',
- '71.0.3557.2',
- '71.0.3557.1',
- '71.0.3557.0',
- '70.0.3538.27',
- '69.0.3497.106',
- '71.0.3554.4',
- '70.0.3538.26',
- '71.0.3556.1',
- '71.0.3556.0',
- '70.0.3538.25',
- '71.0.3554.3',
- '69.0.3497.105',
- '71.0.3554.2',
- '70.0.3538.24',
- '69.0.3497.104',
- '71.0.3555.2',
- '70.0.3538.23',
- '71.0.3555.1',
- '71.0.3555.0',
- '70.0.3538.22',
- '69.0.3497.103',
- '71.0.3554.1',
- '71.0.3554.0',
- '70.0.3538.21',
- '69.0.3497.102',
- '71.0.3553.3',
- '70.0.3538.20',
- '69.0.3497.101',
- '71.0.3553.2',
- '69.0.3497.100',
- '71.0.3553.1',
- '71.0.3553.0',
- '70.0.3538.19',
- '69.0.3497.99',
- '69.0.3497.98',
- '69.0.3497.97',
- '71.0.3552.6',
- '71.0.3552.5',
- '71.0.3552.4',
- '71.0.3552.3',
- '71.0.3552.2',
- '71.0.3552.1',
- '71.0.3552.0',
- '70.0.3538.18',
- '69.0.3497.96',
- '71.0.3551.3',
- '71.0.3551.2',
- '71.0.3551.1',
- '71.0.3551.0',
- '70.0.3538.17',
- '69.0.3497.95',
- '71.0.3550.3',
- '71.0.3550.2',
- '71.0.3550.1',
- '71.0.3550.0',
- '70.0.3538.16',
- '69.0.3497.94',
- '71.0.3549.1',
- '71.0.3549.0',
- '70.0.3538.15',
- '69.0.3497.93',
- '69.0.3497.92',
- '71.0.3548.1',
- '71.0.3548.0',
- '70.0.3538.14',
- '69.0.3497.91',
- '71.0.3547.1',
- '71.0.3547.0',
- '70.0.3538.13',
- '69.0.3497.90',
- '71.0.3546.2',
- '69.0.3497.89',
- '71.0.3546.1',
- '71.0.3546.0',
- '70.0.3538.12',
- '69.0.3497.88',
- '71.0.3545.4',
- '71.0.3545.3',
- '71.0.3545.2',
- '71.0.3545.1',
- '71.0.3545.0',
- '70.0.3538.11',
- '69.0.3497.87',
- '71.0.3544.5',
- '71.0.3544.4',
- '71.0.3544.3',
- '71.0.3544.2',
- '71.0.3544.1',
- '71.0.3544.0',
- '69.0.3497.86',
- '70.0.3538.10',
- '69.0.3497.85',
- '70.0.3538.9',
- '69.0.3497.84',
- '71.0.3543.4',
- '70.0.3538.8',
- '71.0.3543.3',
- '71.0.3543.2',
- '71.0.3543.1',
- '71.0.3543.0',
- '70.0.3538.7',
- '69.0.3497.83',
- '71.0.3542.2',
- '71.0.3542.1',
- '71.0.3542.0',
- '70.0.3538.6',
- '69.0.3497.82',
- '69.0.3497.81',
- '71.0.3541.1',
- '71.0.3541.0',
- '70.0.3538.5',
- '69.0.3497.80',
- '71.0.3540.1',
- '71.0.3540.0',
- '70.0.3538.4',
- '69.0.3497.79',
- '70.0.3538.3',
- '71.0.3539.1',
- '71.0.3539.0',
- '69.0.3497.78',
- '68.0.3440.134',
- '69.0.3497.77',
- '70.0.3538.2',
- '70.0.3538.1',
- '70.0.3538.0',
- '69.0.3497.76',
- '68.0.3440.133',
- '69.0.3497.75',
- '70.0.3537.2',
- '70.0.3537.1',
- '70.0.3537.0',
- '69.0.3497.74',
- '68.0.3440.132',
- '70.0.3536.0',
- '70.0.3535.5',
- '70.0.3535.4',
- '70.0.3535.3',
- '69.0.3497.73',
- '68.0.3440.131',
- '70.0.3532.8',
- '70.0.3532.7',
- '69.0.3497.72',
- '69.0.3497.71',
- '70.0.3535.2',
- '70.0.3535.1',
- '70.0.3535.0',
- '69.0.3497.70',
- '68.0.3440.130',
- '69.0.3497.69',
- '68.0.3440.129',
- '70.0.3534.4',
- '70.0.3534.3',
- '70.0.3534.2',
- '70.0.3534.1',
- '70.0.3534.0',
- '69.0.3497.68',
- '68.0.3440.128',
- '70.0.3533.2',
- '70.0.3533.1',
- '70.0.3533.0',
- '69.0.3497.67',
- '68.0.3440.127',
- '70.0.3532.6',
- '70.0.3532.5',
- '70.0.3532.4',
- '69.0.3497.66',
- '68.0.3440.126',
- '70.0.3532.3',
- '70.0.3532.2',
- '70.0.3532.1',
- '69.0.3497.60',
- '69.0.3497.65',
- '69.0.3497.64',
- '70.0.3532.0',
- '70.0.3531.0',
- '70.0.3530.4',
- '70.0.3530.3',
- '70.0.3530.2',
- '69.0.3497.58',
- '68.0.3440.125',
- '69.0.3497.57',
- '69.0.3497.56',
- '69.0.3497.55',
- '69.0.3497.54',
- '70.0.3530.1',
- '70.0.3530.0',
- '69.0.3497.53',
- '68.0.3440.124',
- '69.0.3497.52',
- '70.0.3529.3',
- '70.0.3529.2',
- '70.0.3529.1',
- '70.0.3529.0',
- '69.0.3497.51',
- '70.0.3528.4',
- '68.0.3440.123',
- '70.0.3528.3',
- '70.0.3528.2',
- '70.0.3528.1',
- '70.0.3528.0',
- '69.0.3497.50',
- '68.0.3440.122',
- '70.0.3527.1',
- '70.0.3527.0',
- '69.0.3497.49',
- '68.0.3440.121',
- '70.0.3526.1',
- '70.0.3526.0',
- '68.0.3440.120',
- '69.0.3497.48',
- '69.0.3497.47',
- '68.0.3440.119',
- '68.0.3440.118',
- '70.0.3525.5',
- '70.0.3525.4',
- '70.0.3525.3',
- '68.0.3440.117',
- '69.0.3497.46',
- '70.0.3525.2',
- '70.0.3525.1',
- '70.0.3525.0',
- '69.0.3497.45',
- '68.0.3440.116',
- '70.0.3524.4',
- '70.0.3524.3',
- '69.0.3497.44',
- '70.0.3524.2',
- '70.0.3524.1',
- '70.0.3524.0',
- '70.0.3523.2',
- '69.0.3497.43',
- '68.0.3440.115',
- '70.0.3505.9',
- '69.0.3497.42',
- '70.0.3505.8',
- '70.0.3523.1',
- '70.0.3523.0',
- '69.0.3497.41',
- '68.0.3440.114',
- '70.0.3505.7',
- '69.0.3497.40',
- '70.0.3522.1',
- '70.0.3522.0',
- '70.0.3521.2',
- '69.0.3497.39',
- '68.0.3440.113',
- '70.0.3505.6',
- '70.0.3521.1',
- '70.0.3521.0',
- '69.0.3497.38',
- '68.0.3440.112',
- '70.0.3520.1',
- '70.0.3520.0',
- '69.0.3497.37',
- '68.0.3440.111',
- '70.0.3519.3',
- '70.0.3519.2',
- '70.0.3519.1',
- '70.0.3519.0',
- '69.0.3497.36',
- '68.0.3440.110',
- '70.0.3518.1',
- '70.0.3518.0',
- '69.0.3497.35',
- '69.0.3497.34',
- '68.0.3440.109',
- '70.0.3517.1',
- '70.0.3517.0',
- '69.0.3497.33',
- '68.0.3440.108',
- '69.0.3497.32',
- '70.0.3516.3',
- '70.0.3516.2',
- '70.0.3516.1',
- '70.0.3516.0',
- '69.0.3497.31',
- '68.0.3440.107',
- '70.0.3515.4',
- '68.0.3440.106',
- '70.0.3515.3',
- '70.0.3515.2',
- '70.0.3515.1',
- '70.0.3515.0',
- '69.0.3497.30',
- '68.0.3440.105',
- '68.0.3440.104',
- '70.0.3514.2',
- '70.0.3514.1',
- '70.0.3514.0',
- '69.0.3497.29',
- '68.0.3440.103',
- '70.0.3513.1',
- '70.0.3513.0',
- '69.0.3497.28',
+ '90.0.4430.212',
+ '90.0.4430.24',
+ '90.0.4430.70',
+ '90.0.4430.72',
+ '90.0.4430.85',
+ '90.0.4430.93',
+ '91.0.4472.101',
+ '91.0.4472.106',
+ '91.0.4472.114',
+ '91.0.4472.124',
+ '91.0.4472.164',
+ '91.0.4472.19',
+ '91.0.4472.77',
+ '92.0.4515.107',
+ '92.0.4515.115',
+ '92.0.4515.131',
+ '92.0.4515.159',
+ '92.0.4515.43',
+ '93.0.4556.0',
+ '93.0.4577.15',
+ '93.0.4577.63',
+ '93.0.4577.82',
+ '94.0.4606.41',
+ '94.0.4606.54',
+ '94.0.4606.61',
+ '94.0.4606.71',
+ '94.0.4606.81',
+ '94.0.4606.85',
+ '95.0.4638.17',
+ '95.0.4638.50',
+ '95.0.4638.54',
+ '95.0.4638.69',
+ '95.0.4638.74',
+ '96.0.4664.18',
+ '96.0.4664.45',
+ '96.0.4664.55',
+ '96.0.4664.93',
+ '97.0.4692.20',
)
return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
+SUPPORTED_ENCODINGS = [
+ 'gzip', 'deflate'
+]
+if compat_brotli:
+ SUPPORTED_ENCODINGS.append('br')
+
std_headers = {
'User-Agent': random_user_agent(),
- 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
- 'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'en-us,en;q=0.5',
+ 'Sec-Fetch-Mode': 'navigate',
}
@@ -1748,6 +227,7 @@ DATE_FORMATS = (
'%Y/%m/%d %H:%M:%S',
'%Y%m%d%H%M',
'%Y%m%d%H%M%S',
+ '%Y%m%d',
'%Y-%m-%d %H:%M',
'%Y-%m-%d %H:%M:%S',
'%Y-%m-%d %H:%M:%S.%f',
@@ -1842,7 +322,7 @@ def write_json_file(obj, fn):
try:
with tf:
- json.dump(obj, tf)
+ json.dump(obj, tf, ensure_ascii=False)
if sys.platform == 'win32':
# Need to remove existing file on Windows, else os.rename raises
# WindowsError or FileExistsError.
@@ -1952,17 +432,33 @@ def get_element_by_id(id, html):
return get_element_by_attribute('id', id, html)
+def get_element_html_by_id(id, html):
+ """Return the html of the tag with the specified ID in the passed HTML document"""
+ return get_element_html_by_attribute('id', id, html)
+
+
def get_element_by_class(class_name, html):
"""Return the content of the first tag with the specified class in the passed HTML document"""
retval = get_elements_by_class(class_name, html)
return retval[0] if retval else None
+def get_element_html_by_class(class_name, html):
+ """Return the html of the first tag with the specified class in the passed HTML document"""
+ retval = get_elements_html_by_class(class_name, html)
+ return retval[0] if retval else None
+
+
def get_element_by_attribute(attribute, value, html, escape_value=True):
retval = get_elements_by_attribute(attribute, value, html, escape_value)
return retval[0] if retval else None
+def get_element_html_by_attribute(attribute, value, html, escape_value=True):
+ retval = get_elements_html_by_attribute(attribute, value, html, escape_value)
+ return retval[0] if retval else None
+
+
def get_elements_by_class(class_name, html):
"""Return the content of all tags with the specified class in the passed HTML document as a list"""
return get_elements_by_attribute(
@@ -1970,29 +466,123 @@ def get_elements_by_class(class_name, html):
html, escape_value=False)
-def get_elements_by_attribute(attribute, value, html, escape_value=True):
+def get_elements_html_by_class(class_name, html):
+ """Return the html of all tags with the specified class in the passed HTML document as a list"""
+ return get_elements_html_by_attribute(
+ 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
+ html, escape_value=False)
+
+
+def get_elements_by_attribute(*args, **kwargs):
"""Return the content of the tag with the specified attribute in the passed HTML document"""
+ return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
+
+
+def get_elements_html_by_attribute(*args, **kwargs):
+ """Return the html of the tag with the specified attribute in the passed HTML document"""
+ return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
+
+
+def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value=True):
+ """
+ Return the text (content) and the html (whole) of the tag with the specified
+ attribute in the passed HTML document
+ """
+
+ value_quote_optional = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
value = re.escape(value) if escape_value else value
- retlist = []
- for m in re.finditer(r'''(?xs)
- <([a-zA-Z0-9:._-]+)
- (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
- \s+%s=['"]?%s['"]?
- (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
- \s*>
- (?P<content>.*?)
- </\1>
- ''' % (re.escape(attribute), value), html):
- res = m.group('content')
+ partial_element_re = r'''(?x)
+ <(?P<tag>[a-zA-Z0-9:._-]+)
+ (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
+ \s%(attribute)s\s*=\s*(?P<_q>['"]%(vqo)s)(?-x:%(value)s)(?P=_q)
+ ''' % {'attribute': re.escape(attribute), 'value': value, 'vqo': value_quote_optional}
+
+ for m in re.finditer(partial_element_re, html):
+ content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
+
+ yield (
+ unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
+ whole
+ )
+
+
+class HTMLBreakOnClosingTagParser(compat_HTMLParser):
+ """
+ HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
+ closing tag for the first opening tag it has encountered, and can be used
+ as a context manager
+ """
+
+ class HTMLBreakOnClosingTagException(Exception):
+ pass
+
+ def __init__(self):
+ self.tagstack = collections.deque()
+ compat_HTMLParser.__init__(self)
+
+ def __enter__(self):
+ return self
- if res.startswith('"') or res.startswith("'"):
- res = res[1:-1]
+ def __exit__(self, *_):
+ self.close()
- retlist.append(unescapeHTML(res))
+ def close(self):
+ # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
+ # so data remains buffered; we no longer have any interest in it, thus
+ # override this method to discard it
+ pass
- return retlist
+ def handle_starttag(self, tag, _):
+ self.tagstack.append(tag)
+
+ def handle_endtag(self, tag):
+ if not self.tagstack:
+ raise compat_HTMLParseError('no tags in the stack')
+ while self.tagstack:
+ inner_tag = self.tagstack.pop()
+ if inner_tag == tag:
+ break
+ else:
+ raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
+ if not self.tagstack:
+ raise self.HTMLBreakOnClosingTagException()
+
+
+def get_element_text_and_html_by_tag(tag, html):
+ """
+ For the first element with the specified tag in the passed HTML document
+ return its' content (text) and the whole element (html)
+ """
+ def find_or_raise(haystack, needle, exc):
+ try:
+ return haystack.index(needle)
+ except ValueError:
+ raise exc
+ closing_tag = f'</{tag}>'
+ whole_start = find_or_raise(
+ html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
+ content_start = find_or_raise(
+ html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
+ content_start += whole_start + 1
+ with HTMLBreakOnClosingTagParser() as parser:
+ parser.feed(html[whole_start:content_start])
+ if not parser.tagstack or parser.tagstack[0] != tag:
+ raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
+ offset = content_start
+ while offset < len(html):
+ next_closing_tag_start = find_or_raise(
+ html[offset:], closing_tag,
+ compat_HTMLParseError(f'closing {tag} tag not found'))
+ next_closing_tag_end = next_closing_tag_start + len(closing_tag)
+ try:
+ parser.feed(html[offset:offset + next_closing_tag_end])
+ offset += next_closing_tag_end
+ except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
+ return html[content_start:offset + next_closing_tag_start], \
+ html[whole_start:offset + next_closing_tag_end]
+ raise compat_HTMLParseError('unexpected end of html')
class HTMLAttributeParser(compat_HTMLParser):
@@ -2006,6 +596,23 @@ class HTMLAttributeParser(compat_HTMLParser):
self.attrs = dict(attrs)
+class HTMLListAttrsParser(compat_HTMLParser):
+ """HTML parser to gather the attributes for the elements of a list"""
+
+ def __init__(self):
+ compat_HTMLParser.__init__(self)
+ self.items = []
+ self._level = 0
+
+ def handle_starttag(self, tag, attrs):
+ if tag == 'li' and self._level == 0:
+ self.items.append(dict(attrs))
+ self._level += 1
+
+ def handle_endtag(self, tag):
+ self._level -= 1
+
+
def extract_attributes(html_element):
"""Given a string for an HTML element such as
<el
@@ -2032,16 +639,24 @@ def extract_attributes(html_element):
return parser.attrs
+def parse_list(webpage):
+ """Given a string for an series of HTML <li> elements,
+ return a dictionary of their attributes"""
+ parser = HTMLListAttrsParser()
+ parser.feed(webpage)
+ parser.close()
+ return parser.items
+
+
def clean_html(html):
"""Clean an HTML snippet into a readable string"""
if html is None: # Convenience for sanitizing descriptions etc.
return html
- # Newline vs <br />
- html = html.replace('\n', ' ')
- html = re.sub(r'(?u)\s*<\s*br\s*/?\s*>\s*', '\n', html)
- html = re.sub(r'(?u)<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
+ html = re.sub(r'\s+', ' ', html)
+ html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
+ html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
# Strip html tags
html = re.sub('<.*?>', '', html)
# Replace html entities
@@ -2065,7 +680,7 @@ def sanitize_open(filename, open_mode):
import msvcrt
msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
- stream = open(encodeFilename(filename), open_mode)
+ stream = locked_file(filename, open_mode, block=False).open()
return (stream, filename)
except (IOError, OSError) as err:
if err.errno in (errno.EACCES,):
@@ -2077,7 +692,7 @@ def sanitize_open(filename, open_mode):
raise
else:
# An exception here should be caught in the caller
- stream = open(encodeFilename(alt_filename), open_mode)
+ stream = locked_file(filename, open_mode, block=False).open()
return (stream, alt_filename)
@@ -2090,36 +705,40 @@ def timeconvert(timestr):
return timestamp
-def sanitize_filename(s, restricted=False, is_id=False):
+def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
"""Sanitizes a string so it could be used as part of a filename.
- If restricted is set, use a stricter subset of allowed characters.
- Set is_id if this is not an arbitrary string, but an ID that should be kept
- if possible.
+ @param restricted Use a stricter subset of allowed characters
+ @param is_id Whether this is an ID that should be kept unchanged if possible.
+ If unset, hypervideo's new sanitization rules are in effect
"""
+ if s == '':
+ return ''
+
def replace_insane(char):
if restricted and char in ACCENT_CHARS:
return ACCENT_CHARS[char]
elif not restricted and char == '\n':
- return ' '
+ return '\0 '
elif char == '?' or ord(char) < 32 or ord(char) == 127:
return ''
elif char == '"':
return '' if restricted else '\''
elif char == ':':
- return '_-' if restricted else ' -'
+ return '\0_\0-' if restricted else '\0 \0-'
elif char in '\\/|*<>':
- return '_'
- if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
- return '_'
- if restricted and ord(char) > 127:
- return '_'
+ return '\0_'
+ if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
+ return '\0_'
return char
- if s == '':
- return ''
- # Handle timestamps
- s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
+ s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) # Handle timestamps
result = ''.join(map(replace_insane, s))
+ if is_id is NO_DEFAULT:
+ result = re.sub('(\0.)(?:(?=\\1)..)+', r'\1', result) # Remove repeated substitute chars
+ STRIP_RE = '(?:\0.|[ _-])*'
+ result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result) # Remove substitute chars from start/end
+ result = result.replace('\0', '') or '_'
+
if not is_id:
while '__' in result:
result = result.replace('__', '_')
@@ -2272,6 +891,20 @@ def process_communicate_or_kill(p, *args, **kwargs):
raise
+class Popen(subprocess.Popen):
+ if sys.platform == 'win32':
+ _startupinfo = subprocess.STARTUPINFO()
+ _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
+ else:
+ _startupinfo = None
+
+ def __init__(self, *args, **kwargs):
+ super(Popen, self).__init__(*args, **kwargs, startupinfo=self._startupinfo)
+
+ def communicate_or_kill(self, *args, **kwargs):
+ return process_communicate_or_kill(self, *args, **kwargs)
+
+
def get_subprocess_encoding():
if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
# For subprocess calls, encode with locale encoding
@@ -2342,14 +975,25 @@ def decodeOption(optval):
return optval
+_timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
+
+
+def timetuple_from_msec(msec):
+ secs, msec = divmod(msec, 1000)
+ mins, secs = divmod(secs, 60)
+ hrs, mins = divmod(mins, 60)
+ return _timetuple(hrs, mins, secs, msec)
+
+
def formatSeconds(secs, delim=':', msec=False):
- if secs > 3600:
- ret = '%d%s%02d%s%02d' % (secs // 3600, delim, (secs % 3600) // 60, delim, secs % 60)
- elif secs > 60:
- ret = '%d%s%02d' % (secs // 60, delim, secs % 60)
+ time = timetuple_from_msec(secs * 1000)
+ if time.hours:
+ ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
+ elif time.minutes:
+ ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
else:
- ret = '%d' % secs
- return '%s.%03d' % (ret, secs % 1) if msec else ret
+ ret = '%d' % time.seconds
+ return '%s.%03d' % (ret, time.milliseconds) if msec else ret
def _ssl_load_windows_store_certs(ssl_context, storename):
@@ -2371,33 +1015,34 @@ def make_HTTPS_handler(params, **kwargs):
opts_check_certificate = not params.get('nocheckcertificate')
context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
context.check_hostname = opts_check_certificate
+ if params.get('legacyserverconnect'):
+ context.options |= 4 # SSL_OP_LEGACY_SERVER_CONNECT
context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
if opts_check_certificate:
- try:
- context.load_default_certs()
- # Work around the issue in load_default_certs when there are bad certificates. See:
- # https://github.com/hypervideo/hypervideo/issues/1060,
- # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
- except ssl.SSLError:
- # enum_certificates is not present in mingw python. See https://github.com/hypervideo/hypervideo/issues/1151
- if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
- # Create a new context to discard any certificates that were already loaded
- context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
- context.check_hostname, context.verify_mode = True, ssl.CERT_REQUIRED
- for storename in ('CA', 'ROOT'):
- _ssl_load_windows_store_certs(context, storename)
- context.set_default_verify_paths()
+ if has_certifi and 'no-certifi' not in params.get('compat_opts', []):
+ context.load_verify_locations(cafile=certifi.where())
+ else:
+ try:
+ context.load_default_certs()
+ # Work around the issue in load_default_certs when there are bad certificates. See:
+ # https://github.com/hypervideo/hypervideo/issues/1060,
+ # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
+ except ssl.SSLError:
+ # enum_certificates is not present in mingw python. See https://github.com/hypervideo/hypervideo/issues/1151
+ if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
+ # Create a new context to discard any certificates that were already loaded
+ context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
+ context.check_hostname, context.verify_mode = True, ssl.CERT_REQUIRED
+ for storename in ('CA', 'ROOT'):
+ _ssl_load_windows_store_certs(context, storename)
+ context.set_default_verify_paths()
return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
def bug_reports_message(before=';'):
- if ytdl_is_updateable():
- update_cmd = 'type doas pacman -Sy hypervideo to update'
- else:
- update_cmd = 'see https://git.conocimientoslibres.ga/software/hypervideo.git/about/#how-do-i-update-hypervideo'
- msg = 'please report this issue on https://github.com/hypervideo/hypervideo .'
- msg += ' Make sure you are using the latest version; %s.' % update_cmd
- msg += ' Be sure to call hypervideo with the --verbose flag and include its complete output.'
+ msg = ('please report this issue on https://issues.hyperbola.info/ , '
+ 'filling out the appropriate issue template. '
+ 'Confirm you are on the latest version using pacman -Su')
before = before.rstrip()
if not before or before.endswith(('.', '!', '?')):
@@ -2408,7 +1053,14 @@ def bug_reports_message(before=';'):
class YoutubeDLError(Exception):
"""Base exception for YoutubeDL errors."""
- pass
+ msg = None
+
+ def __init__(self, msg=None):
+ if msg is not None:
+ self.msg = msg
+ elif self.msg is None:
+ self.msg = type(self).__name__
+ super().__init__(self.msg)
network_exceptions = [compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error]
@@ -2427,7 +1079,7 @@ class ExtractorError(YoutubeDLError):
if sys.exc_info()[0] in network_exceptions:
expected = True
- self.msg = str(msg)
+ self.orig_msg = str(msg)
self.traceback = tb
self.expected = expected
self.cause = cause
@@ -2438,14 +1090,15 @@ class ExtractorError(YoutubeDLError):
super(ExtractorError, self).__init__(''.join((
format_field(ie, template='[%s] '),
format_field(video_id, template='%s: '),
- self.msg,
+ msg,
format_field(cause, template=' (caused by %r)'),
'' if expected else bug_reports_message())))
def format_traceback(self):
- if self.traceback is None:
- return None
- return ''.join(traceback.format_tb(self.traceback))
+ return join_nonempty(
+ self.traceback and ''.join(traceback.format_tb(self.traceback)),
+ self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
+ delim='\n') or None
class UnsupportedError(ExtractorError):
@@ -2467,9 +1120,9 @@ class GeoRestrictedError(ExtractorError):
geographic location due to geographic restrictions imposed by a website.
"""
- def __init__(self, msg, countries=None):
- super(GeoRestrictedError, self).__init__(msg, expected=True)
- self.msg = msg
+ def __init__(self, msg, countries=None, **kwargs):
+ kwargs['expected'] = True
+ super(GeoRestrictedError, self).__init__(msg, **kwargs)
self.countries = countries
@@ -2493,7 +1146,7 @@ class EntryNotInPlaylist(YoutubeDLError):
This exception will be thrown by YoutubeDL when a requested entry
is not found in the playlist info_dict
"""
- pass
+ msg = 'Entry not found in info'
class SameFileError(YoutubeDLError):
@@ -2502,7 +1155,12 @@ class SameFileError(YoutubeDLError):
This exception will be thrown by FileDownloader objects if they detect
multiple files would have to be downloaded to the same file on disk.
"""
- pass
+ msg = 'Fixed output name but more than one file to download'
+
+ def __init__(self, filename=None):
+ if filename is not None:
+ self.msg += f': {filename}'
+ super().__init__(self.msg)
class PostProcessingError(YoutubeDLError):
@@ -2512,29 +1170,41 @@ class PostProcessingError(YoutubeDLError):
indicate an error in the postprocessing task.
"""
- def __init__(self, msg):
- super(PostProcessingError, self).__init__(msg)
- self.msg = msg
+class DownloadCancelled(YoutubeDLError):
+ """ Exception raised when the download queue should be interrupted """
+ msg = 'The download was cancelled'
-class ExistingVideoReached(YoutubeDLError):
- """ --max-downloads limit has been reached. """
- pass
+
+class ExistingVideoReached(DownloadCancelled):
+ """ --break-on-existing triggered """
+ msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
+
+
+class RejectedVideoReached(DownloadCancelled):
+ """ --break-on-reject triggered """
+ msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
-class RejectedVideoReached(YoutubeDLError):
+class MaxDownloadsReached(DownloadCancelled):
""" --max-downloads limit has been reached. """
- pass
+ msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
-class ThrottledDownload(YoutubeDLError):
- """ Download speed below --throttled-rate. """
- pass
+class ReExtractInfo(YoutubeDLError):
+ """ Video info needs to be re-extracted. """
+ def __init__(self, msg, expected=False):
+ super().__init__(msg)
+ self.expected = expected
-class MaxDownloadsReached(YoutubeDLError):
- """ --max-downloads limit has been reached. """
- pass
+
+class ThrottledDownload(ReExtractInfo):
+ """ Download speed below --throttled-rate. """
+ msg = 'The download speed is below throttle limit'
+
+ def __init__(self):
+ super().__init__(self.msg, expected=False)
class UnavailableVideoError(YoutubeDLError):
@@ -2543,7 +1213,12 @@ class UnavailableVideoError(YoutubeDLError):
This exception will be thrown when a video is requested
in a format that is not available for that video.
"""
- pass
+ msg = 'Unable to download video'
+
+ def __init__(self, err=None):
+ if err is not None:
+ self.msg += f': {err}'
+ super().__init__(self.msg)
class ContentTooShortError(YoutubeDLError):
@@ -2701,6 +1376,12 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
except zlib.error:
return zlib.decompress(data)
+ @staticmethod
+ def brotli(data):
+ if not data:
+ return data
+ return compat_brotli.decompress(data)
+
def http_request(self, req):
# According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
# always respected by websites, some tend to give out URLs with non percent-encoded
@@ -2717,12 +1398,15 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
if url != url_escaped:
req = update_Request(req, url=url_escaped)
- for h, v in std_headers.items():
+ for h, v in self._params.get('http_headers', std_headers).items():
# Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
# The dict keys are capitalized because of this bug by urllib
if h.capitalize() not in req.headers:
req.add_header(h, v)
+ if 'Accept-encoding' not in req.headers:
+ req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS))
+
req.headers = handle_youtubedl_headers(req.headers)
if sys.version_info < (2, 7) and '#' in req.get_full_url():
@@ -2761,6 +1445,12 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
resp.msg = old_resp.msg
del resp.headers['Content-encoding']
+ # brotli
+ if resp.headers.get('Content-encoding', '') == 'br':
+ resp = compat_urllib_request.addinfourl(
+ io.BytesIO(self.brotli(resp.read())), old_resp.headers, old_resp.url, old_resp.code)
+ resp.msg = old_resp.msg
+ del resp.headers['Content-encoding']
# Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
# https://github.com/ytdl-org/youtube-dl/issues/6457).
if 300 <= resp.code < 400:
@@ -3180,7 +1870,7 @@ def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
"""
Return a datetime object from a string in the format YYYYMMDD or
- (now|today|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
+ (now|today|yesterday|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
format: string date format used to return datetime object from
precision: round the time portion of a datetime object.
@@ -3191,7 +1881,7 @@ def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
if precision == 'auto':
auto_precision = True
precision = 'microsecond'
- today = datetime_round(datetime.datetime.now(), precision)
+ today = datetime_round(datetime.datetime.utcnow(), precision)
if date_str in ('now', 'today'):
return today
if date_str == 'yesterday':
@@ -3219,13 +1909,17 @@ def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
return datetime_round(datetime.datetime.strptime(date_str, format), precision)
-def date_from_str(date_str, format='%Y%m%d'):
+def date_from_str(date_str, format='%Y%m%d', strict=False):
"""
Return a datetime object from a string in the format YYYYMMDD or
- (now|today|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
+ (now|today|yesterday|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
+
+ If "strict", only (now|today)[+-][0-9](day|week|month|year)(s)? is allowed
format: string date format used to return datetime object from
"""
+ if strict and not re.fullmatch(r'\d{8}|(now|today)[+-]\d+(day|week|month|year)(s)?', date_str):
+ raise ValueError(f'Invalid date format {date_str}')
return datetime_from_str(date_str, precision='microsecond', format=format).date()
@@ -3272,11 +1966,11 @@ class DateRange(object):
def __init__(self, start=None, end=None):
"""start and end must be strings in the format accepted by date"""
if start is not None:
- self.start = date_from_str(start)
+ self.start = date_from_str(start, strict=True)
else:
self.start = datetime.datetime.min.date()
if end is not None:
- self.end = date_from_str(end)
+ self.end = date_from_str(end, strict=True)
else:
self.end = datetime.datetime.max.date()
if self.start > self.end:
@@ -3320,7 +2014,6 @@ def _windows_write_string(s, out):
False if it has yet to be written out."""
# Adapted from http://stackoverflow.com/a/3259271/35070
- import ctypes
import ctypes.wintypes
WIN_OUTPUT_IDS = {
@@ -3464,38 +2157,52 @@ if sys.platform == 'win32':
whole_low = 0xffffffff
whole_high = 0x7fffffff
- def _lock_file(f, exclusive):
+ def _lock_file(f, exclusive, block):
overlapped = OVERLAPPED()
overlapped.Offset = 0
overlapped.OffsetHigh = 0
overlapped.hEvent = 0
f._lock_file_overlapped_p = ctypes.pointer(overlapped)
- handle = msvcrt.get_osfhandle(f.fileno())
- if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
- whole_low, whole_high, f._lock_file_overlapped_p):
- raise OSError('Locking file failed: %r' % ctypes.FormatError())
+
+ if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
+ (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
+ 0, whole_low, whole_high, f._lock_file_overlapped_p):
+ raise BlockingIOError('Locking file failed: %r' % ctypes.FormatError())
def _unlock_file(f):
assert f._lock_file_overlapped_p
handle = msvcrt.get_osfhandle(f.fileno())
- if not UnlockFileEx(handle, 0,
- whole_low, whole_high, f._lock_file_overlapped_p):
+ if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
else:
- # Some platforms, such as Jython, is missing fcntl
try:
import fcntl
- def _lock_file(f, exclusive):
- fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
+ def _lock_file(f, exclusive, block):
+ try:
+ fcntl.flock(f,
+ fcntl.LOCK_SH if not exclusive
+ else fcntl.LOCK_EX if block
+ else fcntl.LOCK_EX | fcntl.LOCK_NB)
+ except BlockingIOError:
+ raise
+ except OSError: # AOSP does not have flock()
+ fcntl.lockf(f,
+ fcntl.LOCK_SH if not exclusive
+ else fcntl.LOCK_EX if block
+ else fcntl.LOCK_EX | fcntl.LOCK_NB)
def _unlock_file(f):
- fcntl.flock(f, fcntl.LOCK_UN)
+ try:
+ fcntl.flock(f, fcntl.LOCK_UN)
+ except OSError:
+ fcntl.lockf(f, fcntl.LOCK_UN)
+
except ImportError:
UNSUPPORTED_MSG = 'file locking is not supported on this platform'
- def _lock_file(f, exclusive):
+ def _lock_file(f, exclusive, block):
raise IOError(UNSUPPORTED_MSG)
def _unlock_file(f):
@@ -3503,15 +2210,18 @@ else:
class locked_file(object):
- def __init__(self, filename, mode, encoding=None):
- assert mode in ['r', 'a', 'w']
+ _closed = False
+
+ def __init__(self, filename, mode, block=True, encoding=None):
+ assert mode in ['r', 'rb', 'a', 'ab', 'w', 'wb']
self.f = io.open(filename, mode, encoding=encoding)
self.mode = mode
+ self.block = block
def __enter__(self):
- exclusive = self.mode != 'r'
+ exclusive = 'r' not in self.mode
try:
- _lock_file(self.f, exclusive)
+ _lock_file(self.f, exclusive, self.block)
except IOError:
self.f.close()
raise
@@ -3519,9 +2229,11 @@ class locked_file(object):
def __exit__(self, etype, value, traceback):
try:
- _unlock_file(self.f)
+ if not self._closed:
+ _unlock_file(self.f)
finally:
self.f.close()
+ self._closed = True
def __iter__(self):
return iter(self.f)
@@ -3532,6 +2244,15 @@ class locked_file(object):
def read(self, *args):
return self.f.read(*args)
+ def flush(self):
+ self.f.flush()
+
+ def open(self):
+ return self.__enter__()
+
+ def close(self, *args):
+ self.__exit__(self, *args, value=False, traceback=False)
+
def get_filesystem_encoding():
encoding = sys.getfilesystemencoding()
@@ -3568,18 +2289,22 @@ def unsmuggle_url(smug_url, default=None):
return url, data
+def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
+ """ Formats numbers with decimal sufixes like K, M, etc """
+ num, factor = float_or_none(num), float(factor)
+ if num is None or num < 0:
+ return None
+ POSSIBLE_SUFFIXES = 'kMGTPEZY'
+ exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
+ suffix = ['', *POSSIBLE_SUFFIXES][exponent]
+ if factor == 1024:
+ suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
+ converted = num / (factor ** exponent)
+ return fmt % (converted, suffix)
+
+
def format_bytes(bytes):
- if bytes is None:
- return 'N/A'
- if type(bytes) is str:
- bytes = float(bytes)
- if bytes == 0.0:
- exponent = 0
- else:
- exponent = int(math.log(bytes, 1024.0))
- suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
- converted = float(bytes) / float(1024 ** exponent)
- return '%.2f%s' % (converted, suffix)
+ return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
def lookup_unit_table(unit_table, s):
@@ -3668,7 +2393,7 @@ def parse_count(s):
if s is None:
return None
- s = s.strip()
+ s = re.sub(r'^[^\d]+\s', '', s).strip()
if re.match(r'^[\d,.]+$', s):
return str_to_int(s)
@@ -3680,23 +2405,34 @@ def parse_count(s):
'M': 1000 ** 2,
'kk': 1000 ** 2,
'KK': 1000 ** 2,
+ 'b': 1000 ** 3,
+ 'B': 1000 ** 3,
}
- return lookup_unit_table(_UNIT_TABLE, s)
+ ret = lookup_unit_table(_UNIT_TABLE, s)
+ if ret is not None:
+ return ret
+
+ mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
+ if mobj:
+ return str_to_int(mobj.group(1))
-def parse_resolution(s):
+def parse_resolution(s, *, lenient=False):
if s is None:
return {}
- mobj = re.search(r'\b(?P<w>\d+)\s*[xX×]\s*(?P<h>\d+)\b', s)
+ if lenient:
+ mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
+ else:
+ mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
if mobj:
return {
'width': int(mobj.group('w')),
'height': int(mobj.group('h')),
}
- mobj = re.search(r'\b(\d+)[pPiI]\b', s)
+ mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
if mobj:
return {'height': int(mobj.group(1))}
@@ -3827,16 +2563,11 @@ class PUTRequest(compat_urllib_request.Request):
def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
- if get_attr:
- if v is not None:
- v = getattr(v, get_attr, None)
- if v == '':
- v = None
- if v is None:
- return default
+ if get_attr and v is not None:
+ v = getattr(v, get_attr, None)
try:
return int(v) * invscale // scale
- except (ValueError, TypeError):
+ except (ValueError, TypeError, OverflowError):
return default
@@ -3877,6 +2608,13 @@ def url_or_none(url):
return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
+def request_to_url(req):
+ if isinstance(req, compat_urllib_request.Request):
+ return req.get_full_url()
+ else:
+ return req
+
+
def strftime_or_none(timestamp, date_format, default=None):
datetime_object = None
try:
@@ -3892,34 +2630,40 @@ def strftime_or_none(timestamp, date_format, default=None):
def parse_duration(s):
if not isinstance(s, compat_basestring):
return None
-
s = s.strip()
+ if not s:
+ return None
days, hours, mins, secs, ms = [None] * 5
- m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s)
+ m = re.match(r'''(?x)
+ (?P<before_secs>
+ (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
+ (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
+ (?P<ms>[.:][0-9]+)?Z?$
+ ''', s)
if m:
- days, hours, mins, secs, ms = m.groups()
+ days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
else:
m = re.match(
r'''(?ix)(?:P?
(?:
- [0-9]+\s*y(?:ears?)?\s*
+ [0-9]+\s*y(?:ears?)?,?\s*
)?
(?:
- [0-9]+\s*m(?:onths?)?\s*
+ [0-9]+\s*m(?:onths?)?,?\s*
)?
(?:
- [0-9]+\s*w(?:eeks?)?\s*
+ [0-9]+\s*w(?:eeks?)?,?\s*
)?
(?:
- (?P<days>[0-9]+)\s*d(?:ays?)?\s*
+ (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
)?
T)?
(?:
- (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
+ (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
)?
(?:
- (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
+ (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
)?
(?:
(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
@@ -3943,7 +2687,7 @@ def parse_duration(s):
if days:
duration += float(days) * 24 * 60 * 60
if ms:
- duration += float(ms)
+ duration += float(ms.replace(':', '.'))
return duration
@@ -3966,30 +2710,27 @@ def check_executable(exe, args=[]):
""" Checks if the given binary is installed somewhere in PATH, and returns its name.
args can be a list of arguments for a short output (like -version) """
try:
- process_communicate_or_kill(subprocess.Popen(
- [exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE))
+ Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate_or_kill()
except OSError:
return False
return exe
-def get_exe_version(exe, args=['--version'],
- version_re=None, unrecognized='present'):
- """ Returns the version of the specified executable,
- or False if the executable is not present """
+def _get_exe_version_output(exe, args, *, to_screen=None):
+ if to_screen:
+ to_screen(f'Checking exe version: {shell_quote([exe] + args)}')
try:
# STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
# SIGTTOU if hypervideo is run in the background.
# See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
- out, _ = process_communicate_or_kill(subprocess.Popen(
- [encodeArgument(exe)] + args,
- stdin=subprocess.PIPE,
- stdout=subprocess.PIPE, stderr=subprocess.STDOUT))
+ out, _ = Popen(
+ [encodeArgument(exe)] + args, stdin=subprocess.PIPE,
+ stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate_or_kill()
except OSError:
return False
if isinstance(out, bytes): # Python 2.x
out = out.decode('ascii', 'ignore')
- return detect_exe_version(out, version_re, unrecognized)
+ return out
def detect_exe_version(output, version_re=None, unrecognized='present'):
@@ -4003,6 +2744,14 @@ def detect_exe_version(output, version_re=None, unrecognized='present'):
return unrecognized
+def get_exe_version(exe, args=['--version'],
+ version_re=None, unrecognized='present'):
+ """ Returns the version of the specified executable,
+ or False if the executable is not present """
+ out = _get_exe_version_output(exe, args)
+ return detect_exe_version(out, version_re, unrecognized) if out else False
+
+
class LazyList(collections.abc.Sequence):
''' Lazy immutable list from an iterable
Note that slices of a LazyList are lists and not LazyList'''
@@ -4010,10 +2759,10 @@ class LazyList(collections.abc.Sequence):
class IndexError(IndexError):
pass
- def __init__(self, iterable):
+ def __init__(self, iterable, *, reverse=False, _cache=None):
self.__iterable = iter(iterable)
- self.__cache = []
- self.__reversed = False
+ self.__cache = [] if _cache is None else _cache
+ self.__reversed = reverse
def __iter__(self):
if self.__reversed:
@@ -4027,6 +2776,8 @@ class LazyList(collections.abc.Sequence):
def __exhaust(self):
self.__cache.extend(self.__iterable)
+ # Discard the emptied iterable to make it pickle-able
+ self.__iterable = []
return self.__cache
def exhaust(self):
@@ -4077,9 +2828,11 @@ class LazyList(collections.abc.Sequence):
self.__exhaust()
return len(self.__cache)
- def reverse(self):
- self.__reversed = not self.__reversed
- return self
+ def __reversed__(self):
+ return type(self)(self.__iterable, reverse=not self.__reversed, _cache=self.__cache)
+
+ def __copy__(self):
+ return type(self)(self.__iterable, reverse=self.__reversed, _cache=self.__cache)
def __repr__(self):
# repr and str should mimic a list. So we exhaust the iterable
@@ -4090,6 +2843,10 @@ class LazyList(collections.abc.Sequence):
class PagedList:
+
+ class IndexError(IndexError):
+ pass
+
def __len__(self):
# This is only useful for tests
return len(self.getslice())
@@ -4097,11 +2854,14 @@ class PagedList:
def __init__(self, pagefunc, pagesize, use_cache=True):
self._pagefunc = pagefunc
self._pagesize = pagesize
+ self._pagecount = float('inf')
self._use_cache = use_cache
self._cache = {}
def getpage(self, pagenum):
- page_results = self._cache.get(pagenum) or list(self._pagefunc(pagenum))
+ page_results = self._cache.get(pagenum)
+ if page_results is None:
+ page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
if self._use_cache:
self._cache[pagenum] = page_results
return page_results
@@ -4113,14 +2873,17 @@ class PagedList:
raise NotImplementedError('This method must be implemented by subclasses')
def __getitem__(self, idx):
- # NOTE: cache must be enabled if this is used
+ assert self._use_cache, 'Indexing PagedList requires cache'
if not isinstance(idx, int) or idx < 0:
raise TypeError('indices must be non-negative integers')
entries = self.getslice(idx, idx + 1)
- return entries[0] if entries else None
+ if not entries:
+ raise self.IndexError()
+ return entries[0]
class OnDemandPagedList(PagedList):
+ """Download pages until a page with less than maximum results"""
def _getslice(self, start, end):
for pagenum in itertools.count(start // self._pagesize):
firstid = pagenum * self._pagesize
@@ -4137,7 +2900,11 @@ class OnDemandPagedList(PagedList):
if (end is not None and firstid <= end <= nextfirstid)
else None)
- page_results = self.getpage(pagenum)
+ try:
+ page_results = self.getpage(pagenum)
+ except Exception:
+ self._pagecount = pagenum - 1
+ raise
if startv != 0 or endv is not None:
page_results = page_results[startv:endv]
yield from page_results
@@ -4156,14 +2923,14 @@ class OnDemandPagedList(PagedList):
class InAdvancePagedList(PagedList):
+ """PagedList with total number of pages known in advance"""
def __init__(self, pagefunc, pagecount, pagesize):
- self._pagecount = pagecount
PagedList.__init__(self, pagefunc, pagesize, True)
+ self._pagecount = pagecount
def _getslice(self, start, end):
start_page = start // self._pagesize
- end_page = (
- self._pagecount if end is None else (end // self._pagesize + 1))
+ end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
skip_elems = start - start_page * self._pagesize
only_more = None if end is None else end - start
for pagenum in range(start_page, end_page):
@@ -4325,36 +3092,37 @@ def multipart_encode(data, boundary=None):
def dict_get(d, key_or_keys, default=None, skip_false_values=True):
- if isinstance(key_or_keys, (list, tuple)):
- for key in key_or_keys:
- if key not in d or d[key] is None or skip_false_values and not d[key]:
- continue
- return d[key]
- return default
- return d.get(key_or_keys, default)
+ for val in map(d.get, variadic(key_or_keys)):
+ if val is not None and (val or not skip_false_values):
+ return val
+ return default
-def try_get(src, getter, expected_type=None):
- for get in variadic(getter):
+def try_call(*funcs, expected_type=None, args=[], kwargs={}):
+ for f in funcs:
try:
- v = get(src)
- except (AttributeError, KeyError, TypeError, IndexError):
+ val = f(*args, **kwargs)
+ except (AttributeError, KeyError, TypeError, IndexError, ZeroDivisionError):
pass
else:
- if expected_type is None or isinstance(v, expected_type):
- return v
+ if expected_type is None or isinstance(val, expected_type):
+ return val
+
+
+def try_get(src, getter, expected_type=None):
+ return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
+
+
+def filter_dict(dct, cndn=lambda _, v: v is not None):
+ return {k: v for k, v in dct.items() if cndn(k, v)}
def merge_dicts(*dicts):
merged = {}
for a_dict in dicts:
for k, v in a_dict.items():
- if v is None:
- continue
- if (k not in merged
- or (isinstance(v, compat_str) and v
- and isinstance(merged[k], compat_str)
- and not merged[k])):
+ if (v is not None and k not in merged
+ or isinstance(v, str) and merged[k] == ''):
merged[k] = v
return merged
@@ -4446,6 +3214,8 @@ def js_to_json(code, vars={}):
return '"%s"' % v
+ code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
+
return re.sub(r'''(?sx)
"(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
'(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
@@ -4467,6 +3237,9 @@ def qualities(quality_ids):
return q
+POSTPROCESS_WHEN = {'pre_process', 'after_filter', 'before_dl', 'after_move', 'post_process', 'after_video', 'playlist'}
+
+
DEFAULT_OUTTMPL = {
'default': '%(title)s [%(id)s].%(ext)s',
'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
@@ -4478,6 +3251,8 @@ OUTTMPL_TYPES = {
'description': 'description',
'annotation': 'annotations.xml',
'infojson': 'info.json',
+ 'link': None,
+ 'pl_video': None,
'pl_thumbnail': None,
'pl_description': 'description',
'pl_infojson': 'info.json',
@@ -4548,6 +3323,10 @@ def error_to_compat_str(err):
return err_str
+def error_to_str(err):
+ return f'{type(err).__name__}: {err}'
+
+
def mimetype2ext(mt):
if mt is None:
return None
@@ -4612,43 +3391,53 @@ def mimetype2ext(mt):
return subtype.replace('+', '.')
+def ext2mimetype(ext_or_url):
+ if not ext_or_url:
+ return None
+ if '.' not in ext_or_url:
+ ext_or_url = f'file.{ext_or_url}'
+ return mimetypes.guess_type(ext_or_url)[0]
+
+
def parse_codecs(codecs_str):
# http://tools.ietf.org/html/rfc6381
if not codecs_str:
return {}
split_codecs = list(filter(None, map(
str.strip, codecs_str.strip().strip(',').split(','))))
- vcodec, acodec, hdr = None, None, None
+ vcodec, acodec, tcodec, hdr = None, None, None, None
for full_codec in split_codecs:
- codec = full_codec.split('.')[0]
- if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v', 'hvc1', 'av01', 'theora', 'dvh1', 'dvhe'):
+ parts = full_codec.split('.')
+ codec = parts[0].replace('0', '')
+ if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
+ 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
if not vcodec:
- vcodec = full_codec
+ vcodec = '.'.join(parts[:4]) if codec in ('vp9', 'av1', 'hvc1') else full_codec
if codec in ('dvh1', 'dvhe'):
hdr = 'DV'
- elif codec == 'vp9' and vcodec.startswith('vp9.2'):
+ elif codec == 'av1' and len(parts) > 3 and parts[3] == '10':
hdr = 'HDR10'
- elif codec == 'av01':
- parts = full_codec.split('.')
- if len(parts) > 3 and parts[3] == '10':
- hdr = 'HDR10'
- vcodec = '.'.join(parts[:4])
- elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
+ elif full_codec.replace('0', '').startswith('vp9.2'):
+ hdr = 'HDR10'
+ elif codec in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
if not acodec:
acodec = full_codec
+ elif codec in ('stpp', 'wvtt',):
+ if not tcodec:
+ tcodec = full_codec
else:
write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr)
- if not vcodec and not acodec:
- if len(split_codecs) == 2:
- return {
- 'vcodec': split_codecs[0],
- 'acodec': split_codecs[1],
- }
- else:
+ if vcodec or acodec or tcodec:
return {
'vcodec': vcodec or 'none',
'acodec': acodec or 'none',
'dynamic_range': hdr,
+ **({'tcodec': tcodec} if tcodec is not None else {}),
+ }
+ elif len(split_codecs) == 2:
+ return {
+ 'vcodec': split_codecs[0],
+ 'acodec': split_codecs[1],
}
return {}
@@ -4706,7 +3495,7 @@ def determine_protocol(info_dict):
if protocol is not None:
return protocol
- url = info_dict['url']
+ url = sanitize_url(info_dict['url'])
if url.startswith('rtmp'):
return 'rtmp'
elif url.startswith('mms'):
@@ -4723,26 +3512,36 @@ def determine_protocol(info_dict):
return compat_urllib_parse_urlparse(url).scheme
-def render_table(header_row, data, delim=False, extraGap=0, hideEmpty=False):
- """ Render a list of rows, each as a list of values """
+def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
+ """ Render a list of rows, each as a list of values.
+ Text after a \t will be right aligned """
+ def width(string):
+ return len(remove_terminal_sequences(string).replace('\t', ''))
def get_max_lens(table):
- return [max(len(compat_str(v)) for v in col) for col in zip(*table)]
+ return [max(width(str(v)) for v in col) for col in zip(*table)]
def filter_using_list(row, filterArray):
- return [col for (take, col) in zip(filterArray, row) if take]
+ return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
- if hideEmpty:
- max_lens = get_max_lens(data)
- header_row = filter_using_list(header_row, max_lens)
- data = [filter_using_list(row, max_lens) for row in data]
+ max_lens = get_max_lens(data) if hide_empty else []
+ header_row = filter_using_list(header_row, max_lens)
+ data = [filter_using_list(row, max_lens) for row in data]
table = [header_row] + data
max_lens = get_max_lens(table)
+ extra_gap += 1
if delim:
- table = [header_row] + [['-' * ml for ml in max_lens]] + data
- format_str = ' '.join('%-' + compat_str(ml + extraGap) + 's' for ml in max_lens[:-1]) + ' %s'
- return '\n'.join(format_str % tuple(row) for row in table)
+ table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
+ table[1][-1] = table[1][-1][:-extra_gap * len(delim)] # Remove extra_gap from end of delimiter
+ for row in table:
+ for pos, text in enumerate(map(str, row)):
+ if '\t' in text:
+ row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
+ else:
+ row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
+ ret = '\n'.join(''.join(row).rstrip() for row in table)
+ return ret
def _match_one(filter_part, dct, incomplete):
@@ -4762,6 +3561,11 @@ def _match_one(filter_part, dct, incomplete):
'=': operator.eq,
}
+ if isinstance(incomplete, bool):
+ is_incomplete = lambda _: incomplete
+ else:
+ is_incomplete = lambda k: k in incomplete
+
operator_rex = re.compile(r'''(?x)\s*
(?P<key>[a-z_]+)
\s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
@@ -4800,7 +3604,7 @@ def _match_one(filter_part, dct, incomplete):
if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
raise ValueError('Operator %s only supports string values!' % m['op'])
if actual_value is None:
- return incomplete or m['none_inclusive']
+ return is_incomplete(m['key']) or m['none_inclusive']
return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
UNARY_OPERATORS = {
@@ -4815,7 +3619,7 @@ def _match_one(filter_part, dct, incomplete):
if m:
op = UNARY_OPERATORS[m.group('op')]
actual_value = dct.get(m.group('key'))
- if incomplete and actual_value is None:
+ if is_incomplete(m.group('key')) and actual_value is None:
return True
return op(actual_value)
@@ -4823,21 +3627,29 @@ def _match_one(filter_part, dct, incomplete):
def match_str(filter_str, dct, incomplete=False):
- """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false
- When incomplete, all conditions passes on missing fields
+ """ Filter a dictionary with a simple string syntax.
+ @returns Whether the filter passes
+ @param incomplete Set of keys that is expected to be missing from dct.
+ Can be True/False to indicate all/none of the keys may be missing.
+ All conditions on incomplete keys pass if the key is missing
"""
return all(
_match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
for filter_part in re.split(r'(?<!\\)&', filter_str))
-def match_filter_func(filter_str):
+def match_filter_func(filters):
+ if not filters:
+ return None
+ filters = variadic(filters)
+
def _match_func(info_dict, *args, **kwargs):
- if match_str(filter_str, info_dict, *args, **kwargs):
+ if any(match_str(f, info_dict, *args, **kwargs) for f in filters):
return None
else:
- video_title = info_dict.get('title', info_dict.get('id', 'video'))
- return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
+ video_title = info_dict.get('title') or info_dict.get('id') or 'video'
+ filter_str = ') | ('.join(map(str.strip, filters))
+ return f'{video_title} does not pass filter ({filter_str}), skipping ..'
return _match_func
@@ -4855,7 +3667,12 @@ def parse_dfxp_time_expr(time_expr):
def srt_subtitles_timecode(seconds):
- return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
+ return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
+
+
+def ass_subtitles_timecode(seconds):
+ time = timetuple_from_msec(seconds * 1000)
+ return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
def dfxp2srt(dfxp_data):
@@ -6139,11 +4956,11 @@ def write_xattr(path, key, value):
+ [encodeFilename(path, True)])
try:
- p = subprocess.Popen(
+ p = Popen(
cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
except EnvironmentError as e:
raise XAttrMetadataError(e.errno, e.strerror)
- stdout, stderr = process_communicate_or_kill(p)
+ stdout, stderr = p.communicate_or_kill()
stderr = stderr.decode('utf-8', 'replace')
if p.returncode != 0:
raise XAttrMetadataError(p.returncode, stderr)
@@ -6201,6 +5018,12 @@ URL=%(url)s
Icon=text-html
'''.lstrip()
+LINK_TEMPLATES = {
+ 'url': DOT_URL_LINK_TEMPLATE,
+ 'desktop': DOT_DESKTOP_LINK_TEMPLATE,
+ 'webloc': DOT_WEBLOC_LINK_TEMPLATE,
+}
+
def iri_to_uri(iri):
"""
@@ -6255,13 +5078,10 @@ def to_high_limit_path(path):
def format_field(obj, field=None, template='%s', ignore=(None, ''), default='', func=None):
- if field is None:
- val = obj if obj is not None else default
- else:
- val = obj.get(field, default)
- if func and val not in ignore:
- val = func(val)
- return template % val if val not in ignore else default
+ val = traverse_obj(obj, *variadic(field))
+ if val in ignore:
+ return default
+ return template % (func(val) if func else val)
def clean_podcast_url(url):
@@ -6337,11 +5157,12 @@ def traverse_obj(
''' Traverse nested list/dict/tuple
@param path_list A list of paths which are checked one by one.
Each path is a list of keys where each key is a string,
- a function, a tuple of strings or "...".
- When a fuction is given, it takes the key as argument and
- returns whether the key matches or not. When a tuple is given,
+ a function, a tuple of strings/None or "...".
+ When a fuction is given, it takes the key and value as arguments
+ and returns whether the key matches or not. When a tuple is given,
all the keys given in the tuple are traversed, and
"..." traverses all the keys in the object
+ "None" returns the object without traversal
@param default Default value to return
@param expected_type Only accept final value of this type (Can also be any callable)
@param get_all Return all the values obtained from a path or only the first one
@@ -6358,10 +5179,10 @@ def traverse_obj(
def _traverse_obj(obj, path, _current_depth=0):
nonlocal depth
- if obj is None:
- return None
path = tuple(variadic(path))
for i, key in enumerate(path):
+ if None in (key, obj):
+ return obj
if isinstance(key, (list, tuple)):
obj = [_traverse_obj(obj, sub_key, _current_depth) for sub_key in key]
key = ...
@@ -6383,7 +5204,7 @@ def traverse_obj(
obj = str(obj)
_current_depth += 1
depth = max(depth, _current_depth)
- return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if key(k)]
+ return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if try_call(key, args=(k, v))]
elif isinstance(obj, dict) and not (is_user_input and key == ':'):
obj = (obj.get(key) if casesense or (key in obj)
else next((v for k, v in obj.items() if _lower(k) == key), None))
@@ -6430,15 +5251,35 @@ def traverse_obj(
def traverse_dict(dictn, keys, casesense=True):
- ''' For backward compatibility. Do not use '''
- return traverse_obj(dictn, keys, casesense=casesense,
- is_user_input=True, traverse_string=True)
+ write_string('DeprecationWarning: hypervideo_dl.utils.traverse_dict is deprecated '
+ 'and may be removed in a future version. Use hypervideo_dl.utils.traverse_obj instead')
+ return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
+
+def get_first(obj, keys, **kwargs):
+ return traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False)
-def variadic(x, allowed_types=(str, bytes)):
+
+def variadic(x, allowed_types=(str, bytes, dict)):
return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
+def decode_base(value, digits):
+ # This will convert given base-x string to scalar (long or int)
+ table = {char: index for index, char in enumerate(digits)}
+ result = 0
+ base = len(digits)
+ for chr in value:
+ result *= base
+ result += table[chr]
+ return result
+
+
+def time_seconds(**kwargs):
+ t = datetime.datetime.now(datetime.timezone(datetime.timedelta(**kwargs)))
+ return t.timestamp()
+
+
# create a JSON Web Signature (jws) with HS256 algorithm
# the resulting format is in JWS Compact Serialization
# implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
@@ -6458,9 +5299,17 @@ def jwt_encode_hs256(payload_data, key, headers={}):
return token
+# can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
+def jwt_decode_hs256(jwt):
+ header_b64, payload_b64, signature_b64 = jwt.split('.')
+ payload_data = json.loads(base64.urlsafe_b64decode(payload_b64))
+ return payload_data
+
+
def supports_terminal_sequences(stream):
if compat_os_name == 'nt':
- if get_windows_version() < (10, 0, 10586):
+ from .compat import WINDOWS_VT_MODE # Must be imported locally
+ if not WINDOWS_VT_MODE or get_windows_version() < (10, 0, 10586):
return False
elif not os.getenv('TERM'):
return False
@@ -6470,12 +5319,223 @@ def supports_terminal_sequences(stream):
return False
-TERMINAL_SEQUENCES = {
- 'DOWN': '\n',
- 'UP': '\x1b[A',
- 'ERASE_LINE': '\x1b[K',
- 'RED': '\033[0;31m',
- 'YELLOW': '\033[0;33m',
- 'BLUE': '\033[0;34m',
- 'RESET_STYLE': '\033[0m',
-}
+_terminal_sequences_re = re.compile('\033\\[[^m]+m')
+
+
+def remove_terminal_sequences(string):
+ return _terminal_sequences_re.sub('', string)
+
+
+def number_of_digits(number):
+ return len('%d' % number)
+
+
+def join_nonempty(*values, delim='-', from_dict=None):
+ if from_dict is not None:
+ values = map(from_dict.get, values)
+ return delim.join(map(str, filter(None, values)))
+
+
+def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
+ """
+ Find the largest format dimensions in terms of video width and, for each thumbnail:
+ * Modify the URL: Match the width with the provided regex and replace with the former width
+ * Update dimensions
+
+ This function is useful with video services that scale the provided thumbnails on demand
+ """
+ _keys = ('width', 'height')
+ max_dimensions = max(
+ [tuple(format.get(k) or 0 for k in _keys) for format in formats],
+ default=(0, 0))
+ if not max_dimensions[0]:
+ return thumbnails
+ return [
+ merge_dicts(
+ {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
+ dict(zip(_keys, max_dimensions)), thumbnail)
+ for thumbnail in thumbnails
+ ]
+
+
+def parse_http_range(range):
+ """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
+ if not range:
+ return None, None, None
+ crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
+ if not crg:
+ return None, None, None
+ return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
+
+
+class Config:
+ own_args = None
+ filename = None
+ __initialized = False
+
+ def __init__(self, parser, label=None):
+ self._parser, self.label = parser, label
+ self._loaded_paths, self.configs = set(), []
+
+ def init(self, args=None, filename=None):
+ assert not self.__initialized
+ directory = ''
+ if filename:
+ location = os.path.realpath(filename)
+ directory = os.path.dirname(location)
+ if location in self._loaded_paths:
+ return False
+ self._loaded_paths.add(location)
+
+ self.__initialized = True
+ self.own_args, self.filename = args, filename
+ for location in self._parser.parse_args(args)[0].config_locations or []:
+ location = os.path.join(directory, expand_path(location))
+ if os.path.isdir(location):
+ location = os.path.join(location, 'hypervideo.conf')
+ if not os.path.exists(location):
+ self._parser.error(f'config location {location} does not exist')
+ self.append_config(self.read_file(location), location)
+ return True
+
+ def __str__(self):
+ label = join_nonempty(
+ self.label, 'config', f'"{self.filename}"' if self.filename else '',
+ delim=' ')
+ return join_nonempty(
+ self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
+ *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
+ delim='\n')
+
+ @staticmethod
+ def read_file(filename, default=[]):
+ try:
+ optionf = open(filename)
+ except IOError:
+ return default # silently skip if file is not present
+ try:
+ # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
+ contents = optionf.read()
+ if sys.version_info < (3,):
+ contents = contents.decode(preferredencoding())
+ res = compat_shlex_split(contents, comments=True)
+ finally:
+ optionf.close()
+ return res
+
+ @staticmethod
+ def hide_login_info(opts):
+ PRIVATE_OPTS = set(['-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'])
+ eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
+
+ def _scrub_eq(o):
+ m = eqre.match(o)
+ if m:
+ return m.group('key') + '=PRIVATE'
+ else:
+ return o
+
+ opts = list(map(_scrub_eq, opts))
+ for idx, opt in enumerate(opts):
+ if opt in PRIVATE_OPTS and idx + 1 < len(opts):
+ opts[idx + 1] = 'PRIVATE'
+ return opts
+
+ def append_config(self, *args, label=None):
+ config = type(self)(self._parser, label)
+ config._loaded_paths = self._loaded_paths
+ if config.init(*args):
+ self.configs.append(config)
+
+ @property
+ def all_args(self):
+ for config in reversed(self.configs):
+ yield from config.all_args
+ yield from self.own_args or []
+
+ def parse_args(self):
+ return self._parser.parse_args(list(self.all_args))
+
+
+class WebSocketsWrapper():
+ """Wraps websockets module to use in non-async scopes"""
+
+ def __init__(self, url, headers=None, connect=True):
+ self.loop = asyncio.events.new_event_loop()
+ self.conn = compat_websockets.connect(
+ url, extra_headers=headers, ping_interval=None,
+ close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
+ if connect:
+ self.__enter__()
+ atexit.register(self.__exit__, None, None, None)
+
+ def __enter__(self):
+ if not self.pool:
+ self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
+ return self
+
+ def send(self, *args):
+ self.run_with_loop(self.pool.send(*args), self.loop)
+
+ def recv(self, *args):
+ return self.run_with_loop(self.pool.recv(*args), self.loop)
+
+ def __exit__(self, type, value, traceback):
+ try:
+ return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
+ finally:
+ self.loop.close()
+ self._cancel_all_tasks(self.loop)
+
+ # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
+ # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
+ @staticmethod
+ def run_with_loop(main, loop):
+ if not asyncio.coroutines.iscoroutine(main):
+ raise ValueError(f'a coroutine was expected, got {main!r}')
+
+ try:
+ return loop.run_until_complete(main)
+ finally:
+ loop.run_until_complete(loop.shutdown_asyncgens())
+ if hasattr(loop, 'shutdown_default_executor'):
+ loop.run_until_complete(loop.shutdown_default_executor())
+
+ @staticmethod
+ def _cancel_all_tasks(loop):
+ to_cancel = asyncio.tasks.all_tasks(loop)
+
+ if not to_cancel:
+ return
+
+ for task in to_cancel:
+ task.cancel()
+
+ loop.run_until_complete(
+ asyncio.tasks.gather(*to_cancel, loop=loop, return_exceptions=True))
+
+ for task in to_cancel:
+ if task.cancelled():
+ continue
+ if task.exception() is not None:
+ loop.call_exception_handler({
+ 'message': 'unhandled exception during asyncio.run() shutdown',
+ 'exception': task.exception(),
+ 'task': task,
+ })
+
+
+has_websockets = bool(compat_websockets)
+
+
+def merge_headers(*dicts):
+ """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
+ return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
+
+
+class classproperty:
+ def __init__(self, f):
+ self.f = f
+
+ def __get__(self, _, cls):
+ return self.f(cls)
diff --git a/hypervideo_dl/version.py b/hypervideo_dl/version.py
index 839f10e..107fefb 100644
--- a/hypervideo_dl/version.py
+++ b/hypervideo_dl/version.py
@@ -1,3 +1,5 @@
-from __future__ import unicode_literals
+# Autogenerated by devscripts/update-version.py
-__version__ = '1.1.12'
+__version__ = '1.1.13'
+
+RELEASE_GIT_HEAD = 'c0c2c57d3'
diff --git a/hypervideo_dl/webvtt.py b/hypervideo_dl/webvtt.py
index b5ad01f..0e602a7 100644
--- a/hypervideo_dl/webvtt.py
+++ b/hypervideo_dl/webvtt.py
@@ -13,7 +13,7 @@ in RFC 8216 §3.5 <https://tools.ietf.org/html/rfc8216#section-3.5>.
import re
import io
-from .utils import int_or_none
+from .utils import int_or_none, timetuple_from_msec
from .compat import (
compat_str as str,
compat_Pattern,
@@ -124,11 +124,7 @@ def _format_ts(ts):
Convert an MPEG PES timestamp into a WebVTT timestamp.
This will lose sub-millisecond precision.
"""
- msec = int((ts + 45) // 90)
- secs, msec = divmod(msec, 1000)
- mins, secs = divmod(secs, 60)
- hrs, mins = divmod(mins, 60)
- return '%02u:%02u:%02u.%03u' % (hrs, mins, secs, msec)
+ return '%02u:%02u:%02u.%03u' % timetuple_from_msec(int((ts + 45) // 90))
class Block(object):
diff --git a/requirements.txt b/requirements.txt
index 6a982fa..b65d254 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,6 @@
mutagen
pycryptodome
websockets
+brotli; platform_python_implementation=='CPython'
+brotlicffi; platform_python_implementation!='CPython'
+certifi \ No newline at end of file
diff --git a/setup.py b/setup.py
index 850ca07..d183924 100644
--- a/setup.py
+++ b/setup.py
@@ -16,12 +16,12 @@ from distutils.spawn import spawn
exec(compile(open('hypervideo_dl/version.py').read(), 'hypervideo_dl/version.py', 'exec'))
-DESCRIPTION = 'Command-line program to download videos from YouTube.com and many other other video platforms.'
+DESCRIPTION = 'A youtube-dl fork with additional features and patches'
LONG_DESCRIPTION = '\n\n'.join((
'Official repository: <https://github.com/hypervideo/hypervideo>',
'**PS**: Some links in this document will not work since this is a copy of the README.md from Github',
- open('README.md', 'r', encoding='utf-8').read()))
+ open('README.md', encoding='utf-8').read()))
REQUIREMENTS = ['mutagen', 'pycryptodome', 'websockets']
@@ -29,7 +29,7 @@ REQUIREMENTS = ['mutagen', 'pycryptodome', 'websockets']
if sys.argv[1:2] == ['py2exe']:
import py2exe
warnings.warn(
- 'Building with py2exe is not officially supported. '
+ 'py2exe builds do not support pycryptodomex and needs VC++14 to run. '
'The recommended way is to use "pyinst.py" to build using pyinstaller')
params = {
'console': [{
diff --git a/test/helper.py b/test/helper.py
index 0d8822e..1f1ccfa 100644
--- a/test/helper.py
+++ b/test/helper.py
@@ -194,6 +194,45 @@ def expect_dict(self, got_dict, expected_dict):
expect_value(self, got, expected, info_field)
+def sanitize_got_info_dict(got_dict):
+ IGNORED_FIELDS = (
+ *YoutubeDL._format_fields,
+
+ # Lists
+ 'formats', 'thumbnails', 'subtitles', 'automatic_captions', 'comments', 'entries',
+
+ # Auto-generated
+ 'autonumber', 'playlist', 'format_index', 'video_ext', 'audio_ext', 'duration_string', 'epoch',
+ 'fulltitle', 'extractor', 'extractor_key', 'filepath', 'infojson_filename', 'original_url', 'n_entries',
+
+ # Only live_status needs to be checked
+ 'is_live', 'was_live',
+ )
+
+ IGNORED_PREFIXES = ('', 'playlist', 'requested', 'webpage')
+
+ def sanitize(key, value):
+ if isinstance(value, str) and len(value) > 100 and key != 'thumbnail':
+ return f'md5:{md5(value)}'
+ elif isinstance(value, list) and len(value) > 10:
+ return f'count:{len(value)}'
+ elif key.endswith('_count') and isinstance(value, int):
+ return int
+ return value
+
+ test_info_dict = {
+ key: sanitize(key, value) for key, value in got_dict.items()
+ if value is not None and key not in IGNORED_FIELDS and not any(
+ key.startswith(f'{prefix}_') for prefix in IGNORED_PREFIXES)
+ }
+
+ # display_id may be generated from id
+ if test_info_dict.get('display_id') == test_info_dict.get('id'):
+ test_info_dict.pop('display_id')
+
+ return test_info_dict
+
+
def expect_info_dict(self, got_dict, expected_dict):
expect_dict(self, got_dict, expected_dict)
# Check for the presence of mandatory fields
@@ -207,15 +246,15 @@ def expect_info_dict(self, got_dict, expected_dict):
for key in ['webpage_url', 'extractor', 'extractor_key']:
self.assertTrue(got_dict.get(key), 'Missing field: %s' % key)
- # Are checkable fields missing from the test case definition?
- test_info_dict = dict((key, value if not isinstance(value, compat_str) or len(value) < 250 else 'md5:' + md5(value))
- for key, value in got_dict.items()
- if value and key in ('id', 'title', 'description', 'uploader', 'upload_date', 'timestamp', 'uploader_id', 'location', 'age_limit'))
+ test_info_dict = sanitize_got_info_dict(got_dict)
+
missing_keys = set(test_info_dict.keys()) - set(expected_dict.keys())
if missing_keys:
def _repr(v):
if isinstance(v, compat_str):
return "'%s'" % v.replace('\\', '\\\\').replace("'", "\\'").replace('\n', '\\n')
+ elif isinstance(v, type):
+ return v.__name__
else:
return repr(v)
info_dict_str = ''
diff --git a/test/parameters.json b/test/parameters.json
index 9ca7d2c..06fe3e3 100644
--- a/test/parameters.json
+++ b/test/parameters.json
@@ -9,7 +9,7 @@
"forcetitle": false,
"forceurl": false,
"force_write_download_archive": false,
- "format": "best",
+ "format": "b/bv",
"ignoreerrors": false,
"listformats": null,
"logtostderr": false,
diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py
index e892095..8494105 100644
--- a/test/test_InfoExtractor.py
+++ b/test/test_InfoExtractor.py
@@ -99,10 +99,10 @@ class TestInfoExtractor(unittest.TestCase):
self.assertRaises(RegexNotFoundError, ie._html_search_meta, ('z', 'x'), html, None, fatal=True)
def test_search_json_ld_realworld(self):
- # https://github.com/ytdl-org/youtube-dl/issues/23306
- expect_dict(
- self,
- self.ie._search_json_ld(r'''<script type="application/ld+json">
+ _TESTS = [
+ # https://github.com/ytdl-org/youtube-dl/issues/23306
+ (
+ r'''<script type="application/ld+json">
{
"@context": "http://schema.org/",
"@type": "VideoObject",
@@ -135,17 +135,171 @@ class TestInfoExtractor(unittest.TestCase):
"name": "Kleio Valentien",
"url": "https://www.eporner.com/pornstar/kleio-valentien/"
}]}
-</script>''', None),
- {
- 'title': '1 On 1 With Kleio',
- 'description': 'Kleio Valentien',
- 'url': 'https://gvideo.eporner.com/xN49A1cT3eB/xN49A1cT3eB.mp4',
- 'timestamp': 1449347075,
- 'duration': 743.0,
- 'view_count': 1120958,
- 'width': 1920,
- 'height': 1080,
- })
+ </script>''',
+ {
+ 'title': '1 On 1 With Kleio',
+ 'description': 'Kleio Valentien',
+ 'url': 'https://gvideo.eporner.com/xN49A1cT3eB/xN49A1cT3eB.mp4',
+ 'timestamp': 1449347075,
+ 'duration': 743.0,
+ 'view_count': 1120958,
+ 'width': 1920,
+ 'height': 1080,
+ },
+ {},
+ ),
+ (
+ r'''<script type="application/ld+json">
+ {
+ "@context": "https://schema.org",
+ "@graph": [
+ {
+ "@type": "NewsArticle",
+ "mainEntityOfPage": {
+ "@type": "WebPage",
+ "@id": "https://www.ant1news.gr/Society/article/620286/symmoria-anilikon-dikigoros-thymaton-ithelan-na-toys-apoteleiosoyn"
+ },
+ "headline": "Συμμορία ανηλίκων – δικηγόρος θυμάτων: ήθελαν να τους αποτελειώσουν",
+ "name": "Συμμορία ανηλίκων – δικηγόρος θυμάτων: ήθελαν να τους αποτελειώσουν",
+ "description": "Τα παιδιά δέχθηκαν την επίθεση επειδή αρνήθηκαν να γίνουν μέλη της συμμορίας, ανέφερε ο Γ. Ζαχαρόπουλος.",
+ "image": {
+ "@type": "ImageObject",
+ "url": "https://ant1media.azureedge.net/imgHandler/1100/a635c968-be71-447c-bf9c-80d843ece21e.jpg",
+ "width": 1100,
+ "height": 756 },
+ "datePublished": "2021-11-10T08:50:00+03:00",
+ "dateModified": "2021-11-10T08:52:53+03:00",
+ "author": {
+ "@type": "Person",
+ "@id": "https://www.ant1news.gr/",
+ "name": "Ant1news",
+ "image": "https://www.ant1news.gr/images/logo-e5d7e4b3e714c88e8d2eca96130142f6.png",
+ "url": "https://www.ant1news.gr/"
+ },
+ "publisher": {
+ "@type": "Organization",
+ "@id": "https://www.ant1news.gr#publisher",
+ "name": "Ant1news",
+ "url": "https://www.ant1news.gr",
+ "logo": {
+ "@type": "ImageObject",
+ "url": "https://www.ant1news.gr/images/logo-e5d7e4b3e714c88e8d2eca96130142f6.png",
+ "width": 400,
+ "height": 400 },
+ "sameAs": [
+ "https://www.facebook.com/Ant1news.gr",
+ "https://twitter.com/antennanews",
+ "https://www.youtube.com/channel/UC0smvAbfczoN75dP0Hw4Pzw",
+ "https://www.instagram.com/ant1news/"
+ ]
+ },
+
+ "keywords": "μαχαίρωμα,συμμορία ανηλίκων,ΕΙΔΗΣΕΙΣ,ΕΙΔΗΣΕΙΣ ΣΗΜΕΡΑ,ΝΕΑ,Κοινωνία - Ant1news",
+
+
+ "articleSection": "Κοινωνία"
+ }
+ ]
+ }
+ </script>''',
+ {
+ 'timestamp': 1636523400,
+ 'title': 'md5:91fe569e952e4d146485740ae927662b',
+ },
+ {'expected_type': 'NewsArticle'},
+ ),
+ (
+ r'''<script type="application/ld+json">
+ {"url":"/vrtnu/a-z/het-journaal/2021/het-journaal-het-journaal-19u-20211231/",
+ "name":"Het journaal 19u",
+ "description":"Het journaal 19u van vrijdag 31 december 2021.",
+ "potentialAction":{"url":"https://vrtnu.page.link/pfVy6ihgCAJKgHqe8","@type":"ShareAction"},
+ "mainEntityOfPage":{"@id":"1640092242445","@type":"WebPage"},
+ "publication":[{
+ "startDate":"2021-12-31T19:00:00.000+01:00",
+ "endDate":"2022-01-30T23:55:00.000+01:00",
+ "publishedBy":{"name":"een","@type":"Organization"},
+ "publishedOn":{"url":"https://www.vrt.be/vrtnu/","name":"VRT NU","@type":"BroadcastService"},
+ "@id":"pbs-pub-3a7ec233-da95-4c1e-9b2b-cf5fdfebcbe8",
+ "@type":"BroadcastEvent"
+ }],
+ "video":{
+ "name":"Het journaal - Aflevering 365 (Seizoen 2021)",
+ "description":"Het journaal 19u van vrijdag 31 december 2021. Bekijk aflevering 365 van seizoen 2021 met VRT NU via de site of app.",
+ "thumbnailUrl":"//images.vrt.be/width1280/2021/12/31/80d5ed00-6a64-11ec-b07d-02b7b76bf47f.jpg",
+ "expires":"2022-01-30T23:55:00.000+01:00",
+ "hasPart":[
+ {"name":"Explosie Turnhout","startOffset":70,"@type":"Clip"},
+ {"name":"Jaarwisseling","startOffset":440,"@type":"Clip"},
+ {"name":"Natuurbranden Colorado","startOffset":1179,"@type":"Clip"},
+ {"name":"Klimaatverandering","startOffset":1263,"@type":"Clip"},
+ {"name":"Zacht weer","startOffset":1367,"@type":"Clip"},
+ {"name":"Financiële balans","startOffset":1383,"@type":"Clip"},
+ {"name":"Club Brugge","startOffset":1484,"@type":"Clip"},
+ {"name":"Mentale gezondheid bij topsporters","startOffset":1575,"@type":"Clip"},
+ {"name":"Olympische Winterspelen","startOffset":1728,"@type":"Clip"},
+ {"name":"Sober oudjaar in Nederland","startOffset":1873,"@type":"Clip"}
+ ],
+ "duration":"PT34M39.23S",
+ "uploadDate":"2021-12-31T19:00:00.000+01:00",
+ "@id":"vid-9457d0c6-b8ac-4aba-b5e1-15aa3a3295b5",
+ "@type":"VideoObject"
+ },
+ "genre":["Nieuws en actua"],
+ "episodeNumber":365,
+ "partOfSeries":{"name":"Het journaal","@id":"222831405527","@type":"TVSeries"},
+ "partOfSeason":{"name":"Seizoen 2021","@id":"961809365527","@type":"TVSeason"},
+ "@context":"https://schema.org","@id":"961685295527","@type":"TVEpisode"}</script>
+ ''',
+ {
+ 'chapters': [
+ {"title": "Explosie Turnhout", "start_time": 70, "end_time": 440},
+ {"title": "Jaarwisseling", "start_time": 440, "end_time": 1179},
+ {"title": "Natuurbranden Colorado", "start_time": 1179, "end_time": 1263},
+ {"title": "Klimaatverandering", "start_time": 1263, "end_time": 1367},
+ {"title": "Zacht weer", "start_time": 1367, "end_time": 1383},
+ {"title": "Financiële balans", "start_time": 1383, "end_time": 1484},
+ {"title": "Club Brugge", "start_time": 1484, "end_time": 1575},
+ {"title": "Mentale gezondheid bij topsporters", "start_time": 1575, "end_time": 1728},
+ {"title": "Olympische Winterspelen", "start_time": 1728, "end_time": 1873},
+ {"title": "Sober oudjaar in Nederland", "start_time": 1873, "end_time": 2079.23}
+ ],
+ 'title': 'Het journaal - Aflevering 365 (Seizoen 2021)'
+ }, {}
+ ),
+ (
+ # test multiple thumbnails in a list
+ r'''
+<script type="application/ld+json">
+{"@context":"https://schema.org",
+"@type":"VideoObject",
+"thumbnailUrl":["https://www.rainews.it/cropgd/640x360/dl/img/2021/12/30/1640886376927_GettyImages.jpg"]}
+</script>''',
+ {
+ 'thumbnails': [{'url': 'https://www.rainews.it/cropgd/640x360/dl/img/2021/12/30/1640886376927_GettyImages.jpg'}],
+ },
+ {},
+ ),
+ (
+ # test single thumbnail
+ r'''
+<script type="application/ld+json">
+{"@context":"https://schema.org",
+"@type":"VideoObject",
+"thumbnailUrl":"https://www.rainews.it/cropgd/640x360/dl/img/2021/12/30/1640886376927_GettyImages.jpg"}
+</script>''',
+ {
+ 'thumbnails': [{'url': 'https://www.rainews.it/cropgd/640x360/dl/img/2021/12/30/1640886376927_GettyImages.jpg'}],
+ },
+ {},
+ )
+ ]
+ for html, expected_dict, search_json_ld_kwargs in _TESTS:
+ expect_dict(
+ self,
+ self.ie._search_json_ld(html, None, **search_json_ld_kwargs),
+ expected_dict
+ )
def test_download_json(self):
uri = encode_data_uri(b'{"foo": "blah"}', 'application/json')
diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py
index c9dd498..fe0fd35 100644
--- a/test/test_YoutubeDL.py
+++ b/test/test_YoutubeDL.py
@@ -30,8 +30,7 @@ class YDL(FakeYDL):
self.msgs = []
def process_info(self, info_dict):
- info_dict.pop('__original_infodict', None)
- self.downloaded_info_dicts.append(info_dict)
+ self.downloaded_info_dicts.append(info_dict.copy())
def to_screen(self, msg):
self.msgs.append(msg)
@@ -137,7 +136,7 @@ class TestFormatSelection(unittest.TestCase):
test('webm/mp4', '47')
test('3gp/40/mp4', '35')
test('example-with-dashes', 'example-with-dashes')
- test('all', '35', 'example-with-dashes', '45', '47', '2') # Order doesn't actually matter for this
+ test('all', '2', '47', '45', 'example-with-dashes', '35')
test('mergeall', '2+47+45+example-with-dashes+35', multi=True)
def test_format_selection_audio(self):
@@ -520,7 +519,7 @@ class TestFormatSelection(unittest.TestCase):
ydl = YDL({'format': 'all[width>=400][width<=600]'})
ydl.process_ie_result(info_dict)
downloaded_ids = [info['format_id'] for info in ydl.downloaded_info_dicts]
- self.assertEqual(downloaded_ids, ['B', 'C', 'D'])
+ self.assertEqual(downloaded_ids, ['D', 'C', 'B'])
ydl = YDL({'format': 'best[height<40]'})
try:
@@ -645,6 +644,7 @@ class TestYoutubeDL(unittest.TestCase):
'ext': 'mp4',
'width': None,
'height': 1080,
+ 'filesize': 1024,
'title1': '$PATH',
'title2': '%PATH%',
'title3': 'foo/bar\\test',
@@ -656,7 +656,7 @@ class TestYoutubeDL(unittest.TestCase):
'playlist_autonumber': 2,
'_last_playlist_index': 100,
'n_entries': 10,
- 'formats': [{'id': 'id1'}, {'id': 'id2'}, {'id': 'id3'}]
+ 'formats': [{'id': 'id 1'}, {'id': 'id 2'}, {'id': 'id 3'}]
}
def test_prepare_outtmpl_and_filename(self):
@@ -717,6 +717,7 @@ class TestYoutubeDL(unittest.TestCase):
test('%(id)s', '.abcd', info={'id': '.abcd'})
test('%(id)s', 'ab__cd', info={'id': 'ab__cd'})
test('%(id)s', ('ab:cd', 'ab -cd'), info={'id': 'ab:cd'})
+ test('%(id.0)s', '-', info={'id': '--'})
# Invalid templates
self.assertTrue(isinstance(YoutubeDL.validate_outtmpl('%(title)'), ValueError))
@@ -737,6 +738,7 @@ class TestYoutubeDL(unittest.TestCase):
test(NA_TEST_OUTTMPL, 'NA-NA-def-1234.mp4')
test(NA_TEST_OUTTMPL, 'none-none-def-1234.mp4', outtmpl_na_placeholder='none')
test(NA_TEST_OUTTMPL, '--def-1234.mp4', outtmpl_na_placeholder='')
+ test('%(non_existent.0)s', 'NA')
# String formatting
FMT_TEST_OUTTMPL = '%%(height)%s.%%(ext)s'
@@ -762,23 +764,33 @@ class TestYoutubeDL(unittest.TestCase):
test('a%(width|)d', 'a', outtmpl_na_placeholder='none')
FORMATS = self.outtmpl_info['formats']
- sanitize = lambda x: x.replace(':', ' -').replace('"', "'")
+ sanitize = lambda x: x.replace(':', ' -').replace('"', "'").replace('\n', ' ')
# Custom type casting
- test('%(formats.:.id)l', 'id1, id2, id3')
- test('%(formats.:.id)#l', ('id1\nid2\nid3', 'id1 id2 id3'))
+ test('%(formats.:.id)l', 'id 1, id 2, id 3')
+ test('%(formats.:.id)#l', ('id 1\nid 2\nid 3', 'id 1 id 2 id 3'))
test('%(ext)l', 'mp4')
- test('%(formats.:.id) 15l', ' id1, id2, id3')
+ test('%(formats.:.id) 18l', ' id 1, id 2, id 3')
test('%(formats)j', (json.dumps(FORMATS), sanitize(json.dumps(FORMATS))))
+ test('%(formats)#j', (json.dumps(FORMATS, indent=4), sanitize(json.dumps(FORMATS, indent=4))))
test('%(title5).3B', 'á')
test('%(title5)U', 'áéí 𝐀')
test('%(title5)#U', 'a\u0301e\u0301i\u0301 𝐀')
test('%(title5)+U', 'áéí A')
test('%(title5)+#U', 'a\u0301e\u0301i\u0301 A')
+ test('%(height)D', '1k')
+ test('%(filesize)#D', '1Ki')
+ test('%(height)5.2D', ' 1.08k')
+ test('%(title4)#S', 'foo_bar_test')
+ test('%(title4).10S', ('foo \'bar\' ', 'foo \'bar\'' + ('#' if compat_os_name == 'nt' else ' ')))
if compat_os_name == 'nt':
test('%(title4)q', ('"foo \\"bar\\" test"', "'foo _'bar_' test'"))
+ test('%(formats.:.id)#q', ('"id 1" "id 2" "id 3"', "'id 1' 'id 2' 'id 3'"))
+ test('%(formats.0.id)#q', ('"id 1"', "'id 1'"))
else:
test('%(title4)q', ('\'foo "bar" test\'', "'foo 'bar' test'"))
+ test('%(formats.:.id)#q', "'id 1' 'id 2' 'id 3'")
+ test('%(formats.0.id)#q', "'id 1'")
# Internal formatting
test('%(timestamp-1000>%H-%M-%S)s', '11-43-20')
@@ -802,6 +814,13 @@ class TestYoutubeDL(unittest.TestCase):
test('%(width-100,height+width|def)s', 'def')
test('%(timestamp-x>%H\\,%M\\,%S,timestamp>%H\\,%M\\,%S)s', '12,00,00')
+ # Replacement
+ test('%(id&foo)s.bar', 'foo.bar')
+ test('%(title&foo)s.bar', 'NA.bar')
+ test('%(title&foo|baz)s.bar', 'baz.bar')
+ test('%(x,id&foo|baz)s.bar', 'foo.bar')
+ test('%(x,title&foo|baz)s.bar', 'baz.bar')
+
# Laziness
def gen():
yield from range(5)
@@ -879,20 +898,6 @@ class TestYoutubeDL(unittest.TestCase):
os.unlink(filename)
def test_match_filter(self):
- class FilterYDL(YDL):
- def __init__(self, *args, **kwargs):
- super(FilterYDL, self).__init__(*args, **kwargs)
- self.params['simulate'] = True
-
- def process_info(self, info_dict):
- super(YDL, self).process_info(info_dict)
-
- def _match_entry(self, info_dict, incomplete=False):
- res = super(FilterYDL, self)._match_entry(info_dict, incomplete)
- if res is None:
- self.downloaded_info_dicts.append(info_dict)
- return res
-
first = {
'id': '1',
'url': TEST_URL,
@@ -920,7 +925,7 @@ class TestYoutubeDL(unittest.TestCase):
videos = [first, second]
def get_videos(filter_=None):
- ydl = FilterYDL({'match_filter': filter_})
+ ydl = YDL({'match_filter': filter_, 'simulate': True})
for v in videos:
ydl.process_ie_result(v, download=True)
return [v['id'] for v in ydl.downloaded_info_dicts]
@@ -928,7 +933,7 @@ class TestYoutubeDL(unittest.TestCase):
res = get_videos()
self.assertEqual(res, ['1', '2'])
- def f(v):
+ def f(v, incomplete):
if v['id'] == '1':
return None
else:
@@ -1135,6 +1140,7 @@ class TestYoutubeDL(unittest.TestCase):
self.assertTrue(entries[1] is None)
self.assertEqual(len(ydl.downloaded_info_dicts), 1)
downloaded = ydl.downloaded_info_dicts[0]
+ entries[2].pop('requested_downloads', None)
self.assertEqual(entries[2], downloaded)
self.assertEqual(downloaded['url'], TEST_URL)
self.assertEqual(downloaded['title'], 'Video Transparent 2')
diff --git a/test/test_aes.py b/test/test_aes.py
index 746e447..9d260b5 100644
--- a/test/test_aes.py
+++ b/test/test_aes.py
@@ -10,6 +10,8 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from hypervideo_dl.aes import (
aes_decrypt,
aes_encrypt,
+ aes_ecb_encrypt,
+ aes_ecb_decrypt,
aes_cbc_decrypt,
aes_cbc_decrypt_bytes,
aes_cbc_encrypt,
@@ -17,7 +19,8 @@ from hypervideo_dl.aes import (
aes_ctr_encrypt,
aes_gcm_decrypt_and_verify,
aes_gcm_decrypt_and_verify_bytes,
- aes_decrypt_text
+ aes_decrypt_text,
+ BLOCK_SIZE_BYTES,
)
from hypervideo_dl.compat import compat_pycrypto_AES
from hypervideo_dl.utils import bytes_to_intlist, intlist_to_bytes
@@ -94,6 +97,19 @@ class TestAES(unittest.TestCase):
decrypted = (aes_decrypt_text(encrypted, password, 32))
self.assertEqual(decrypted, self.secret_msg)
+ def test_ecb_encrypt(self):
+ data = bytes_to_intlist(self.secret_msg)
+ data += [0x08] * (BLOCK_SIZE_BYTES - len(data) % BLOCK_SIZE_BYTES)
+ encrypted = intlist_to_bytes(aes_ecb_encrypt(data, self.key, self.iv))
+ self.assertEqual(
+ encrypted,
+ b'\xaa\x86]\x81\x97>\x02\x92\x9d\x1bR[[L/u\xd3&\xd1(h\xde{\x81\x94\xba\x02\xae\xbd\xa6\xd0:')
+
+ def test_ecb_decrypt(self):
+ data = bytes_to_intlist(b'\xaa\x86]\x81\x97>\x02\x92\x9d\x1bR[[L/u\xd3&\xd1(h\xde{\x81\x94\xba\x02\xae\xbd\xa6\xd0:')
+ decrypted = intlist_to_bytes(aes_ecb_decrypt(data, self.key, self.iv))
+ self.assertEqual(decrypted.rstrip(b'\x08'), self.secret_msg)
+
if __name__ == '__main__':
unittest.main()
diff --git a/test/test_all_urls.py b/test/test_all_urls.py
index d9e4bad..74634cb 100644
--- a/test/test_all_urls.py
+++ b/test/test_all_urls.py
@@ -38,7 +38,6 @@ class TestAllURLsMatching(unittest.TestCase):
assertTab('https://www.youtube.com/AsapSCIENCE')
assertTab('https://www.youtube.com/embedded')
assertTab('https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q')
- assertTab('https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8')
assertTab('https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC')
assertTab('https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012') # 668
self.assertFalse('youtube:playlist' in self.matching_ies('PLtS2H6bU1M'))
diff --git a/test/test_cookies.py b/test/test_cookies.py
index fb034fc..053e45b 100644
--- a/test/test_cookies.py
+++ b/test/test_cookies.py
@@ -8,6 +8,8 @@ from hypervideo_dl.cookies import (
WindowsChromeCookieDecryptor,
parse_safari_cookies,
pbkdf2_sha1,
+ _get_linux_desktop_environment,
+ _LinuxDesktopEnvironment,
)
@@ -42,6 +44,37 @@ class MonkeyPatch:
class TestCookies(unittest.TestCase):
+ def test_get_desktop_environment(self):
+ """ based on https://chromium.googlesource.com/chromium/src/+/refs/heads/main/base/nix/xdg_util_unittest.cc """
+ test_cases = [
+ ({}, _LinuxDesktopEnvironment.OTHER),
+
+ ({'DESKTOP_SESSION': 'gnome'}, _LinuxDesktopEnvironment.GNOME),
+ ({'DESKTOP_SESSION': 'mate'}, _LinuxDesktopEnvironment.GNOME),
+ ({'DESKTOP_SESSION': 'kde4'}, _LinuxDesktopEnvironment.KDE),
+ ({'DESKTOP_SESSION': 'kde'}, _LinuxDesktopEnvironment.KDE),
+ ({'DESKTOP_SESSION': 'xfce'}, _LinuxDesktopEnvironment.XFCE),
+
+ ({'GNOME_DESKTOP_SESSION_ID': 1}, _LinuxDesktopEnvironment.GNOME),
+ ({'KDE_FULL_SESSION': 1}, _LinuxDesktopEnvironment.KDE),
+
+ ({'XDG_CURRENT_DESKTOP': 'X-Cinnamon'}, _LinuxDesktopEnvironment.CINNAMON),
+ ({'XDG_CURRENT_DESKTOP': 'GNOME'}, _LinuxDesktopEnvironment.GNOME),
+ ({'XDG_CURRENT_DESKTOP': 'GNOME:GNOME-Classic'}, _LinuxDesktopEnvironment.GNOME),
+ ({'XDG_CURRENT_DESKTOP': 'GNOME : GNOME-Classic'}, _LinuxDesktopEnvironment.GNOME),
+
+ ({'XDG_CURRENT_DESKTOP': 'Unity', 'DESKTOP_SESSION': 'gnome-fallback'}, _LinuxDesktopEnvironment.GNOME),
+ ({'XDG_CURRENT_DESKTOP': 'KDE', 'KDE_SESSION_VERSION': '5'}, _LinuxDesktopEnvironment.KDE),
+ ({'XDG_CURRENT_DESKTOP': 'KDE'}, _LinuxDesktopEnvironment.KDE),
+ ({'XDG_CURRENT_DESKTOP': 'Pantheon'}, _LinuxDesktopEnvironment.PANTHEON),
+ ({'XDG_CURRENT_DESKTOP': 'Unity'}, _LinuxDesktopEnvironment.UNITY),
+ ({'XDG_CURRENT_DESKTOP': 'Unity:Unity7'}, _LinuxDesktopEnvironment.UNITY),
+ ({'XDG_CURRENT_DESKTOP': 'Unity:Unity8'}, _LinuxDesktopEnvironment.UNITY),
+ ]
+
+ for env, expected_desktop_environment in test_cases:
+ self.assertEqual(_get_linux_desktop_environment(env), expected_desktop_environment)
+
def test_chrome_cookie_decryptor_linux_derive_key(self):
key = LinuxChromeCookieDecryptor.derive_key(b'abc')
self.assertEqual(key, b'7\xa1\xec\xd4m\xfcA\xc7\xb19Z\xd0\x19\xdcM\x17')
@@ -58,8 +91,7 @@ class TestCookies(unittest.TestCase):
self.assertEqual(decryptor.decrypt(encrypted_value), value)
def test_chrome_cookie_decryptor_linux_v11(self):
- with MonkeyPatch(cookies, {'_get_linux_keyring_password': lambda *args, **kwargs: b'',
- 'KEYRING_AVAILABLE': True}):
+ with MonkeyPatch(cookies, {'_get_linux_keyring_password': lambda *args, **kwargs: b''}):
encrypted_value = b'v11#\x81\x10>`w\x8f)\xc0\xb2\xc1\r\xf4\x1al\xdd\x93\xfd\xf8\xf8N\xf2\xa9\x83\xf1\xe9o\x0elVQd'
value = 'tz=Europe.London'
decryptor = LinuxChromeCookieDecryptor('Chrome', Logger())
diff --git a/test/test_download.py b/test/test_download.py
index 8b5eea5..3cca13b 100644..100755
--- a/test/test_download.py
+++ b/test/test_download.py
@@ -53,7 +53,7 @@ class YoutubeDL(hypervideo_dl.YoutubeDL):
raise ExtractorError(message)
def process_info(self, info_dict):
- self.processed_info_dicts.append(info_dict)
+ self.processed_info_dicts.append(info_dict.copy())
return super(YoutubeDL, self).process_info(info_dict)
diff --git a/test/test_netrc.py b/test/test_netrc.py
index 50b9e5b..c7f5272 100644
--- a/test/test_netrc.py
+++ b/test/test_netrc.py
@@ -7,18 +7,19 @@ import unittest
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-from hypervideo_dl.extractor import (
- gen_extractors,
-)
+from hypervideo_dl.extractor import gen_extractor_classes
+from hypervideo_dl.extractor.common import InfoExtractor
+
+NO_LOGIN = InfoExtractor._perform_login
class TestNetRc(unittest.TestCase):
def test_netrc_present(self):
- for ie in gen_extractors():
- if not hasattr(ie, '_login'):
+ for ie in gen_extractor_classes():
+ if ie._perform_login is NO_LOGIN:
continue
self.assertTrue(
- hasattr(ie, '_NETRC_MACHINE'),
+ ie._NETRC_MACHINE,
'Extractor %s supports login, but is missing a _NETRC_MACHINE property' % ie.IE_NAME)
diff --git a/test/test_postprocessors.py b/test/test_postprocessors.py
index 42f37b8..e0b8347 100644
--- a/test/test_postprocessors.py
+++ b/test/test_postprocessors.py
@@ -124,11 +124,11 @@ class TestModifyChaptersPP(unittest.TestCase):
chapters = self._chapters([70], ['c']) + [
self._sponsor_chapter(10, 20, 'sponsor'),
self._sponsor_chapter(30, 40, 'preview'),
- self._sponsor_chapter(50, 60, 'sponsor')]
+ self._sponsor_chapter(50, 60, 'filler')]
expected = self._chapters(
[10, 20, 30, 40, 50, 60, 70],
['c', '[SponsorBlock]: Sponsor', 'c', '[SponsorBlock]: Preview/Recap',
- 'c', '[SponsorBlock]: Sponsor', 'c'])
+ 'c', '[SponsorBlock]: Filler Tangent', 'c'])
self._remove_marked_arrange_sponsors_test_impl(chapters, expected, [])
def test_remove_marked_arrange_sponsors_UniqueNamesForOverlappingSponsors(self):
diff --git a/test/test_subtitles.py b/test/test_subtitles.py
index e94df35..10fa0ca 100644
--- a/test/test_subtitles.py
+++ b/test/test_subtitles.py
@@ -13,7 +13,7 @@ from test.helper import FakeYDL, md5, is_download_test
from hypervideo_dl.extractor import (
YoutubeIE,
DailymotionIE,
- TEDIE,
+ TedTalkIE,
VimeoIE,
WallaIE,
CeskaTelevizeIE,
@@ -141,7 +141,7 @@ class TestDailymotionSubtitles(BaseTestSubtitles):
@is_download_test
class TestTedSubtitles(BaseTestSubtitles):
url = 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html'
- IE = TEDIE
+ IE = TedTalkIE
def test_allsubtitles(self):
self.DL.params['writesubtitles'] = True
diff --git a/test/test_utils.py b/test/test_utils.py
index 1cd2b2f..039900c 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -23,6 +23,7 @@ from hypervideo_dl.utils import (
caesar,
clean_html,
clean_podcast_url,
+ Config,
date_from_str,
datetime_from_str,
DateRange,
@@ -37,11 +38,18 @@ from hypervideo_dl.utils import (
ExtractorError,
find_xpath_attr,
fix_xml_ampersands,
+ format_bytes,
float_or_none,
get_element_by_class,
get_element_by_attribute,
get_elements_by_class,
get_elements_by_attribute,
+ get_element_html_by_class,
+ get_element_html_by_attribute,
+ get_elements_html_by_class,
+ get_elements_html_by_attribute,
+ get_elements_text_and_html_by_attribute,
+ get_element_text_and_html_by_tag,
InAdvancePagedList,
int_or_none,
intlist_to_bytes,
@@ -116,6 +124,7 @@ from hypervideo_dl.compat import (
compat_chr,
compat_etree_fromstring,
compat_getenv,
+ compat_HTMLParseError,
compat_os_name,
compat_setenv,
)
@@ -151,10 +160,12 @@ class TestUtil(unittest.TestCase):
sanitize_filename('New World record at 0:12:34'),
'New World record at 0_12_34')
- self.assertEqual(sanitize_filename('--gasdgf'), '_-gasdgf')
+ self.assertEqual(sanitize_filename('--gasdgf'), '--gasdgf')
self.assertEqual(sanitize_filename('--gasdgf', is_id=True), '--gasdgf')
- self.assertEqual(sanitize_filename('.gasdgf'), 'gasdgf')
+ self.assertEqual(sanitize_filename('--gasdgf', is_id=False), '_-gasdgf')
+ self.assertEqual(sanitize_filename('.gasdgf'), '.gasdgf')
self.assertEqual(sanitize_filename('.gasdgf', is_id=True), '.gasdgf')
+ self.assertEqual(sanitize_filename('.gasdgf', is_id=False), 'gasdgf')
forbidden = '"\0\\/'
for fc in forbidden:
@@ -616,6 +627,8 @@ class TestUtil(unittest.TestCase):
self.assertEqual(parse_duration('3h 11m 53s'), 11513)
self.assertEqual(parse_duration('3 hours 11 minutes 53 seconds'), 11513)
self.assertEqual(parse_duration('3 hours 11 mins 53 secs'), 11513)
+ self.assertEqual(parse_duration('3 hours, 11 minutes, 53 seconds'), 11513)
+ self.assertEqual(parse_duration('3 hours, 11 mins, 53 secs'), 11513)
self.assertEqual(parse_duration('62m45s'), 3765)
self.assertEqual(parse_duration('6m59s'), 419)
self.assertEqual(parse_duration('49s'), 49)
@@ -634,6 +647,8 @@ class TestUtil(unittest.TestCase):
self.assertEqual(parse_duration('PT1H0.040S'), 3600.04)
self.assertEqual(parse_duration('PT00H03M30SZ'), 210)
self.assertEqual(parse_duration('P0Y0M0DT0H4M20.880S'), 260.88)
+ self.assertEqual(parse_duration('01:02:03:050'), 3723.05)
+ self.assertEqual(parse_duration('103:050'), 103.05)
def test_fix_xml_ampersands(self):
self.assertEqual(
@@ -1122,7 +1137,7 @@ class TestUtil(unittest.TestCase):
def test_clean_html(self):
self.assertEqual(clean_html('a:\nb'), 'a: b')
- self.assertEqual(clean_html('a:\n "b"'), 'a: "b"')
+ self.assertEqual(clean_html('a:\n "b"'), 'a: "b"')
self.assertEqual(clean_html('a<br>\xa0b'), 'a\nb')
def test_intlist_to_bytes(self):
@@ -1156,19 +1171,29 @@ class TestUtil(unittest.TestCase):
self.assertEqual(parse_count('1000'), 1000)
self.assertEqual(parse_count('1.000'), 1000)
self.assertEqual(parse_count('1.1k'), 1100)
+ self.assertEqual(parse_count('1.1 k'), 1100)
+ self.assertEqual(parse_count('1,1 k'), 1100)
self.assertEqual(parse_count('1.1kk'), 1100000)
self.assertEqual(parse_count('1.1kk '), 1100000)
+ self.assertEqual(parse_count('1,1kk'), 1100000)
+ self.assertEqual(parse_count('100 views'), 100)
+ self.assertEqual(parse_count('1,100 views'), 1100)
self.assertEqual(parse_count('1.1kk views'), 1100000)
+ self.assertEqual(parse_count('10M views'), 10000000)
+ self.assertEqual(parse_count('has 10M views'), 10000000)
def test_parse_resolution(self):
self.assertEqual(parse_resolution(None), {})
self.assertEqual(parse_resolution(''), {})
- self.assertEqual(parse_resolution('1920x1080'), {'width': 1920, 'height': 1080})
- self.assertEqual(parse_resolution('1920×1080'), {'width': 1920, 'height': 1080})
+ self.assertEqual(parse_resolution(' 1920x1080'), {'width': 1920, 'height': 1080})
+ self.assertEqual(parse_resolution('1920×1080 '), {'width': 1920, 'height': 1080})
self.assertEqual(parse_resolution('1920 x 1080'), {'width': 1920, 'height': 1080})
self.assertEqual(parse_resolution('720p'), {'height': 720})
self.assertEqual(parse_resolution('4k'), {'height': 2160})
self.assertEqual(parse_resolution('8K'), {'height': 4320})
+ self.assertEqual(parse_resolution('pre_1920x1080_post'), {'width': 1920, 'height': 1080})
+ self.assertEqual(parse_resolution('ep1x2'), {})
+ self.assertEqual(parse_resolution('1920, 1080'), {'width': 1920, 'height': 1080})
def test_parse_bitrate(self):
self.assertEqual(parse_bitrate(None), None)
@@ -1219,12 +1244,49 @@ ffmpeg version 2.4.4 Copyright (c) 2000-2014 the FFmpeg ...'''), '2.4.4')
def test_render_table(self):
self.assertEqual(
render_table(
+ ['a', 'empty', 'bcd'],
+ [[123, '', 4], [9999, '', 51]]),
+ 'a empty bcd\n'
+ '123 4\n'
+ '9999 51')
+
+ self.assertEqual(
+ render_table(
+ ['a', 'empty', 'bcd'],
+ [[123, '', 4], [9999, '', 51]],
+ hide_empty=True),
+ 'a bcd\n'
+ '123 4\n'
+ '9999 51')
+
+ self.assertEqual(
+ render_table(
+ ['\ta', 'bcd'],
+ [['1\t23', 4], ['\t9999', 51]]),
+ ' a bcd\n'
+ '1 23 4\n'
+ '9999 51')
+
+ self.assertEqual(
+ render_table(
['a', 'bcd'],
- [[123, 4], [9999, 51]]),
+ [[123, 4], [9999, 51]],
+ delim='-'),
'a bcd\n'
+ '--------\n'
'123 4\n'
'9999 51')
+ self.assertEqual(
+ render_table(
+ ['a', 'bcd'],
+ [[123, 4], [9999, 51]],
+ delim='-', extra_gap=2),
+ 'a bcd\n'
+ '----------\n'
+ '123 4\n'
+ '9999 51')
+
def test_match_str(self):
# Unary
self.assertFalse(match_str('xy', {'x': 1200}))
@@ -1390,21 +1452,21 @@ The first line
</body>
</tt>'''.encode('utf-8')
srt_data = '''1
-00:00:02,080 --> 00:00:05,839
+00:00:02,080 --> 00:00:05,840
<font color="white" face="sansSerif" size="16">default style<font color="red">custom style</font></font>
2
-00:00:02,080 --> 00:00:05,839
+00:00:02,080 --> 00:00:05,840
<b><font color="cyan" face="sansSerif" size="16"><font color="lime">part 1
</font>part 2</font></b>
3
-00:00:05,839 --> 00:00:09,560
+00:00:05,840 --> 00:00:09,560
<u><font color="lime">line 3
part 3</font></u>
4
-00:00:09,560 --> 00:00:12,359
+00:00:09,560 --> 00:00:12,360
<i><u><font color="yellow"><font color="lime">inner
</font>style</font></u></i>
@@ -1526,46 +1588,116 @@ Line 1
self.assertEqual(urshift(3, 1), 1)
self.assertEqual(urshift(-3, 1), 2147483646)
+ GET_ELEMENT_BY_CLASS_TEST_STRING = '''
+ <span class="foo bar">nice</span>
+ '''
+
def test_get_element_by_class(self):
- html = '''
- <span class="foo bar">nice</span>
- '''
+ html = self.GET_ELEMENT_BY_CLASS_TEST_STRING
self.assertEqual(get_element_by_class('foo', html), 'nice')
self.assertEqual(get_element_by_class('no-such-class', html), None)
+ def test_get_element_html_by_class(self):
+ html = self.GET_ELEMENT_BY_CLASS_TEST_STRING
+
+ self.assertEqual(get_element_html_by_class('foo', html), html.strip())
+ self.assertEqual(get_element_by_class('no-such-class', html), None)
+
+ GET_ELEMENT_BY_ATTRIBUTE_TEST_STRING = '''
+ <div itemprop="author" itemscope>foo</div>
+ '''
+
def test_get_element_by_attribute(self):
- html = '''
- <span class="foo bar">nice</span>
- '''
+ html = self.GET_ELEMENT_BY_CLASS_TEST_STRING
self.assertEqual(get_element_by_attribute('class', 'foo bar', html), 'nice')
self.assertEqual(get_element_by_attribute('class', 'foo', html), None)
self.assertEqual(get_element_by_attribute('class', 'no-such-foo', html), None)
- html = '''
- <div itemprop="author" itemscope>foo</div>
- '''
+ html = self.GET_ELEMENT_BY_ATTRIBUTE_TEST_STRING
self.assertEqual(get_element_by_attribute('itemprop', 'author', html), 'foo')
+ def test_get_element_html_by_attribute(self):
+ html = self.GET_ELEMENT_BY_CLASS_TEST_STRING
+
+ self.assertEqual(get_element_html_by_attribute('class', 'foo bar', html), html.strip())
+ self.assertEqual(get_element_html_by_attribute('class', 'foo', html), None)
+ self.assertEqual(get_element_html_by_attribute('class', 'no-such-foo', html), None)
+
+ html = self.GET_ELEMENT_BY_ATTRIBUTE_TEST_STRING
+
+ self.assertEqual(get_element_html_by_attribute('itemprop', 'author', html), html.strip())
+
+ GET_ELEMENTS_BY_CLASS_TEST_STRING = '''
+ <span class="foo bar">nice</span><span class="foo bar">also nice</span>
+ '''
+ GET_ELEMENTS_BY_CLASS_RES = ['<span class="foo bar">nice</span>', '<span class="foo bar">also nice</span>']
+
def test_get_elements_by_class(self):
- html = '''
- <span class="foo bar">nice</span><span class="foo bar">also nice</span>
- '''
+ html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING
self.assertEqual(get_elements_by_class('foo', html), ['nice', 'also nice'])
self.assertEqual(get_elements_by_class('no-such-class', html), [])
+ def test_get_elements_html_by_class(self):
+ html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING
+
+ self.assertEqual(get_elements_html_by_class('foo', html), self.GET_ELEMENTS_BY_CLASS_RES)
+ self.assertEqual(get_elements_html_by_class('no-such-class', html), [])
+
def test_get_elements_by_attribute(self):
- html = '''
- <span class="foo bar">nice</span><span class="foo bar">also nice</span>
- '''
+ html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING
self.assertEqual(get_elements_by_attribute('class', 'foo bar', html), ['nice', 'also nice'])
self.assertEqual(get_elements_by_attribute('class', 'foo', html), [])
self.assertEqual(get_elements_by_attribute('class', 'no-such-foo', html), [])
+ def test_get_elements_html_by_attribute(self):
+ html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING
+
+ self.assertEqual(get_elements_html_by_attribute('class', 'foo bar', html), self.GET_ELEMENTS_BY_CLASS_RES)
+ self.assertEqual(get_elements_html_by_attribute('class', 'foo', html), [])
+ self.assertEqual(get_elements_html_by_attribute('class', 'no-such-foo', html), [])
+
+ def test_get_elements_text_and_html_by_attribute(self):
+ html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING
+
+ self.assertEqual(
+ list(get_elements_text_and_html_by_attribute('class', 'foo bar', html)),
+ list(zip(['nice', 'also nice'], self.GET_ELEMENTS_BY_CLASS_RES)))
+ self.assertEqual(list(get_elements_text_and_html_by_attribute('class', 'foo', html)), [])
+ self.assertEqual(list(get_elements_text_and_html_by_attribute('class', 'no-such-foo', html)), [])
+
+ GET_ELEMENT_BY_TAG_TEST_STRING = '''
+ random text lorem ipsum</p>
+ <div>
+ this should be returned
+ <span>this should also be returned</span>
+ <div>
+ this should also be returned
+ </div>
+ closing tag above should not trick, so this should also be returned
+ </div>
+ but this text should not be returned
+ '''
+ GET_ELEMENT_BY_TAG_RES_OUTERDIV_HTML = GET_ELEMENT_BY_TAG_TEST_STRING.strip()[32:276]
+ GET_ELEMENT_BY_TAG_RES_OUTERDIV_TEXT = GET_ELEMENT_BY_TAG_RES_OUTERDIV_HTML[5:-6]
+ GET_ELEMENT_BY_TAG_RES_INNERSPAN_HTML = GET_ELEMENT_BY_TAG_TEST_STRING.strip()[78:119]
+ GET_ELEMENT_BY_TAG_RES_INNERSPAN_TEXT = GET_ELEMENT_BY_TAG_RES_INNERSPAN_HTML[6:-7]
+
+ def test_get_element_text_and_html_by_tag(self):
+ html = self.GET_ELEMENT_BY_TAG_TEST_STRING
+
+ self.assertEqual(
+ get_element_text_and_html_by_tag('div', html),
+ (self.GET_ELEMENT_BY_TAG_RES_OUTERDIV_TEXT, self.GET_ELEMENT_BY_TAG_RES_OUTERDIV_HTML))
+ self.assertEqual(
+ get_element_text_and_html_by_tag('span', html),
+ (self.GET_ELEMENT_BY_TAG_RES_INNERSPAN_TEXT, self.GET_ELEMENT_BY_TAG_RES_INNERSPAN_HTML))
+ self.assertRaises(compat_HTMLParseError, get_element_text_and_html_by_tag, 'article', html)
+
def test_iri_to_uri(self):
self.assertEqual(
iri_to_uri('https://www.google.com/search?q=foo&ie=utf-8&oe=utf-8&client=firefox-b'),
@@ -1617,9 +1749,9 @@ Line 1
self.assertEqual(repr(LazyList(it)), repr(it))
self.assertEqual(str(LazyList(it)), str(it))
- self.assertEqual(list(LazyList(it).reverse()), it[::-1])
- self.assertEqual(list(LazyList(it).reverse()[1:3:7]), it[::-1][1:3:7])
- self.assertEqual(list(LazyList(it).reverse()[::-1]), it)
+ self.assertEqual(list(LazyList(it, reverse=True)), it[::-1])
+ self.assertEqual(list(reversed(LazyList(it))[::-1]), it)
+ self.assertEqual(list(reversed(LazyList(it))[1:3:7]), it[::-1][1:3:7])
def test_LazyList_laziness(self):
@@ -1632,15 +1764,37 @@ Line 1
test(ll, 5, 5, range(6))
test(ll, -3, 7, range(10))
- ll = LazyList(range(10)).reverse()
+ ll = LazyList(range(10), reverse=True)
test(ll, -1, 0, range(1))
test(ll, 3, 6, range(10))
ll = LazyList(itertools.count())
test(ll, 10, 10, range(11))
- ll.reverse()
+ ll = reversed(ll)
test(ll, -15, 14, range(15))
+ def test_format_bytes(self):
+ self.assertEqual(format_bytes(0), '0.00B')
+ self.assertEqual(format_bytes(1000), '1000.00B')
+ self.assertEqual(format_bytes(1024), '1.00KiB')
+ self.assertEqual(format_bytes(1024**2), '1.00MiB')
+ self.assertEqual(format_bytes(1024**3), '1.00GiB')
+ self.assertEqual(format_bytes(1024**4), '1.00TiB')
+ self.assertEqual(format_bytes(1024**5), '1.00PiB')
+ self.assertEqual(format_bytes(1024**6), '1.00EiB')
+ self.assertEqual(format_bytes(1024**7), '1.00ZiB')
+ self.assertEqual(format_bytes(1024**8), '1.00YiB')
+ self.assertEqual(format_bytes(1024**9), '1024.00YiB')
+
+ def test_hide_login_info(self):
+ self.assertEqual(Config.hide_login_info(['-u', 'foo', '-p', 'bar']),
+ ['-u', 'PRIVATE', '-p', 'PRIVATE'])
+ self.assertEqual(Config.hide_login_info(['-u']), ['-u'])
+ self.assertEqual(Config.hide_login_info(['-u', 'foo', '-u', 'bar']),
+ ['-u', 'PRIVATE', '-u', 'PRIVATE'])
+ self.assertEqual(Config.hide_login_info(['--username=foo']),
+ ['--username=PRIVATE'])
+
if __name__ == '__main__':
unittest.main()
diff --git a/test/test_verbose_output.py b/test/test_verbose_output.py
index 050fd76..98c6d70 100644
--- a/test/test_verbose_output.py
+++ b/test/test_verbose_output.py
@@ -19,52 +19,52 @@ class TestVerboseOutput(unittest.TestCase):
[
sys.executable, 'hypervideo_dl/__main__.py', '-v',
'--username', 'johnsmith@gmail.com',
- '--password', 'secret',
+ '--password', 'my_secret_password',
], cwd=rootDir, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
sout, serr = outp.communicate()
self.assertTrue(b'--username' in serr)
self.assertTrue(b'johnsmith' not in serr)
self.assertTrue(b'--password' in serr)
- self.assertTrue(b'secret' not in serr)
+ self.assertTrue(b'my_secret_password' not in serr)
def test_private_info_shortarg(self):
outp = subprocess.Popen(
[
sys.executable, 'hypervideo_dl/__main__.py', '-v',
'-u', 'johnsmith@gmail.com',
- '-p', 'secret',
+ '-p', 'my_secret_password',
], cwd=rootDir, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
sout, serr = outp.communicate()
self.assertTrue(b'-u' in serr)
self.assertTrue(b'johnsmith' not in serr)
self.assertTrue(b'-p' in serr)
- self.assertTrue(b'secret' not in serr)
+ self.assertTrue(b'my_secret_password' not in serr)
def test_private_info_eq(self):
outp = subprocess.Popen(
[
sys.executable, 'hypervideo_dl/__main__.py', '-v',
'--username=johnsmith@gmail.com',
- '--password=secret',
+ '--password=my_secret_password',
], cwd=rootDir, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
sout, serr = outp.communicate()
self.assertTrue(b'--username' in serr)
self.assertTrue(b'johnsmith' not in serr)
self.assertTrue(b'--password' in serr)
- self.assertTrue(b'secret' not in serr)
+ self.assertTrue(b'my_secret_password' not in serr)
def test_private_info_shortarg_eq(self):
outp = subprocess.Popen(
[
sys.executable, 'hypervideo_dl/__main__.py', '-v',
'-u=johnsmith@gmail.com',
- '-p=secret',
+ '-p=my_secret_password',
], cwd=rootDir, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
sout, serr = outp.communicate()
self.assertTrue(b'-u' in serr)
self.assertTrue(b'johnsmith' not in serr)
self.assertTrue(b'-p' in serr)
- self.assertTrue(b'secret' not in serr)
+ self.assertTrue(b'my_secret_password' not in serr)
if __name__ == '__main__':
diff --git a/test/test_youtube_lists.py b/test/test_youtube_lists.py
index 2da1a50..b94b733 100644
--- a/test/test_youtube_lists.py
+++ b/test/test_youtube_lists.py
@@ -9,11 +9,9 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from test.helper import FakeYDL, is_download_test
-
from hypervideo_dl.extractor import (
- YoutubePlaylistIE,
- YoutubeTabIE,
YoutubeIE,
+ YoutubeTabIE,
)
@@ -26,38 +24,20 @@ class TestYoutubeLists(unittest.TestCase):
def test_youtube_playlist_noplaylist(self):
dl = FakeYDL()
dl.params['noplaylist'] = True
- ie = YoutubePlaylistIE(dl)
- result = ie.extract('https://www.youtube.com/watch?v=FXxLjLQi3Fg&list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re')
+ ie = YoutubeTabIE(dl)
+ result = ie.extract('https://www.youtube.com/watch?v=OmJ-4B-mS-Y&list=PLydZ2Hrp_gPRJViZjLFKaBMgCQOYEEkyp&index=2')
self.assertEqual(result['_type'], 'url')
- self.assertEqual(YoutubeIE().extract_id(result['url']), 'FXxLjLQi3Fg')
-
- def test_youtube_course(self):
- dl = FakeYDL()
- ie = YoutubePlaylistIE(dl)
- # TODO find a > 100 (paginating?) videos course
- result = ie.extract('https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8')
- entries = list(result['entries'])
- self.assertEqual(YoutubeIE().extract_id(entries[0]['url']), 'j9WZyLZCBzs')
- self.assertEqual(len(entries), 25)
- self.assertEqual(YoutubeIE().extract_id(entries[-1]['url']), 'rYefUsYuEp0')
+ self.assertEqual(result['ie_key'], YoutubeIE.ie_key())
+ self.assertEqual(YoutubeIE.extract_id(result['url']), 'OmJ-4B-mS-Y')
def test_youtube_mix(self):
dl = FakeYDL()
- ie = YoutubePlaylistIE(dl)
- result = ie.extract('https://www.youtube.com/watch?v=W01L70IGBgE&index=2&list=RDOQpdSVF_k_w')
- entries = result['entries']
+ ie = YoutubeTabIE(dl)
+ result = ie.extract('https://www.youtube.com/watch?v=tyITL_exICo&list=RDCLAK5uy_kLWIr9gv1XLlPbaDS965-Db4TrBoUTxQ8')
+ entries = list(result['entries'])
self.assertTrue(len(entries) >= 50)
original_video = entries[0]
- self.assertEqual(original_video['id'], 'OQpdSVF_k_w')
-
- def test_youtube_toptracks(self):
- print('Skipping: The playlist page gives error 500')
- return
- dl = FakeYDL()
- ie = YoutubePlaylistIE(dl)
- result = ie.extract('https://www.youtube.com/playlist?list=MCUS')
- entries = result['entries']
- self.assertEqual(len(entries), 100)
+ self.assertEqual(original_video['id'], 'tyITL_exICo')
def test_youtube_flat_playlist_extraction(self):
dl = FakeYDL()
@@ -68,10 +48,10 @@ class TestYoutubeLists(unittest.TestCase):
entries = list(result['entries'])
self.assertTrue(len(entries) == 1)
video = entries[0]
- self.assertEqual(video['_type'], 'url_transparent')
+ self.assertEqual(video['_type'], 'url')
self.assertEqual(video['ie_key'], 'Youtube')
self.assertEqual(video['id'], 'BaW_jenozKc')
- self.assertEqual(video['url'], 'BaW_jenozKc')
+ self.assertEqual(video['url'], 'https://www.youtube.com/watch?v=BaW_jenozKc')
self.assertEqual(video['title'], 'youtube-dl test video "\'/\\ä↭𝕐')
self.assertEqual(video['duration'], 10)
self.assertEqual(video['uploader'], 'Philipp Hagemeister')