aboutsummaryrefslogtreecommitdiffstats
path: root/hypervideo_dl/extractor
diff options
context:
space:
mode:
Diffstat (limited to 'hypervideo_dl/extractor')
-rw-r--r--hypervideo_dl/extractor/__init__.py9
-rw-r--r--hypervideo_dl/extractor/abc.py104
-rw-r--r--hypervideo_dl/extractor/abcnews.py3
-rw-r--r--hypervideo_dl/extractor/abcotvs.py3
-rw-r--r--hypervideo_dl/extractor/acast.py3
-rw-r--r--hypervideo_dl/extractor/adobepass.py155
-rw-r--r--hypervideo_dl/extractor/adobetv.py8
-rw-r--r--hypervideo_dl/extractor/adultswim.py3
-rw-r--r--hypervideo_dl/extractor/aenetworks.py11
-rw-r--r--hypervideo_dl/extractor/afreecatv.py23
-rw-r--r--hypervideo_dl/extractor/aljazeera.py5
-rw-r--r--hypervideo_dl/extractor/alura.py179
-rw-r--r--hypervideo_dl/extractor/amcnetworks.py69
-rw-r--r--hypervideo_dl/extractor/americastestkitchen.py5
-rw-r--r--hypervideo_dl/extractor/animelab.py285
-rw-r--r--hypervideo_dl/extractor/anvato.py25
-rw-r--r--hypervideo_dl/extractor/anvato_token_generator/__init__.py7
-rw-r--r--hypervideo_dl/extractor/anvato_token_generator/common.py6
-rw-r--r--hypervideo_dl/extractor/anvato_token_generator/nfl.py30
-rw-r--r--hypervideo_dl/extractor/aol.py9
-rw-r--r--hypervideo_dl/extractor/apa.py2
-rw-r--r--hypervideo_dl/extractor/aparat.py3
-rw-r--r--hypervideo_dl/extractor/appleconnect.py13
-rw-r--r--hypervideo_dl/extractor/appletrailers.py2
-rw-r--r--hypervideo_dl/extractor/archiveorg.py427
-rw-r--r--hypervideo_dl/extractor/arcpublishing.py11
-rw-r--r--hypervideo_dl/extractor/ard.py181
-rw-r--r--hypervideo_dl/extractor/arkena.py6
-rw-r--r--hypervideo_dl/extractor/arte.py15
-rw-r--r--hypervideo_dl/extractor/asiancrush.py4
-rw-r--r--hypervideo_dl/extractor/atresplayer.py13
-rw-r--r--hypervideo_dl/extractor/atvat.py125
-rw-r--r--hypervideo_dl/extractor/audius.py274
-rw-r--r--hypervideo_dl/extractor/awaan.py7
-rw-r--r--hypervideo_dl/extractor/azmedien.py3
-rw-r--r--hypervideo_dl/extractor/baidu.py3
-rw-r--r--hypervideo_dl/extractor/bandcamp.py48
-rw-r--r--hypervideo_dl/extractor/bannedvideo.py158
-rw-r--r--hypervideo_dl/extractor/bbc.py9
-rw-r--r--hypervideo_dl/extractor/beatport.py4
-rw-r--r--hypervideo_dl/extractor/beeg.py4
-rw-r--r--hypervideo_dl/extractor/behindkink.py3
-rw-r--r--hypervideo_dl/extractor/bellmedia.py3
-rw-r--r--hypervideo_dl/extractor/bet.py2
-rw-r--r--hypervideo_dl/extractor/bilibili.py468
-rw-r--r--hypervideo_dl/extractor/bitchute.py32
-rw-r--r--hypervideo_dl/extractor/bitwave.py61
-rw-r--r--hypervideo_dl/extractor/blackboardcollaborate.py67
-rw-r--r--hypervideo_dl/extractor/blinkx.py86
-rw-r--r--hypervideo_dl/extractor/bokecc.py5
-rw-r--r--hypervideo_dl/extractor/bongacams.py3
-rw-r--r--hypervideo_dl/extractor/box.py3
-rw-r--r--hypervideo_dl/extractor/bpb.py2
-rw-r--r--hypervideo_dl/extractor/br.py5
-rw-r--r--hypervideo_dl/extractor/bravotv.py38
-rw-r--r--hypervideo_dl/extractor/breakcom.py3
-rw-r--r--hypervideo_dl/extractor/brightcove.py33
-rw-r--r--hypervideo_dl/extractor/byutv.py17
-rw-r--r--hypervideo_dl/extractor/c56.py3
-rw-r--r--hypervideo_dl/extractor/cam4.py32
-rw-r--r--hypervideo_dl/extractor/cammodels.py2
-rw-r--r--hypervideo_dl/extractor/canalplus.py5
-rw-r--r--hypervideo_dl/extractor/canvas.py83
-rw-r--r--hypervideo_dl/extractor/cbc.py476
-rw-r--r--hypervideo_dl/extractor/cbs.py134
-rw-r--r--hypervideo_dl/extractor/cbsinteractive.py3
-rw-r--r--hypervideo_dl/extractor/cbssports.py3
-rw-r--r--hypervideo_dl/extractor/ccma.py3
-rw-r--r--hypervideo_dl/extractor/cctv.py2
-rw-r--r--hypervideo_dl/extractor/cda.py44
-rw-r--r--hypervideo_dl/extractor/ceskatelevize.py5
-rw-r--r--hypervideo_dl/extractor/cgtn.py64
-rw-r--r--hypervideo_dl/extractor/channel9.py8
-rw-r--r--hypervideo_dl/extractor/chilloutzone.py3
-rw-r--r--hypervideo_dl/extractor/chingari.py209
-rw-r--r--hypervideo_dl/extractor/cinemax.py3
-rw-r--r--hypervideo_dl/extractor/ciscolive.py7
-rw-r--r--hypervideo_dl/extractor/ciscowebex.py90
-rw-r--r--hypervideo_dl/extractor/cjsw.py3
-rw-r--r--hypervideo_dl/extractor/clyp.py7
-rw-r--r--hypervideo_dl/extractor/cmt.py6
-rw-r--r--hypervideo_dl/extractor/cnbc.py3
-rw-r--r--hypervideo_dl/extractor/cnn.py3
-rw-r--r--hypervideo_dl/extractor/comedycentral.py5
-rw-r--r--hypervideo_dl/extractor/common.py1653
-rw-r--r--hypervideo_dl/extractor/commonmistakes.py4
-rw-r--r--hypervideo_dl/extractor/commonprotocols.py14
-rw-r--r--hypervideo_dl/extractor/condenast.py2
-rw-r--r--hypervideo_dl/extractor/corus.py5
-rw-r--r--hypervideo_dl/extractor/coub.py6
-rw-r--r--hypervideo_dl/extractor/crackle.py279
-rw-r--r--hypervideo_dl/extractor/crunchyroll.py133
-rw-r--r--hypervideo_dl/extractor/cultureunplugged.py3
-rw-r--r--hypervideo_dl/extractor/curiositystream.py22
-rw-r--r--hypervideo_dl/extractor/dailymotion.py10
-rw-r--r--hypervideo_dl/extractor/damtomo.py113
-rw-r--r--hypervideo_dl/extractor/daum.py9
-rw-r--r--hypervideo_dl/extractor/dbtv.py2
-rw-r--r--hypervideo_dl/extractor/deezer.py127
-rw-r--r--hypervideo_dl/extractor/dfb.py3
-rw-r--r--hypervideo_dl/extractor/digiteka.py2
-rw-r--r--hypervideo_dl/extractor/discovery.py3
-rw-r--r--hypervideo_dl/extractor/discoverynetworks.py3
-rw-r--r--hypervideo_dl/extractor/discoveryplusindia.py98
-rw-r--r--hypervideo_dl/extractor/disney.py5
-rw-r--r--hypervideo_dl/extractor/dispeak.py2
-rw-r--r--hypervideo_dl/extractor/dlive.py3
-rw-r--r--hypervideo_dl/extractor/doodstream.py71
-rw-r--r--hypervideo_dl/extractor/dplay.py112
-rw-r--r--hypervideo_dl/extractor/drbonanza.py3
-rw-r--r--hypervideo_dl/extractor/dropbox.py4
-rw-r--r--hypervideo_dl/extractor/drtuber.py2
-rw-r--r--hypervideo_dl/extractor/drtv.py4
-rw-r--r--hypervideo_dl/extractor/dtube.py3
-rw-r--r--hypervideo_dl/extractor/duboku.py242
-rw-r--r--hypervideo_dl/extractor/dw.py14
-rw-r--r--hypervideo_dl/extractor/eagleplatform.py2
-rw-r--r--hypervideo_dl/extractor/egghead.py19
-rw-r--r--hypervideo_dl/extractor/eighttracks.py20
-rw-r--r--hypervideo_dl/extractor/einthusan.py3
-rw-r--r--hypervideo_dl/extractor/elonet.py89
-rw-r--r--hypervideo_dl/extractor/epicon.py119
-rw-r--r--hypervideo_dl/extractor/eporner.py3
-rw-r--r--hypervideo_dl/extractor/eroprofile.py39
-rw-r--r--hypervideo_dl/extractor/espn.py2
-rw-r--r--hypervideo_dl/extractor/europa.py4
-rw-r--r--hypervideo_dl/extractor/euscreen.py64
-rw-r--r--hypervideo_dl/extractor/everyonesmixtape.py76
-rw-r--r--hypervideo_dl/extractor/extractors.py320
-rw-r--r--hypervideo_dl/extractor/facebook.py151
-rw-r--r--hypervideo_dl/extractor/fancode.py187
-rw-r--r--hypervideo_dl/extractor/fc2.py3
-rw-r--r--hypervideo_dl/extractor/filmmodu.py74
-rw-r--r--hypervideo_dl/extractor/filmweb.py3
-rw-r--r--hypervideo_dl/extractor/firsttv.py2
-rw-r--r--hypervideo_dl/extractor/fivetv.py3
-rw-r--r--hypervideo_dl/extractor/flickr.py2
-rw-r--r--hypervideo_dl/extractor/fourtube.py4
-rw-r--r--hypervideo_dl/extractor/foxnews.py2
-rw-r--r--hypervideo_dl/extractor/francetv.py385
-rw-r--r--hypervideo_dl/extractor/frontendmasters.py2
-rw-r--r--hypervideo_dl/extractor/funimation.py408
-rw-r--r--hypervideo_dl/extractor/funk.py3
-rw-r--r--hypervideo_dl/extractor/fxnetworks.py77
-rw-r--r--hypervideo_dl/extractor/gab.py64
-rw-r--r--hypervideo_dl/extractor/gaia.py3
-rw-r--r--hypervideo_dl/extractor/gamestar.py3
-rw-r--r--hypervideo_dl/extractor/gaskrank.py2
-rw-r--r--hypervideo_dl/extractor/gazeta.py3
-rw-r--r--hypervideo_dl/extractor/gdcvault.py2
-rw-r--r--hypervideo_dl/extractor/gedidigital.py57
-rw-r--r--hypervideo_dl/extractor/generic.py320
-rw-r--r--hypervideo_dl/extractor/gettr.py110
-rw-r--r--hypervideo_dl/extractor/giantbomb.py3
-rw-r--r--hypervideo_dl/extractor/globo.py157
-rw-r--r--hypervideo_dl/extractor/go.py19
-rw-r--r--hypervideo_dl/extractor/godtube.py3
-rw-r--r--hypervideo_dl/extractor/googledrive.py4
-rw-r--r--hypervideo_dl/extractor/googlepodcasts.py3
-rw-r--r--hypervideo_dl/extractor/googlesearch.py28
-rw-r--r--hypervideo_dl/extractor/gopro.py110
-rw-r--r--hypervideo_dl/extractor/gotostage.py73
-rw-r--r--hypervideo_dl/extractor/gronkh.py43
-rw-r--r--hypervideo_dl/extractor/hearthisat.py90
-rw-r--r--hypervideo_dl/extractor/hidive.py100
-rw-r--r--hypervideo_dl/extractor/hotstar.py303
-rw-r--r--hypervideo_dl/extractor/hrfensehen.py102
-rw-r--r--hypervideo_dl/extractor/hrti.py5
-rw-r--r--hypervideo_dl/extractor/hungama.py58
-rw-r--r--hypervideo_dl/extractor/ichinanalive.py167
-rw-r--r--hypervideo_dl/extractor/ign.py2
-rw-r--r--hypervideo_dl/extractor/imggaming.py5
-rw-r--r--hypervideo_dl/extractor/imgur.py2
-rw-r--r--hypervideo_dl/extractor/instagram.py35
-rw-r--r--hypervideo_dl/extractor/internetvideoarchive.py7
-rw-r--r--hypervideo_dl/extractor/iprima.py2
-rw-r--r--hypervideo_dl/extractor/iqiyi.py2
-rw-r--r--hypervideo_dl/extractor/itv.py173
-rw-r--r--hypervideo_dl/extractor/ivi.py33
-rw-r--r--hypervideo_dl/extractor/ivideon.py3
-rw-r--r--hypervideo_dl/extractor/iwara.py24
-rw-r--r--hypervideo_dl/extractor/jeuxvideo.py3
-rw-r--r--hypervideo_dl/extractor/joj.py216
-rw-r--r--hypervideo_dl/extractor/jove.py3
-rw-r--r--hypervideo_dl/extractor/jwplatform.py11
-rw-r--r--hypervideo_dl/extractor/kakao.py121
-rw-r--r--hypervideo_dl/extractor/kaltura.py4
-rw-r--r--hypervideo_dl/extractor/kanalplay.py96
-rw-r--r--hypervideo_dl/extractor/keezmovies.py4
-rw-r--r--hypervideo_dl/extractor/kinja.py2
-rw-r--r--hypervideo_dl/extractor/koo.py116
-rw-r--r--hypervideo_dl/extractor/kusi.py3
-rw-r--r--hypervideo_dl/extractor/kuwo.py2
-rw-r--r--hypervideo_dl/extractor/la7.py174
-rw-r--r--hypervideo_dl/extractor/lbry.py30
-rw-r--r--hypervideo_dl/extractor/lecturio.py4
-rw-r--r--hypervideo_dl/extractor/leeco.py2
-rw-r--r--hypervideo_dl/extractor/lego.py3
-rw-r--r--hypervideo_dl/extractor/libsyn.py3
-rw-r--r--hypervideo_dl/extractor/lifenews.py2
-rw-r--r--hypervideo_dl/extractor/limelight.py11
-rw-r--r--hypervideo_dl/extractor/line.py10
-rw-r--r--hypervideo_dl/extractor/linkedin.py32
-rw-r--r--hypervideo_dl/extractor/linuxacademy.py21
-rw-r--r--hypervideo_dl/extractor/litv.py2
-rw-r--r--hypervideo_dl/extractor/livestream.py8
-rw-r--r--hypervideo_dl/extractor/lnkgo.py3
-rw-r--r--hypervideo_dl/extractor/localnews8.py3
-rw-r--r--hypervideo_dl/extractor/lovehomeporn.py3
-rw-r--r--hypervideo_dl/extractor/lrt.py3
-rw-r--r--hypervideo_dl/extractor/lynda.py6
-rw-r--r--hypervideo_dl/extractor/magentamusik360.py61
-rw-r--r--hypervideo_dl/extractor/mailru.py25
-rw-r--r--hypervideo_dl/extractor/manoto.py138
-rw-r--r--hypervideo_dl/extractor/massengeschmacktv.py2
-rw-r--r--hypervideo_dl/extractor/mdr.py4
-rw-r--r--hypervideo_dl/extractor/medaltv.py4
-rw-r--r--hypervideo_dl/extractor/mediaite.py93
-rw-r--r--hypervideo_dl/extractor/mediaklikk.py104
-rw-r--r--hypervideo_dl/extractor/mediaset.py116
-rw-r--r--hypervideo_dl/extractor/mediasite.py69
-rw-r--r--hypervideo_dl/extractor/metacafe.py4
-rw-r--r--hypervideo_dl/extractor/metacritic.py2
-rw-r--r--hypervideo_dl/extractor/mgoon.py3
-rw-r--r--hypervideo_dl/extractor/microsoftvirtualacademy.py4
-rw-r--r--hypervideo_dl/extractor/mildom.py258
-rw-r--r--hypervideo_dl/extractor/minoto.py5
-rw-r--r--hypervideo_dl/extractor/mirrativ.py134
-rw-r--r--hypervideo_dl/extractor/mit.py2
-rw-r--r--hypervideo_dl/extractor/mixcloud.py7
-rw-r--r--hypervideo_dl/extractor/moevideo.py3
-rw-r--r--hypervideo_dl/extractor/mojvideo.py3
-rw-r--r--hypervideo_dl/extractor/morningstar.py3
-rw-r--r--hypervideo_dl/extractor/motherless.py30
-rw-r--r--hypervideo_dl/extractor/moviezine.py3
-rw-r--r--hypervideo_dl/extractor/msn.py4
-rw-r--r--hypervideo_dl/extractor/mtv.py188
-rw-r--r--hypervideo_dl/extractor/muenchentv.py2
-rw-r--r--hypervideo_dl/extractor/musescore.py67
-rw-r--r--hypervideo_dl/extractor/mxplayer.py222
-rw-r--r--hypervideo_dl/extractor/mychannels.py3
-rw-r--r--hypervideo_dl/extractor/myspace.py16
-rw-r--r--hypervideo_dl/extractor/myvideoge.py56
-rw-r--r--hypervideo_dl/extractor/n1.py136
-rw-r--r--hypervideo_dl/extractor/naver.py85
-rw-r--r--hypervideo_dl/extractor/nba.py13
-rw-r--r--hypervideo_dl/extractor/nbc.py161
-rw-r--r--hypervideo_dl/extractor/ndr.py179
-rw-r--r--hypervideo_dl/extractor/nebula.py238
-rw-r--r--hypervideo_dl/extractor/neteasemusic.py2
-rw-r--r--hypervideo_dl/extractor/netzkino.py50
-rw-r--r--hypervideo_dl/extractor/newgrounds.py217
-rw-r--r--hypervideo_dl/extractor/nexx.py2
-rw-r--r--hypervideo_dl/extractor/nfhsnetwork.py144
-rw-r--r--hypervideo_dl/extractor/nhk.py5
-rw-r--r--hypervideo_dl/extractor/nhl.py3
-rw-r--r--hypervideo_dl/extractor/nick.py85
-rw-r--r--hypervideo_dl/extractor/niconico.py660
-rw-r--r--hypervideo_dl/extractor/ninecninemedia.py9
-rw-r--r--hypervideo_dl/extractor/ninenow.py58
-rw-r--r--hypervideo_dl/extractor/nitter.py228
-rw-r--r--hypervideo_dl/extractor/noco.py235
-rw-r--r--hypervideo_dl/extractor/nova.py4
-rw-r--r--hypervideo_dl/extractor/novaplay.py63
-rw-r--r--hypervideo_dl/extractor/npo.py7
-rw-r--r--hypervideo_dl/extractor/nrk.py8
-rw-r--r--hypervideo_dl/extractor/ntvde.py2
-rw-r--r--hypervideo_dl/extractor/nuvid.py86
-rw-r--r--hypervideo_dl/extractor/nytimes.py10
-rw-r--r--hypervideo_dl/extractor/nzherald.py98
-rw-r--r--hypervideo_dl/extractor/odnoklassniki.py5
-rw-r--r--hypervideo_dl/extractor/olympics.py56
-rw-r--r--hypervideo_dl/extractor/on24.py91
-rw-r--r--hypervideo_dl/extractor/ondemandkorea.py38
-rw-r--r--hypervideo_dl/extractor/onet.py4
-rw-r--r--hypervideo_dl/extractor/ooyala.py5
-rw-r--r--hypervideo_dl/extractor/openload.py3
-rw-r--r--hypervideo_dl/extractor/openrec.py126
-rw-r--r--hypervideo_dl/extractor/ora.py2
-rw-r--r--hypervideo_dl/extractor/orf.py5
-rw-r--r--hypervideo_dl/extractor/packtpub.py5
-rw-r--r--hypervideo_dl/extractor/palcomp3.py7
-rw-r--r--hypervideo_dl/extractor/pandoratv.py7
-rw-r--r--hypervideo_dl/extractor/paramountplus.py145
-rw-r--r--hypervideo_dl/extractor/parliamentliveuk.py76
-rw-r--r--hypervideo_dl/extractor/parlview.py68
-rw-r--r--hypervideo_dl/extractor/patreon.py86
-rw-r--r--hypervideo_dl/extractor/pbs.py33
-rw-r--r--hypervideo_dl/extractor/peertube.py818
-rw-r--r--hypervideo_dl/extractor/peloton.py222
-rw-r--r--hypervideo_dl/extractor/performgroup.py3
-rw-r--r--hypervideo_dl/extractor/periscope.py8
-rw-r--r--hypervideo_dl/extractor/philharmoniedeparis.py2
-rw-r--r--hypervideo_dl/extractor/photobucket.py3
-rw-r--r--hypervideo_dl/extractor/piksel.py2
-rw-r--r--hypervideo_dl/extractor/pinterest.py6
-rw-r--r--hypervideo_dl/extractor/pladform.py4
-rw-r--r--hypervideo_dl/extractor/playfm.py3
-rw-r--r--hypervideo_dl/extractor/playplustv.py3
-rw-r--r--hypervideo_dl/extractor/playtvak.py2
-rw-r--r--hypervideo_dl/extractor/playwire.py3
-rw-r--r--hypervideo_dl/extractor/pluralsight.py9
-rw-r--r--hypervideo_dl/extractor/plutotv.py184
-rw-r--r--hypervideo_dl/extractor/podomatic.py3
-rw-r--r--hypervideo_dl/extractor/pokemon.py73
-rw-r--r--hypervideo_dl/extractor/polskieradio.py47
-rw-r--r--hypervideo_dl/extractor/popcorntimes.py3
-rw-r--r--hypervideo_dl/extractor/popcorntv.py3
-rw-r--r--hypervideo_dl/extractor/porncom.py2
-rw-r--r--hypervideo_dl/extractor/pornflip.py82
-rw-r--r--hypervideo_dl/extractor/pornhd.py3
-rw-r--r--hypervideo_dl/extractor/pornhub.py126
-rw-r--r--hypervideo_dl/extractor/pornovoisines.py3
-rw-r--r--hypervideo_dl/extractor/pornoxo.py3
-rw-r--r--hypervideo_dl/extractor/presstv.py3
-rw-r--r--hypervideo_dl/extractor/projectveritas.py55
-rw-r--r--hypervideo_dl/extractor/prosiebensat1.py4
-rw-r--r--hypervideo_dl/extractor/pyvideo.py2
-rw-r--r--hypervideo_dl/extractor/qqmusic.py2
-rw-r--r--hypervideo_dl/extractor/radiko.py234
-rw-r--r--hypervideo_dl/extractor/radiocanada.py3
-rw-r--r--hypervideo_dl/extractor/radiofrance.py4
-rw-r--r--hypervideo_dl/extractor/radlive.py179
-rw-r--r--hypervideo_dl/extractor/rai.py146
-rw-r--r--hypervideo_dl/extractor/raywenderlich.py2
-rw-r--r--hypervideo_dl/extractor/rbmaradio.py3
-rw-r--r--hypervideo_dl/extractor/rcs.py427
-rw-r--r--hypervideo_dl/extractor/rcti.py354
-rw-r--r--hypervideo_dl/extractor/redbulltv.py5
-rw-r--r--hypervideo_dl/extractor/reddit.py32
-rw-r--r--hypervideo_dl/extractor/redtube.py3
-rw-r--r--hypervideo_dl/extractor/rice.py2
-rw-r--r--hypervideo_dl/extractor/rmcdecouverte.py29
-rw-r--r--hypervideo_dl/extractor/roosterteeth.py26
-rw-r--r--hypervideo_dl/extractor/roxwel.py3
-rw-r--r--hypervideo_dl/extractor/rtbf.py4
-rw-r--r--hypervideo_dl/extractor/rtl2.py4
-rw-r--r--hypervideo_dl/extractor/rtp.py88
-rw-r--r--hypervideo_dl/extractor/rts.py2
-rw-r--r--hypervideo_dl/extractor/rtve.py3
-rw-r--r--hypervideo_dl/extractor/rumble.py47
-rw-r--r--hypervideo_dl/extractor/rutube.py11
-rw-r--r--hypervideo_dl/extractor/rutv.py8
-rw-r--r--hypervideo_dl/extractor/ruutu.py6
-rw-r--r--hypervideo_dl/extractor/safari.py11
-rw-r--r--hypervideo_dl/extractor/saitosan.py78
-rw-r--r--hypervideo_dl/extractor/sapo.py2
-rw-r--r--hypervideo_dl/extractor/savefrom.py3
-rw-r--r--hypervideo_dl/extractor/scrippsnetworks.py5
-rw-r--r--hypervideo_dl/extractor/seeker.py2
-rw-r--r--hypervideo_dl/extractor/senateisvp.py2
-rw-r--r--hypervideo_dl/extractor/sendtonews.py4
-rw-r--r--hypervideo_dl/extractor/sevenplus.py48
-rw-r--r--hypervideo_dl/extractor/seznamzpravy.py4
-rw-r--r--hypervideo_dl/extractor/shahid.py6
-rw-r--r--hypervideo_dl/extractor/shemaroome.py104
-rw-r--r--hypervideo_dl/extractor/simplecast.py2
-rw-r--r--hypervideo_dl/extractor/sina.py9
-rw-r--r--hypervideo_dl/extractor/sixplay.py8
-rw-r--r--hypervideo_dl/extractor/skynewsau.py46
-rw-r--r--hypervideo_dl/extractor/slideshare.py3
-rw-r--r--hypervideo_dl/extractor/snotr.py3
-rw-r--r--hypervideo_dl/extractor/sohu.py4
-rw-r--r--hypervideo_dl/extractor/sonyliv.py72
-rw-r--r--hypervideo_dl/extractor/soundcloud.py279
-rw-r--r--hypervideo_dl/extractor/soundgasm.py2
-rw-r--r--hypervideo_dl/extractor/southpark.py64
-rw-r--r--hypervideo_dl/extractor/sovietscloset.py221
-rw-r--r--hypervideo_dl/extractor/spankbang.py32
-rw-r--r--hypervideo_dl/extractor/spankwire.py2
-rw-r--r--hypervideo_dl/extractor/spiegeltv.py17
-rw-r--r--hypervideo_dl/extractor/sport5.py3
-rw-r--r--hypervideo_dl/extractor/sportdeutschland.py11
-rw-r--r--hypervideo_dl/extractor/springboardplatform.py2
-rw-r--r--hypervideo_dl/extractor/srgssr.py19
-rw-r--r--hypervideo_dl/extractor/stanfordoc.py2
-rw-r--r--hypervideo_dl/extractor/startv.py103
-rw-r--r--hypervideo_dl/extractor/steam.py4
-rw-r--r--hypervideo_dl/extractor/streamable.py8
-rw-r--r--hypervideo_dl/extractor/streamanity.py51
-rw-r--r--hypervideo_dl/extractor/streamcloud.py4
-rw-r--r--hypervideo_dl/extractor/stv.py3
-rw-r--r--hypervideo_dl/extractor/svt.py10
-rw-r--r--hypervideo_dl/extractor/tagesschau.py4
-rw-r--r--hypervideo_dl/extractor/tastytrade.py43
-rw-r--r--hypervideo_dl/extractor/tbs.py11
-rw-r--r--hypervideo_dl/extractor/teachable.py4
-rw-r--r--hypervideo_dl/extractor/teachertube.py2
-rw-r--r--hypervideo_dl/extractor/techtalks.py2
-rw-r--r--hypervideo_dl/extractor/tele13.py2
-rw-r--r--hypervideo_dl/extractor/tele5.py4
-rw-r--r--hypervideo_dl/extractor/telemb.py4
-rw-r--r--hypervideo_dl/extractor/telemundo.py58
-rw-r--r--hypervideo_dl/extractor/tennistv.py10
-rw-r--r--hypervideo_dl/extractor/tenplay.py88
-rw-r--r--hypervideo_dl/extractor/testurl.py2
-rw-r--r--hypervideo_dl/extractor/tf1.py3
-rw-r--r--hypervideo_dl/extractor/theplatform.py13
-rw-r--r--hypervideo_dl/extractor/theta.py87
-rw-r--r--hypervideo_dl/extractor/theweatherchannel.py3
-rw-r--r--hypervideo_dl/extractor/thisav.py3
-rw-r--r--hypervideo_dl/extractor/threeqsdn.py24
-rw-r--r--hypervideo_dl/extractor/tiktok.py602
-rw-r--r--hypervideo_dl/extractor/tinypic.py2
-rw-r--r--hypervideo_dl/extractor/tmz.py240
-rw-r--r--hypervideo_dl/extractor/tnaflix.py2
-rw-r--r--hypervideo_dl/extractor/toggle.py10
-rw-r--r--hypervideo_dl/extractor/tokentube.py152
-rw-r--r--hypervideo_dl/extractor/toongoggles.py3
-rw-r--r--hypervideo_dl/extractor/toutv.py2
-rw-r--r--hypervideo_dl/extractor/traileraddict.py2
-rw-r--r--hypervideo_dl/extractor/trovo.py73
-rw-r--r--hypervideo_dl/extractor/trutv.py3
-rw-r--r--hypervideo_dl/extractor/tubitv.py43
-rw-r--r--hypervideo_dl/extractor/tumblr.py3
-rw-r--r--hypervideo_dl/extractor/turbo.py2
-rw-r--r--hypervideo_dl/extractor/turner.py7
-rw-r--r--hypervideo_dl/extractor/tv2.py136
-rw-r--r--hypervideo_dl/extractor/tv2hu.py132
-rw-r--r--hypervideo_dl/extractor/tv4.py31
-rw-r--r--hypervideo_dl/extractor/tv5mondeplus.py40
-rw-r--r--hypervideo_dl/extractor/tv5unis.py3
-rw-r--r--hypervideo_dl/extractor/tver.py3
-rw-r--r--hypervideo_dl/extractor/tvigle.py3
-rw-r--r--hypervideo_dl/extractor/tvland.py7
-rw-r--r--hypervideo_dl/extractor/tvnow.py172
-rw-r--r--hypervideo_dl/extractor/tvp.py2
-rw-r--r--hypervideo_dl/extractor/tvplay.py48
-rw-r--r--hypervideo_dl/extractor/twentyfourvideo.py3
-rw-r--r--hypervideo_dl/extractor/twentythreevideo.py3
-rw-r--r--hypervideo_dl/extractor/twitcasting.py111
-rw-r--r--hypervideo_dl/extractor/twitch.py10
-rw-r--r--hypervideo_dl/extractor/twitter.py46
-rw-r--r--hypervideo_dl/extractor/udemy.py2
-rw-r--r--hypervideo_dl/extractor/ukcolumn.py72
-rw-r--r--hypervideo_dl/extractor/umg.py10
-rw-r--r--hypervideo_dl/extractor/unistra.py2
-rw-r--r--hypervideo_dl/extractor/uol.py1
-rw-r--r--hypervideo_dl/extractor/uplynk.py5
-rw-r--r--hypervideo_dl/extractor/urort.py2
-rw-r--r--hypervideo_dl/extractor/urplay.py17
-rw-r--r--hypervideo_dl/extractor/usanetwork.py2
-rw-r--r--hypervideo_dl/extractor/ustream.py4
-rw-r--r--hypervideo_dl/extractor/ustudio.py5
-rw-r--r--hypervideo_dl/extractor/utreon.py85
-rw-r--r--hypervideo_dl/extractor/varzesh3.py7
-rw-r--r--hypervideo_dl/extractor/veo.py74
-rw-r--r--hypervideo_dl/extractor/vesti.py2
-rw-r--r--hypervideo_dl/extractor/vevo.py140
-rw-r--r--hypervideo_dl/extractor/vgtv.py2
-rw-r--r--hypervideo_dl/extractor/vh1.py27
-rw-r--r--hypervideo_dl/extractor/vice.py6
-rw-r--r--hypervideo_dl/extractor/viddler.py3
-rw-r--r--hypervideo_dl/extractor/videa.py54
-rw-r--r--hypervideo_dl/extractor/videomore.py12
-rw-r--r--hypervideo_dl/extractor/vidio.py234
-rw-r--r--hypervideo_dl/extractor/vidzi.py68
-rw-r--r--hypervideo_dl/extractor/vier.py4
-rw-r--r--hypervideo_dl/extractor/viewlift.py6
-rw-r--r--hypervideo_dl/extractor/viidea.py2
-rw-r--r--hypervideo_dl/extractor/viki.py328
-rw-r--r--hypervideo_dl/extractor/vimeo.py368
-rw-r--r--hypervideo_dl/extractor/vine.py4
-rw-r--r--hypervideo_dl/extractor/viu.py151
-rw-r--r--hypervideo_dl/extractor/vk.py4
-rw-r--r--hypervideo_dl/extractor/vlive.py77
-rw-r--r--hypervideo_dl/extractor/voicy.py147
-rw-r--r--hypervideo_dl/extractor/voot.py58
-rw-r--r--hypervideo_dl/extractor/vrt.py11
-rw-r--r--hypervideo_dl/extractor/vrv.py3
-rw-r--r--hypervideo_dl/extractor/vube.py10
-rw-r--r--hypervideo_dl/extractor/vupload.py51
-rw-r--r--hypervideo_dl/extractor/vvvvid.py4
-rw-r--r--hypervideo_dl/extractor/vzaar.py2
-rw-r--r--hypervideo_dl/extractor/wakanim.py14
-rw-r--r--hypervideo_dl/extractor/walla.py2
-rw-r--r--hypervideo_dl/extractor/wat.py16
-rw-r--r--hypervideo_dl/extractor/watchbox.py3
-rw-r--r--hypervideo_dl/extractor/watchindianporn.py2
-rw-r--r--hypervideo_dl/extractor/wdr.py17
-rw-r--r--hypervideo_dl/extractor/whowatch.py99
-rw-r--r--hypervideo_dl/extractor/wimtv.py163
-rw-r--r--hypervideo_dl/extractor/wistia.py2
-rw-r--r--hypervideo_dl/extractor/xboxclips.py7
-rw-r--r--hypervideo_dl/extractor/xfileshare.py2
-rw-r--r--hypervideo_dl/extractor/xhamster.py9
-rw-r--r--hypervideo_dl/extractor/ximalaya.py2
-rw-r--r--hypervideo_dl/extractor/xnxx.py2
-rw-r--r--hypervideo_dl/extractor/xstream.py4
-rw-r--r--hypervideo_dl/extractor/xtube.py18
-rw-r--r--hypervideo_dl/extractor/xxxymovies.py3
-rw-r--r--hypervideo_dl/extractor/yahoo.py63
-rw-r--r--hypervideo_dl/extractor/yandexdisk.py3
-rw-r--r--hypervideo_dl/extractor/yandexmusic.py13
-rw-r--r--hypervideo_dl/extractor/yandexvideo.py88
-rw-r--r--hypervideo_dl/extractor/youjizz.py3
-rw-r--r--hypervideo_dl/extractor/youku.py2
-rw-r--r--hypervideo_dl/extractor/youporn.py2
-rw-r--r--hypervideo_dl/extractor/youtube.py3600
-rw-r--r--hypervideo_dl/extractor/zapiks.py2
-rw-r--r--hypervideo_dl/extractor/zaq1.py101
-rw-r--r--hypervideo_dl/extractor/zattoo.py6
-rw-r--r--hypervideo_dl/extractor/zdf.py42
-rw-r--r--hypervideo_dl/extractor/zee5.py244
-rw-r--r--hypervideo_dl/extractor/zingmp3.py5
-rw-r--r--hypervideo_dl/extractor/zoom.py15
-rw-r--r--hypervideo_dl/extractor/zype.py7
507 files changed, 24726 insertions, 6247 deletions
diff --git a/hypervideo_dl/extractor/__init__.py b/hypervideo_dl/extractor/__init__.py
index 18d8dbc..198c4ae 100644
--- a/hypervideo_dl/extractor/__init__.py
+++ b/hypervideo_dl/extractor/__init__.py
@@ -1,13 +1,17 @@
from __future__ import unicode_literals
+from ..utils import load_plugins
+
try:
from .lazy_extractors import *
from .lazy_extractors import _ALL_CLASSES
_LAZY_LOADER = True
+ _PLUGIN_CLASSES = {}
except ImportError:
_LAZY_LOADER = False
- from .extractors import *
+if not _LAZY_LOADER:
+ from .extractors import *
_ALL_CLASSES = [
klass
for name, klass in globals().items()
@@ -15,6 +19,9 @@ except ImportError:
]
_ALL_CLASSES.append(GenericIE)
+ _PLUGIN_CLASSES = load_plugins('extractor', 'IE', globals())
+ _ALL_CLASSES = list(_PLUGIN_CLASSES.values()) + _ALL_CLASSES
+
def gen_extractor_classes():
""" Return a list of supported extractors.
diff --git a/hypervideo_dl/extractor/abc.py b/hypervideo_dl/extractor/abc.py
index 6637f4f..3e20216 100644
--- a/hypervideo_dl/extractor/abc.py
+++ b/hypervideo_dl/extractor/abc.py
@@ -12,6 +12,7 @@ from ..utils import (
js_to_json,
int_or_none,
parse_iso8601,
+ str_or_none,
try_get,
unescapeHTML,
update_url_query,
@@ -20,7 +21,7 @@ from ..utils import (
class ABCIE(InfoExtractor):
IE_NAME = 'abc.net.au'
- _VALID_URL = r'https?://(?:www\.)?abc\.net\.au/news/(?:[^/]+/){1,2}(?P<id>\d+)'
+ _VALID_URL = r'https?://(?:www\.)?abc\.net\.au/(?:news|btn)/(?:[^/]+/){1,4}(?P<id>\d{5,})'
_TESTS = [{
'url': 'http://www.abc.net.au/news/2014-11-05/australia-to-staff-ebola-treatment-centre-in-sierra-leone/5868334',
@@ -34,7 +35,7 @@ class ABCIE(InfoExtractor):
'skip': 'this video has expired',
}, {
'url': 'http://www.abc.net.au/news/2015-08-17/warren-entsch-introduces-same-sex-marriage-bill/6702326',
- 'md5': 'db2a5369238b51f9811ad815b69dc086',
+ 'md5': '4ebd61bdc82d9a8b722f64f1f4b4d121',
'info_dict': {
'id': 'NvqvPeNZsHU',
'ext': 'mp4',
@@ -58,39 +59,102 @@ class ABCIE(InfoExtractor):
}, {
'url': 'http://www.abc.net.au/news/2015-10-19/6866214',
'only_matching': True,
+ }, {
+ 'url': 'https://www.abc.net.au/btn/classroom/wwi-centenary/10527914',
+ 'info_dict': {
+ 'id': '10527914',
+ 'ext': 'mp4',
+ 'title': 'WWI Centenary',
+ 'description': 'md5:c2379ec0ca84072e86b446e536954546',
+ }
+ }, {
+ 'url': 'https://www.abc.net.au/news/programs/the-world/2020-06-10/black-lives-matter-protests-spawn-support-for/12342074',
+ 'info_dict': {
+ 'id': '12342074',
+ 'ext': 'mp4',
+ 'title': 'Black Lives Matter protests spawn support for Papuans in Indonesia',
+ 'description': 'md5:2961a17dc53abc558589ccd0fb8edd6f',
+ }
+ }, {
+ 'url': 'https://www.abc.net.au/btn/newsbreak/btn-newsbreak-20200814/12560476',
+ 'info_dict': {
+ 'id': 'tDL8Ld4dK_8',
+ 'ext': 'mp4',
+ 'title': 'Fortnite Banned From Apple and Google App Stores',
+ 'description': 'md5:a6df3f36ce8f816b74af4bd6462f5651',
+ 'upload_date': '20200813',
+ 'uploader': 'Behind the News',
+ 'uploader_id': 'behindthenews',
+ }
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- mobj = re.search(
- r'inline(?P<type>Video|Audio|YouTube)Data\.push\((?P<json_data>[^)]+)\);',
- webpage)
+ mobj = re.search(r'<a\s+href="(?P<url>[^"]+)"\s+data-duration="\d+"\s+title="Download audio directly">', webpage)
+ if mobj:
+ urls_info = mobj.groupdict()
+ youtube = False
+ video = False
+ else:
+ mobj = re.search(r'<a href="(?P<url>http://www\.youtube\.com/watch\?v=[^"]+)"><span><strong>External Link:</strong>',
+ webpage)
+ if mobj is None:
+ mobj = re.search(r'<iframe width="100%" src="(?P<url>//www\.youtube-nocookie\.com/embed/[^?"]+)', webpage)
+ if mobj:
+ urls_info = mobj.groupdict()
+ youtube = True
+ video = True
+
if mobj is None:
- expired = self._html_search_regex(r'(?s)class="expired-(?:video|audio)".+?<span>(.+?)</span>', webpage, 'expired', None)
- if expired:
- raise ExtractorError('%s said: %s' % (self.IE_NAME, expired), expected=True)
- raise ExtractorError('Unable to extract video urls')
+ mobj = re.search(r'(?P<type>)"sources": (?P<json_data>\[[^\]]+\]),', webpage)
+ if mobj is None:
+ mobj = re.search(
+ r'inline(?P<type>Video|Audio|YouTube)Data\.push\((?P<json_data>[^)]+)\);',
+ webpage)
+ if mobj is None:
+ expired = self._html_search_regex(r'(?s)class="expired-(?:video|audio)".+?<span>(.+?)</span>', webpage, 'expired', None)
+ if expired:
+ raise ExtractorError('%s said: %s' % (self.IE_NAME, expired), expected=True)
+ raise ExtractorError('Unable to extract video urls')
- urls_info = self._parse_json(
- mobj.group('json_data'), video_id, transform_source=js_to_json)
+ urls_info = self._parse_json(
+ mobj.group('json_data'), video_id, transform_source=js_to_json)
+ youtube = mobj.group('type') == 'YouTube'
+ video = mobj.group('type') == 'Video' or urls_info[0]['contentType'] == 'video/mp4'
if not isinstance(urls_info, list):
urls_info = [urls_info]
- if mobj.group('type') == 'YouTube':
+ if youtube:
return self.playlist_result([
self.url_result(url_info['url']) for url_info in urls_info])
- formats = [{
- 'url': url_info['url'],
- 'vcodec': url_info.get('codec') if mobj.group('type') == 'Video' else 'none',
- 'width': int_or_none(url_info.get('width')),
- 'height': int_or_none(url_info.get('height')),
- 'tbr': int_or_none(url_info.get('bitrate')),
- 'filesize': int_or_none(url_info.get('filesize')),
- } for url_info in urls_info]
+ formats = []
+ for url_info in urls_info:
+ height = int_or_none(url_info.get('height'))
+ bitrate = int_or_none(url_info.get('bitrate'))
+ width = int_or_none(url_info.get('width'))
+ format_id = None
+ mobj = re.search(r'_(?:(?P<height>\d+)|(?P<bitrate>\d+)k)\.mp4$', url_info['url'])
+ if mobj:
+ height_from_url = mobj.group('height')
+ if height_from_url:
+ height = height or int_or_none(height_from_url)
+ width = width or int_or_none(url_info.get('label'))
+ else:
+ bitrate = bitrate or int_or_none(mobj.group('bitrate'))
+ format_id = str_or_none(url_info.get('label'))
+ formats.append({
+ 'url': url_info['url'],
+ 'vcodec': url_info.get('codec') if video else 'none',
+ 'width': width,
+ 'height': height,
+ 'tbr': bitrate,
+ 'filesize': int_or_none(url_info.get('filesize')),
+ 'format_id': format_id
+ })
self._sort_formats(formats)
diff --git a/hypervideo_dl/extractor/abcnews.py b/hypervideo_dl/extractor/abcnews.py
index 908c833..296b8ce 100644
--- a/hypervideo_dl/extractor/abcnews.py
+++ b/hypervideo_dl/extractor/abcnews.py
@@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
from .amp import AMPIE
from .common import InfoExtractor
@@ -59,7 +58,7 @@ class AbcNewsVideoIE(AMPIE):
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
display_id = mobj.group('display_id')
video_id = mobj.group('id')
info_dict = self._extract_feed_info(
diff --git a/hypervideo_dl/extractor/abcotvs.py b/hypervideo_dl/extractor/abcotvs.py
index 0bc69a6..5bff466 100644
--- a/hypervideo_dl/extractor/abcotvs.py
+++ b/hypervideo_dl/extractor/abcotvs.py
@@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
from ..compat import compat_str
@@ -55,7 +54,7 @@ class ABCOTVSIE(InfoExtractor):
}
def _real_extract(self, url):
- site, display_id, video_id = re.match(self._VALID_URL, url).groups()
+ site, display_id, video_id = self._match_valid_url(url).groups()
display_id = display_id or video_id
station = self._SITE_MAP[site]
diff --git a/hypervideo_dl/extractor/acast.py b/hypervideo_dl/extractor/acast.py
index b9355a2..63587c5 100644
--- a/hypervideo_dl/extractor/acast.py
+++ b/hypervideo_dl/extractor/acast.py
@@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
from ..utils import (
@@ -80,7 +79,7 @@ class ACastIE(ACastBaseIE):
}]
def _real_extract(self, url):
- channel, display_id = re.match(self._VALID_URL, url).groups()
+ channel, display_id = self._match_valid_url(url).groups()
episode = self._call_api(
'%s/episodes/%s' % (channel, display_id),
display_id, {'showInfo': 'true'})
diff --git a/hypervideo_dl/extractor/adobepass.py b/hypervideo_dl/extractor/adobepass.py
index 38dca1b..9378c33 100644
--- a/hypervideo_dl/extractor/adobepass.py
+++ b/hypervideo_dl/extractor/adobepass.py
@@ -1,6 +1,7 @@
# coding: utf-8
from __future__ import unicode_literals
+import json
import re
import time
import xml.etree.ElementTree as etree
@@ -9,6 +10,7 @@ from .common import InfoExtractor
from ..compat import (
compat_kwargs,
compat_urlparse,
+ compat_getpass
)
from ..utils import (
unescapeHTML,
@@ -35,6 +37,11 @@ MSO_INFO = {
'username_field': 'email',
'password_field': 'loginpassword',
},
+ 'RCN': {
+ 'name': 'RCN',
+ 'username_field': 'UserName',
+ 'password_field': 'UserPassword',
+ },
'Rogers': {
'name': 'Rogers',
'username_field': 'UserName',
@@ -60,11 +67,25 @@ MSO_INFO = {
'username_field': 'IDToken1',
'password_field': 'IDToken2',
},
+ 'Spectrum': {
+ 'name': 'Spectrum',
+ 'username_field': 'IDToken1',
+ 'password_field': 'IDToken2',
+ },
+ 'Philo': {
+ 'name': 'Philo',
+ 'username_field': 'ident'
+ },
'Verizon': {
'name': 'Verizon FiOS',
'username_field': 'IDToken1',
'password_field': 'IDToken2',
},
+ 'Cablevision': {
+ 'name': 'Optimum/Cablevision',
+ 'username_field': 'j_username',
+ 'password_field': 'j_password',
+ },
'thr030': {
'name': '3 Rivers Communications'
},
@@ -1319,6 +1340,11 @@ MSO_INFO = {
'cou060': {
'name': 'Zito Media'
},
+ 'slingtv': {
+ 'name': 'Sling TV',
+ 'username_field': 'username',
+ 'password_field': 'password',
+ },
}
@@ -1409,7 +1435,7 @@ class AdobePassIE(InfoExtractor):
authn_token = None
if not authn_token:
# TODO add support for other TV Providers
- mso_id = self._downloader.params.get('ap_mso')
+ mso_id = self.get_param('ap_mso')
if not mso_id:
raise_mvpd_required()
username, password = self._get_login_info('ap_username', 'ap_password', mso_id)
@@ -1438,6 +1464,13 @@ class AdobePassIE(InfoExtractor):
provider_redirect_page, 'oauth redirect')
self._download_webpage(
oauth_redirect_url, video_id, 'Confirming auto login')
+ elif 'automatically signed in with' in provider_redirect_page:
+ # Seems like comcast is rolling up new way of automatically signing customers
+ oauth_redirect_url = self._html_search_regex(
+ r'continue:\s*"(https://oauth.xfinity.com/oauth/authorize\?.+)"', provider_redirect_page,
+ 'oauth redirect (signed)')
+ # Just need to process the request. No useful data comes back
+ self._download_webpage(oauth_redirect_url, video_id, 'Confirming auto login')
else:
if '<form name="signin"' in provider_redirect_page:
provider_login_page_res = provider_redirect_page_res
@@ -1460,11 +1493,28 @@ class AdobePassIE(InfoExtractor):
mvpd_confirm_page, urlh = mvpd_confirm_page_res
if '<button class="submit" value="Resume">Resume</button>' in mvpd_confirm_page:
post_form(mvpd_confirm_page_res, 'Confirming Login')
+ elif mso_id == 'Philo':
+ # Philo has very unique authentication method
+ self._download_webpage(
+ 'https://idp.philo.com/auth/init/login_code', video_id, 'Requesting auth code', data=urlencode_postdata({
+ 'ident': username,
+ 'device': 'web',
+ 'send_confirm_link': False,
+ 'send_token': True
+ }))
+ philo_code = compat_getpass('Type auth code you have received [Return]: ')
+ self._download_webpage(
+ 'https://idp.philo.com/auth/update/login_code', video_id, 'Submitting token', data=urlencode_postdata({
+ 'token': philo_code
+ }))
+ mvpd_confirm_page_res = self._download_webpage_handle('https://idp.philo.com/idp/submit', video_id, 'Confirming Philo Login')
+ post_form(mvpd_confirm_page_res, 'Confirming Login')
elif mso_id == 'Verizon':
# In general, if you're connecting from a Verizon-assigned IP,
# you will not actually pass your credentials.
provider_redirect_page, urlh = provider_redirect_page_res
- if 'Please wait ...' in provider_redirect_page:
+ # From non-Verizon IP, still gave 'Please wait', but noticed N==Y; will need to try on Verizon IP
+ if 'Please wait ...' in provider_redirect_page and '\'N\'== "Y"' not in provider_redirect_page:
saml_redirect_url = self._html_search_regex(
r'self\.parent\.location=(["\'])(?P<url>.+?)\1',
provider_redirect_page,
@@ -1472,7 +1522,8 @@ class AdobePassIE(InfoExtractor):
saml_login_page = self._download_webpage(
saml_redirect_url, video_id,
'Downloading SAML Login Page')
- else:
+ elif 'Verizon FiOS - sign in' in provider_redirect_page:
+ # FXNetworks from non-Verizon IP
saml_login_page_res = post_form(
provider_redirect_page_res, 'Logging in', {
mso_info['username_field']: username,
@@ -1482,6 +1533,26 @@ class AdobePassIE(InfoExtractor):
if 'Please try again.' in saml_login_page:
raise ExtractorError(
'We\'re sorry, but either the User ID or Password entered is not correct.')
+ else:
+ # ABC from non-Verizon IP
+ saml_redirect_url = self._html_search_regex(
+ r'var\surl\s*=\s*(["\'])(?P<url>.+?)\1',
+ provider_redirect_page,
+ 'SAML Redirect URL', group='url')
+ saml_redirect_url = saml_redirect_url.replace(r'\/', '/')
+ saml_redirect_url = saml_redirect_url.replace(r'\-', '-')
+ saml_redirect_url = saml_redirect_url.replace(r'\x26', '&')
+ saml_login_page = self._download_webpage(
+ saml_redirect_url, video_id,
+ 'Downloading SAML Login Page')
+ saml_login_page, urlh = post_form(
+ [saml_login_page, saml_redirect_url], 'Logging in', {
+ mso_info['username_field']: username,
+ mso_info['password_field']: password,
+ })
+ if 'Please try again.' in saml_login_page:
+ raise ExtractorError(
+ 'Failed to login, incorrect User ID or Password.')
saml_login_url = self._search_regex(
r'xmlHttp\.open\("POST"\s*,\s*(["\'])(?P<url>.+?)\1',
saml_login_page, 'SAML Login URL', group='url')
@@ -1496,6 +1567,75 @@ class AdobePassIE(InfoExtractor):
}), headers={
'Content-Type': 'application/x-www-form-urlencoded'
})
+ elif mso_id == 'Spectrum':
+ # Spectrum's login for is dynamically loaded via JS so we need to hardcode the flow
+ # as a one-off implementation.
+ provider_redirect_page, urlh = provider_redirect_page_res
+ provider_login_page_res = post_form(
+ provider_redirect_page_res, self._DOWNLOADING_LOGIN_PAGE)
+ saml_login_page, urlh = provider_login_page_res
+ relay_state = self._search_regex(
+ r'RelayState\s*=\s*"(?P<relay>.+?)";',
+ saml_login_page, 'RelayState', group='relay')
+ saml_request = self._search_regex(
+ r'SAMLRequest\s*=\s*"(?P<saml_request>.+?)";',
+ saml_login_page, 'SAMLRequest', group='saml_request')
+ login_json = {
+ mso_info['username_field']: username,
+ mso_info['password_field']: password,
+ 'RelayState': relay_state,
+ 'SAMLRequest': saml_request,
+ }
+ saml_response_json = self._download_json(
+ 'https://tveauthn.spectrum.net/tveauthentication/api/v1/manualAuth', video_id,
+ 'Downloading SAML Response',
+ data=json.dumps(login_json).encode(),
+ headers={
+ 'Content-Type': 'application/json',
+ 'Accept': 'application/json',
+ })
+ self._download_webpage(
+ saml_response_json['SAMLRedirectUri'], video_id,
+ 'Confirming Login', data=urlencode_postdata({
+ 'SAMLResponse': saml_response_json['SAMLResponse'],
+ 'RelayState': relay_state,
+ }), headers={
+ 'Content-Type': 'application/x-www-form-urlencoded'
+ })
+ elif mso_id == 'slingtv':
+ # SlingTV has a meta-refresh based authentication, but also
+ # looks at the tab history to count the number of times the
+ # browser has been on a page
+
+ first_bookend_page, urlh = provider_redirect_page_res
+
+ hidden_data = self._hidden_inputs(first_bookend_page)
+ hidden_data['history'] = 1
+
+ provider_login_page_res = self._download_webpage_handle(
+ urlh.geturl(), video_id, 'Sending first bookend',
+ query=hidden_data)
+
+ provider_association_redirect, urlh = post_form(
+ provider_login_page_res, 'Logging in', {
+ mso_info['username_field']: username,
+ mso_info['password_field']: password
+ })
+
+ provider_refresh_redirect_url = extract_redirect_url(
+ provider_association_redirect, url=urlh.geturl())
+
+ last_bookend_page, urlh = self._download_webpage_handle(
+ provider_refresh_redirect_url, video_id,
+ 'Downloading Auth Association Redirect Page')
+ hidden_data = self._hidden_inputs(last_bookend_page)
+ hidden_data['history'] = 3
+
+ mvpd_confirm_page_res = self._download_webpage_handle(
+ urlh.geturl(), video_id, 'Sending final bookend',
+ query=hidden_data)
+
+ post_form(mvpd_confirm_page_res, 'Confirming Login')
else:
# Some providers (e.g. DIRECTV NOW) have another meta refresh
# based redirect that should be followed.
@@ -1508,10 +1648,13 @@ class AdobePassIE(InfoExtractor):
'Downloading Provider Redirect Page (meta refresh)')
provider_login_page_res = post_form(
provider_redirect_page_res, self._DOWNLOADING_LOGIN_PAGE)
- mvpd_confirm_page_res = post_form(provider_login_page_res, 'Logging in', {
+ form_data = {
mso_info.get('username_field', 'username'): username,
- mso_info.get('password_field', 'password'): password,
- })
+ mso_info.get('password_field', 'password'): password
+ }
+ if mso_id == 'Cablevision':
+ form_data['_eventId_proceed'] = ''
+ mvpd_confirm_page_res = post_form(provider_login_page_res, 'Logging in', form_data)
if mso_id != 'Rogers':
post_form(mvpd_confirm_page_res, 'Confirming Login')
diff --git a/hypervideo_dl/extractor/adobetv.py b/hypervideo_dl/extractor/adobetv.py
index 80060f0..12b8192 100644
--- a/hypervideo_dl/extractor/adobetv.py
+++ b/hypervideo_dl/extractor/adobetv.py
@@ -66,7 +66,7 @@ class AdobeTVBaseIE(InfoExtractor):
if original_filename.startswith('s3://') and not s3_extracted:
formats.append({
'format_id': 'original',
- 'preference': 1,
+ 'quality': 1,
'url': original_filename.replace('s3://', 'https://s3.amazonaws.com/'),
})
s3_extracted = True
@@ -132,7 +132,7 @@ class AdobeTVIE(AdobeTVBaseIE):
}
def _real_extract(self, url):
- language, show_urlname, urlname = re.match(self._VALID_URL, url).groups()
+ language, show_urlname, urlname = self._match_valid_url(url).groups()
if not language:
language = 'en'
@@ -178,7 +178,7 @@ class AdobeTVShowIE(AdobeTVPlaylistBaseIE):
_process_data = AdobeTVBaseIE._parse_video_data
def _real_extract(self, url):
- language, show_urlname = re.match(self._VALID_URL, url).groups()
+ language, show_urlname = self._match_valid_url(url).groups()
if not language:
language = 'en'
query = {
@@ -215,7 +215,7 @@ class AdobeTVChannelIE(AdobeTVPlaylistBaseIE):
show_data['url'], 'AdobeTVShow', str_or_none(show_data.get('id')))
def _real_extract(self, url):
- language, channel_urlname, category_urlname = re.match(self._VALID_URL, url).groups()
+ language, channel_urlname, category_urlname = self._match_valid_url(url).groups()
if not language:
language = 'en'
query = {
diff --git a/hypervideo_dl/extractor/adultswim.py b/hypervideo_dl/extractor/adultswim.py
index 8d1d9ac..c97cfc1 100644
--- a/hypervideo_dl/extractor/adultswim.py
+++ b/hypervideo_dl/extractor/adultswim.py
@@ -2,7 +2,6 @@
from __future__ import unicode_literals
import json
-import re
from .turner import TurnerBaseIE
from ..utils import (
@@ -89,7 +88,7 @@ class AdultSwimIE(TurnerBaseIE):
}]
def _real_extract(self, url):
- show_path, episode_path = re.match(self._VALID_URL, url).groups()
+ show_path, episode_path = self._match_valid_url(url).groups()
display_id = episode_path or show_path
query = '''query {
getShowBySlug(slug:"%s") {
diff --git a/hypervideo_dl/extractor/aenetworks.py b/hypervideo_dl/extractor/aenetworks.py
index e55c03f..8025de5 100644
--- a/hypervideo_dl/extractor/aenetworks.py
+++ b/hypervideo_dl/extractor/aenetworks.py
@@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
from .theplatform import ThePlatformIE
from ..utils import (
@@ -20,8 +19,8 @@ class AENetworksBaseIE(ThePlatformIE):
(?:history(?:vault)?|aetv|mylifetime|lifetimemovieclub)\.com|
fyi\.tv
)/'''
- _THEPLATFORM_KEY = 'crazyjava'
- _THEPLATFORM_SECRET = 's3cr3t'
+ _THEPLATFORM_KEY = '43jXaGRQud'
+ _THEPLATFORM_SECRET = 'S10BPXHMlb'
_DOMAIN_MAP = {
'history.com': ('HISTORY', 'history'),
'aetv.com': ('AETV', 'aetv'),
@@ -170,7 +169,7 @@ class AENetworksIE(AENetworksBaseIE):
}]
def _real_extract(self, url):
- domain, canonical = re.match(self._VALID_URL, url).groups()
+ domain, canonical = self._match_valid_url(url).groups()
return self._extract_aetn_info(domain, 'canonical', '/' + canonical, url)
@@ -187,7 +186,7 @@ class AENetworksListBaseIE(AENetworksBaseIE):
}))['data'][resource]
def _real_extract(self, url):
- domain, slug = re.match(self._VALID_URL, url).groups()
+ domain, slug = self._match_valid_url(url).groups()
_, brand = self._DOMAIN_MAP[domain]
playlist = self._call_api(self._RESOURCE, slug, brand, self._FIELDS)
base_url = 'http://watch.%s' % domain
@@ -309,7 +308,7 @@ class HistoryPlayerIE(AENetworksBaseIE):
_TESTS = []
def _real_extract(self, url):
- domain, video_id = re.match(self._VALID_URL, url).groups()
+ domain, video_id = self._match_valid_url(url).groups()
return self._extract_aetn_info(domain, 'id', video_id, url)
diff --git a/hypervideo_dl/extractor/afreecatv.py b/hypervideo_dl/extractor/afreecatv.py
index b56abb1..063872b 100644
--- a/hypervideo_dl/extractor/afreecatv.py
+++ b/hypervideo_dl/extractor/afreecatv.py
@@ -6,9 +6,11 @@ import re
from .common import InfoExtractor
from ..compat import compat_xpath
from ..utils import (
+ date_from_str,
determine_ext,
ExtractorError,
int_or_none,
+ unified_strdate,
url_or_none,
urlencode_postdata,
xpath_text,
@@ -237,6 +239,7 @@ class AfreecaTVIE(InfoExtractor):
r'nTitleNo\s*=\s*(\d+)', webpage, 'title', default=video_id)
partial_view = False
+ adult_view = False
for _ in range(2):
query = {
'nTitleNo': video_id,
@@ -245,6 +248,8 @@ class AfreecaTVIE(InfoExtractor):
}
if partial_view:
query['partialView'] = 'SKIP_ADULT'
+ if adult_view:
+ query['adultView'] = 'ADULT_VIEW'
video_xml = self._download_xml(
'http://afbbs.afreecatv.com:8080/api/video/get_video_info.php',
video_id, 'Downloading video info XML%s'
@@ -257,13 +262,16 @@ class AfreecaTVIE(InfoExtractor):
if flag and flag == 'SUCCEED':
break
if flag == 'PARTIAL_ADULT':
- self._downloader.report_warning(
+ self.report_warning(
'In accordance with local laws and regulations, underage users are restricted from watching adult content. '
'Only content suitable for all ages will be downloaded. '
'Provide account credentials if you wish to download restricted content.')
partial_view = True
continue
elif flag == 'ADULT':
+ if not adult_view:
+ adult_view = True
+ continue
error = 'Only users older than 19 are able to watch this video. Provide account credentials to download this content.'
else:
error = flag
@@ -309,8 +317,15 @@ class AfreecaTVIE(InfoExtractor):
if not file_url:
continue
key = file_element.get('key', '')
- upload_date = self._search_regex(
- r'^(\d{8})_', key, 'upload date', default=None)
+ upload_date = unified_strdate(self._search_regex(
+ r'^(\d{8})_', key, 'upload date', default=None))
+ if upload_date is not None:
+ # sometimes the upload date isn't included in the file name
+ # instead, another random ID is, which may parse as a valid
+ # date but be wildly out of a reasonable range
+ parsed_date = date_from_str(upload_date)
+ if parsed_date.year < 2000 or parsed_date.year >= 2100:
+ upload_date = None
file_duration = int_or_none(file_element.get('duration'))
format_id = key if key else '%s_%s' % (video_id, file_num)
if determine_ext(file_url) == 'm3u8':
@@ -323,7 +338,7 @@ class AfreecaTVIE(InfoExtractor):
'url': file_url,
'format_id': 'http',
}]
- if not formats:
+ if not formats and not self.get_param('ignore_no_formats'):
continue
self._sort_formats(formats)
file_info = common_entry.copy()
diff --git a/hypervideo_dl/extractor/aljazeera.py b/hypervideo_dl/extractor/aljazeera.py
index c4f915a..e829b45 100644
--- a/hypervideo_dl/extractor/aljazeera.py
+++ b/hypervideo_dl/extractor/aljazeera.py
@@ -1,7 +1,6 @@
from __future__ import unicode_literals
import json
-import re
from .common import InfoExtractor
@@ -32,7 +31,7 @@ class AlJazeeraIE(InfoExtractor):
BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_default/index.html?videoId=%s'
def _real_extract(self, url):
- post_type, name = re.match(self._VALID_URL, url).groups()
+ post_type, name = self._match_valid_url(url).groups()
post_type = {
'features': 'post',
'program': 'episode',
@@ -40,7 +39,7 @@ class AlJazeeraIE(InfoExtractor):
}[post_type.split('/')[0]]
video = self._download_json(
'https://www.aljazeera.com/graphql', name, query={
- 'operationName': 'SingleArticleQuery',
+ 'operationName': 'ArchipelagoSingleArticleQuery',
'variables': json.dumps({
'name': name,
'postType': post_type,
diff --git a/hypervideo_dl/extractor/alura.py b/hypervideo_dl/extractor/alura.py
new file mode 100644
index 0000000..f5325de
--- /dev/null
+++ b/hypervideo_dl/extractor/alura.py
@@ -0,0 +1,179 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+from ..compat import (
+ compat_urlparse,
+)
+
+from ..utils import (
+ urlencode_postdata,
+ urljoin,
+ int_or_none,
+ clean_html,
+ ExtractorError
+)
+
+
+class AluraIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:cursos\.)?alura\.com\.br/course/(?P<course_name>[^/]+)/task/(?P<id>\d+)'
+ _LOGIN_URL = 'https://cursos.alura.com.br/loginForm?urlAfterLogin=/loginForm'
+ _VIDEO_URL = 'https://cursos.alura.com.br/course/%s/task/%s/video'
+ _NETRC_MACHINE = 'alura'
+ _TESTS = [{
+ 'url': 'https://cursos.alura.com.br/course/clojure-mutabilidade-com-atoms-e-refs/task/60095',
+ 'info_dict': {
+ 'id': '60095',
+ 'ext': 'mp4',
+ 'title': 'Referências, ref-set e alter'
+ },
+ 'skip': 'Requires alura account credentials'},
+ {
+ # URL without video
+ 'url': 'https://cursos.alura.com.br/course/clojure-mutabilidade-com-atoms-e-refs/task/60098',
+ 'only_matching': True},
+ {
+ 'url': 'https://cursos.alura.com.br/course/fundamentos-market-digital/task/55219',
+ 'only_matching': True}
+ ]
+
+ def _real_extract(self, url):
+
+ course, video_id = self._match_valid_url(url)
+ video_url = self._VIDEO_URL % (course, video_id)
+
+ video_dict = self._download_json(video_url, video_id, 'Searching for videos')
+
+ if video_dict:
+ webpage = self._download_webpage(url, video_id)
+ video_title = clean_html(self._search_regex(
+ r'<span[^>]+class=(["\'])task-body-header-title-text\1[^>]*>(?P<title>[^<]+)',
+ webpage, 'title', group='title'))
+
+ formats = []
+ for video_obj in video_dict:
+ video_url_m3u8 = video_obj.get('link')
+ video_format = self._extract_m3u8_formats(
+ video_url_m3u8, None, 'mp4', entry_protocol='m3u8_native',
+ m3u8_id='hls', fatal=False)
+ for f in video_format:
+ m = re.search(r'^[\w \W]*-(?P<res>\w*).mp4[\W \w]*', f['url'])
+ if m:
+ if not f.get('height'):
+ f['height'] = int('720' if m.group('res') == 'hd' else '480')
+ formats.extend(video_format)
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': video_title,
+ "formats": formats
+ }
+
+ def _real_initialize(self):
+ self._login()
+
+ def _login(self):
+ username, password = self._get_login_info()
+ if username is None:
+ return
+ pass
+
+ login_page = self._download_webpage(
+ self._LOGIN_URL, None, 'Downloading login popup')
+
+ def is_logged(webpage):
+ return any(re.search(p, webpage) for p in (
+ r'href=[\"|\']?/signout[\"|\']',
+ r'>Logout<'))
+
+ # already logged in
+ if is_logged(login_page):
+ return
+
+ login_form = self._hidden_inputs(login_page)
+
+ login_form.update({
+ 'username': username,
+ 'password': password,
+ })
+
+ post_url = self._search_regex(
+ r'<form[^>]+class=["|\']signin-form["|\'] action=["|\'](?P<url>.+?)["|\']', login_page,
+ 'post url', default=self._LOGIN_URL, group='url')
+
+ if not post_url.startswith('http'):
+ post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url)
+
+ response = self._download_webpage(
+ post_url, None, 'Logging in',
+ data=urlencode_postdata(login_form),
+ headers={'Content-Type': 'application/x-www-form-urlencoded'})
+
+ if not is_logged(response):
+ error = self._html_search_regex(
+ r'(?s)<p[^>]+class="alert-message[^"]*">(.+?)</p>',
+ response, 'error message', default=None)
+ if error:
+ raise ExtractorError('Unable to login: %s' % error, expected=True)
+ raise ExtractorError('Unable to log in')
+
+
+class AluraCourseIE(AluraIE):
+
+ _VALID_URL = r'https?://(?:cursos\.)?alura\.com\.br/course/(?P<id>[^/]+)'
+ _LOGIN_URL = 'https://cursos.alura.com.br/loginForm?urlAfterLogin=/loginForm'
+ _NETRC_MACHINE = 'aluracourse'
+ _TESTS = [{
+ 'url': 'https://cursos.alura.com.br/course/clojure-mutabilidade-com-atoms-e-refs',
+ 'only_matching': True,
+ }]
+
+ @classmethod
+ def suitable(cls, url):
+ return False if AluraIE.suitable(url) else super(AluraCourseIE, cls).suitable(url)
+
+ def _real_extract(self, url):
+
+ course_path = self._match_id(url)
+ webpage = self._download_webpage(url, course_path)
+
+ course_title = self._search_regex(
+ r'<h1.*?>(.*?)<strong>(?P<course_title>.*?)</strong></h[0-9]>', webpage,
+ 'course title', default=course_path, group='course_title')
+
+ entries = []
+ if webpage:
+ for path in re.findall(r'<a\b(?=[^>]* class="[^"]*(?<=[" ])courseSectionList-section[" ])(?=[^>]* href="([^"]*))', webpage):
+ page_url = urljoin(url, path)
+ section_path = self._download_webpage(page_url, course_path)
+ for path_video in re.findall(r'<a\b(?=[^>]* class="[^"]*(?<=[" ])task-menu-nav-item-link-VIDEO[" ])(?=[^>]* href="([^"]*))', section_path):
+ chapter = clean_html(
+ self._search_regex(
+ r'<h3[^>]+class=(["\'])task-menu-section-title-text\1[^>]*>(?P<chapter>[^<]+)',
+ section_path,
+ 'chapter',
+ group='chapter'))
+
+ chapter_number = int_or_none(
+ self._search_regex(
+ r'<span[^>]+class=(["\'])task-menu-section-title-number[^>]*>(.*?)<strong>(?P<chapter_number>[^<]+)</strong>',
+ section_path,
+ 'chapter number',
+ group='chapter_number'))
+ video_url = urljoin(url, path_video)
+
+ entry = {
+ '_type': 'url_transparent',
+ 'id': self._match_id(video_url),
+ 'url': video_url,
+ 'id_key': self.ie_key(),
+ 'chapter': chapter,
+ 'chapter_number': chapter_number
+ }
+ entries.append(entry)
+ return self.playlist_result(entries, course_path, course_title)
diff --git a/hypervideo_dl/extractor/amcnetworks.py b/hypervideo_dl/extractor/amcnetworks.py
index b8027bb..e38e215 100644
--- a/hypervideo_dl/extractor/amcnetworks.py
+++ b/hypervideo_dl/extractor/amcnetworks.py
@@ -63,17 +63,37 @@ class AMCNetworksIE(ThePlatformIE):
}
def _real_extract(self, url):
- site, display_id = re.match(self._VALID_URL, url).groups()
+ site, display_id = self._match_valid_url(url).groups()
requestor_id = self._REQUESTOR_ID_MAP[site]
- properties = self._download_json(
- 'https://content-delivery-gw.svc.ds.amcn.com/api/v2/content/amcn/%s/url/%s' % (requestor_id.lower(), display_id),
- display_id)['data']['properties']
+ page_data = self._download_json(
+ 'https://content-delivery-gw.svc.ds.amcn.com/api/v2/content/amcn/%s/url/%s'
+ % (requestor_id.lower(), display_id), display_id)['data']
+ properties = page_data.get('properties') or {}
query = {
'mbr': 'true',
'manifest': 'm3u',
}
- tp_path = 'M_UwQC/media/' + properties['videoPid']
- media_url = 'https://link.theplatform.com/s/' + tp_path
+
+ video_player_count = 0
+ try:
+ for v in page_data['children']:
+ if v.get('type') == 'video-player':
+ releasePid = v['properties']['currentVideo']['meta']['releasePid']
+ tp_path = 'M_UwQC/' + releasePid
+ media_url = 'https://link.theplatform.com/s/' + tp_path
+ video_player_count += 1
+ except KeyError:
+ pass
+ if video_player_count > 1:
+ self.report_warning(
+ 'The JSON data has %d video players. Only one will be extracted' % video_player_count)
+
+ # Fall back to videoPid if releasePid not found.
+ # TODO: Fall back to videoPid if releasePid manifest uses DRM.
+ if not video_player_count:
+ tp_path = 'M_UwQC/media/' + properties['videoPid']
+ media_url = 'https://link.theplatform.com/s/' + tp_path
+
theplatform_metadata = self._download_theplatform_metadata(tp_path, display_id)
info = self._parse_theplatform_metadata(theplatform_metadata)
video_id = theplatform_metadata['pid']
@@ -90,30 +110,41 @@ class AMCNetworksIE(ThePlatformIE):
formats, subtitles = self._extract_theplatform_smil(
media_url, video_id)
self._sort_formats(formats)
+
+ thumbnails = []
+ thumbnail_urls = [properties.get('imageDesktop')]
+ if 'thumbnail' in info:
+ thumbnail_urls.append(info.pop('thumbnail'))
+ for thumbnail_url in thumbnail_urls:
+ if not thumbnail_url:
+ continue
+ mobj = re.search(r'(\d+)x(\d+)', thumbnail_url)
+ thumbnails.append({
+ 'url': thumbnail_url,
+ 'width': int(mobj.group(1)) if mobj else None,
+ 'height': int(mobj.group(2)) if mobj else None,
+ })
+
info.update({
+ 'age_limit': parse_age_limit(rating),
+ 'formats': formats,
'id': video_id,
'subtitles': subtitles,
- 'formats': formats,
- 'age_limit': parse_age_limit(parse_age_limit(rating)),
+ 'thumbnails': thumbnails,
})
ns_keys = theplatform_metadata.get('$xmlns', {}).keys()
if ns_keys:
ns = list(ns_keys)[0]
- series = theplatform_metadata.get(ns + '$show')
- season_number = int_or_none(
- theplatform_metadata.get(ns + '$season'))
- episode = theplatform_metadata.get(ns + '$episodeTitle')
+ episode = theplatform_metadata.get(ns + '$episodeTitle') or None
episode_number = int_or_none(
theplatform_metadata.get(ns + '$episode'))
- if season_number:
- title = 'Season %d - %s' % (season_number, title)
- if series:
- title = '%s - %s' % (series, title)
+ season_number = int_or_none(
+ theplatform_metadata.get(ns + '$season'))
+ series = theplatform_metadata.get(ns + '$show') or None
info.update({
- 'title': title,
- 'series': series,
- 'season_number': season_number,
'episode': episode,
'episode_number': episode_number,
+ 'season_number': season_number,
+ 'series': series,
})
return info
diff --git a/hypervideo_dl/extractor/americastestkitchen.py b/hypervideo_dl/extractor/americastestkitchen.py
index be960c0..6e6099a 100644
--- a/hypervideo_dl/extractor/americastestkitchen.py
+++ b/hypervideo_dl/extractor/americastestkitchen.py
@@ -2,7 +2,6 @@
from __future__ import unicode_literals
import json
-import re
from .common import InfoExtractor
from ..utils import (
@@ -69,7 +68,7 @@ class AmericasTestKitchenIE(InfoExtractor):
}]
def _real_extract(self, url):
- resource_type, video_id = re.match(self._VALID_URL, url).groups()
+ resource_type, video_id = self._match_valid_url(url).groups()
is_episode = resource_type == 'episode'
if is_episode:
resource_type = 'episodes'
@@ -114,7 +113,7 @@ class AmericasTestKitchenSeasonIE(InfoExtractor):
}]
def _real_extract(self, url):
- show_name, season_number = re.match(self._VALID_URL, url).groups()
+ show_name, season_number = self._match_valid_url(url).groups()
season_number = int(season_number)
slug = 'atk' if show_name == 'americastestkitchen' else 'cco'
diff --git a/hypervideo_dl/extractor/animelab.py b/hypervideo_dl/extractor/animelab.py
new file mode 100644
index 0000000..4fb7ee4
--- /dev/null
+++ b/hypervideo_dl/extractor/animelab.py
@@ -0,0 +1,285 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+from ..utils import (
+ ExtractorError,
+ urlencode_postdata,
+ int_or_none,
+ str_or_none,
+ determine_ext,
+)
+
+from ..compat import compat_HTTPError
+
+
+class AnimeLabBaseIE(InfoExtractor):
+ _LOGIN_REQUIRED = True
+ _LOGIN_URL = 'https://www.animelab.com/login'
+ _NETRC_MACHINE = 'animelab'
+
+ def _login(self):
+ def is_logged_in(login_webpage):
+ return 'Sign In' not in login_webpage
+
+ login_page = self._download_webpage(
+ self._LOGIN_URL, None, 'Downloading login page')
+
+ # Check if already logged in
+ if is_logged_in(login_page):
+ return
+
+ (username, password) = self._get_login_info()
+ if username is None and self._LOGIN_REQUIRED:
+ self.raise_login_required('Login is required to access any AnimeLab content')
+
+ login_form = {
+ 'email': username,
+ 'password': password,
+ }
+
+ try:
+ response = self._download_webpage(
+ self._LOGIN_URL, None, 'Logging in', 'Wrong login info',
+ data=urlencode_postdata(login_form),
+ headers={'Content-Type': 'application/x-www-form-urlencoded'})
+ except ExtractorError as e:
+ if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400:
+ raise ExtractorError('Unable to log in (wrong credentials?)', expected=True)
+ else:
+ raise
+
+ # if login was successful
+ if is_logged_in(response):
+ return
+
+ raise ExtractorError('Unable to login (cannot verify if logged in)')
+
+ def _real_initialize(self):
+ self._login()
+
+
+class AnimeLabIE(AnimeLabBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?animelab\.com/player/(?P<id>[^/]+)'
+
+ # the following tests require authentication, but a free account will suffice
+ # just set 'usenetrc' to true in test/local_parameters.json if you use a .netrc file
+ # or you can set 'username' and 'password' there
+ # the tests also select a specific format so that the same video is downloaded
+ # regardless of whether the user is premium or not (needs testing on a premium account)
+ _TEST = {
+ 'url': 'https://www.animelab.com/player/fullmetal-alchemist-brotherhood-episode-42',
+ 'md5': '05bde4b91a5d1ff46ef5b94df05b0f7f',
+ 'info_dict': {
+ 'id': '383',
+ 'ext': 'mp4',
+ 'display_id': 'fullmetal-alchemist-brotherhood-episode-42',
+ 'title': 'Fullmetal Alchemist: Brotherhood - Episode 42 - Signs of a Counteroffensive',
+ 'description': 'md5:103eb61dd0a56d3dfc5dbf748e5e83f4',
+ 'series': 'Fullmetal Alchemist: Brotherhood',
+ 'episode': 'Signs of a Counteroffensive',
+ 'episode_number': 42,
+ 'duration': 1469,
+ 'season': 'Season 1',
+ 'season_number': 1,
+ 'season_id': '38',
+ },
+ 'params': {
+ 'format': '[format_id=21711_yeshardsubbed_ja-JP][height=480]',
+ },
+ 'skip': 'All AnimeLab content requires authentication',
+ }
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+
+ # unfortunately we can get different URLs for the same formats
+ # e.g. if we are using a "free" account so no dubs available
+ # (so _remove_duplicate_formats is not effective)
+ # so we use a dictionary as a workaround
+ formats = {}
+ for language_option_url in ('https://www.animelab.com/player/%s/subtitles',
+ 'https://www.animelab.com/player/%s/dubbed'):
+ actual_url = language_option_url % display_id
+ webpage = self._download_webpage(actual_url, display_id, 'Downloading URL ' + actual_url)
+
+ video_collection = self._parse_json(self._search_regex(r'new\s+?AnimeLabApp\.VideoCollection\s*?\((.*?)\);', webpage, 'AnimeLab VideoCollection'), display_id)
+ position = int_or_none(self._search_regex(r'playlistPosition\s*?=\s*?(\d+)', webpage, 'Playlist Position'))
+
+ raw_data = video_collection[position]['videoEntry']
+
+ video_id = str_or_none(raw_data['id'])
+
+ # create a title from many sources (while grabbing other info)
+ # TODO use more fallback sources to get some of these
+ series = raw_data.get('showTitle')
+ video_type = raw_data.get('videoEntryType', {}).get('name')
+ episode_number = raw_data.get('episodeNumber')
+ episode_name = raw_data.get('name')
+
+ title_parts = (series, video_type, episode_number, episode_name)
+ if None not in title_parts:
+ title = '%s - %s %s - %s' % title_parts
+ else:
+ title = episode_name
+
+ description = raw_data.get('synopsis') or self._og_search_description(webpage, default=None)
+
+ duration = int_or_none(raw_data.get('duration'))
+
+ thumbnail_data = raw_data.get('images', [])
+ thumbnails = []
+ for thumbnail in thumbnail_data:
+ for instance in thumbnail['imageInstances']:
+ image_data = instance.get('imageInfo', {})
+ thumbnails.append({
+ 'id': str_or_none(image_data.get('id')),
+ 'url': image_data.get('fullPath'),
+ 'width': image_data.get('width'),
+ 'height': image_data.get('height'),
+ })
+
+ season_data = raw_data.get('season', {}) or {}
+ season = str_or_none(season_data.get('name'))
+ season_number = int_or_none(season_data.get('seasonNumber'))
+ season_id = str_or_none(season_data.get('id'))
+
+ for video_data in raw_data['videoList']:
+ current_video_list = {}
+ current_video_list['language'] = video_data.get('language', {}).get('languageCode')
+
+ is_hardsubbed = video_data.get('hardSubbed')
+
+ for video_instance in video_data['videoInstances']:
+ httpurl = video_instance.get('httpUrl')
+ url = httpurl if httpurl else video_instance.get('rtmpUrl')
+ if url is None:
+ # this video format is unavailable to the user (not premium etc.)
+ continue
+
+ current_format = current_video_list.copy()
+
+ format_id_parts = []
+
+ format_id_parts.append(str_or_none(video_instance.get('id')))
+
+ if is_hardsubbed is not None:
+ if is_hardsubbed:
+ format_id_parts.append('yeshardsubbed')
+ else:
+ format_id_parts.append('nothardsubbed')
+
+ format_id_parts.append(current_format['language'])
+
+ format_id = '_'.join([x for x in format_id_parts if x is not None])
+
+ ext = determine_ext(url)
+ if ext == 'm3u8':
+ for format_ in self._extract_m3u8_formats(
+ url, video_id, m3u8_id=format_id, fatal=False):
+ formats[format_['format_id']] = format_
+ continue
+ elif ext == 'mpd':
+ for format_ in self._extract_mpd_formats(
+ url, video_id, mpd_id=format_id, fatal=False):
+ formats[format_['format_id']] = format_
+ continue
+
+ current_format['url'] = url
+ quality_data = video_instance.get('videoQuality')
+ if quality_data:
+ quality = quality_data.get('name') or quality_data.get('description')
+ else:
+ quality = None
+
+ height = None
+ if quality:
+ height = int_or_none(self._search_regex(r'(\d+)p?$', quality, 'Video format height', default=None))
+
+ if height is None:
+ self.report_warning('Could not get height of video')
+ else:
+ current_format['height'] = height
+ current_format['format_id'] = format_id
+
+ formats[current_format['format_id']] = current_format
+
+ formats = list(formats.values())
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'display_id': display_id,
+ 'title': title,
+ 'description': description,
+ 'series': series,
+ 'episode': episode_name,
+ 'episode_number': int_or_none(episode_number),
+ 'thumbnails': thumbnails,
+ 'duration': duration,
+ 'formats': formats,
+ 'season': season,
+ 'season_number': season_number,
+ 'season_id': season_id,
+ }
+
+
+class AnimeLabShowsIE(AnimeLabBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?animelab\.com/shows/(?P<id>[^/]+)'
+
+ _TEST = {
+ 'url': 'https://www.animelab.com/shows/attack-on-titan',
+ 'info_dict': {
+ 'id': '45',
+ 'title': 'Attack on Titan',
+ 'description': 'md5:989d95a2677e9309368d5cf39ba91469',
+ },
+ 'playlist_count': 59,
+ 'skip': 'All AnimeLab content requires authentication',
+ }
+
+ def _real_extract(self, url):
+ _BASE_URL = 'http://www.animelab.com'
+ _SHOWS_API_URL = '/api/videoentries/show/videos/'
+ display_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, display_id, 'Downloading requested URL')
+
+ show_data_str = self._search_regex(r'({"id":.*}),\svideoEntry', webpage, 'AnimeLab show data')
+ show_data = self._parse_json(show_data_str, display_id)
+
+ show_id = str_or_none(show_data.get('id'))
+ title = show_data.get('name')
+ description = show_data.get('shortSynopsis') or show_data.get('longSynopsis')
+
+ entries = []
+ for season in show_data['seasons']:
+ season_id = season['id']
+ get_data = urlencode_postdata({
+ 'seasonId': season_id,
+ 'limit': 1000,
+ })
+ # despite using urlencode_postdata, we are sending a GET request
+ target_url = _BASE_URL + _SHOWS_API_URL + show_id + "?" + get_data.decode('utf-8')
+ response = self._download_webpage(
+ target_url,
+ None, 'Season id %s' % season_id)
+
+ season_data = self._parse_json(response, display_id)
+
+ for video_data in season_data['list']:
+ entries.append(self.url_result(
+ _BASE_URL + '/player/' + video_data['slug'], 'AnimeLab',
+ str_or_none(video_data.get('id')), video_data.get('name')
+ ))
+
+ return {
+ '_type': 'playlist',
+ 'id': show_id,
+ 'title': title,
+ 'description': description,
+ 'entries': entries,
+ }
+
+# TODO implement myqueue
diff --git a/hypervideo_dl/extractor/anvato.py b/hypervideo_dl/extractor/anvato.py
index b739856..b82f0b5 100644
--- a/hypervideo_dl/extractor/anvato.py
+++ b/hypervideo_dl/extractor/anvato.py
@@ -21,6 +21,16 @@ from ..utils import (
unsmuggle_url,
)
+# This import causes a ModuleNotFoundError on some systems for unknown reason.
+# See issues:
+# https://github.com/hypervideo/hypervideo/issues/35
+# https://github.com/ytdl-org/youtube-dl/issues/27449
+# https://github.com/animelover1984/youtube-dl/issues/17
+try:
+ from .anvato_token_generator import NFLTokenGenerator
+except ImportError:
+ NFLTokenGenerator = None
+
def md5_text(s):
if not isinstance(s, compat_str):
@@ -203,6 +213,10 @@ class AnvatoIE(InfoExtractor):
'telemundo': 'anvato_mcp_telemundo_web_prod_c5278d51ad46fda4b6ca3d0ea44a7846a054f582'
}
+ _TOKEN_GENERATORS = {
+ 'GXvEgwyJeWem8KCYXfeoHWknwP48Mboj': NFLTokenGenerator,
+ }
+
_API_KEY = '3hwbSuqqT690uxjNYBktSQpa5ZrpYYR0Iofx7NcJHyA'
_ANVP_RE = r'<script[^>]+\bdata-anvp\s*=\s*(["\'])(?P<anvp>(?:(?!\1).)+)\1'
@@ -262,9 +276,12 @@ class AnvatoIE(InfoExtractor):
'anvrid': anvrid,
'anvts': server_time,
}
- api['anvstk'] = md5_text('%s|%s|%d|%s' % (
- access_key, anvrid, server_time,
- self._ANVACK_TABLE.get(access_key, self._API_KEY)))
+ if self._TOKEN_GENERATORS.get(access_key) is not None:
+ api['anvstk2'] = self._TOKEN_GENERATORS[access_key].generate(self, access_key, video_id)
+ else:
+ api['anvstk'] = md5_text('%s|%s|%d|%s' % (
+ access_key, anvrid, server_time,
+ self._ANVACK_TABLE.get(access_key, self._API_KEY)))
return self._download_json(
video_data_url, video_id, transform_source=strip_jsonp,
@@ -373,7 +390,7 @@ class AnvatoIE(InfoExtractor):
'countries': smuggled_data.get('geo_countries'),
})
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
access_key, video_id = mobj.group('access_key_or_mcp', 'id')
if access_key not in self._ANVACK_TABLE:
access_key = self._MCP_TO_ACCESS_KEY_TABLE.get(
diff --git a/hypervideo_dl/extractor/anvato_token_generator/__init__.py b/hypervideo_dl/extractor/anvato_token_generator/__init__.py
new file mode 100644
index 0000000..6e223db
--- /dev/null
+++ b/hypervideo_dl/extractor/anvato_token_generator/__init__.py
@@ -0,0 +1,7 @@
+from __future__ import unicode_literals
+
+from .nfl import NFLTokenGenerator
+
+__all__ = [
+ 'NFLTokenGenerator',
+]
diff --git a/hypervideo_dl/extractor/anvato_token_generator/common.py b/hypervideo_dl/extractor/anvato_token_generator/common.py
new file mode 100644
index 0000000..b959a90
--- /dev/null
+++ b/hypervideo_dl/extractor/anvato_token_generator/common.py
@@ -0,0 +1,6 @@
+from __future__ import unicode_literals
+
+
+class TokenGenerator:
+ def generate(self, anvack, mcp_id):
+ raise NotImplementedError('This method must be implemented by subclasses')
diff --git a/hypervideo_dl/extractor/anvato_token_generator/nfl.py b/hypervideo_dl/extractor/anvato_token_generator/nfl.py
new file mode 100644
index 0000000..97a2b24
--- /dev/null
+++ b/hypervideo_dl/extractor/anvato_token_generator/nfl.py
@@ -0,0 +1,30 @@
+from __future__ import unicode_literals
+
+import json
+
+from .common import TokenGenerator
+
+
+class NFLTokenGenerator(TokenGenerator):
+ _AUTHORIZATION = None
+
+ def generate(ie, anvack, mcp_id):
+ if not NFLTokenGenerator._AUTHORIZATION:
+ reroute = ie._download_json(
+ 'https://api.nfl.com/v1/reroute', mcp_id,
+ data=b'grant_type=client_credentials',
+ headers={'X-Domain-Id': 100})
+ NFLTokenGenerator._AUTHORIZATION = '%s %s' % (reroute.get('token_type') or 'Bearer', reroute['access_token'])
+ return ie._download_json(
+ 'https://api.nfl.com/v3/shield/', mcp_id, data=json.dumps({
+ 'query': '''{
+ viewer {
+ mediaToken(anvack: "%s", id: %s) {
+ token
+ }
+ }
+}''' % (anvack, mcp_id),
+ }).encode(), headers={
+ 'Authorization': NFLTokenGenerator._AUTHORIZATION,
+ 'Content-Type': 'application/json',
+ })['data']['viewer']['mediaToken']['token']
diff --git a/hypervideo_dl/extractor/aol.py b/hypervideo_dl/extractor/aol.py
index f6ecb84..4766a2c 100644
--- a/hypervideo_dl/extractor/aol.py
+++ b/hypervideo_dl/extractor/aol.py
@@ -4,13 +4,10 @@ from __future__ import unicode_literals
import re
from .yahoo import YahooIE
-from ..compat import (
- compat_parse_qs,
- compat_urllib_parse_urlparse,
-)
from ..utils import (
ExtractorError,
int_or_none,
+ parse_qs,
url_or_none,
)
@@ -119,13 +116,13 @@ class AolIE(YahooIE):
'height': int(mobj.group(2)),
})
else:
- qs = compat_parse_qs(compat_urllib_parse_urlparse(video_url).query)
+ qs = parse_qs(video_url)
f.update({
'width': int_or_none(qs.get('w', [None])[0]),
'height': int_or_none(qs.get('h', [None])[0]),
})
formats.append(f)
- self._sort_formats(formats, ('width', 'height', 'tbr', 'format_id'))
+ self._sort_formats(formats)
return {
'id': video_id,
diff --git a/hypervideo_dl/extractor/apa.py b/hypervideo_dl/extractor/apa.py
index cbc1c0e..1736cdf 100644
--- a/hypervideo_dl/extractor/apa.py
+++ b/hypervideo_dl/extractor/apa.py
@@ -42,7 +42,7 @@ class APAIE(InfoExtractor):
webpage)]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
video_id, base_url = mobj.group('id', 'base_url')
webpage = self._download_webpage(
diff --git a/hypervideo_dl/extractor/aparat.py b/hypervideo_dl/extractor/aparat.py
index a9527e7..da06a3c 100644
--- a/hypervideo_dl/extractor/aparat.py
+++ b/hypervideo_dl/extractor/aparat.py
@@ -72,8 +72,7 @@ class AparatIE(InfoExtractor):
r'(\d+)[pP]', label or '', 'height',
default=None)),
})
- self._sort_formats(
- formats, field_preference=('height', 'width', 'tbr', 'format_id'))
+ self._sort_formats(formats)
info = self._search_json_ld(webpage, video_id, default={})
diff --git a/hypervideo_dl/extractor/appleconnect.py b/hypervideo_dl/extractor/appleconnect.py
index a84b8b1..494f833 100644
--- a/hypervideo_dl/extractor/appleconnect.py
+++ b/hypervideo_dl/extractor/appleconnect.py
@@ -9,10 +9,10 @@ from ..utils import (
class AppleConnectIE(InfoExtractor):
- _VALID_URL = r'https?://itunes\.apple\.com/\w{0,2}/?post/idsa\.(?P<id>[\w-]+)'
- _TEST = {
+ _VALID_URL = r'https?://itunes\.apple\.com/\w{0,2}/?post/(?:id)?sa\.(?P<id>[\w-]+)'
+ _TESTS = [{
'url': 'https://itunes.apple.com/us/post/idsa.4ab17a39-2720-11e5-96c5-a5b38f6c42d3',
- 'md5': 'e7c38568a01ea45402570e6029206723',
+ 'md5': 'c1d41f72c8bcaf222e089434619316e4',
'info_dict': {
'id': '4ab17a39-2720-11e5-96c5-a5b38f6c42d3',
'ext': 'm4v',
@@ -22,7 +22,10 @@ class AppleConnectIE(InfoExtractor):
'upload_date': '20150710',
'timestamp': 1436545535,
},
- }
+ }, {
+ 'url': 'https://itunes.apple.com/us/post/sa.0fe0229f-2457-11e5-9f40-1bb645f2d5d9',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
video_id = self._match_id(url)
@@ -36,7 +39,7 @@ class AppleConnectIE(InfoExtractor):
video_data = self._parse_json(video_json, video_id)
timestamp = str_to_int(self._html_search_regex(r'data-timestamp="(\d+)"', webpage, 'timestamp'))
- like_count = str_to_int(self._html_search_regex(r'(\d+) Loves', webpage, 'like count'))
+ like_count = str_to_int(self._html_search_regex(r'(\d+) Loves', webpage, 'like count', default=None))
return {
'id': video_id,
diff --git a/hypervideo_dl/extractor/appletrailers.py b/hypervideo_dl/extractor/appletrailers.py
index 10442a5..0abfb43 100644
--- a/hypervideo_dl/extractor/appletrailers.py
+++ b/hypervideo_dl/extractor/appletrailers.py
@@ -94,7 +94,7 @@ class AppleTrailersIE(InfoExtractor):
_JSON_RE = r'iTunes.playURL\((.*?)\);'
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
movie = mobj.group('movie')
uploader_id = mobj.group('company')
diff --git a/hypervideo_dl/extractor/archiveorg.py b/hypervideo_dl/extractor/archiveorg.py
index e42ed5e..d90fcb1 100644
--- a/hypervideo_dl/extractor/archiveorg.py
+++ b/hypervideo_dl/extractor/archiveorg.py
@@ -1,9 +1,33 @@
+# coding: utf-8
from __future__ import unicode_literals
+import re
+import json
+
from .common import InfoExtractor
+from .youtube import YoutubeIE
+from ..compat import (
+ compat_urllib_parse_unquote,
+ compat_urllib_parse_unquote_plus,
+ compat_HTTPError
+)
from ..utils import (
clean_html,
+ determine_ext,
+ dict_get,
extract_attributes,
+ ExtractorError,
+ HEADRequest,
+ int_or_none,
+ KNOWN_EXTENSIONS,
+ merge_dicts,
+ mimetype2ext,
+ parse_duration,
+ parse_qs,
+ RegexNotFoundError,
+ str_to_int,
+ str_or_none,
+ try_get,
unified_strdate,
unified_timestamp,
)
@@ -11,22 +35,22 @@ from ..utils import (
class ArchiveOrgIE(InfoExtractor):
IE_NAME = 'archive.org'
- IE_DESC = 'archive.org videos'
- _VALID_URL = r'https?://(?:www\.)?archive\.org/(?:details|embed)/(?P<id>[^/?#&]+)'
+ IE_DESC = 'archive.org video and audio'
+ _VALID_URL = r'https?://(?:www\.)?archive\.org/(?:details|embed)/(?P<id>[^?#]+)(?:[?].*)?$'
_TESTS = [{
'url': 'http://archive.org/details/XD300-23_68HighlightsAResearchCntAugHumanIntellect',
'md5': '8af1d4cf447933ed3c7f4871162602db',
'info_dict': {
'id': 'XD300-23_68HighlightsAResearchCntAugHumanIntellect',
- 'ext': 'ogg',
+ 'ext': 'ogv',
'title': '1968 Demo - FJCC Conference Presentation Reel #1',
'description': 'md5:da45c349df039f1cc8075268eb1b5c25',
- 'creator': 'SRI International',
'release_date': '19681210',
- 'uploader': 'SRI International',
'timestamp': 1268695290,
'upload_date': '20100315',
- }
+ 'creator': 'SRI International',
+ 'uploader': 'laura@archive.org',
+ },
}, {
'url': 'https://archive.org/details/Cops1922',
'md5': '0869000b4ce265e8ca62738b336b268a',
@@ -35,61 +59,360 @@ class ArchiveOrgIE(InfoExtractor):
'ext': 'mp4',
'title': 'Buster Keaton\'s "Cops" (1922)',
'description': 'md5:43a603fd6c5b4b90d12a96b921212b9c',
+ 'uploader': 'yorkmba99@hotmail.com',
'timestamp': 1387699629,
- 'upload_date': '20131222',
- }
+ 'upload_date': "20131222",
+ },
}, {
'url': 'http://archive.org/embed/XD300-23_68HighlightsAResearchCntAugHumanIntellect',
'only_matching': True,
}, {
- 'url': 'https://archive.org/details/MSNBCW_20131125_040000_To_Catch_a_Predator/',
- 'only_matching': True,
+ 'url': 'https://archive.org/details/Election_Ads',
+ 'md5': '284180e857160cf866358700bab668a3',
+ 'info_dict': {
+ 'id': 'Election_Ads/Commercial-JFK1960ElectionAdCampaignJingle.mpg',
+ 'title': 'Commercial-JFK1960ElectionAdCampaignJingle.mpg',
+ 'ext': 'mp4',
+ },
+ }, {
+ 'url': 'https://archive.org/details/Election_Ads/Commercial-Nixon1960ElectionAdToughonDefense.mpg',
+ 'md5': '7915213ef02559b5501fe630e1a53f59',
+ 'info_dict': {
+ 'id': 'Election_Ads/Commercial-Nixon1960ElectionAdToughonDefense.mpg',
+ 'title': 'Commercial-Nixon1960ElectionAdToughonDefense.mpg',
+ 'ext': 'mp4',
+ 'timestamp': 1205588045,
+ 'uploader': 'mikedavisstripmaster@yahoo.com',
+ 'description': '1960 Presidential Campaign Election Commercials John F Kennedy, Richard M Nixon',
+ 'upload_date': '20080315',
+ },
+ }, {
+ 'url': 'https://archive.org/details/gd1977-05-08.shure57.stevenson.29303.flac16',
+ 'md5': '7d07ffb42aba6537c28e053efa4b54c9',
+ 'info_dict': {
+ 'id': 'gd1977-05-08.shure57.stevenson.29303.flac16/gd1977-05-08d01t01.flac',
+ 'title': 'Turning',
+ 'ext': 'flac',
+ },
+ }, {
+ 'url': 'https://archive.org/details/gd1977-05-08.shure57.stevenson.29303.flac16/gd1977-05-08d01t07.flac',
+ 'md5': 'a07cd8c6ab4ee1560f8a0021717130f3',
+ 'info_dict': {
+ 'id': 'gd1977-05-08.shure57.stevenson.29303.flac16/gd1977-05-08d01t07.flac',
+ 'title': 'Deal',
+ 'ext': 'flac',
+ 'timestamp': 1205895624,
+ 'uploader': 'mvernon54@yahoo.com',
+ 'description': 'md5:6a31f1996db0aa0fc9da6d6e708a1bb0',
+ 'upload_date': '20080319',
+ 'location': 'Barton Hall - Cornell University',
+ },
+ }, {
+ 'url': 'https://archive.org/details/lp_the-music-of-russia_various-artists-a-askaryan-alexander-melik',
+ 'md5': '7cb019baa9b332e82ea7c10403acd180',
+ 'info_dict': {
+ 'id': 'lp_the-music-of-russia_various-artists-a-askaryan-alexander-melik/disc1/01.01. Bells Of Rostov.mp3',
+ 'title': 'Bells Of Rostov',
+ 'ext': 'mp3',
+ },
+ }, {
+ 'url': 'https://archive.org/details/lp_the-music-of-russia_various-artists-a-askaryan-alexander-melik/disc1/02.02.+Song+And+Chorus+In+The+Polovetsian+Camp+From+%22Prince+Igor%22+(Act+2%2C+Scene+1).mp3',
+ 'md5': '1d0aabe03edca83ca58d9ed3b493a3c3',
+ 'info_dict': {
+ 'id': 'lp_the-music-of-russia_various-artists-a-askaryan-alexander-melik/disc1/02.02. Song And Chorus In The Polovetsian Camp From "Prince Igor" (Act 2, Scene 1).mp3',
+ 'title': 'Song And Chorus In The Polovetsian Camp From "Prince Igor" (Act 2, Scene 1)',
+ 'ext': 'mp3',
+ 'timestamp': 1569662587,
+ 'uploader': 'associate-joygen-odiongan@archive.org',
+ 'description': 'md5:012b2d668ae753be36896f343d12a236',
+ 'upload_date': '20190928',
+ },
}]
+ @staticmethod
+ def _playlist_data(webpage):
+ element = re.findall(r'''(?xs)
+ <input
+ (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
+ \s+class=['"]?js-play8-playlist['"]?
+ (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
+ \s*/>
+ ''', webpage)[0]
+
+ return json.loads(extract_attributes(element)['value'])
+
def _real_extract(self, url):
- video_id = self._match_id(url)
- webpage = self._download_webpage(
- 'http://archive.org/embed/' + video_id, video_id)
-
- playlist = None
- play8 = self._search_regex(
- r'(<[^>]+\bclass=["\']js-play8-playlist[^>]+>)', webpage,
- 'playlist', default=None)
- if play8:
- attrs = extract_attributes(play8)
- playlist = attrs.get('value')
- if not playlist:
- # Old jwplayer fallback
- playlist = self._search_regex(
- r"(?s)Play\('[^']+'\s*,\s*(\[.+\])\s*,\s*{.*?}\)",
- webpage, 'jwplayer playlist', default='[]')
- jwplayer_playlist = self._parse_json(playlist, video_id, fatal=False)
- if jwplayer_playlist:
- info = self._parse_jwplayer_data(
- {'playlist': jwplayer_playlist}, video_id, base_url=url)
- else:
- # HTML5 media fallback
- info = self._parse_html5_media_entries(url, webpage, video_id)[0]
- info['id'] = video_id
+ video_id = compat_urllib_parse_unquote_plus(self._match_id(url))
+ identifier, entry_id = (video_id.split('/', 1) + [None])[:2]
+
+ # Archive.org metadata API doesn't clearly demarcate playlist entries
+ # or subtitle tracks, so we get them from the embeddable player.
+ embed_page = self._download_webpage(
+ 'https://archive.org/embed/' + identifier, identifier)
+ playlist = self._playlist_data(embed_page)
+
+ entries = {}
+ for p in playlist:
+ # If the user specified a playlist entry in the URL, ignore the
+ # rest of the playlist.
+ if entry_id and p['orig'] != entry_id:
+ continue
- def get_optional(metadata, field):
- return metadata.get(field, [None])[0]
+ entries[p['orig']] = {
+ 'formats': [],
+ 'thumbnails': [],
+ 'artist': p.get('artist'),
+ 'track': p.get('title'),
+ 'subtitles': {}}
+
+ for track in p.get('tracks', []):
+ if track['kind'] != 'subtitles':
+ continue
+
+ entries[p['orig']][track['label']] = {
+ 'url': 'https://archive.org/' + track['file'].lstrip('/')}
metadata = self._download_json(
- 'http://archive.org/details/' + video_id, video_id, query={
- 'output': 'json',
- })['metadata']
- info.update({
- 'title': get_optional(metadata, 'title') or info.get('title'),
- 'description': clean_html(get_optional(metadata, 'description')),
- })
- if info.get('_type') != 'playlist':
- creator = get_optional(metadata, 'creator')
- info.update({
- 'creator': creator,
- 'release_date': unified_strdate(get_optional(metadata, 'date')),
- 'uploader': get_optional(metadata, 'publisher') or creator,
- 'timestamp': unified_timestamp(get_optional(metadata, 'publicdate')),
- 'language': get_optional(metadata, 'language'),
- })
+ 'http://archive.org/metadata/' + identifier, identifier)
+ m = metadata['metadata']
+ identifier = m['identifier']
+
+ info = {
+ 'id': identifier,
+ 'title': m['title'],
+ 'description': clean_html(m.get('description')),
+ 'uploader': dict_get(m, ['uploader', 'adder']),
+ 'creator': m.get('creator'),
+ 'license': m.get('licenseurl'),
+ 'release_date': unified_strdate(m.get('date')),
+ 'timestamp': unified_timestamp(dict_get(m, ['publicdate', 'addeddate'])),
+ 'webpage_url': 'https://archive.org/details/' + identifier,
+ 'location': m.get('venue'),
+ 'release_year': int_or_none(m.get('year'))}
+
+ for f in metadata['files']:
+ if f['name'] in entries:
+ entries[f['name']] = merge_dicts(entries[f['name']], {
+ 'id': identifier + '/' + f['name'],
+ 'title': f.get('title') or f['name'],
+ 'display_id': f['name'],
+ 'description': clean_html(f.get('description')),
+ 'creator': f.get('creator'),
+ 'duration': parse_duration(f.get('length')),
+ 'track_number': int_or_none(f.get('track')),
+ 'album': f.get('album'),
+ 'discnumber': int_or_none(f.get('disc')),
+ 'release_year': int_or_none(f.get('year'))})
+ entry = entries[f['name']]
+ elif f.get('original') in entries:
+ entry = entries[f['original']]
+ else:
+ continue
+
+ if f.get('format') == 'Thumbnail':
+ entry['thumbnails'].append({
+ 'id': f['name'],
+ 'url': 'https://archive.org/download/' + identifier + '/' + f['name'],
+ 'width': int_or_none(f.get('width')),
+ 'height': int_or_none(f.get('width')),
+ 'filesize': int_or_none(f.get('size'))})
+
+ extension = (f['name'].rsplit('.', 1) + [None])[1]
+ if extension in KNOWN_EXTENSIONS:
+ entry['formats'].append({
+ 'url': 'https://archive.org/download/' + identifier + '/' + f['name'],
+ 'format': f.get('format'),
+ 'width': int_or_none(f.get('width')),
+ 'height': int_or_none(f.get('height')),
+ 'filesize': int_or_none(f.get('size')),
+ 'protocol': 'https'})
+
+ # Sort available formats by filesize
+ for entry in entries.values():
+ entry['formats'] = list(sorted(entry['formats'], key=lambda x: x.get('filesize', -1)))
+
+ if len(entries) == 1:
+ # If there's only one item, use it as the main info dict
+ only_video = entries[list(entries.keys())[0]]
+ if entry_id:
+ info = merge_dicts(only_video, info)
+ else:
+ info = merge_dicts(info, only_video)
+ else:
+ # Otherwise, we have a playlist.
+ info['_type'] = 'playlist'
+ info['entries'] = list(entries.values())
+
+ if metadata.get('reviews'):
+ info['comments'] = []
+ for review in metadata['reviews']:
+ info['comments'].append({
+ 'id': review.get('review_id'),
+ 'author': review.get('reviewer'),
+ 'text': str_or_none(review.get('reviewtitle'), '') + '\n\n' + review.get('reviewbody'),
+ 'timestamp': unified_timestamp(review.get('createdate')),
+ 'parent': 'root'})
+
return info
+
+
+class YoutubeWebArchiveIE(InfoExtractor):
+ IE_NAME = 'web.archive:youtube'
+ IE_DESC = 'web.archive.org saved youtube videos'
+ _VALID_URL = r"""(?x)^
+ (?:https?://)?web\.archive\.org/
+ (?:web/)?
+ (?:[0-9A-Za-z_*]+/)? # /web and the version index is optional
+
+ (?:https?(?::|%3[Aa])//)?
+ (?:
+ (?:\w+\.)?youtube\.com/watch(?:\?|%3[fF])(?:[^\#]+(?:&|%26))?v(?:=|%3[dD]) # Youtube URL
+ |(wayback-fakeurl\.archive\.org/yt/) # Or the internal fake url
+ )
+ (?P<id>[0-9A-Za-z_-]{11})(?:%26|\#|&|$)
+ """
+
+ _TESTS = [
+ {
+ 'url': 'https://web.archive.org/web/20150415002341/https://www.youtube.com/watch?v=aYAGB11YrSs',
+ 'info_dict': {
+ 'id': 'aYAGB11YrSs',
+ 'ext': 'webm',
+ 'title': 'Team Fortress 2 - Sandviches!'
+ }
+ },
+ {
+ # Internal link
+ 'url': 'https://web.archive.org/web/2oe/http://wayback-fakeurl.archive.org/yt/97t7Xj_iBv0',
+ 'info_dict': {
+ 'id': '97t7Xj_iBv0',
+ 'ext': 'mp4',
+ 'title': 'How Flexible Machines Could Save The World'
+ }
+ },
+ {
+ # Video from 2012, webm format itag 45.
+ 'url': 'https://web.archive.org/web/20120712231619/http://www.youtube.com/watch?v=AkhihxRKcrs&gl=US&hl=en',
+ 'info_dict': {
+ 'id': 'AkhihxRKcrs',
+ 'ext': 'webm',
+ 'title': 'Limited Run: Mondo\'s Modern Classic 1 of 3 (SDCC 2012)'
+ }
+ },
+ {
+ # Old flash-only video. Webpage title starts with "YouTube - ".
+ 'url': 'https://web.archive.org/web/20081211103536/http://www.youtube.com/watch?v=jNQXAC9IVRw',
+ 'info_dict': {
+ 'id': 'jNQXAC9IVRw',
+ 'ext': 'unknown_video',
+ 'title': 'Me at the zoo'
+ }
+ },
+ {
+ # Flash video with .flv extension (itag 34). Title has prefix "YouTube -"
+ # Title has some weird unicode characters too.
+ 'url': 'https://web.archive.org/web/20110712231407/http://www.youtube.com/watch?v=lTx3G6h2xyA',
+ 'info_dict': {
+ 'id': 'lTx3G6h2xyA',
+ 'ext': 'flv',
+ 'title': '‪Madeon - Pop Culture (live mashup)‬‏'
+ }
+ },
+ { # Some versions of Youtube have have "YouTube" as page title in html (and later rewritten by js).
+ 'url': 'https://web.archive.org/web/http://www.youtube.com/watch?v=kH-G_aIBlFw',
+ 'info_dict': {
+ 'id': 'kH-G_aIBlFw',
+ 'ext': 'mp4',
+ 'title': 'kH-G_aIBlFw'
+ },
+ 'expected_warnings': [
+ 'unable to extract title',
+ ]
+ },
+ {
+ # First capture is a 302 redirect intermediary page.
+ 'url': 'https://web.archive.org/web/20050214000000/http://www.youtube.com/watch?v=0altSZ96U4M',
+ 'info_dict': {
+ 'id': '0altSZ96U4M',
+ 'ext': 'mp4',
+ 'title': '0altSZ96U4M'
+ },
+ 'expected_warnings': [
+ 'unable to extract title',
+ ]
+ },
+ {
+ # Video not archived, only capture is unavailable video page
+ 'url': 'https://web.archive.org/web/20210530071008/https://www.youtube.com/watch?v=lHJTf93HL1s&spfreload=10',
+ 'only_matching': True,
+ },
+ { # Encoded url
+ 'url': 'https://web.archive.org/web/20120712231619/http%3A//www.youtube.com/watch%3Fgl%3DUS%26v%3DAkhihxRKcrs%26hl%3Den',
+ 'only_matching': True,
+ },
+ {
+ 'url': 'https://web.archive.org/web/20120712231619/http%3A//www.youtube.com/watch%3Fv%3DAkhihxRKcrs%26gl%3DUS%26hl%3Den',
+ 'only_matching': True,
+ }
+ ]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ title = video_id # if we are not able get a title
+
+ def _extract_title(webpage):
+ page_title = self._html_search_regex(
+ r'<title>([^<]*)</title>', webpage, 'title', fatal=False) or ''
+ # YouTube video pages appear to always have either 'YouTube -' as suffix or '- YouTube' as prefix.
+ try:
+ page_title = self._html_search_regex(
+ r'(?:YouTube\s*-\s*(.*)$)|(?:(.*)\s*-\s*YouTube$)',
+ page_title, 'title', default='')
+ except RegexNotFoundError:
+ page_title = None
+
+ if not page_title:
+ self.report_warning('unable to extract title', video_id=video_id)
+ return
+ return page_title
+
+ # If the video is no longer available, the oldest capture may be one before it was removed.
+ # Setting the capture date in url to early date seems to redirect to earliest capture.
+ webpage = self._download_webpage(
+ 'https://web.archive.org/web/20050214000000/http://www.youtube.com/watch?v=%s' % video_id,
+ video_id=video_id, fatal=False, errnote='unable to download video webpage (probably not archived).')
+ if webpage:
+ title = _extract_title(webpage) or title
+
+ # Use link translator mentioned in https://github.com/ytdl-org/youtube-dl/issues/13655
+ internal_fake_url = 'https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/%s' % video_id
+ try:
+ video_file_webpage = self._request_webpage(
+ HEADRequest(internal_fake_url), video_id,
+ note='Fetching video file url', expected_status=True)
+ except ExtractorError as e:
+ # HTTP Error 404 is expected if the video is not saved.
+ if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404:
+ raise ExtractorError(
+ 'HTTP Error %s. Most likely the video is not archived or issue with web.archive.org.' % e.cause.code,
+ expected=True)
+ raise
+ video_file_url = compat_urllib_parse_unquote(video_file_webpage.url)
+ video_file_url_qs = parse_qs(video_file_url)
+
+ # Attempt to recover any ext & format info from playback url
+ format = {'url': video_file_url}
+ itag = try_get(video_file_url_qs, lambda x: x['itag'][0])
+ if itag and itag in YoutubeIE._formats: # Naughty access but it works
+ format.update(YoutubeIE._formats[itag])
+ format.update({'format_id': itag})
+ else:
+ mime = try_get(video_file_url_qs, lambda x: x['mime'][0])
+ ext = mimetype2ext(mime) or determine_ext(video_file_url)
+ format.update({'ext': ext})
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'formats': [format],
+ 'duration': str_to_int(try_get(video_file_url_qs, lambda x: x['dur'][0]))
+ }
diff --git a/hypervideo_dl/extractor/arcpublishing.py b/hypervideo_dl/extractor/arcpublishing.py
index ca6a6c4..5a9b818 100644
--- a/hypervideo_dl/extractor/arcpublishing.py
+++ b/hypervideo_dl/extractor/arcpublishing.py
@@ -86,7 +86,7 @@ class ArcPublishingIE(InfoExtractor):
return entries
def _real_extract(self, url):
- org, uuid = re.match(self._VALID_URL, url).groups()
+ org, uuid = self._match_valid_url(url).groups()
for orgs, tmpl in self._POWA_DEFAULTS:
if org in orgs:
base_api_tmpl = tmpl
@@ -129,10 +129,6 @@ class ArcPublishingIE(InfoExtractor):
if all([f.get('acodec') == 'none' for f in m3u8_formats]):
continue
for f in m3u8_formats:
- if f.get('acodec') == 'none':
- f['preference'] = -40
- elif f.get('vcodec') == 'none':
- f['preference'] = -50
height = f.get('height')
if not height:
continue
@@ -150,10 +146,9 @@ class ArcPublishingIE(InfoExtractor):
'height': int_or_none(s.get('height')),
'filesize': int_or_none(s.get('filesize')),
'url': s_url,
- 'preference': -1,
+ 'quality': -10,
})
- self._sort_formats(
- formats, ('preference', 'width', 'height', 'vbr', 'filesize', 'tbr', 'ext', 'format_id'))
+ self._sort_formats(formats)
subtitles = {}
for subtitle in (try_get(video, lambda x: x['subtitles']['urls'], list) or []):
diff --git a/hypervideo_dl/extractor/ard.py b/hypervideo_dl/extractor/ard.py
index d45a9fe..048d30f 100644
--- a/hypervideo_dl/extractor/ard.py
+++ b/hypervideo_dl/extractor/ard.py
@@ -36,12 +36,12 @@ class ARDMediathekBaseIE(InfoExtractor):
if not formats:
if fsk:
- raise ExtractorError(
+ self.raise_no_formats(
'This video is only available after 20:00', expected=True)
elif media_info.get('_geoblocked'):
self.raise_geo_restricted(
'This video is not available due to geoblocking',
- countries=self._GEO_COUNTRIES)
+ countries=self._GEO_COUNTRIES, metadata_available=True)
self._sort_formats(formats)
@@ -62,6 +62,45 @@ class ARDMediathekBaseIE(InfoExtractor):
'subtitles': subtitles,
}
+ def _ARD_extract_episode_info(self, title):
+ """Try to extract season/episode data from the title."""
+ res = {}
+ if not title:
+ return res
+
+ for pattern in [
+ # Pattern for title like "Homo sapiens (S06/E07) - Originalversion"
+ # from: https://www.ardmediathek.de/one/sendung/doctor-who/Y3JpZDovL3dkci5kZS9vbmUvZG9jdG9yIHdobw
+ r'.*(?P<ep_info> \(S(?P<season_number>\d+)/E(?P<episode_number>\d+)\)).*',
+ # E.g.: title="Fritjof aus Norwegen (2) (AD)"
+ # from: https://www.ardmediathek.de/ard/sammlung/der-krieg-und-ich/68cMkqJdllm639Skj4c7sS/
+ r'.*(?P<ep_info> \((?:Folge |Teil )?(?P<episode_number>\d+)(?:/\d+)?\)).*',
+ r'.*(?P<ep_info>Folge (?P<episode_number>\d+)(?:\:| -|) )\"(?P<episode>.+)\".*',
+ # E.g.: title="Folge 25/42: Symmetrie"
+ # from: https://www.ardmediathek.de/ard/video/grips-mathe/folge-25-42-symmetrie/ard-alpha/Y3JpZDovL2JyLmRlL3ZpZGVvLzMyYzI0ZjczLWQ1N2MtNDAxNC05ZmZhLTFjYzRkZDA5NDU5OQ/
+ # E.g.: title="Folge 1063 - Vertrauen"
+ # from: https://www.ardmediathek.de/ard/sendung/die-fallers/Y3JpZDovL3N3ci5kZS8yMzAyMDQ4/
+ r'.*(?P<ep_info>Folge (?P<episode_number>\d+)(?:/\d+)?(?:\:| -|) ).*',
+ ]:
+ m = re.match(pattern, title)
+ if m:
+ groupdict = m.groupdict()
+ res['season_number'] = int_or_none(groupdict.get('season_number'))
+ res['episode_number'] = int_or_none(groupdict.get('episode_number'))
+ res['episode'] = str_or_none(groupdict.get('episode'))
+ # Build the episode title by removing numeric episode information:
+ if groupdict.get('ep_info') and not res['episode']:
+ res['episode'] = str_or_none(
+ title.replace(groupdict.get('ep_info'), ''))
+ if res['episode']:
+ res['episode'] = res['episode'].strip()
+ break
+
+ # As a fallback use the whole title as the episode name:
+ if not res.get('episode'):
+ res['episode'] = title.strip()
+ return res
+
def _extract_formats(self, media_info, video_id):
type_ = media_info.get('_type')
media_array = media_info.get('_mediaArray', [])
@@ -160,7 +199,7 @@ class ARDMediathekIE(ARDMediathekBaseIE):
def _real_extract(self, url):
# determine video id from url
- m = re.match(self._VALID_URL, url)
+ m = self._match_valid_url(url)
document_id = None
@@ -233,7 +272,8 @@ class ARDMediathekIE(ARDMediathekBaseIE):
else: # request JSON file
if not document_id:
video_id = self._search_regex(
- r'/play/(?:config|media)/(\d+)', webpage, 'media id')
+ (r'/play/(?:config|media|sola)/(\d+)', r'contentId["\']\s*:\s*(\d+)'),
+ webpage, 'media id', default=None)
info = self._extract_media_info(
'http://www.ardmediathek.de/play/media/%s' % video_id,
webpage, video_id)
@@ -244,6 +284,7 @@ class ARDMediathekIE(ARDMediathekBaseIE):
'description': description,
'thumbnail': thumbnail,
})
+ info.update(self._ARD_extract_episode_info(info['title']))
return info
@@ -270,6 +311,9 @@ class ARDIE(InfoExtractor):
'url': 'https://www.daserste.de/information/nachrichten-wetter/tagesthemen/videosextern/tagesthemen-17736.html',
'only_matching': True,
}, {
+ 'url': 'https://www.daserste.de/unterhaltung/serie/in-aller-freundschaft-die-jungen-aerzte/videos/diversity-tag-sanam-afrashteh100.html',
+ 'only_matching': True,
+ }, {
'url': 'http://www.daserste.de/information/reportage-dokumentation/dokus/videos/die-story-im-ersten-mission-unter-falscher-flagge-100.html',
'only_matching': True,
}, {
@@ -281,7 +325,7 @@ class ARDIE(InfoExtractor):
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
display_id = mobj.group('id')
player_url = mobj.group('mainurl') + '~playerXml.xml'
@@ -344,7 +388,7 @@ class ARDIE(InfoExtractor):
class ARDBetaMediathekIE(ARDMediathekBaseIE):
- _VALID_URL = r'https://(?:(?:beta|www)\.)?ardmediathek\.de/(?:[^/]+/)?(?:player|live|video)/(?:[^/]+/)*(?P<id>Y3JpZDovL[a-zA-Z0-9]+)'
+ _VALID_URL = r'https://(?:(?:beta|www)\.)?ardmediathek\.de/(?P<client>[^/]+)/(?P<mode>player|live|video|sendung|sammlung)/(?P<display_id>(?:[^/]+/)*)(?P<video_id>[a-zA-Z0-9]+)'
_TESTS = [{
'url': 'https://www.ardmediathek.de/mdr/video/die-robuste-roswita/Y3JpZDovL21kci5kZS9iZWl0cmFnL2Ntcy84MWMxN2MzZC0wMjkxLTRmMzUtODk4ZS0wYzhlOWQxODE2NGI/',
'md5': 'a1dc75a39c61601b980648f7c9f9f71d',
@@ -375,21 +419,132 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
'url': 'https://www.ardmediathek.de/swr/live/Y3JpZDovL3N3ci5kZS8xMzQ4MTA0Mg',
'only_matching': True,
}, {
- 'url': 'https://www.ardmediathek.de/video/coronavirus-update-ndr-info/astrazeneca-kurz-lockdown-und-pims-syndrom-81/ndr/Y3JpZDovL25kci5kZS84NzE0M2FjNi0wMWEwLTQ5ODEtOTE5NS1mOGZhNzdhOTFmOTI/',
+ # playlist of type 'sendung'
+ 'url': 'https://www.ardmediathek.de/ard/sendung/doctor-who/Y3JpZDovL3dkci5kZS9vbmUvZG9jdG9yIHdobw/',
'only_matching': True,
}, {
- 'url': 'https://www.ardmediathek.de/ard/player/Y3JpZDovL3dkci5kZS9CZWl0cmFnLWQ2NDJjYWEzLTMwZWYtNGI4NS1iMTI2LTU1N2UxYTcxOGIzOQ/tatort-duo-koeln-leipzig-ihr-kinderlein-kommet',
+ # playlist of type 'sammlung'
+ 'url': 'https://www.ardmediathek.de/ard/sammlung/team-muenster/5JpTzLSbWUAK8184IOvEir/',
'only_matching': True,
}]
+ def _ARD_load_playlist_snipped(self, playlist_id, display_id, client, mode, pageNumber):
+ """ Query the ARD server for playlist information
+ and returns the data in "raw" format """
+ if mode == 'sendung':
+ graphQL = json.dumps({
+ 'query': '''{
+ showPage(
+ client: "%s"
+ showId: "%s"
+ pageNumber: %d
+ ) {
+ pagination {
+ pageSize
+ totalElements
+ }
+ teasers { # Array
+ mediumTitle
+ links { target { id href title } }
+ type
+ }
+ }}''' % (client, playlist_id, pageNumber),
+ }).encode()
+ else: # mode == 'sammlung'
+ graphQL = json.dumps({
+ 'query': '''{
+ morePage(
+ client: "%s"
+ compilationId: "%s"
+ pageNumber: %d
+ ) {
+ widget {
+ pagination {
+ pageSize
+ totalElements
+ }
+ teasers { # Array
+ mediumTitle
+ links { target { id href title } }
+ type
+ }
+ }
+ }}''' % (client, playlist_id, pageNumber),
+ }).encode()
+ # Ressources for ARD graphQL debugging:
+ # https://api-test.ardmediathek.de/public-gateway
+ show_page = self._download_json(
+ 'https://api.ardmediathek.de/public-gateway',
+ '[Playlist] %s' % display_id,
+ data=graphQL,
+ headers={'Content-Type': 'application/json'})['data']
+ # align the structure of the returned data:
+ if mode == 'sendung':
+ show_page = show_page['showPage']
+ else: # mode == 'sammlung'
+ show_page = show_page['morePage']['widget']
+ return show_page
+
+ def _ARD_extract_playlist(self, url, playlist_id, display_id, client, mode):
+ """ Collects all playlist entries and returns them as info dict.
+ Supports playlists of mode 'sendung' and 'sammlung', and also nested
+ playlists. """
+ entries = []
+ pageNumber = 0
+ while True: # iterate by pageNumber
+ show_page = self._ARD_load_playlist_snipped(
+ playlist_id, display_id, client, mode, pageNumber)
+ for teaser in show_page['teasers']: # process playlist items
+ if '/compilation/' in teaser['links']['target']['href']:
+ # alternativ cond.: teaser['type'] == "compilation"
+ # => This is an nested compilation, e.g. like:
+ # https://www.ardmediathek.de/ard/sammlung/die-kirche-bleibt-im-dorf/5eOHzt8XB2sqeFXbIoJlg2/
+ link_mode = 'sammlung'
+ else:
+ link_mode = 'video'
+
+ item_url = 'https://www.ardmediathek.de/%s/%s/%s/%s/%s' % (
+ client, link_mode, display_id,
+ # perform HTLM quoting of episode title similar to ARD:
+ re.sub('^-|-$', '', # remove '-' from begin/end
+ re.sub('[^a-zA-Z0-9]+', '-', # replace special chars by -
+ teaser['links']['target']['title'].lower()
+ .replace('ä', 'ae').replace('ö', 'oe')
+ .replace('ü', 'ue').replace('ß', 'ss'))),
+ teaser['links']['target']['id'])
+ entries.append(self.url_result(
+ item_url,
+ ie=ARDBetaMediathekIE.ie_key()))
+
+ if (show_page['pagination']['pageSize'] * (pageNumber + 1)
+ >= show_page['pagination']['totalElements']):
+ # we've processed enough pages to get all playlist entries
+ break
+ pageNumber = pageNumber + 1
+
+ return self.playlist_result(entries, playlist_title=display_id)
+
def _real_extract(self, url):
- video_id = self._match_id(url)
+ mobj = self._match_valid_url(url)
+ video_id = mobj.group('video_id')
+ display_id = mobj.group('display_id')
+ if display_id:
+ display_id = display_id.rstrip('/')
+ if not display_id:
+ display_id = video_id
+
+ if mobj.group('mode') in ('sendung', 'sammlung'):
+ # this is a playlist-URL
+ return self._ARD_extract_playlist(
+ url, video_id, display_id,
+ mobj.group('client'),
+ mobj.group('mode'))
player_page = self._download_json(
'https://api.ardmediathek.de/public-gateway',
- video_id, data=json.dumps({
+ display_id, data=json.dumps({
'query': '''{
- playerPage(client: "ard", clipId: "%s") {
+ playerPage(client:"%s", clipId: "%s") {
blockedByFsk
broadcastedOn
maturityContentRating
@@ -419,7 +574,7 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
}
}
}
-}''' % video_id,
+}''' % (mobj.group('client'), video_id),
}).encode(), headers={
'Content-Type': 'application/json'
})['data']['playerPage']
@@ -444,9 +599,11 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
r'\(FSK\s*(\d+)\)\s*$', description, 'age limit', default=None))
info.update({
'age_limit': age_limit,
+ 'display_id': display_id,
'title': title,
'description': description,
'timestamp': unified_timestamp(player_page.get('broadcastedOn')),
'series': try_get(player_page, lambda x: x['show']['title']),
})
+ info.update(self._ARD_extract_episode_info(info['title']))
return info
diff --git a/hypervideo_dl/extractor/arkena.py b/hypervideo_dl/extractor/arkena.py
index fd46b1c..4f4f457 100644
--- a/hypervideo_dl/extractor/arkena.py
+++ b/hypervideo_dl/extractor/arkena.py
@@ -4,12 +4,12 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..compat import compat_urlparse
from ..utils import (
ExtractorError,
float_or_none,
int_or_none,
parse_iso8601,
+ parse_qs,
try_get,
)
@@ -63,13 +63,13 @@ class ArkenaIE(InfoExtractor):
return mobj.group('url')
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
video_id = mobj.group('id')
account_id = mobj.group('account_id')
# Handle http://video.arkena.com/play2/embed/player URL
if not video_id:
- qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
+ qs = parse_qs(url)
video_id = qs.get('mediaId', [None])[0]
account_id = qs.get('accountId', [None])[0]
if not video_id or not account_id:
diff --git a/hypervideo_dl/extractor/arte.py b/hypervideo_dl/extractor/arte.py
index 03abdbf..296b169 100644
--- a/hypervideo_dl/extractor/arte.py
+++ b/hypervideo_dl/extractor/arte.py
@@ -6,11 +6,11 @@ import re
from .common import InfoExtractor
from ..compat import (
compat_str,
- compat_urlparse,
)
from ..utils import (
ExtractorError,
int_or_none,
+ parse_qs,
qualities,
try_get,
unified_strdate,
@@ -49,7 +49,7 @@ class ArteTVIE(ArteTVBaseIE):
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
video_id = mobj.group('id')
lang = mobj.group('lang') or mobj.group('lang_2')
@@ -150,7 +150,6 @@ class ArteTVIE(ArteTVBaseIE):
format = {
'format_id': format_id,
- 'preference': -10 if f.get('videoFormat') == 'M3U8' else None,
'language_preference': lang_pref,
'format_note': '%s, %s' % (f.get('versionCode'), f.get('versionLibelle')),
'width': int_or_none(f.get('width')),
@@ -168,12 +167,14 @@ class ArteTVIE(ArteTVBaseIE):
formats.append(format)
- self._sort_formats(formats)
+ # For this extractor, quality only represents the relative quality
+ # with respect to other formats with the same resolution
+ self._sort_formats(formats, ('res', 'quality'))
return {
'id': player_info.get('VID') or video_id,
'title': title,
- 'description': player_info.get('VDE'),
+ 'description': player_info.get('VDE') or player_info.get('V7T'),
'upload_date': unified_strdate(upload_date_str),
'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'),
'formats': formats,
@@ -203,7 +204,7 @@ class ArteTVEmbedIE(InfoExtractor):
webpage)]
def _real_extract(self, url):
- qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
+ qs = parse_qs(url)
json_url = qs['json_url'][0]
video_id = ArteTVIE._match_id(json_url)
return self.url_result(
@@ -226,7 +227,7 @@ class ArteTVPlaylistIE(ArteTVBaseIE):
}]
def _real_extract(self, url):
- lang, playlist_id = re.match(self._VALID_URL, url).groups()
+ lang, playlist_id = self._match_valid_url(url).groups()
collection = self._download_json(
'%s/collectionData/%s/%s?source=videos'
% (self._API_BASE, lang, playlist_id), playlist_id)
diff --git a/hypervideo_dl/extractor/asiancrush.py b/hypervideo_dl/extractor/asiancrush.py
index 66ce7c6..75a6329 100644
--- a/hypervideo_dl/extractor/asiancrush.py
+++ b/hypervideo_dl/extractor/asiancrush.py
@@ -111,7 +111,7 @@ class AsianCrushIE(AsianCrushBaseIE):
}]
def _real_extract(self, url):
- host, video_id = re.match(self._VALID_URL, url).groups()
+ host, video_id = self._match_valid_url(url).groups()
if host == 'cocoro.tv':
webpage = self._download_webpage(url, video_id)
@@ -161,7 +161,7 @@ class AsianCrushPlaylistIE(AsianCrushBaseIE):
yield self._parse_video_data(video)
def _real_extract(self, url):
- host, playlist_id = re.match(self._VALID_URL, url).groups()
+ host, playlist_id = self._match_valid_url(url).groups()
if host == 'cocoro.tv':
webpage = self._download_webpage(url, playlist_id)
diff --git a/hypervideo_dl/extractor/atresplayer.py b/hypervideo_dl/extractor/atresplayer.py
index c2cec98..8143eb4 100644
--- a/hypervideo_dl/extractor/atresplayer.py
+++ b/hypervideo_dl/extractor/atresplayer.py
@@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
from ..compat import compat_HTTPError
@@ -75,7 +74,7 @@ class AtresPlayerIE(InfoExtractor):
self._request_webpage(target_url, None, 'Following Target URL')
def _real_extract(self, url):
- display_id, video_id = re.match(self._VALID_URL, url).groups()
+ display_id, video_id = self._match_valid_url(url).groups()
try:
episode = self._download_json(
@@ -86,18 +85,19 @@ class AtresPlayerIE(InfoExtractor):
title = episode['titulo']
formats = []
+ subtitles = {}
for source in episode.get('sources', []):
src = source.get('src')
if not src:
continue
src_type = source.get('type')
if src_type == 'application/vnd.apple.mpegurl':
- formats.extend(self._extract_m3u8_formats(
+ formats, subtitles = self._extract_m3u8_formats(
src, video_id, 'mp4', 'm3u8_native',
- m3u8_id='hls', fatal=False))
+ m3u8_id='hls', fatal=False)
elif src_type == 'application/dash+xml':
- formats.extend(self._extract_mpd_formats(
- src, video_id, mpd_id='dash', fatal=False))
+ formats, subtitles = self._extract_mpd_formats(
+ src, video_id, mpd_id='dash', fatal=False)
self._sort_formats(formats)
heartbeat = episode.get('heartbeat') or {}
@@ -115,4 +115,5 @@ class AtresPlayerIE(InfoExtractor):
'channel': get_meta('channel'),
'season': get_meta('season'),
'episode_number': int_or_none(get_meta('episodeNumber')),
+ 'subtitles': subtitles,
}
diff --git a/hypervideo_dl/extractor/atvat.py b/hypervideo_dl/extractor/atvat.py
index 95e572d..7c30cfc 100644
--- a/hypervideo_dl/extractor/atvat.py
+++ b/hypervideo_dl/extractor/atvat.py
@@ -1,75 +1,106 @@
# coding: utf-8
from __future__ import unicode_literals
+import datetime
+
from .common import InfoExtractor
from ..utils import (
- determine_ext,
- int_or_none,
- unescapeHTML,
+ float_or_none,
+ jwt_encode_hs256,
+ try_get,
)
class ATVAtIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?atv\.at/(?:[^/]+/){2}(?P<id>[dv]\d+)'
+ _VALID_URL = r'https?://(?:www\.)?atv\.at/tv/(?:[^/]+/){2,3}(?P<id>.*)'
+
_TESTS = [{
- 'url': 'http://atv.at/aktuell/di-210317-2005-uhr/v1698449/',
- 'md5': 'c3b6b975fb3150fc628572939df205f2',
+ 'url': 'https://www.atv.at/tv/bauer-sucht-frau/staffel-18/bauer-sucht-frau/bauer-sucht-frau-staffel-18-folge-3-die-hofwochen',
+ 'md5': '3c3b4aaca9f63e32b35e04a9c2515903',
'info_dict': {
- 'id': '1698447',
+ 'id': 'v-ce9cgn1e70n5-1',
'ext': 'mp4',
- 'title': 'DI, 21.03.17 | 20:05 Uhr 1/1',
+ 'title': 'Bauer sucht Frau - Staffel 18 Folge 3 - Die Hofwochen',
}
}, {
- 'url': 'http://atv.at/aktuell/meinrad-knapp/d8416/',
+ 'url': 'https://www.atv.at/tv/bauer-sucht-frau/staffel-18/episode-01/bauer-sucht-frau-staffel-18-vorstellungsfolge-1',
'only_matching': True,
}]
- def _real_extract(self, url):
- display_id = self._match_id(url)
- webpage = self._download_webpage(url, display_id)
- video_data = self._parse_json(unescapeHTML(self._search_regex(
- [r'flashPlayerOptions\s*=\s*(["\'])(?P<json>(?:(?!\1).)+)\1',
- r'class="[^"]*jsb_video/FlashPlayer[^"]*"[^>]+data-jsb="(?P<json>[^"]+)"'],
- webpage, 'player data', group='json')),
- display_id)['config']['initial_video']
+ # extracted from bootstrap.js function (search for e.encryption_key and use your browser's debugger)
+ _ACCESS_ID = 'x_atv'
+ _ENCRYPTION_KEY = 'Hohnaekeishoogh2omaeghooquooshia'
- video_id = video_data['id']
- video_title = video_data['title']
+ def _extract_video_info(self, url, content, video):
+ clip_id = content.get('splitId', content['id'])
+ formats = []
+ clip_urls = video['urls']
+ for protocol, variant in clip_urls.items():
+ source_url = try_get(variant, lambda x: x['clear']['url'])
+ if not source_url:
+ continue
+ if protocol == 'dash':
+ formats.extend(self._extract_mpd_formats(
+ source_url, clip_id, mpd_id=protocol, fatal=False))
+ elif protocol == 'hls':
+ formats.extend(self._extract_m3u8_formats(
+ source_url, clip_id, 'mp4', 'm3u8_native',
+ m3u8_id=protocol, fatal=False))
+ else:
+ formats.append({
+ 'url': source_url,
+ 'format_id': protocol,
+ })
+ self._sort_formats(formats)
- parts = []
- for part in video_data.get('parts', []):
- part_id = part['id']
- part_title = part['title']
+ return {
+ 'id': clip_id,
+ 'title': content.get('title'),
+ 'duration': float_or_none(content.get('duration')),
+ 'series': content.get('tvShowTitle'),
+ 'formats': formats,
+ }
- formats = []
- for source in part.get('sources', []):
- source_url = source.get('src')
- if not source_url:
- continue
- ext = determine_ext(source_url)
- if ext == 'm3u8':
- formats.extend(self._extract_m3u8_formats(
- source_url, part_id, 'mp4', 'm3u8_native',
- m3u8_id='hls', fatal=False))
- else:
- formats.append({
- 'format_id': source.get('delivery'),
- 'url': source_url,
- })
- self._sort_formats(formats)
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+ json_data = self._parse_json(
+ self._search_regex(r'<script id="state" type="text/plain">(.*)</script>', webpage, 'json_data'),
+ video_id=video_id)
+
+ video_title = json_data['views']['default']['page']['title']
+ contentResource = json_data['views']['default']['page']['contentResource']
+ content_id = contentResource[0]['id']
+ content_ids = [{'id': id, 'subclip_start': content['start'], 'subclip_end': content['end']}
+ for id, content in enumerate(contentResource)]
- parts.append({
- 'id': part_id,
- 'title': part_title,
- 'thumbnail': part.get('preview_image_url'),
- 'duration': int_or_none(part.get('duration')),
- 'is_live': part.get('is_livestream'),
- 'formats': formats,
+ time_of_request = datetime.datetime.now()
+ not_before = time_of_request - datetime.timedelta(minutes=5)
+ expire = time_of_request + datetime.timedelta(minutes=5)
+ payload = {
+ 'content_ids': {
+ content_id: content_ids,
+ },
+ 'secure_delivery': True,
+ 'iat': int(time_of_request.timestamp()),
+ 'nbf': int(not_before.timestamp()),
+ 'exp': int(expire.timestamp()),
+ }
+ jwt_token = jwt_encode_hs256(payload, self._ENCRYPTION_KEY, headers={'kid': self._ACCESS_ID})
+ videos = self._download_json(
+ 'https://vas-v4.p7s1video.net/4.0/getsources',
+ content_id, 'Downloading videos JSON', query={
+ 'token': jwt_token.decode('utf-8')
})
+ video_id, videos_data = list(videos['data'].items())[0]
+ entries = [
+ self._extract_video_info(url, contentResource[video['id']], video)
+ for video in videos_data]
+
return {
'_type': 'multi_video',
'id': video_id,
'title': video_title,
- 'entries': parts,
+ 'entries': entries,
}
diff --git a/hypervideo_dl/extractor/audius.py b/hypervideo_dl/extractor/audius.py
new file mode 100644
index 0000000..fa64995
--- /dev/null
+++ b/hypervideo_dl/extractor/audius.py
@@ -0,0 +1,274 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import random
+
+from .common import InfoExtractor
+from ..utils import ExtractorError, try_get, compat_str, str_or_none
+from ..compat import compat_urllib_parse_unquote
+
+
+class AudiusBaseIE(InfoExtractor):
+ _API_BASE = None
+ _API_V = '/v1'
+
+ def _get_response_data(self, response):
+ if isinstance(response, dict):
+ response_data = response.get('data')
+ if response_data is not None:
+ return response_data
+ if len(response) == 1 and 'message' in response:
+ raise ExtractorError('API error: %s' % response['message'],
+ expected=True)
+ raise ExtractorError('Unexpected API response')
+
+ def _select_api_base(self):
+ """Selecting one of the currently available API hosts"""
+ response = super(AudiusBaseIE, self)._download_json(
+ 'https://api.audius.co/', None,
+ note='Requesting available API hosts',
+ errnote='Unable to request available API hosts')
+ hosts = self._get_response_data(response)
+ if isinstance(hosts, list):
+ self._API_BASE = random.choice(hosts)
+ return
+ raise ExtractorError('Unable to get available API hosts')
+
+ @staticmethod
+ def _prepare_url(url, title):
+ """
+ Audius removes forward slashes from the uri, but leaves backslashes.
+ The problem is that the current version of Chrome replaces backslashes
+ in the address bar with a forward slashes, so if you copy the link from
+ there and paste it into youtube-dl, you won't be able to download
+ anything from this link, since the Audius API won't be able to resolve
+ this url
+ """
+ url = compat_urllib_parse_unquote(url)
+ title = compat_urllib_parse_unquote(title)
+ if '/' in title or '%2F' in title:
+ fixed_title = title.replace('/', '%5C').replace('%2F', '%5C')
+ return url.replace(title, fixed_title)
+ return url
+
+ def _api_request(self, path, item_id=None, note='Downloading JSON metadata',
+ errnote='Unable to download JSON metadata',
+ expected_status=None):
+ if self._API_BASE is None:
+ self._select_api_base()
+ try:
+ response = super(AudiusBaseIE, self)._download_json(
+ '%s%s%s' % (self._API_BASE, self._API_V, path), item_id, note=note,
+ errnote=errnote, expected_status=expected_status)
+ except ExtractorError as exc:
+ # some of Audius API hosts may not work as expected and return HTML
+ if 'Failed to parse JSON' in compat_str(exc):
+ raise ExtractorError('An error occurred while receiving data. Try again',
+ expected=True)
+ raise exc
+ return self._get_response_data(response)
+
+ def _resolve_url(self, url, item_id):
+ return self._api_request('/resolve?url=%s' % url, item_id,
+ expected_status=404)
+
+
+class AudiusIE(AudiusBaseIE):
+ _VALID_URL = r'''(?x)https?://(?:www\.)?(?:audius\.co/(?P<uploader>[\w\d-]+)(?!/album|/playlist)/(?P<title>\S+))'''
+ IE_DESC = 'Audius.co'
+ _TESTS = [
+ {
+ # URL from Chrome address bar which replace backslash to forward slash
+ 'url': 'https://audius.co/test_acc/t%D0%B5%D0%B5%D0%B5est-1.%5E_%7B%7D/%22%3C%3E.%E2%84%96~%60-198631',
+ 'md5': '92c35d3e754d5a0f17eef396b0d33582',
+ 'info_dict': {
+ 'id': 'xd8gY',
+ 'title': '''Tеееest/ 1.!@#$%^&*()_+=[]{};'\\\":<>,.?/№~`''',
+ 'ext': 'mp3',
+ 'description': 'Description',
+ 'duration': 30,
+ 'track': '''Tеееest/ 1.!@#$%^&*()_+=[]{};'\\\":<>,.?/№~`''',
+ 'artist': 'test',
+ 'genre': 'Electronic',
+ 'thumbnail': r're:https?://.*\.jpg',
+ 'view_count': int,
+ 'like_count': int,
+ 'repost_count': int,
+ }
+ },
+ {
+ # Regular track
+ 'url': 'https://audius.co/voltra/radar-103692',
+ 'md5': '491898a0a8de39f20c5d6a8a80ab5132',
+ 'info_dict': {
+ 'id': 'KKdy2',
+ 'title': 'RADAR',
+ 'ext': 'mp3',
+ 'duration': 318,
+ 'track': 'RADAR',
+ 'artist': 'voltra',
+ 'genre': 'Trance',
+ 'thumbnail': r're:https?://.*\.jpg',
+ 'view_count': int,
+ 'like_count': int,
+ 'repost_count': int,
+ }
+ },
+ ]
+
+ _ARTWORK_MAP = {
+ "150x150": 150,
+ "480x480": 480,
+ "1000x1000": 1000
+ }
+
+ def _real_extract(self, url):
+ mobj = self._match_valid_url(url)
+ track_id = try_get(mobj, lambda x: x.group('track_id'))
+ if track_id is None:
+ title = mobj.group('title')
+ # uploader = mobj.group('uploader')
+ url = self._prepare_url(url, title)
+ track_data = self._resolve_url(url, title)
+ else: # API link
+ title = None
+ # uploader = None
+ track_data = self._api_request('/tracks/%s' % track_id, track_id)
+
+ if not isinstance(track_data, dict):
+ raise ExtractorError('Unexpected API response')
+
+ track_id = track_data.get('id')
+ if track_id is None:
+ raise ExtractorError('Unable to get ID of the track')
+
+ artworks_data = track_data.get('artwork')
+ thumbnails = []
+ if isinstance(artworks_data, dict):
+ for quality_key, thumbnail_url in artworks_data.items():
+ thumbnail = {
+ "url": thumbnail_url
+ }
+ quality_code = self._ARTWORK_MAP.get(quality_key)
+ if quality_code is not None:
+ thumbnail['preference'] = quality_code
+ thumbnails.append(thumbnail)
+
+ return {
+ 'id': track_id,
+ 'title': track_data.get('title', title),
+ 'url': '%s/v1/tracks/%s/stream' % (self._API_BASE, track_id),
+ 'ext': 'mp3',
+ 'description': track_data.get('description'),
+ 'duration': track_data.get('duration'),
+ 'track': track_data.get('title'),
+ 'artist': try_get(track_data, lambda x: x['user']['name'], compat_str),
+ 'genre': track_data.get('genre'),
+ 'thumbnails': thumbnails,
+ 'view_count': track_data.get('play_count'),
+ 'like_count': track_data.get('favorite_count'),
+ 'repost_count': track_data.get('repost_count'),
+ }
+
+
+class AudiusTrackIE(AudiusIE):
+ _VALID_URL = r'''(?x)(?:audius:)(?:https?://(?:www\.)?.+/v1/tracks/)?(?P<track_id>\w+)'''
+ IE_NAME = 'audius:track'
+ IE_DESC = 'Audius track ID or API link. Prepend with "audius:"'
+ _TESTS = [
+ {
+ 'url': 'audius:9RWlo',
+ 'only_matching': True
+ },
+ {
+ 'url': 'audius:http://discoveryprovider.audius.prod-us-west-2.staked.cloud/v1/tracks/9RWlo',
+ 'only_matching': True
+ },
+ ]
+
+
+class AudiusPlaylistIE(AudiusBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?audius\.co/(?P<uploader>[\w\d-]+)/(?:album|playlist)/(?P<title>\S+)'
+ IE_NAME = 'audius:playlist'
+ IE_DESC = 'Audius.co playlists'
+ _TEST = {
+ 'url': 'https://audius.co/test_acc/playlist/test-playlist-22910',
+ 'info_dict': {
+ 'id': 'DNvjN',
+ 'title': 'test playlist',
+ 'description': 'Test description\n\nlol',
+ },
+ 'playlist_count': 175,
+ }
+
+ def _build_playlist(self, tracks):
+ entries = []
+ for track in tracks:
+ if not isinstance(track, dict):
+ raise ExtractorError('Unexpected API response')
+ track_id = str_or_none(track.get('id'))
+ if not track_id:
+ raise ExtractorError('Unable to get track ID from playlist')
+ entries.append(self.url_result(
+ 'audius:%s' % track_id,
+ ie=AudiusTrackIE.ie_key(), video_id=track_id))
+ return entries
+
+ def _real_extract(self, url):
+ self._select_api_base()
+ mobj = self._match_valid_url(url)
+ title = mobj.group('title')
+ # uploader = mobj.group('uploader')
+ url = self._prepare_url(url, title)
+ playlist_response = self._resolve_url(url, title)
+
+ if not isinstance(playlist_response, list) or len(playlist_response) != 1:
+ raise ExtractorError('Unexpected API response')
+
+ playlist_data = playlist_response[0]
+ if not isinstance(playlist_data, dict):
+ raise ExtractorError('Unexpected API response')
+
+ playlist_id = playlist_data.get('id')
+ if playlist_id is None:
+ raise ExtractorError('Unable to get playlist ID')
+
+ playlist_tracks = self._api_request(
+ '/playlists/%s/tracks' % playlist_id,
+ title, note='Downloading playlist tracks metadata',
+ errnote='Unable to download playlist tracks metadata')
+ if not isinstance(playlist_tracks, list):
+ raise ExtractorError('Unexpected API response')
+
+ entries = self._build_playlist(playlist_tracks)
+ return self.playlist_result(entries, playlist_id,
+ playlist_data.get('playlist_name', title),
+ playlist_data.get('description'))
+
+
+class AudiusProfileIE(AudiusPlaylistIE):
+ IE_NAME = 'audius:artist'
+ IE_DESC = 'Audius.co profile/artist pages'
+ _VALID_URL = r'https?://(?:www)?audius\.co/(?P<id>[^\/]+)/?(?:[?#]|$)'
+ _TEST = {
+ 'url': 'https://audius.co/pzl/',
+ 'info_dict': {
+ 'id': 'ezRo7',
+ 'description': 'TAMALE\n\nContact: officialpzl@gmail.com',
+ 'title': 'pzl',
+ },
+ 'playlist_count': 24,
+ }
+
+ def _real_extract(self, url):
+ self._select_api_base()
+ profile_id = self._match_id(url)
+ try:
+ _profile_data = self._api_request('/full/users/handle/' + profile_id, profile_id)
+ except ExtractorError as e:
+ raise ExtractorError('Could not download profile info; ' + str(e))
+ profile_audius_id = _profile_data[0]['id']
+ profile_bio = _profile_data[0].get('bio')
+
+ api_call = self._api_request('/full/users/handle/%s/tracks' % profile_id, profile_id)
+ return self.playlist_result(self._build_playlist(api_call), profile_audius_id, profile_id, profile_bio)
diff --git a/hypervideo_dl/extractor/awaan.py b/hypervideo_dl/extractor/awaan.py
index 3a7700c..22cc10d 100644
--- a/hypervideo_dl/extractor/awaan.py
+++ b/hypervideo_dl/extractor/awaan.py
@@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
import base64
from .common import InfoExtractor
@@ -19,10 +18,10 @@ from ..utils import (
class AWAANIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?(?:awaan|dcndigital)\.ae/(?:#/)?show/(?P<show_id>\d+)/[^/]+(?:/(?P<video_id>\d+)/(?P<season_id>\d+))?'
+ _VALID_URL = r'https?://(?:www\.)?(?:awaan|dcndigital)\.ae/(?:#/)?show/(?P<show_id>\d+)/[^/]+(?:/(?P<id>\d+)/(?P<season_id>\d+))?'
def _real_extract(self, url):
- show_id, video_id, season_id = re.match(self._VALID_URL, url).groups()
+ show_id, video_id, season_id = self._match_valid_url(url).groups()
if video_id and int(video_id) > 0:
return self.url_result(
'http://awaan.ae/media/%s' % video_id, 'AWAANVideo')
@@ -154,7 +153,7 @@ class AWAANSeasonIE(InfoExtractor):
def _real_extract(self, url):
url, smuggled_data = unsmuggle_url(url, {})
- show_id, season_id = re.match(self._VALID_URL, url).groups()
+ show_id, season_id = self._match_valid_url(url).groups()
data = {}
if season_id:
diff --git a/hypervideo_dl/extractor/azmedien.py b/hypervideo_dl/extractor/azmedien.py
index 9302669..fee640e 100644
--- a/hypervideo_dl/extractor/azmedien.py
+++ b/hypervideo_dl/extractor/azmedien.py
@@ -2,7 +2,6 @@
from __future__ import unicode_literals
import json
-import re
from .common import InfoExtractor
from .kaltura import KalturaIE
@@ -51,7 +50,7 @@ class AZMedienIE(InfoExtractor):
_PARTNER_ID = '1719221'
def _real_extract(self, url):
- host, display_id, article_id, entry_id = re.match(self._VALID_URL, url).groups()
+ host, display_id, article_id, entry_id = self._match_valid_url(url).groups()
if not entry_id:
entry_id = self._download_json(
diff --git a/hypervideo_dl/extractor/baidu.py b/hypervideo_dl/extractor/baidu.py
index 234a661..364fd94 100644
--- a/hypervideo_dl/extractor/baidu.py
+++ b/hypervideo_dl/extractor/baidu.py
@@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
from ..utils import unescapeHTML
@@ -33,7 +32,7 @@ class BaiduVideoIE(InfoExtractor):
path, category, playlist_id), playlist_id, note)
def _real_extract(self, url):
- category, playlist_id = re.match(self._VALID_URL, url).groups()
+ category, playlist_id = self._match_valid_url(url).groups()
if category == 'show':
category = 'tvshow'
if category == 'tv':
diff --git a/hypervideo_dl/extractor/bandcamp.py b/hypervideo_dl/extractor/bandcamp.py
index dbe57c7..b664145 100644
--- a/hypervideo_dl/extractor/bandcamp.py
+++ b/hypervideo_dl/extractor/bandcamp.py
@@ -31,9 +31,9 @@ class BandcampIE(InfoExtractor):
'info_dict': {
'id': '1812978515',
'ext': 'mp3',
- 'title': "hypervideo \"'/\\ä↭ - hypervideo \"'/\\ä↭ - hypervideo test song \"'/\\ä↭",
+ 'title': "youtube-dl \"'/\\ä↭ - youtube-dl \"'/\\ä↭ - youtube-dl test song \"'/\\ä↭",
'duration': 9.8485,
- 'uploader': 'hypervideo "\'/\\ä↭',
+ 'uploader': 'youtube-dl "\'/\\ä↭',
'upload_date': '20121129',
'timestamp': 1354224127,
},
@@ -212,7 +212,7 @@ class BandcampIE(InfoExtractor):
class BandcampAlbumIE(BandcampIE):
IE_NAME = 'Bandcamp:album'
- _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<id>[^/?#&]+))?'
+ _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?!/music)(?:/album/(?P<id>[^/?#&]+))?'
_TESTS = [{
'url': 'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1',
@@ -294,7 +294,7 @@ class BandcampAlbumIE(BandcampIE):
else super(BandcampAlbumIE, cls).suitable(url))
def _real_extract(self, url):
- uploader_id, album_id = re.match(self._VALID_URL, url).groups()
+ uploader_id, album_id = self._match_valid_url(url).groups()
playlist_id = album_id or uploader_id
webpage = self._download_webpage(url, playlist_id)
tralbum = self._extract_data_attr(webpage, playlist_id)
@@ -389,3 +389,43 @@ class BandcampWeeklyIE(BandcampIE):
'episode_id': show_id,
'formats': formats
}
+
+
+class BandcampMusicIE(InfoExtractor):
+ _VALID_URL = r'https?://(?P<id>[^/]+)\.bandcamp\.com/music'
+ _TESTS = [{
+ 'url': 'https://steviasphere.bandcamp.com/music',
+ 'playlist_mincount': 47,
+ 'info_dict': {
+ 'id': 'steviasphere',
+ },
+ }, {
+ 'url': 'https://coldworldofficial.bandcamp.com/music',
+ 'playlist_mincount': 10,
+ 'info_dict': {
+ 'id': 'coldworldofficial',
+ },
+ }, {
+ 'url': 'https://nuclearwarnowproductions.bandcamp.com/music',
+ 'playlist_mincount': 399,
+ 'info_dict': {
+ 'id': 'nuclearwarnowproductions',
+ },
+ }
+ ]
+
+ _TYPE_IE_DICT = {
+ 'album': BandcampAlbumIE.ie_key(),
+ 'track': BandcampIE.ie_key()
+ }
+
+ def _real_extract(self, url):
+ id = self._match_id(url)
+ webpage = self._download_webpage(url, id)
+ items = re.findall(r'href\=\"\/(?P<path>(?P<type>album|track)+/[^\"]+)', webpage)
+ entries = [
+ self.url_result(
+ f'https://{id}.bandcamp.com/{item[0]}',
+ ie=self._TYPE_IE_DICT[item[1]])
+ for item in items]
+ return self.playlist_result(entries, id)
diff --git a/hypervideo_dl/extractor/bannedvideo.py b/hypervideo_dl/extractor/bannedvideo.py
new file mode 100644
index 0000000..3db1151
--- /dev/null
+++ b/hypervideo_dl/extractor/bannedvideo.py
@@ -0,0 +1,158 @@
+from __future__ import unicode_literals
+
+import json
+
+from .common import InfoExtractor
+from ..utils import (
+ try_get,
+ int_or_none,
+ url_or_none,
+ float_or_none,
+ unified_timestamp,
+)
+
+
+class BannedVideoIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?banned\.video/watch\?id=(?P<id>[0-f]{24})'
+ _TESTS = [{
+ 'url': 'https://banned.video/watch?id=5e7a859644e02200c6ef5f11',
+ 'md5': '14b6e81d41beaaee2215cd75c6ed56e4',
+ 'info_dict': {
+ 'id': '5e7a859644e02200c6ef5f11',
+ 'ext': 'mp4',
+ 'title': 'China Discovers Origin of Corona Virus: Issues Emergency Statement',
+ 'thumbnail': r're:^https?://(?:www\.)?assets\.infowarsmedia.com/images/',
+ 'description': 'md5:560d96f02abbebe6c6b78b47465f6b28',
+ 'upload_date': '20200324',
+ 'timestamp': 1585087895,
+ }
+ }]
+
+ _GRAPHQL_GETMETADATA_QUERY = '''
+query GetVideoAndComments($id: String!) {
+ getVideo(id: $id) {
+ streamUrl
+ directUrl
+ unlisted
+ live
+ tags {
+ name
+ }
+ title
+ summary
+ playCount
+ largeImage
+ videoDuration
+ channel {
+ _id
+ title
+ }
+ createdAt
+ }
+ getVideoComments(id: $id, limit: 999999, offset: 0) {
+ _id
+ content
+ user {
+ _id
+ username
+ }
+ voteCount {
+ positive
+ }
+ createdAt
+ replyCount
+ }
+}'''
+
+ _GRAPHQL_GETCOMMENTSREPLIES_QUERY = '''
+query GetCommentReplies($id: String!) {
+ getCommentReplies(id: $id, limit: 999999, offset: 0) {
+ _id
+ content
+ user {
+ _id
+ username
+ }
+ voteCount {
+ positive
+ }
+ createdAt
+ replyCount
+ }
+}'''
+
+ _GRAPHQL_QUERIES = {
+ 'GetVideoAndComments': _GRAPHQL_GETMETADATA_QUERY,
+ 'GetCommentReplies': _GRAPHQL_GETCOMMENTSREPLIES_QUERY,
+ }
+
+ def _call_api(self, video_id, id, operation, note):
+ return self._download_json(
+ 'https://api.infowarsmedia.com/graphql', video_id, note=note,
+ headers={
+ 'Content-Type': 'application/json; charset=utf-8'
+ }, data=json.dumps({
+ 'variables': {'id': id},
+ 'operationName': operation,
+ 'query': self._GRAPHQL_QUERIES[operation]
+ }).encode('utf8')).get('data')
+
+ def _get_comments(self, video_id, comments, comment_data):
+ yield from comments
+ for comment in comment_data.copy():
+ comment_id = comment.get('_id')
+ if comment.get('replyCount') > 0:
+ reply_json = self._call_api(
+ video_id, comment_id, 'GetCommentReplies',
+ f'Downloading replies for comment {comment_id}')
+ for reply in reply_json.get('getCommentReplies'):
+ yield self._parse_comment(reply, comment_id)
+
+ @staticmethod
+ def _parse_comment(comment_data, parent):
+ return {
+ 'id': comment_data.get('_id'),
+ 'text': comment_data.get('content'),
+ 'author': try_get(comment_data, lambda x: x['user']['username']),
+ 'author_id': try_get(comment_data, lambda x: x['user']['_id']),
+ 'timestamp': unified_timestamp(comment_data.get('createdAt')),
+ 'parent': parent,
+ 'like_count': try_get(comment_data, lambda x: x['voteCount']['positive']),
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ video_json = self._call_api(video_id, video_id, 'GetVideoAndComments', 'Downloading video metadata')
+ video_info = video_json['getVideo']
+ is_live = video_info.get('live')
+ comments = [self._parse_comment(comment, 'root') for comment in video_json.get('getVideoComments')]
+
+ formats = [{
+ 'format_id': 'direct',
+ 'quality': 1,
+ 'url': video_info.get('directUrl'),
+ 'ext': 'mp4',
+ }] if url_or_none(video_info.get('directUrl')) else []
+ if video_info.get('streamUrl'):
+ formats.extend(self._extract_m3u8_formats(
+ video_info.get('streamUrl'), video_id, 'mp4',
+ entry_protocol='m3u8_native', m3u8_id='hls', live=True))
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': video_info.get('title')[:-1],
+ 'formats': formats,
+ 'is_live': is_live,
+ 'description': video_info.get('summary'),
+ 'channel': try_get(video_info, lambda x: x['channel']['title']),
+ 'channel_id': try_get(video_info, lambda x: x['channel']['_id']),
+ 'view_count': int_or_none(video_info.get('playCount')),
+ 'thumbnail': url_or_none(video_info.get('largeImage')),
+ 'duration': float_or_none(video_info.get('videoDuration')),
+ 'timestamp': unified_timestamp(video_info.get('createdAt')),
+ 'tags': [tag.get('name') for tag in video_info.get('tags')],
+ 'availability': self._availability(is_unlisted=video_info.get('unlisted')),
+ 'comments': comments,
+ '__post_extractor': self.extract_comments(video_id, comments, video_json.get('getVideoComments'))
+ }
diff --git a/hypervideo_dl/extractor/bbc.py b/hypervideo_dl/extractor/bbc.py
index 247d982..4e2dcd7 100644
--- a/hypervideo_dl/extractor/bbc.py
+++ b/hypervideo_dl/extractor/bbc.py
@@ -10,9 +10,7 @@ from .common import InfoExtractor
from ..compat import (
compat_etree_Element,
compat_HTTPError,
- compat_parse_qs,
compat_str,
- compat_urllib_parse_urlparse,
compat_urlparse,
)
from ..utils import (
@@ -26,6 +24,7 @@ from ..utils import (
js_to_json,
parse_duration,
parse_iso8601,
+ parse_qs,
strip_or_none,
try_get,
unescapeHTML,
@@ -589,8 +588,8 @@ class BBCIE(BBCCoUkIE):
_VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P<id>[^/#?]+)'
_MEDIA_SETS = [
- 'mobile-tablet-main',
'pc',
+ 'mobile-tablet-main',
]
_TESTS = [{
@@ -1271,7 +1270,7 @@ class BBCIE(BBCCoUkIE):
entries = []
for num, media_meta in enumerate(medias, start=1):
formats, subtitles = self._extract_from_media_meta(media_meta, playlist_id)
- if not formats:
+ if not formats and not self.get_param('ignore_no_formats'):
continue
self._sort_formats(formats)
@@ -1410,7 +1409,7 @@ class BBCCoUkIPlayerPlaylistBaseIE(InfoExtractor):
def _real_extract(self, url):
pid = self._match_id(url)
- qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
+ qs = parse_qs(url)
series_id = qs.get('seriesId', [None])[0]
page = qs.get('page', [None])[0]
per_page = 36 if page else self._PAGE_SIZE
diff --git a/hypervideo_dl/extractor/beatport.py b/hypervideo_dl/extractor/beatport.py
index e607094..e1cf8b4 100644
--- a/hypervideo_dl/extractor/beatport.py
+++ b/hypervideo_dl/extractor/beatport.py
@@ -40,7 +40,7 @@ class BeatportIE(InfoExtractor):
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
track_id = mobj.group('id')
display_id = mobj.group('display_id')
@@ -69,12 +69,10 @@ class BeatportIE(InfoExtractor):
'vcodec': 'none',
}
if ext == 'mp3':
- fmt['preference'] = 0
fmt['acodec'] = 'mp3'
fmt['abr'] = 96
fmt['asr'] = 44100
elif ext == 'mp4':
- fmt['preference'] = 1
fmt['acodec'] = 'aac'
fmt['abr'] = 96
fmt['asr'] = 44100
diff --git a/hypervideo_dl/extractor/beeg.py b/hypervideo_dl/extractor/beeg.py
index 5788d13..8fbabe7 100644
--- a/hypervideo_dl/extractor/beeg.py
+++ b/hypervideo_dl/extractor/beeg.py
@@ -3,10 +3,10 @@ from __future__ import unicode_literals
from .common import InfoExtractor
from ..compat import (
compat_str,
- compat_urlparse,
)
from ..utils import (
int_or_none,
+ parse_qs,
unified_timestamp,
)
@@ -57,7 +57,7 @@ class BeegIE(InfoExtractor):
query = {
'v': 2,
}
- qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
+ qs = parse_qs(url)
t = qs.get('t', [''])[0].split('-')
if len(t) > 1:
query.update({
diff --git a/hypervideo_dl/extractor/behindkink.py b/hypervideo_dl/extractor/behindkink.py
index 9bca853..2c97f98 100644
--- a/hypervideo_dl/extractor/behindkink.py
+++ b/hypervideo_dl/extractor/behindkink.py
@@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
from ..utils import url_basename
@@ -24,7 +23,7 @@ class BehindKinkIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
display_id = mobj.group('id')
webpage = self._download_webpage(url, display_id)
diff --git a/hypervideo_dl/extractor/bellmedia.py b/hypervideo_dl/extractor/bellmedia.py
index 9f9de96..904c17e 100644
--- a/hypervideo_dl/extractor/bellmedia.py
+++ b/hypervideo_dl/extractor/bellmedia.py
@@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
@@ -78,7 +77,7 @@ class BellMediaIE(InfoExtractor):
}
def _real_extract(self, url):
- domain, video_id = re.match(self._VALID_URL, url).groups()
+ domain, video_id = self._match_valid_url(url).groups()
domain = domain.split('.')[0]
return {
'_type': 'url_transparent',
diff --git a/hypervideo_dl/extractor/bet.py b/hypervideo_dl/extractor/bet.py
index d7ceaa8..2c71442 100644
--- a/hypervideo_dl/extractor/bet.py
+++ b/hypervideo_dl/extractor/bet.py
@@ -3,6 +3,8 @@ from __future__ import unicode_literals
from .mtv import MTVServicesInfoExtractor
from ..utils import unified_strdate
+# TODO Remove - Reason: Outdated Site
+
class BetIE(MTVServicesInfoExtractor):
_VALID_URL = r'https?://(?:www\.)?bet\.com/(?:[^/]+/)+(?P<id>.+?)\.html'
diff --git a/hypervideo_dl/extractor/bilibili.py b/hypervideo_dl/extractor/bilibili.py
index 08e12cc..8d66b43 100644
--- a/hypervideo_dl/extractor/bilibili.py
+++ b/hypervideo_dl/extractor/bilibili.py
@@ -1,25 +1,33 @@
# coding: utf-8
-from __future__ import unicode_literals
import hashlib
+import itertools
+import functools
import re
+import math
-from .common import InfoExtractor
+from .common import InfoExtractor, SearchInfoExtractor
from ..compat import (
compat_parse_qs,
compat_urlparse,
+ compat_urllib_parse_urlparse
)
from ..utils import (
ExtractorError,
int_or_none,
float_or_none,
parse_iso8601,
+ traverse_obj,
+ try_get,
smuggle_url,
+ srt_subtitles_timecode,
str_or_none,
+ str_to_int,
strip_jsonp,
unified_timestamp,
unsmuggle_url,
urlencode_postdata,
+ OnDemandPagedList
)
@@ -32,13 +40,14 @@ class BiliBiliIE(InfoExtractor):
(?:
video/[aA][vV]|
anime/(?P<anime_id>\d+)/play\#
- )(?P<id_bv>\d+)|
- video/[bB][vV](?P<id>[^/?#&]+)
+ )(?P<id>\d+)|
+ (s/)?video/[bB][vV](?P<id_bv>[^/?#&]+)
)
+ (?:/?\?p=(?P<page>\d+))?
'''
_TESTS = [{
- 'url': 'http://www.bilibili.tv/video/av1074402/',
+ 'url': 'http://www.bilibili.com/video/av1074402/',
'md5': '5f7d29e1a2872f3df0cf76b1f87d3788',
'info_dict': {
'id': '1074402',
@@ -57,6 +66,10 @@ class BiliBiliIE(InfoExtractor):
'url': 'http://bangumi.bilibili.com/anime/1869/play#40062',
'only_matching': True,
}, {
+ # bilibili.tv
+ 'url': 'http://www.bilibili.tv/video/av1074402/',
+ 'only_matching': True,
+ }, {
'url': 'http://bangumi.bilibili.com/anime/5802/play#100643',
'md5': '3f721ad1e75030cc06faf73587cfec57',
'info_dict': {
@@ -86,7 +99,7 @@ class BiliBiliIE(InfoExtractor):
'upload_date': '20170301',
},
'params': {
- 'skip_download': True, # Test metadata only
+ 'skip_download': True,
},
}, {
'info_dict': {
@@ -100,13 +113,21 @@ class BiliBiliIE(InfoExtractor):
'upload_date': '20170301',
},
'params': {
- 'skip_download': True, # Test metadata only
+ 'skip_download': True,
},
}]
}, {
# new BV video id format
'url': 'https://www.bilibili.com/video/BV1JE411F741',
'only_matching': True,
+ }, {
+ # Anthology
+ 'url': 'https://www.bilibili.com/video/BV1bK411W797',
+ 'info_dict': {
+ 'id': 'BV1bK411W797',
+ 'title': '物语中的人物是如何吐槽自己的OP的'
+ },
+ 'playlist_count': 17,
}]
_APP_KEY = 'iVGUTjsxvpLeuDCf'
@@ -123,13 +144,32 @@ class BiliBiliIE(InfoExtractor):
def _real_extract(self, url):
url, smuggled_data = unsmuggle_url(url, {})
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id') or mobj.group('id_bv')
+ mobj = self._match_valid_url(url)
+ video_id = mobj.group('id_bv') or mobj.group('id')
+
+ av_id, bv_id = self._get_video_id_set(video_id, mobj.group('id_bv') is not None)
+ video_id = av_id
+
anime_id = mobj.group('anime_id')
+ page_id = mobj.group('page')
webpage = self._download_webpage(url, video_id)
+ # Bilibili anthologies are similar to playlists but all videos share the same video ID as the anthology itself.
+ # If the video has no page argument, check to see if it's an anthology
+ if page_id is None:
+ if not self.get_param('noplaylist'):
+ r = self._extract_anthology_entries(bv_id, video_id, webpage)
+ if r is not None:
+ self.to_screen('Downloading anthology %s - add --no-playlist to just download video' % video_id)
+ return r
+ else:
+ self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
+
if 'anime/' not in url:
cid = self._search_regex(
+ r'\bcid(?:["\']:|=)(\d+),["\']page(?:["\']:|=)' + str(page_id), webpage, 'cid',
+ default=None
+ ) or self._search_regex(
r'\bcid(?:["\']:|=)(\d+)', webpage, 'cid',
default=None
) or compat_parse_qs(self._search_regex(
@@ -190,7 +230,7 @@ class BiliBiliIE(InfoExtractor):
formats.append({
'url': backup_url,
# backup URLs have lower priorities
- 'preference': -2 if 'hd.mp4' in backup_url else -3,
+ 'quality': -2 if 'hd.mp4' in backup_url else -3,
})
for a_format in formats:
@@ -208,9 +248,20 @@ class BiliBiliIE(InfoExtractor):
break
title = self._html_search_regex(
- ('<h1[^>]+\btitle=(["\'])(?P<title>(?:(?!\1).)+)\1',
- '(?s)<h1[^>]*>(?P<title>.+?)</h1>'), webpage, 'title',
+ (r'<h1[^>]+\btitle=(["\'])(?P<title>(?:(?!\1).)+)\1',
+ r'(?s)<h1[^>]*>(?P<title>.+?)</h1>'), webpage, 'title',
group='title')
+
+ # Get part title for anthologies
+ if page_id is not None:
+ # TODO: The json is already downloaded by _extract_anthology_entries. Don't redownload for each video
+ part_title = try_get(
+ self._download_json(
+ f'https://api.bilibili.com/x/player/pagelist?bvid={bv_id}&jsonp=jsonp',
+ video_id, note='Extracting videos in anthology'),
+ lambda x: x['data'][int(page_id) - 1]['part'])
+ title = part_title or title
+
description = self._html_search_meta('description', webpage)
timestamp = unified_timestamp(self._html_search_regex(
r'<time[^>]+datetime="([^"]+)"', webpage, 'upload time',
@@ -220,7 +271,8 @@ class BiliBiliIE(InfoExtractor):
# TODO 'view_count' requires deobfuscating Javascript
info = {
- 'id': video_id,
+ 'id': str(video_id) if page_id is None else '%s_part%s' % (video_id, page_id),
+ 'cid': cid,
'title': title,
'description': description,
'timestamp': timestamp,
@@ -229,33 +281,117 @@ class BiliBiliIE(InfoExtractor):
}
uploader_mobj = re.search(
- r'<a[^>]+href="(?:https?:)?//space\.bilibili\.com/(?P<id>\d+)"[^>]*>(?P<name>[^<]+)',
+ r'<a[^>]+href="(?:https?:)?//space\.bilibili\.com/(?P<id>\d+)"[^>]*>\s*(?P<name>[^<]+?)\s*<',
webpage)
if uploader_mobj:
info.update({
- 'uploader': uploader_mobj.group('name'),
+ 'uploader': uploader_mobj.group('name').strip(),
'uploader_id': uploader_mobj.group('id'),
})
+
if not info.get('uploader'):
info['uploader'] = self._html_search_meta(
'author', webpage, 'uploader', default=None)
+ top_level_info = {
+ 'tags': traverse_obj(self._download_json(
+ f'https://api.bilibili.com/x/tag/archive/tags?aid={video_id}',
+ video_id, fatal=False, note='Downloading tags'), ('data', ..., 'tag_name')),
+ }
+
+ entries[0]['subtitles'] = {
+ 'danmaku': [{
+ 'ext': 'xml',
+ 'url': f'https://comment.bilibili.com/{cid}.xml',
+ }]
+ }
+
+ r'''
+ # Requires https://github.com/m13253/danmaku2ass which is licenced under GPL3
+ # See https://github.com/animelover1984/youtube-dl
+
+ raw_danmaku = self._download_webpage(
+ f'https://comment.bilibili.com/{cid}.xml', video_id, fatal=False, note='Downloading danmaku comments')
+ danmaku = NiconicoIE.CreateDanmaku(raw_danmaku, commentType='Bilibili', x=1024, y=576)
+ entries[0]['subtitles'] = {
+ 'danmaku': [{
+ 'ext': 'ass',
+ 'data': danmaku
+ }]
+ }
+ '''
+
+ top_level_info['__post_extractor'] = self.extract_comments(video_id)
+
for entry in entries:
entry.update(info)
if len(entries) == 1:
+ entries[0].update(top_level_info)
return entries[0]
- else:
- for idx, entry in enumerate(entries):
- entry['id'] = '%s_part%d' % (video_id, (idx + 1))
-
- return {
- '_type': 'multi_video',
- 'id': video_id,
- 'title': title,
- 'description': description,
- 'entries': entries,
- }
+
+ for idx, entry in enumerate(entries):
+ entry['id'] = '%s_part%d' % (video_id, (idx + 1))
+
+ return {
+ '_type': 'multi_video',
+ 'id': str(video_id),
+ 'bv_id': bv_id,
+ 'title': title,
+ 'description': description,
+ 'entries': entries,
+ **info, **top_level_info
+ }
+
+ def _extract_anthology_entries(self, bv_id, video_id, webpage):
+ title = self._html_search_regex(
+ (r'<h1[^>]+\btitle=(["\'])(?P<title>(?:(?!\1).)+)\1',
+ r'(?s)<h1[^>]*>(?P<title>.+?)</h1>'), webpage, 'title',
+ group='title')
+ json_data = self._download_json(
+ f'https://api.bilibili.com/x/player/pagelist?bvid={bv_id}&jsonp=jsonp',
+ video_id, note='Extracting videos in anthology')
+
+ if json_data['data']:
+ return self.playlist_from_matches(
+ json_data['data'], bv_id, title, ie=BiliBiliIE.ie_key(),
+ getter=lambda entry: 'https://www.bilibili.com/video/%s?p=%d' % (bv_id, entry['page']))
+
+ def _get_video_id_set(self, id, is_bv):
+ query = {'bvid': id} if is_bv else {'aid': id}
+ response = self._download_json(
+ "http://api.bilibili.cn/x/web-interface/view",
+ id, query=query,
+ note='Grabbing original ID via API')
+
+ if response['code'] == -400:
+ raise ExtractorError('Video ID does not exist', expected=True, video_id=id)
+ elif response['code'] != 0:
+ raise ExtractorError(f'Unknown error occurred during API check (code {response["code"]})',
+ expected=True, video_id=id)
+ return response['data']['aid'], response['data']['bvid']
+
+ def _get_comments(self, video_id, commentPageNumber=0):
+ for idx in itertools.count(1):
+ replies = traverse_obj(
+ self._download_json(
+ f'https://api.bilibili.com/x/v2/reply?pn={idx}&oid={video_id}&type=1&jsonp=jsonp&sort=2&_=1567227301685',
+ video_id, note=f'Extracting comments from page {idx}'),
+ ('data', 'replies')) or []
+ for children in map(self._get_all_children, replies):
+ yield from children
+
+ def _get_all_children(self, reply):
+ yield {
+ 'author': traverse_obj(reply, ('member', 'uname')),
+ 'author_id': traverse_obj(reply, ('member', 'mid')),
+ 'id': reply.get('rpid'),
+ 'text': traverse_obj(reply, ('content', 'message')),
+ 'timestamp': reply.get('ctime'),
+ 'parent': reply.get('parent') or 'root',
+ }
+ for children in map(self._get_all_children, reply.get('replies') or []):
+ yield from children
class BiliBiliBangumiIE(InfoExtractor):
@@ -325,6 +461,136 @@ class BiliBiliBangumiIE(InfoExtractor):
season_info.get('bangumi_title'), season_info.get('evaluate'))
+class BilibiliChannelIE(InfoExtractor):
+ _VALID_URL = r'https?://space.bilibili\.com/(?P<id>\d+)'
+ _API_URL = "https://api.bilibili.com/x/space/arc/search?mid=%s&pn=%d&jsonp=jsonp"
+ _TESTS = [{
+ 'url': 'https://space.bilibili.com/3985676/video',
+ 'info_dict': {},
+ 'playlist_mincount': 112,
+ }]
+
+ def _entries(self, list_id):
+ count, max_count = 0, None
+
+ for page_num in itertools.count(1):
+ data = self._download_json(
+ self._API_URL % (list_id, page_num), list_id, note=f'Downloading page {page_num}')['data']
+
+ max_count = max_count or try_get(data, lambda x: x['page']['count'])
+
+ entries = try_get(data, lambda x: x['list']['vlist'])
+ if not entries:
+ return
+ for entry in entries:
+ yield self.url_result(
+ 'https://www.bilibili.com/video/%s' % entry['bvid'],
+ BiliBiliIE.ie_key(), entry['bvid'])
+
+ count += len(entries)
+ if max_count and count >= max_count:
+ return
+
+ def _real_extract(self, url):
+ list_id = self._match_id(url)
+ return self.playlist_result(self._entries(list_id), list_id)
+
+
+class BilibiliCategoryIE(InfoExtractor):
+ IE_NAME = 'Bilibili category extractor'
+ _MAX_RESULTS = 1000000
+ _VALID_URL = r'https?://www\.bilibili\.com/v/[a-zA-Z]+\/[a-zA-Z]+'
+ _TESTS = [{
+ 'url': 'https://www.bilibili.com/v/kichiku/mad',
+ 'info_dict': {
+ 'id': 'kichiku: mad',
+ 'title': 'kichiku: mad'
+ },
+ 'playlist_mincount': 45,
+ 'params': {
+ 'playlistend': 45
+ }
+ }]
+
+ def _fetch_page(self, api_url, num_pages, query, page_num):
+ parsed_json = self._download_json(
+ api_url, query, query={'Search_key': query, 'pn': page_num},
+ note='Extracting results from page %s of %s' % (page_num, num_pages))
+
+ video_list = try_get(parsed_json, lambda x: x['data']['archives'], list)
+ if not video_list:
+ raise ExtractorError('Failed to retrieve video list for page %d' % page_num)
+
+ for video in video_list:
+ yield self.url_result(
+ 'https://www.bilibili.com/video/%s' % video['bvid'], 'BiliBili', video['bvid'])
+
+ def _entries(self, category, subcategory, query):
+ # map of categories : subcategories : RIDs
+ rid_map = {
+ 'kichiku': {
+ 'mad': 26,
+ 'manual_vocaloid': 126,
+ 'guide': 22,
+ 'theatre': 216,
+ 'course': 127
+ },
+ }
+
+ if category not in rid_map:
+ raise ExtractorError(
+ f'The category {category} isn\'t supported. Supported categories: {list(rid_map.keys())}')
+ if subcategory not in rid_map[category]:
+ raise ExtractorError(
+ f'The subcategory {subcategory} isn\'t supported for this category. Supported subcategories: {list(rid_map[category].keys())}')
+ rid_value = rid_map[category][subcategory]
+
+ api_url = 'https://api.bilibili.com/x/web-interface/newlist?rid=%d&type=1&ps=20&jsonp=jsonp' % rid_value
+ page_json = self._download_json(api_url, query, query={'Search_key': query, 'pn': '1'})
+ page_data = try_get(page_json, lambda x: x['data']['page'], dict)
+ count, size = int_or_none(page_data.get('count')), int_or_none(page_data.get('size'))
+ if count is None or not size:
+ raise ExtractorError('Failed to calculate either page count or size')
+
+ num_pages = math.ceil(count / size)
+
+ return OnDemandPagedList(functools.partial(
+ self._fetch_page, api_url, num_pages, query), size)
+
+ def _real_extract(self, url):
+ u = compat_urllib_parse_urlparse(url)
+ category, subcategory = u.path.split('/')[2:4]
+ query = '%s: %s' % (category, subcategory)
+
+ return self.playlist_result(self._entries(category, subcategory, query), query, query)
+
+
+class BiliBiliSearchIE(SearchInfoExtractor):
+ IE_DESC = 'Bilibili video search, "bilisearch" keyword'
+ _MAX_RESULTS = 100000
+ _SEARCH_KEY = 'bilisearch'
+
+ def _search_results(self, query):
+ for page_num in itertools.count(1):
+ videos = self._download_json(
+ 'https://api.bilibili.com/x/web-interface/search/type', query,
+ note=f'Extracting results from page {page_num}', query={
+ 'Search_key': query,
+ 'keyword': query,
+ 'page': page_num,
+ 'context': '',
+ 'order': 'pubdate',
+ 'duration': 0,
+ 'tids_2': '',
+ '__refresh__': 'true',
+ 'search_type': 'video',
+ 'tids': 0,
+ 'highlight': 1,
+ })['data'].get('result') or []
+ for video in videos:
+ yield self.url_result(video['arcurl'], 'BiliBili', str(video['aid']))
+
+
class BilibiliAudioBaseIE(InfoExtractor):
def _call_api(self, path, sid, query=None):
if not query:
@@ -367,6 +633,7 @@ class BilibiliAudioIE(BilibiliAudioBaseIE):
formats = [{
'url': play_data['cdns'][0],
'filesize': int_or_none(play_data.get('size')),
+ 'vcodec': 'none'
}]
song = self._call_api('song/info', au_id)
@@ -449,3 +716,152 @@ class BiliBiliPlayerIE(InfoExtractor):
return self.url_result(
'http://www.bilibili.tv/video/av%s/' % video_id,
ie=BiliBiliIE.ie_key(), video_id=video_id)
+
+
+class BiliIntlBaseIE(InfoExtractor):
+ _API_URL = 'https://api.bili{}/intl/gateway{}'
+
+ def _call_api(self, type, endpoint, id):
+ return self._download_json(self._API_URL.format(type, endpoint), id)['data']
+
+ def json2srt(self, json):
+ data = '\n\n'.join(
+ f'{i + 1}\n{srt_subtitles_timecode(line["from"])} --> {srt_subtitles_timecode(line["to"])}\n{line["content"]}'
+ for i, line in enumerate(json['body']))
+ return data
+
+ def _get_subtitles(self, type, ep_id):
+ sub_json = self._call_api(type, f'/m/subtitle?ep_id={ep_id}&platform=web', ep_id)
+ subtitles = {}
+ for sub in sub_json.get('subtitles', []):
+ sub_url = sub.get('url')
+ if not sub_url:
+ continue
+ sub_data = self._download_json(sub_url, ep_id, fatal=False)
+ if not sub_data:
+ continue
+ subtitles.setdefault(sub.get('key', 'en'), []).append({
+ 'ext': 'srt',
+ 'data': self.json2srt(sub_data)
+ })
+ return subtitles
+
+ def _get_formats(self, type, ep_id):
+ video_json = self._call_api(type, f'/web/playurl?ep_id={ep_id}&platform=web', ep_id)
+ if not video_json:
+ self.raise_login_required(method='cookies')
+ video_json = video_json['playurl']
+ formats = []
+ for vid in video_json.get('video', []):
+ video_res = vid.get('video_resource') or {}
+ video_info = vid.get('stream_info') or {}
+ if not video_res.get('url'):
+ continue
+ formats.append({
+ 'url': video_res['url'],
+ 'ext': 'mp4',
+ 'format_note': video_info.get('desc_words'),
+ 'width': video_res.get('width'),
+ 'height': video_res.get('height'),
+ 'vbr': video_res.get('bandwidth'),
+ 'acodec': 'none',
+ 'vcodec': video_res.get('codecs'),
+ 'filesize': video_res.get('size'),
+ })
+ for aud in video_json.get('audio_resource', []):
+ if not aud.get('url'):
+ continue
+ formats.append({
+ 'url': aud['url'],
+ 'ext': 'mp4',
+ 'abr': aud.get('bandwidth'),
+ 'acodec': aud.get('codecs'),
+ 'vcodec': 'none',
+ 'filesize': aud.get('size'),
+ })
+
+ self._sort_formats(formats)
+ return formats
+
+ def _extract_ep_info(self, type, episode_data, ep_id):
+ return {
+ 'id': ep_id,
+ 'title': episode_data.get('long_title') or episode_data['title'],
+ 'thumbnail': episode_data.get('cover'),
+ 'episode_number': str_to_int(episode_data.get('title')),
+ 'formats': self._get_formats(type, ep_id),
+ 'subtitles': self._get_subtitles(type, ep_id),
+ 'extractor_key': BiliIntlIE.ie_key(),
+ }
+
+
+class BiliIntlIE(BiliIntlBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?bili(?P<type>bili\.tv|intl.com)/(?:[a-z]{2}/)?play/(?P<season_id>\d+)/(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'https://www.bilibili.tv/en/play/34613/341736',
+ 'info_dict': {
+ 'id': '341736',
+ 'ext': 'mp4',
+ 'title': 'The First Night',
+ 'thumbnail': 'https://i0.hdslb.com/bfs/intl/management/91e30e5521235d9b163339a26a0b030ebda54310.png',
+ 'episode_number': 2,
+ },
+ 'params': {
+ 'format': 'bv',
+ },
+ }, {
+ 'url': 'https://www.biliintl.com/en/play/34613/341736',
+ 'info_dict': {
+ 'id': '341736',
+ 'ext': 'mp4',
+ 'title': 'The First Night',
+ 'thumbnail': 'https://i0.hdslb.com/bfs/intl/management/91e30e5521235d9b163339a26a0b030ebda54310.png',
+ 'episode_number': 2,
+ },
+ 'params': {
+ 'format': 'bv',
+ },
+ }]
+
+ def _real_extract(self, url):
+ type, season_id, id = self._match_valid_url(url).groups()
+ data_json = self._call_api(type, f'/web/view/ogv_collection?season_id={season_id}', id)
+ episode_data = next(
+ episode for episode in data_json.get('episodes', [])
+ if str(episode.get('ep_id')) == id)
+ return self._extract_ep_info(type, episode_data, id)
+
+
+class BiliIntlSeriesIE(BiliIntlBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?bili(?P<type>bili\.tv|intl.com)/(?:[a-z]{2}/)?play/(?P<id>\d+)$'
+ _TESTS = [{
+ 'url': 'https://www.bilibili.tv/en/play/34613',
+ 'playlist_mincount': 15,
+ 'info_dict': {
+ 'id': '34613',
+ },
+ 'params': {
+ 'skip_download': True,
+ 'format': 'bv',
+ },
+ }, {
+ 'url': 'https://www.biliintl.com/en/play/34613',
+ 'playlist_mincount': 15,
+ 'info_dict': {
+ 'id': '34613',
+ },
+ 'params': {
+ 'skip_download': True,
+ 'format': 'bv',
+ },
+ }]
+
+ def _entries(self, id, type):
+ data_json = self._call_api(type, f'/web/view/ogv_collection?season_id={id}', id)
+ for episode in data_json.get('episodes', []):
+ episode_id = str(episode.get('ep_id'))
+ yield self._extract_ep_info(type, episode, episode_id)
+
+ def _real_extract(self, url):
+ type, id = self._match_valid_url(url).groups()
+ return self.playlist_result(self._entries(id, type), playlist_id=id)
diff --git a/hypervideo_dl/extractor/bitchute.py b/hypervideo_dl/extractor/bitchute.py
index 0c773e6..dcae6f4 100644
--- a/hypervideo_dl/extractor/bitchute.py
+++ b/hypervideo_dl/extractor/bitchute.py
@@ -6,6 +6,8 @@ import re
from .common import InfoExtractor
from ..utils import (
+ ExtractorError,
+ GeoRestrictedError,
orderedSet,
unified_strdate,
urlencode_postdata,
@@ -15,16 +17,16 @@ from ..utils import (
class BitChuteIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?bitchute\.com/(?:video|embed|torrent/[^/]+)/(?P<id>[^/?#&]+)'
_TESTS = [{
- 'url': 'https://www.bitchute.com/video/szoMrox2JEI/',
- 'md5': '66c4a70e6bfc40dcb6be3eb1d74939eb',
+ 'url': 'https://www.bitchute.com/video/UGlrF9o9b-Q/',
+ 'md5': '7e427d7ed7af5a75b5855705ec750e2b',
'info_dict': {
'id': 'szoMrox2JEI',
'ext': 'mp4',
- 'title': 'Fuck bitches get money',
- 'description': 'md5:3f21f6fb5b1d17c3dee9cf6b5fe60b3a',
+ 'title': 'This is the first video on #BitChute !',
+ 'description': 'md5:a0337e7b1fe39e32336974af8173a034',
'thumbnail': r're:^https?://.*\.jpg$',
- 'uploader': 'Victoria X Rave',
- 'upload_date': '20170813',
+ 'uploader': 'BitChute',
+ 'upload_date': '20170103',
},
}, {
'url': 'https://www.bitchute.com/embed/lbb5G1hjPhw/',
@@ -34,6 +36,14 @@ class BitChuteIE(InfoExtractor):
'only_matching': True,
}]
+ @staticmethod
+ def _extract_urls(webpage):
+ return [
+ mobj.group('url')
+ for mobj in re.finditer(
+ r'<(?:script|iframe)[^>]+\bsrc=(["\'])(?P<url>%s)' % BitChuteIE._VALID_URL,
+ webpage)]
+
def _real_extract(self, url):
video_id = self._match_id(url)
@@ -59,8 +69,14 @@ class BitChuteIE(InfoExtractor):
for format_url in orderedSet(format_urls)]
if not formats:
- formats = self._parse_html5_media_entries(
- url, webpage, video_id)[0]['formats']
+ entries = self._parse_html5_media_entries(
+ url, webpage, video_id)
+ if not entries:
+ error = self._html_search_regex(r'<h1 class="page-title">([^<]+)</h1>', webpage, 'error', default='Cannot find video')
+ if error == 'Video Unavailable':
+ raise GeoRestrictedError(error)
+ raise ExtractorError(error)
+ formats = entries[0]['formats']
self._check_formats(formats, video_id)
self._sort_formats(formats)
diff --git a/hypervideo_dl/extractor/bitwave.py b/hypervideo_dl/extractor/bitwave.py
new file mode 100644
index 0000000..eb16c46
--- /dev/null
+++ b/hypervideo_dl/extractor/bitwave.py
@@ -0,0 +1,61 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class BitwaveReplayIE(InfoExtractor):
+ IE_NAME = 'bitwave:replay'
+ _VALID_URL = r'https?://(?:www\.)?bitwave\.tv/(?P<user>\w+)/replay/(?P<id>\w+)/?$'
+ _TEST = {
+ 'url': 'https://bitwave.tv/RhythmicCarnage/replay/z4P6eq5L7WDrM85UCrVr',
+ 'only_matching': True
+ }
+
+ def _real_extract(self, url):
+ replay_id = self._match_id(url)
+ replay = self._download_json(
+ 'https://api.bitwave.tv/v1/replays/' + replay_id,
+ replay_id
+ )
+
+ return {
+ 'id': replay_id,
+ 'title': replay['data']['title'],
+ 'uploader': replay['data']['name'],
+ 'uploader_id': replay['data']['name'],
+ 'url': replay['data']['url'],
+ 'thumbnails': [
+ {'url': x} for x in replay['data']['thumbnails']
+ ],
+ }
+
+
+class BitwaveStreamIE(InfoExtractor):
+ IE_NAME = 'bitwave:stream'
+ _VALID_URL = r'https?://(?:www\.)?bitwave\.tv/(?P<id>\w+)/?$'
+ _TEST = {
+ 'url': 'https://bitwave.tv/doomtube',
+ 'only_matching': True
+ }
+
+ def _real_extract(self, url):
+ username = self._match_id(url)
+ channel = self._download_json(
+ 'https://api.bitwave.tv/v1/channels/' + username,
+ username)
+
+ formats = self._extract_m3u8_formats(
+ channel['data']['url'], username,
+ 'mp4')
+ self._sort_formats(formats)
+
+ return {
+ 'id': username,
+ 'title': self._live_title(channel['data']['title']),
+ 'uploader': username,
+ 'uploader_id': username,
+ 'formats': formats,
+ 'thumbnail': channel['data']['thumbnail'],
+ 'is_live': True,
+ 'view_count': channel['data']['viewCount']
+ }
diff --git a/hypervideo_dl/extractor/blackboardcollaborate.py b/hypervideo_dl/extractor/blackboardcollaborate.py
new file mode 100644
index 0000000..8ae2941
--- /dev/null
+++ b/hypervideo_dl/extractor/blackboardcollaborate.py
@@ -0,0 +1,67 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+
+from .common import InfoExtractor
+from ..utils import parse_iso8601
+
+
+class BlackboardCollaborateIE(InfoExtractor):
+ _VALID_URL = r'''(?x)
+ https?://
+ (?P<region>[a-z-]+)\.bbcollab\.com/
+ (?:
+ collab/ui/session/playback/load|
+ recording
+ )/
+ (?P<id>[^/]+)'''
+ _TESTS = [
+ {
+ 'url': 'https://us-lti.bbcollab.com/collab/ui/session/playback/load/0a633b6a88824deb8c918f470b22b256',
+ 'md5': 'bb7a055682ee4f25fdb5838cdf014541',
+ 'info_dict': {
+ 'id': '0a633b6a88824deb8c918f470b22b256',
+ 'title': 'HESI A2 Information Session - Thursday, May 6, 2021 - recording_1',
+ 'ext': 'mp4',
+ 'duration': 1896000,
+ 'timestamp': 1620331399,
+ 'upload_date': '20210506',
+ },
+ },
+ {
+ 'url': 'https://us.bbcollab.com/collab/ui/session/playback/load/76761522adfe4345a0dee6794bbcabda',
+ 'only_matching': True,
+ },
+ {
+ 'url': 'https://ca.bbcollab.com/collab/ui/session/playback/load/b6399dcb44df4f21b29ebe581e22479d',
+ 'only_matching': True,
+ },
+ {
+ 'url': 'https://eu.bbcollab.com/recording/51ed7b50810c4444a106e48cefb3e6b5',
+ 'only_matching': True,
+ },
+ {
+ 'url': 'https://au.bbcollab.com/collab/ui/session/playback/load/2bccf7165d7c419ab87afc1ec3f3bb15',
+ 'only_matching': True,
+ },
+ ]
+
+ def _real_extract(self, url):
+ mobj = self._match_valid_url(url)
+ region = mobj.group('region')
+ video_id = mobj.group('id')
+ info = self._download_json(
+ 'https://{}.bbcollab.com/collab/api/csa/recordings/{}/data'.format(region, video_id), video_id)
+ duration = info.get('duration')
+ title = info['name']
+ upload_date = info.get('created')
+ streams = info['streams']
+ formats = [{'format_id': k, 'url': url} for k, url in streams.items()]
+
+ return {
+ 'duration': duration,
+ 'formats': formats,
+ 'id': video_id,
+ 'timestamp': parse_iso8601(upload_date),
+ 'title': title,
+ }
diff --git a/hypervideo_dl/extractor/blinkx.py b/hypervideo_dl/extractor/blinkx.py
new file mode 100644
index 0000000..d70a3b3
--- /dev/null
+++ b/hypervideo_dl/extractor/blinkx.py
@@ -0,0 +1,86 @@
+from __future__ import unicode_literals
+
+import json
+
+from .common import InfoExtractor
+from ..utils import (
+ remove_start,
+ int_or_none,
+)
+
+
+class BlinkxIE(InfoExtractor):
+ _VALID_URL = r'(?:https?://(?:www\.)blinkx\.com/#?ce/|blinkx:)(?P<id>[^?]+)'
+ IE_NAME = 'blinkx'
+
+ _TEST = {
+ 'url': 'http://www.blinkx.com/ce/Da0Gw3xc5ucpNduzLuDDlv4WC9PuI4fDi1-t6Y3LyfdY2SZS5Urbvn-UPJvrvbo8LTKTc67Wu2rPKSQDJyZeeORCR8bYkhs8lI7eqddznH2ofh5WEEdjYXnoRtj7ByQwt7atMErmXIeYKPsSDuMAAqJDlQZ-3Ff4HJVeH_s3Gh8oQ',
+ 'md5': '337cf7a344663ec79bf93a526a2e06c7',
+ 'info_dict': {
+ 'id': 'Da0Gw3xc',
+ 'ext': 'mp4',
+ 'title': 'No Daily Show for John Oliver; HBO Show Renewed - IGN News',
+ 'uploader': 'IGN News',
+ 'upload_date': '20150217',
+ 'timestamp': 1424215740,
+ 'description': 'HBO has renewed Last Week Tonight With John Oliver for two more seasons.',
+ 'duration': 47.743333,
+ },
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ display_id = video_id[:8]
+
+ api_url = ('https://apib4.blinkx.com/api.php?action=play_video&'
+ + 'video=%s' % video_id)
+ data_json = self._download_webpage(api_url, display_id)
+ data = json.loads(data_json)['api']['results'][0]
+ duration = None
+ thumbnails = []
+ formats = []
+ for m in data['media']:
+ if m['type'] == 'jpg':
+ thumbnails.append({
+ 'url': m['link'],
+ 'width': int(m['w']),
+ 'height': int(m['h']),
+ })
+ elif m['type'] == 'original':
+ duration = float(m['d'])
+ elif m['type'] == 'youtube':
+ yt_id = m['link']
+ self.to_screen('Youtube video detected: %s' % yt_id)
+ return self.url_result(yt_id, 'Youtube', video_id=yt_id)
+ elif m['type'] in ('flv', 'mp4'):
+ vcodec = remove_start(m['vcodec'], 'ff')
+ acodec = remove_start(m['acodec'], 'ff')
+ vbr = int_or_none(m.get('vbr') or m.get('vbitrate'), 1000)
+ abr = int_or_none(m.get('abr') or m.get('abitrate'), 1000)
+ tbr = vbr + abr if vbr and abr else None
+ format_id = '%s-%sk-%s' % (vcodec, tbr, m['w'])
+ formats.append({
+ 'format_id': format_id,
+ 'url': m['link'],
+ 'vcodec': vcodec,
+ 'acodec': acodec,
+ 'abr': abr,
+ 'vbr': vbr,
+ 'tbr': tbr,
+ 'width': int_or_none(m.get('w')),
+ 'height': int_or_none(m.get('h')),
+ })
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': display_id,
+ 'fullid': video_id,
+ 'title': data['title'],
+ 'formats': formats,
+ 'uploader': data.get('channel_name'),
+ 'timestamp': data.get('pubdate_epoch'),
+ 'description': data.get('description'),
+ 'thumbnails': thumbnails,
+ 'duration': duration,
+ }
diff --git a/hypervideo_dl/extractor/bokecc.py b/hypervideo_dl/extractor/bokecc.py
index 6017e83..6a89d36 100644
--- a/hypervideo_dl/extractor/bokecc.py
+++ b/hypervideo_dl/extractor/bokecc.py
@@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
from ..compat import compat_parse_qs
@@ -23,7 +22,7 @@ class BokeCCBaseIE(InfoExtractor):
formats = [{
'format_id': format_id,
'url': quality.find('./copy').attrib['playurl'],
- 'preference': int(quality.attrib['value']),
+ 'quality': int(quality.attrib['value']),
} for quality in info_xml.findall('./video/quality')]
self._sort_formats(formats)
@@ -45,7 +44,7 @@ class BokeCCIE(BokeCCBaseIE):
}]
def _real_extract(self, url):
- qs = compat_parse_qs(re.match(self._VALID_URL, url).group('query'))
+ qs = compat_parse_qs(self._match_valid_url(url).group('query'))
if not qs.get('vid') or not qs.get('uid'):
raise ExtractorError('Invalid URL', expected=True)
diff --git a/hypervideo_dl/extractor/bongacams.py b/hypervideo_dl/extractor/bongacams.py
index 180542f..9e75511 100644
--- a/hypervideo_dl/extractor/bongacams.py
+++ b/hypervideo_dl/extractor/bongacams.py
@@ -1,6 +1,5 @@
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
from ..compat import compat_str
@@ -22,7 +21,7 @@ class BongaCamsIE(InfoExtractor):
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
host = mobj.group('host')
channel_id = mobj.group('id')
diff --git a/hypervideo_dl/extractor/box.py b/hypervideo_dl/extractor/box.py
index aae82d1..8214086 100644
--- a/hypervideo_dl/extractor/box.py
+++ b/hypervideo_dl/extractor/box.py
@@ -2,7 +2,6 @@
from __future__ import unicode_literals
import json
-import re
from .common import InfoExtractor
from ..utils import (
@@ -30,7 +29,7 @@ class BoxIE(InfoExtractor):
}
def _real_extract(self, url):
- shared_name, file_id = re.match(self._VALID_URL, url).groups()
+ shared_name, file_id = self._match_valid_url(url).groups()
webpage = self._download_webpage(url, file_id)
request_token = self._parse_json(self._search_regex(
r'Box\.config\s*=\s*({.+?});', webpage,
diff --git a/hypervideo_dl/extractor/bpb.py b/hypervideo_dl/extractor/bpb.py
index 0783353..8f6ef3c 100644
--- a/hypervideo_dl/extractor/bpb.py
+++ b/hypervideo_dl/extractor/bpb.py
@@ -47,7 +47,7 @@ class BpbIE(InfoExtractor):
quality = 'high' if '_high' in video_url else 'low'
formats.append({
'url': video_url,
- 'preference': 10 if quality == 'high' else 0,
+ 'quality': 10 if quality == 'high' else 0,
'format_note': quality,
'format_id': '%s-%s' % (quality, determine_ext(video_url)),
})
diff --git a/hypervideo_dl/extractor/br.py b/hypervideo_dl/extractor/br.py
index 9bde7f2..7169ece 100644
--- a/hypervideo_dl/extractor/br.py
+++ b/hypervideo_dl/extractor/br.py
@@ -2,7 +2,6 @@
from __future__ import unicode_literals
import json
-import re
from .common import InfoExtractor
from ..utils import (
@@ -86,7 +85,7 @@ class BRIE(InfoExtractor):
]
def _real_extract(self, url):
- base_url, display_id = re.search(self._VALID_URL, url).groups()
+ base_url, display_id = self._match_valid_url(url).groups()
page = self._download_webpage(url, display_id)
xml_url = self._search_regex(
r"return BRavFramework\.register\(BRavFramework\('avPlayer_(?:[a-f0-9-]{36})'\)\.setup\({dataURL:'(/(?:[a-z0-9\-]+/)+[a-z0-9/~_.-]+)'}\)\);", page, 'XMLURL')
@@ -114,7 +113,7 @@ class BRIE(InfoExtractor):
medias.append(media)
if len(medias) > 1:
- self._downloader.report_warning(
+ self.report_warning(
'found multiple medias; please '
'report this with the video URL to http://yt-dl.org/bug')
if not medias:
diff --git a/hypervideo_dl/extractor/bravotv.py b/hypervideo_dl/extractor/bravotv.py
index bae2aed..139d51c 100644
--- a/hypervideo_dl/extractor/bravotv.py
+++ b/hypervideo_dl/extractor/bravotv.py
@@ -8,6 +8,9 @@ from ..utils import (
smuggle_url,
update_url_query,
int_or_none,
+ float_or_none,
+ try_get,
+ dict_get,
)
@@ -24,6 +27,11 @@ class BravoTVIE(AdobePassIE):
'uploader': 'NBCU-BRAV',
'upload_date': '20190314',
'timestamp': 1552591860,
+ 'season_number': 16,
+ 'episode_number': 15,
+ 'series': 'Top Chef',
+ 'episode': 'The Top Chef Season 16 Winner Is...',
+ 'duration': 190.0,
}
}, {
'url': 'http://www.bravotv.com/below-deck/season-3/ep-14-reunion-part-1',
@@ -34,7 +42,7 @@ class BravoTVIE(AdobePassIE):
}]
def _real_extract(self, url):
- site, display_id = re.match(self._VALID_URL, url).groups()
+ site, display_id = self._match_valid_url(url).groups()
webpage = self._download_webpage(url, display_id)
settings = self._parse_json(self._search_regex(
r'<script[^>]+data-drupal-selector="drupal-settings-json"[^>]*>({.+?})</script>', webpage, 'drupal settings'),
@@ -79,12 +87,34 @@ class BravoTVIE(AdobePassIE):
'episode_number': int_or_none(metadata.get('episode_num')),
})
query['switch'] = 'progressive'
+
+ tp_url = 'http://link.theplatform.com/s/%s/%s' % (account_pid, tp_path)
+
+ tp_metadata = self._download_json(
+ update_url_query(tp_url, {'format': 'preview'}),
+ display_id, fatal=False)
+ if tp_metadata:
+ info.update({
+ 'title': tp_metadata.get('title'),
+ 'description': tp_metadata.get('description'),
+ 'duration': float_or_none(tp_metadata.get('duration'), 1000),
+ 'season_number': int_or_none(
+ dict_get(tp_metadata, ('pl1$seasonNumber', 'nbcu$seasonNumber'))),
+ 'episode_number': int_or_none(
+ dict_get(tp_metadata, ('pl1$episodeNumber', 'nbcu$episodeNumber'))),
+ # For some reason the series is sometimes wrapped into a single element array.
+ 'series': try_get(
+ dict_get(tp_metadata, ('pl1$show', 'nbcu$show')),
+ lambda x: x[0] if isinstance(x, list) else x,
+ expected_type=str),
+ 'episode': dict_get(
+ tp_metadata, ('pl1$episodeName', 'nbcu$episodeName', 'title')),
+ })
+
info.update({
'_type': 'url_transparent',
'id': release_pid,
- 'url': smuggle_url(update_url_query(
- 'http://link.theplatform.com/s/%s/%s' % (account_pid, tp_path),
- query), {'force_smil_url': True}),
+ 'url': smuggle_url(update_url_query(tp_url, query), {'force_smil_url': True}),
'ie_key': 'ThePlatform',
})
return info
diff --git a/hypervideo_dl/extractor/breakcom.py b/hypervideo_dl/extractor/breakcom.py
index 68c7cf2..f38789f 100644
--- a/hypervideo_dl/extractor/breakcom.py
+++ b/hypervideo_dl/extractor/breakcom.py
@@ -1,6 +1,5 @@
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
from .youtube import YoutubeIE
@@ -41,7 +40,7 @@ class BreakIE(InfoExtractor):
}]
def _real_extract(self, url):
- display_id, video_id = re.match(self._VALID_URL, url).groups()
+ display_id, video_id = self._match_valid_url(url).groups()
webpage = self._download_webpage(url, display_id)
diff --git a/hypervideo_dl/extractor/brightcove.py b/hypervideo_dl/extractor/brightcove.py
index 6022076..cd1c3f0 100644
--- a/hypervideo_dl/extractor/brightcove.py
+++ b/hypervideo_dl/extractor/brightcove.py
@@ -11,7 +11,6 @@ from ..compat import (
compat_etree_fromstring,
compat_HTTPError,
compat_parse_qs,
- compat_urllib_parse_urlparse,
compat_urlparse,
compat_xml_parse_error,
)
@@ -26,6 +25,7 @@ from ..utils import (
js_to_json,
mimetype2ext,
parse_iso8601,
+ parse_qs,
smuggle_url,
str_or_none,
try_get,
@@ -177,7 +177,7 @@ class BrightcoveLegacyIE(InfoExtractor):
flashvars = {}
data_url = object_doc.attrib.get('data', '')
- data_url_params = compat_parse_qs(compat_urllib_parse_urlparse(data_url).query)
+ data_url_params = parse_qs(data_url)
def find_param(name):
if name in flashvars:
@@ -290,7 +290,7 @@ class BrightcoveLegacyIE(InfoExtractor):
url = re.sub(r'(?<=[?&])(videoI(d|D)|idVideo|bctid)', '%40videoPlayer', url)
# Change bckey (used by bcove.me urls) to playerKey
url = re.sub(r'(?<=[?&])bckey', 'playerKey', url)
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
query_str = mobj.group('query')
query = compat_urlparse.parse_qs(query_str)
@@ -472,27 +472,32 @@ class BrightcoveNewIE(AdobePassIE):
title = json_data['name'].strip()
num_drm_sources = 0
- formats = []
+ formats, subtitles = [], {}
sources = json_data.get('sources') or []
for source in sources:
container = source.get('container')
ext = mimetype2ext(source.get('type'))
src = source.get('src')
+ skip_unplayable = not self.get_param('allow_unplayable_formats')
# https://support.brightcove.com/playback-api-video-fields-reference#key_systems_object
- if container == 'WVM' or source.get('key_systems'):
+ if skip_unplayable and (container == 'WVM' or source.get('key_systems')):
num_drm_sources += 1
continue
- elif ext == 'ism':
+ elif ext == 'ism' and skip_unplayable:
continue
elif ext == 'm3u8' or container == 'M2TS':
if not src:
continue
- formats.extend(self._extract_m3u8_formats(
- src, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))
+ f, subs = self._extract_m3u8_formats_and_subtitles(
+ src, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)
+ formats.extend(f)
+ subtitles = self._merge_subtitles(subtitles, subs)
elif ext == 'mpd':
if not src:
continue
- formats.extend(self._extract_mpd_formats(src, video_id, 'dash', fatal=False))
+ f, subs = self._extract_mpd_formats_and_subtitles(src, video_id, 'dash', fatal=False)
+ formats.extend(f)
+ subtitles = self._merge_subtitles(subtitles, subs)
else:
streaming_src = source.get('streaming_src')
stream_name, app_name = source.get('stream_name'), source.get('app_name')
@@ -544,17 +549,17 @@ class BrightcoveNewIE(AdobePassIE):
errors = json_data.get('errors')
if errors:
error = errors[0]
- raise ExtractorError(
+ self.raise_no_formats(
error.get('message') or error.get('error_subcode') or error['error_code'], expected=True)
- if sources and num_drm_sources == len(sources):
- raise ExtractorError('This video is DRM protected.', expected=True)
+ elif (not self.get_param('allow_unplayable_formats')
+ and sources and num_drm_sources == len(sources)):
+ self.report_drm(video_id)
self._sort_formats(formats)
for f in formats:
f.setdefault('http_headers', {}).update(headers)
- subtitles = {}
for text_track in json_data.get('text_tracks', []):
if text_track.get('kind') != 'captions':
continue
@@ -593,7 +598,7 @@ class BrightcoveNewIE(AdobePassIE):
'ip_blocks': smuggled_data.get('geo_ip_blocks'),
})
- account_id, player_id, embed, content_type, video_id = re.match(self._VALID_URL, url).groups()
+ account_id, player_id, embed, content_type, video_id = self._match_valid_url(url).groups()
policy_key_id = '%s_%s' % (account_id, player_id)
policy_key = self._downloader.cache.load('brightcove', policy_key_id)
diff --git a/hypervideo_dl/extractor/byutv.py b/hypervideo_dl/extractor/byutv.py
index 0b11bf1..f4d5086 100644
--- a/hypervideo_dl/extractor/byutv.py
+++ b/hypervideo_dl/extractor/byutv.py
@@ -1,6 +1,5 @@
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
from ..utils import (
@@ -52,7 +51,7 @@ class BYUtvIE(InfoExtractor):
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
video_id = mobj.group('id')
display_id = mobj.group('display_id') or video_id
@@ -82,6 +81,7 @@ class BYUtvIE(InfoExtractor):
info = {}
formats = []
+ subtitles = {}
for format_id, ep in video.items():
if not isinstance(ep, dict):
continue
@@ -90,12 +90,16 @@ class BYUtvIE(InfoExtractor):
continue
ext = determine_ext(video_url)
if ext == 'm3u8':
- formats.extend(self._extract_m3u8_formats(
+ m3u8_fmts, m3u8_subs = self._extract_m3u8_formats_and_subtitles(
video_url, video_id, 'mp4', entry_protocol='m3u8_native',
- m3u8_id='hls', fatal=False))
+ m3u8_id='hls', fatal=False)
+ formats.extend(m3u8_fmts)
+ subtitles = self._merge_subtitles(subtitles, m3u8_subs)
elif ext == 'mpd':
- formats.extend(self._extract_mpd_formats(
- video_url, video_id, mpd_id='dash', fatal=False))
+ mpd_fmts, mpd_subs = self._extract_mpd_formats_and_subtitles(
+ video_url, video_id, mpd_id='dash', fatal=False)
+ formats.extend(mpd_fmts)
+ subtitles = self._merge_subtitles(subtitles, mpd_subs)
else:
formats.append({
'url': video_url,
@@ -114,4 +118,5 @@ class BYUtvIE(InfoExtractor):
'display_id': display_id,
'title': display_id,
'formats': formats,
+ 'subtitles': subtitles,
})
diff --git a/hypervideo_dl/extractor/c56.py b/hypervideo_dl/extractor/c56.py
index cac8fdc..a853c53 100644
--- a/hypervideo_dl/extractor/c56.py
+++ b/hypervideo_dl/extractor/c56.py
@@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
from ..utils import js_to_json
@@ -31,7 +30,7 @@ class C56IE(InfoExtractor):
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url, flags=re.VERBOSE)
+ mobj = self._match_valid_url(url)
text_id = mobj.group('textid')
webpage = self._download_webpage(url, text_id)
diff --git a/hypervideo_dl/extractor/cam4.py b/hypervideo_dl/extractor/cam4.py
new file mode 100644
index 0000000..30daf2b
--- /dev/null
+++ b/hypervideo_dl/extractor/cam4.py
@@ -0,0 +1,32 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class CAM4IE(InfoExtractor):
+ _VALID_URL = r'https?://(?:[^/]+\.)?cam4\.com/(?P<id>[a-z0-9_]+)'
+ _TEST = {
+ 'url': 'https://www.cam4.com/foxynesss',
+ 'info_dict': {
+ 'id': 'foxynesss',
+ 'ext': 'mp4',
+ 'title': 're:^foxynesss [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+ 'age_limit': 18,
+ }
+ }
+
+ def _real_extract(self, url):
+ channel_id = self._match_id(url)
+ m3u8_playlist = self._download_json('https://www.cam4.com/rest/v1.0/profile/{}/streamInfo'.format(channel_id), channel_id).get('cdnURL')
+
+ formats = self._extract_m3u8_formats(m3u8_playlist, channel_id, 'mp4', m3u8_id='hls', live=True)
+ self._sort_formats(formats)
+
+ return {
+ 'id': channel_id,
+ 'title': self._live_title(channel_id),
+ 'is_live': True,
+ 'age_limit': 18,
+ 'formats': formats,
+ }
diff --git a/hypervideo_dl/extractor/cammodels.py b/hypervideo_dl/extractor/cammodels.py
index 1eb81b7..eb2a8b4 100644
--- a/hypervideo_dl/extractor/cammodels.py
+++ b/hypervideo_dl/extractor/cammodels.py
@@ -82,7 +82,7 @@ class CamModelsIE(InfoExtractor):
f.update({
'ext': 'mp4',
# hls skips fragments, preferring rtmp
- 'preference': -1,
+ 'quality': -10,
})
else:
continue
diff --git a/hypervideo_dl/extractor/canalplus.py b/hypervideo_dl/extractor/canalplus.py
index 51c11cb..211ea26 100644
--- a/hypervideo_dl/extractor/canalplus.py
+++ b/hypervideo_dl/extractor/canalplus.py
@@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
from ..utils import (
@@ -50,7 +49,7 @@ class CanalplusIE(InfoExtractor):
}]
def _real_extract(self, url):
- site, display_id, video_id = re.match(self._VALID_URL, url).groups()
+ site, display_id, video_id = self._match_valid_url(url).groups()
site_id = self._SITE_ID_MAP[site]
@@ -89,7 +88,7 @@ class CanalplusIE(InfoExtractor):
# the secret extracted from ya function in http://player.canalplus.fr/common/js/canalPlayer.js
'url': format_url + '?secret=pqzerjlsmdkjfoiuerhsdlfknaes',
'format_id': format_id,
- 'preference': preference(format_id),
+ 'quality': preference(format_id),
})
self._sort_formats(formats)
diff --git a/hypervideo_dl/extractor/canvas.py b/hypervideo_dl/extractor/canvas.py
index eefbab2..49e7e4e 100644
--- a/hypervideo_dl/extractor/canvas.py
+++ b/hypervideo_dl/extractor/canvas.py
@@ -1,7 +1,5 @@
from __future__ import unicode_literals
-import re
-import json
from .common import InfoExtractor
from .gigya import GigyaBaseIE
@@ -17,6 +15,7 @@ from ..utils import (
str_or_none,
strip_or_none,
url_or_none,
+ urlencode_postdata
)
@@ -24,7 +23,7 @@ class CanvasIE(InfoExtractor):
_VALID_URL = r'https?://mediazone\.vrt\.be/api/v1/(?P<site_id>canvas|een|ketnet|vrt(?:video|nieuws)|sporza|dako)/assets/(?P<id>[^/?#&]+)'
_TESTS = [{
'url': 'https://mediazone.vrt.be/api/v1/ketnet/assets/md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475',
- 'md5': '68993eda72ef62386a15ea2cf3c93107',
+ 'md5': '37b2b7bb9b3dcaa05b67058dc3a714a9',
'info_dict': {
'id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475',
'display_id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475',
@@ -32,9 +31,9 @@ class CanvasIE(InfoExtractor):
'title': 'Nachtwacht: De Greystook',
'description': 'Nachtwacht: De Greystook',
'thumbnail': r're:^https?://.*\.jpg$',
- 'duration': 1468.04,
+ 'duration': 1468.02,
},
- 'expected_warnings': ['is not a supported codec', 'Unknown MIME type'],
+ 'expected_warnings': ['is not a supported codec'],
}, {
'url': 'https://mediazone.vrt.be/api/v1/canvas/assets/mz-ast-5e5f90b6-2d72-4c40-82c2-e134f884e93e',
'only_matching': True,
@@ -47,7 +46,7 @@ class CanvasIE(InfoExtractor):
_REST_API_BASE = 'https://media-services-public.vrt.be/vualto-video-aggregator-web/rest/external/v1'
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
site_id, video_id = mobj.group('site_id'), mobj.group('id')
data = None
@@ -83,24 +82,31 @@ class CanvasIE(InfoExtractor):
description = data.get('description')
formats = []
+ subtitles = {}
for target in data['targetUrls']:
format_url, format_type = url_or_none(target.get('url')), str_or_none(target.get('type'))
if not format_url or not format_type:
continue
format_type = format_type.upper()
if format_type in self._HLS_ENTRY_PROTOCOLS_MAP:
- formats.extend(self._extract_m3u8_formats(
+ fmts, subs = self._extract_m3u8_formats_and_subtitles(
format_url, video_id, 'mp4', self._HLS_ENTRY_PROTOCOLS_MAP[format_type],
- m3u8_id=format_type, fatal=False))
+ m3u8_id=format_type, fatal=False)
+ formats.extend(fmts)
+ subtitles = self._merge_subtitles(subtitles, subs)
elif format_type == 'HDS':
formats.extend(self._extract_f4m_formats(
format_url, video_id, f4m_id=format_type, fatal=False))
elif format_type == 'MPEG_DASH':
- formats.extend(self._extract_mpd_formats(
- format_url, video_id, mpd_id=format_type, fatal=False))
+ fmts, subs = self._extract_mpd_formats_and_subtitles(
+ format_url, video_id, mpd_id=format_type, fatal=False)
+ formats.extend(fmts)
+ subtitles = self._merge_subtitles(subtitles, subs)
elif format_type == 'HSS':
- formats.extend(self._extract_ism_formats(
- format_url, video_id, ism_id='mss', fatal=False))
+ fmts, subs = self._extract_ism_formats_and_subtitles(
+ format_url, video_id, ism_id='mss', fatal=False)
+ formats.extend(fmts)
+ subtitles = self._merge_subtitles(subtitles, subs)
else:
formats.append({
'format_id': format_type,
@@ -108,7 +114,6 @@ class CanvasIE(InfoExtractor):
})
self._sort_formats(formats)
- subtitles = {}
subtitle_urls = data.get('subtitleUrls')
if isinstance(subtitle_urls, list):
for subtitle in subtitle_urls:
@@ -186,7 +191,7 @@ class CanvasEenIE(InfoExtractor):
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
site_id, display_id = mobj.group('site_id'), mobj.group('id')
webpage = self._download_webpage(url, display_id)
@@ -259,7 +264,7 @@ class VrtNUIE(GigyaBaseIE):
'expected_warnings': ['Unable to download asset JSON', 'is not a supported codec', 'Unknown MIME type'],
}]
_NETRC_MACHINE = 'vrtnu'
- _APIKEY = '3_0Z2HujMtiWq_pkAjgnS2Md2E11a1AwZjYiBETtwNE-EoEHDINgtnvcAOpNgmrVGy'
+ _APIKEY = '3_qhEcPa5JGFROVwu5SWKqJ4mVOIkwlFNMSKwzPDAh8QZOtHqu6L4nD5Q7lk0eXOOG'
_CONTEXT_ID = 'R3595707040'
def _real_initialize(self):
@@ -270,35 +275,41 @@ class VrtNUIE(GigyaBaseIE):
if username is None:
return
- auth_data = {
- 'APIKey': self._APIKEY,
- 'targetEnv': 'jssdk',
- 'loginID': username,
- 'password': password,
- 'authMode': 'cookie',
- }
+ auth_info = self._download_json(
+ 'https://accounts.vrt.be/accounts.login', None,
+ note='Login data', errnote='Could not get Login data',
+ headers={}, data=urlencode_postdata({
+ 'loginID': username,
+ 'password': password,
+ 'sessionExpiration': '-2',
+ 'APIKey': self._APIKEY,
+ 'targetEnv': 'jssdk',
+ }))
- auth_info = self._gigya_login(auth_data)
+ if auth_info.get('errorDetails'):
+ raise ExtractorError('Unable to login: VrtNU said: ' + auth_info.get('errorDetails'), expected=True)
# Sometimes authentication fails for no good reason, retry
login_attempt = 1
while login_attempt <= 3:
try:
- # When requesting a token, no actual token is returned, but the
- # necessary cookies are set.
+ self._request_webpage('https://token.vrt.be/vrtnuinitlogin',
+ None, note='Requesting XSRF Token', errnote='Could not get XSRF Token',
+ query={'provider': 'site', 'destination': 'https://www.vrt.be/vrtnu/'})
+
+ post_data = {
+ 'UID': auth_info['UID'],
+ 'UIDSignature': auth_info['UIDSignature'],
+ 'signatureTimestamp': auth_info['signatureTimestamp'],
+ 'client_id': 'vrtnu-site',
+ '_csrf': self._get_cookies('https://login.vrt.be').get('OIDCXSRF').value,
+ }
+
self._request_webpage(
- 'https://token.vrt.be',
+ 'https://login.vrt.be/perform_login',
None, note='Requesting a token', errnote='Could not get a token',
- headers={
- 'Content-Type': 'application/json',
- 'Referer': 'https://www.vrt.be/vrtnu/',
- },
- data=json.dumps({
- 'uid': auth_info['UID'],
- 'uidsig': auth_info['UIDSignature'],
- 'ts': auth_info['signatureTimestamp'],
- 'email': auth_info['profile']['email'],
- }).encode('utf-8'))
+ headers={}, data=urlencode_postdata(post_data))
+
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
login_attempt += 1
diff --git a/hypervideo_dl/extractor/cbc.py b/hypervideo_dl/extractor/cbc.py
index fd5ec60..2429521 100644
--- a/hypervideo_dl/extractor/cbc.py
+++ b/hypervideo_dl/extractor/cbc.py
@@ -1,30 +1,18 @@
# coding: utf-8
from __future__ import unicode_literals
-import hashlib
-import json
import re
-from xml.sax.saxutils import escape
from .common import InfoExtractor
from ..compat import (
compat_str,
- compat_HTTPError,
)
from ..utils import (
js_to_json,
smuggle_url,
try_get,
- xpath_text,
- xpath_element,
- xpath_with_ns,
- find_xpath_attr,
orderedSet,
- parse_duration,
- parse_iso8601,
- parse_age_limit,
strip_or_none,
- int_or_none,
ExtractorError,
)
@@ -59,6 +47,7 @@ class CBCIE(InfoExtractor):
'uploader': 'CBCC-NEW',
'timestamp': 1382717907,
},
+ 'skip': 'No longer available',
}, {
# with clipId, feed only available via tpfeed.cbc.ca
'url': 'http://www.cbc.ca/archives/entry/1978-robin-williams-freestyles-on-90-minutes-live',
@@ -209,289 +198,228 @@ class CBCPlayerIE(InfoExtractor):
}
-class CBCWatchBaseIE(InfoExtractor):
- _device_id = None
- _device_token = None
- _API_BASE_URL = 'https://api-cbc.cloud.clearleap.com/cloffice/client/'
- _NS_MAP = {
- 'media': 'http://search.yahoo.com/mrss/',
- 'clearleap': 'http://www.clearleap.com/namespace/clearleap/1.0/',
- }
- _GEO_COUNTRIES = ['CA']
- _LOGIN_URL = 'https://api.loginradius.com/identity/v2/auth/login'
- _TOKEN_URL = 'https://cloud-api.loginradius.com/sso/jwt/api/token'
- _API_KEY = '3f4beddd-2061-49b0-ae80-6f1f2ed65b37'
- _NETRC_MACHINE = 'cbcwatch'
-
- def _signature(self, email, password):
- data = json.dumps({
- 'email': email,
- 'password': password,
- }).encode()
- headers = {'content-type': 'application/json'}
- query = {'apikey': self._API_KEY}
- resp = self._download_json(self._LOGIN_URL, None, data=data, headers=headers, query=query)
- access_token = resp['access_token']
-
- # token
- query = {
- 'access_token': access_token,
- 'apikey': self._API_KEY,
- 'jwtapp': 'jwt',
- }
- resp = self._download_json(self._TOKEN_URL, None, headers=headers, query=query)
- return resp['signature']
-
- def _call_api(self, path, video_id):
- url = path if path.startswith('http') else self._API_BASE_URL + path
- for _ in range(2):
- try:
- result = self._download_xml(url, video_id, headers={
- 'X-Clearleap-DeviceId': self._device_id,
- 'X-Clearleap-DeviceToken': self._device_token,
- })
- except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
- # Device token has expired, re-acquiring device token
- self._register_device()
- continue
- raise
- error_message = xpath_text(result, 'userMessage') or xpath_text(result, 'systemMessage')
- if error_message:
- raise ExtractorError('%s said: %s' % (self.IE_NAME, error_message))
- return result
-
- def _real_initialize(self):
- if self._valid_device_token():
- return
- device = self._downloader.cache.load(
- 'cbcwatch', self._cache_device_key()) or {}
- self._device_id, self._device_token = device.get('id'), device.get('token')
- if self._valid_device_token():
- return
- self._register_device()
-
- def _valid_device_token(self):
- return self._device_id and self._device_token
-
- def _cache_device_key(self):
- email, _ = self._get_login_info()
- return '%s_device' % hashlib.sha256(email.encode()).hexdigest() if email else 'device'
-
- def _register_device(self):
- result = self._download_xml(
- self._API_BASE_URL + 'device/register',
- None, 'Acquiring device token',
- data=b'<device><type>web</type></device>')
- self._device_id = xpath_text(result, 'deviceId', fatal=True)
- email, password = self._get_login_info()
- if email and password:
- signature = self._signature(email, password)
- data = '<login><token>{0}</token><device><deviceId>{1}</deviceId><type>web</type></device></login>'.format(
- escape(signature), escape(self._device_id)).encode()
- url = self._API_BASE_URL + 'device/login'
- result = self._download_xml(
- url, None, data=data,
- headers={'content-type': 'application/xml'})
- self._device_token = xpath_text(result, 'token', fatal=True)
- else:
- self._device_token = xpath_text(result, 'deviceToken', fatal=True)
- self._downloader.cache.store(
- 'cbcwatch', self._cache_device_key(), {
- 'id': self._device_id,
- 'token': self._device_token,
- })
-
- def _parse_rss_feed(self, rss):
- channel = xpath_element(rss, 'channel', fatal=True)
-
- def _add_ns(path):
- return xpath_with_ns(path, self._NS_MAP)
-
- entries = []
- for item in channel.findall('item'):
- guid = xpath_text(item, 'guid', fatal=True)
- title = xpath_text(item, 'title', fatal=True)
-
- media_group = xpath_element(item, _add_ns('media:group'), fatal=True)
- content = xpath_element(media_group, _add_ns('media:content'), fatal=True)
- content_url = content.attrib['url']
-
- thumbnails = []
- for thumbnail in media_group.findall(_add_ns('media:thumbnail')):
- thumbnail_url = thumbnail.get('url')
- if not thumbnail_url:
- continue
- thumbnails.append({
- 'id': thumbnail.get('profile'),
- 'url': thumbnail_url,
- 'width': int_or_none(thumbnail.get('width')),
- 'height': int_or_none(thumbnail.get('height')),
- })
-
- timestamp = None
- release_date = find_xpath_attr(
- item, _add_ns('media:credit'), 'role', 'releaseDate')
- if release_date is not None:
- timestamp = parse_iso8601(release_date.text)
-
- entries.append({
- '_type': 'url_transparent',
- 'url': content_url,
- 'id': guid,
- 'title': title,
- 'description': xpath_text(item, 'description'),
- 'timestamp': timestamp,
- 'duration': int_or_none(content.get('duration')),
- 'age_limit': parse_age_limit(xpath_text(item, _add_ns('media:rating'))),
- 'episode': xpath_text(item, _add_ns('clearleap:episode')),
- 'episode_number': int_or_none(xpath_text(item, _add_ns('clearleap:episodeInSeason'))),
- 'series': xpath_text(item, _add_ns('clearleap:series')),
- 'season_number': int_or_none(xpath_text(item, _add_ns('clearleap:season'))),
- 'thumbnails': thumbnails,
- 'ie_key': 'CBCWatchVideo',
- })
-
- return self.playlist_result(
- entries, xpath_text(channel, 'guid'),
- xpath_text(channel, 'title'),
- xpath_text(channel, 'description'))
-
-
-class CBCWatchVideoIE(CBCWatchBaseIE):
- IE_NAME = 'cbc.ca:watch:video'
- _VALID_URL = r'https?://api-cbc\.cloud\.clearleap\.com/cloffice/client/web/play/?\?.*?\bcontentId=(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'
- _TEST = {
- # geo-restricted to Canada, bypassable
- 'url': 'https://api-cbc.cloud.clearleap.com/cloffice/client/web/play/?contentId=3c84472a-1eea-4dee-9267-2655d5055dcf&categoryId=ebc258f5-ee40-4cca-b66b-ba6bd55b7235',
- 'only_matching': True,
- }
+class CBCGemIE(InfoExtractor):
+ IE_NAME = 'gem.cbc.ca'
+ _VALID_URL = r'https?://gem\.cbc\.ca/media/(?P<id>[0-9a-z-]+/s[0-9]+[a-z][0-9]+)'
+ _TESTS = [{
+ # This is a normal, public, TV show video
+ 'url': 'https://gem.cbc.ca/media/schitts-creek/s06e01',
+ 'md5': '93dbb31c74a8e45b378cf13bd3f6f11e',
+ 'info_dict': {
+ 'id': 'schitts-creek/s06e01',
+ 'ext': 'mp4',
+ 'title': 'Smoke Signals',
+ 'description': 'md5:929868d20021c924020641769eb3e7f1',
+ 'thumbnail': 'https://images.radio-canada.ca/v1/synps-cbc/episode/perso/cbc_schitts_creek_season_06e01_thumbnail_v01.jpg?im=Resize=(Size)',
+ 'duration': 1314,
+ 'categories': ['comedy'],
+ 'series': 'Schitt\'s Creek',
+ 'season': 'Season 6',
+ 'season_number': 6,
+ 'episode': 'Smoke Signals',
+ 'episode_number': 1,
+ 'episode_id': 'schitts-creek/s06e01',
+ },
+ 'params': {'format': 'bv'},
+ 'skip': 'Geo-restricted to Canada',
+ }, {
+ # This video requires an account in the browser, but works fine in hypervideo
+ 'url': 'https://gem.cbc.ca/media/schitts-creek/s01e01',
+ 'md5': '297a9600f554f2258aed01514226a697',
+ 'info_dict': {
+ 'id': 'schitts-creek/s01e01',
+ 'ext': 'mp4',
+ 'title': 'The Cup Runneth Over',
+ 'description': 'md5:9bca14ea49ab808097530eb05a29e797',
+ 'thumbnail': 'https://images.radio-canada.ca/v1/synps-cbc/episode/perso/cbc_schitts_creek_season_01e01_thumbnail_v01.jpg?im=Resize=(Size)',
+ 'series': 'Schitt\'s Creek',
+ 'season_number': 1,
+ 'season': 'Season 1',
+ 'episode_number': 1,
+ 'episode': 'The Cup Runneth Over',
+ 'episode_id': 'schitts-creek/s01e01',
+ 'duration': 1309,
+ 'categories': ['comedy'],
+ },
+ 'params': {'format': 'bv'},
+ 'skip': 'Geo-restricted to Canada',
+ }]
+ _API_BASE = 'https://services.radio-canada.ca/ott/cbc-api/v2/assets/'
def _real_extract(self, url):
video_id = self._match_id(url)
- result = self._call_api(url, video_id)
-
- m3u8_url = xpath_text(result, 'url', fatal=True)
- formats = self._extract_m3u8_formats(re.sub(r'/([^/]+)/[^/?]+\.m3u8', r'/\1/\1.m3u8', m3u8_url), video_id, 'mp4', fatal=False)
- if len(formats) < 2:
- formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4')
- for f in formats:
- format_id = f.get('format_id')
- if format_id.startswith('AAC'):
- f['acodec'] = 'aac'
- elif format_id.startswith('AC3'):
- f['acodec'] = 'ac-3'
+ video_info = self._download_json(self._API_BASE + video_id, video_id)
+
+ last_error = None
+ attempt = -1
+ retries = self.get_param('extractor_retries', 15)
+ while attempt < retries:
+ attempt += 1
+ if last_error:
+ self.report_warning('%s. Retrying ...' % last_error)
+ m3u8_info = self._download_json(
+ video_info['playSession']['url'], video_id,
+ note='Downloading JSON metadata%s' % f' (attempt {attempt})')
+ m3u8_url = m3u8_info.get('url')
+ if m3u8_url:
+ break
+ elif m3u8_info.get('errorCode') == 1:
+ self.raise_geo_restricted(countries=['CA'])
+ else:
+ last_error = f'{self.IE_NAME} said: {m3u8_info.get("errorCode")} - {m3u8_info.get("message")}'
+ # 35 means media unavailable, but retries work
+ if m3u8_info.get('errorCode') != 35 or attempt >= retries:
+ raise ExtractorError(last_error)
+
+ formats = self._extract_m3u8_formats(m3u8_url, video_id, m3u8_id='hls')
+ self._remove_duplicate_formats(formats)
+
+ for i, format in enumerate(formats):
+ if format.get('vcodec') == 'none':
+ if format.get('ext') is None:
+ format['ext'] = 'm4a'
+ if format.get('acodec') is None:
+ format['acodec'] = 'mp4a.40.2'
+
+ # Put described audio at the beginning of the list, so that it
+ # isn't chosen by default, as most people won't want it.
+ if 'descriptive' in format['format_id'].lower():
+ format['preference'] = -2
+
self._sort_formats(formats)
- info = {
+ return {
'id': video_id,
- 'title': video_id,
+ 'title': video_info['title'],
+ 'description': video_info.get('description'),
+ 'thumbnail': video_info.get('image'),
+ 'series': video_info.get('series'),
+ 'season_number': video_info.get('season'),
+ 'season': f'Season {video_info.get("season")}',
+ 'episode_number': video_info.get('episode'),
+ 'episode': video_info.get('title'),
+ 'episode_id': video_id,
+ 'duration': video_info.get('duration'),
+ 'categories': [video_info.get('category')],
'formats': formats,
+ 'release_timestamp': video_info.get('airDate'),
+ 'timestamp': video_info.get('availableDate'),
}
- rss = xpath_element(result, 'rss')
- if rss:
- info.update(self._parse_rss_feed(rss)['entries'][0])
- del info['url']
- del info['_type']
- del info['ie_key']
- return info
-
-class CBCWatchIE(CBCWatchBaseIE):
- IE_NAME = 'cbc.ca:watch'
- _VALID_URL = r'https?://(?:gem|watch)\.cbc\.ca/(?:[^/]+/)+(?P<id>[0-9a-f-]+)'
+class CBCGemPlaylistIE(InfoExtractor):
+ IE_NAME = 'gem.cbc.ca:playlist'
+ _VALID_URL = r'https?://gem\.cbc\.ca/media/(?P<id>(?P<show>[0-9a-z-]+)/s(?P<season>[0-9]+))/?(?:[?#]|$)'
_TESTS = [{
- # geo-restricted to Canada, bypassable
- 'url': 'http://watch.cbc.ca/doc-zone/season-6/customer-disservice/38e815a-009e3ab12e4',
+ # TV show playlist, all public videos
+ 'url': 'https://gem.cbc.ca/media/schitts-creek/s06',
+ 'playlist_count': 16,
'info_dict': {
- 'id': '9673749a-5e77-484c-8b62-a1092a6b5168',
- 'ext': 'mp4',
- 'title': 'Customer (Dis)Service',
- 'description': 'md5:8bdd6913a0fe03d4b2a17ebe169c7c87',
- 'upload_date': '20160219',
- 'timestamp': 1455840000,
- },
- 'params': {
- # m3u8 download
- 'skip_download': True,
- 'format': 'bestvideo',
+ 'id': 'schitts-creek/s06',
+ 'title': 'Season 6',
+ 'description': 'md5:6a92104a56cbeb5818cc47884d4326a2',
},
- }, {
- # geo-restricted to Canada, bypassable
- 'url': 'http://watch.cbc.ca/arthur/all/1ed4b385-cd84-49cf-95f0-80f004680057',
- 'info_dict': {
- 'id': '1ed4b385-cd84-49cf-95f0-80f004680057',
- 'title': 'Arthur',
- 'description': 'Arthur, the sweetest 8-year-old aardvark, and his pals solve all kinds of problems with humour, kindness and teamwork.',
- },
- 'playlist_mincount': 30,
- }, {
- 'url': 'https://gem.cbc.ca/media/this-hour-has-22-minutes/season-26/episode-20/38e815a-0108c6c6a42',
- 'only_matching': True,
}]
+ _API_BASE = 'https://services.radio-canada.ca/ott/cbc-api/v2/shows/'
def _real_extract(self, url):
- video_id = self._match_id(url)
- rss = self._call_api('web/browse/' + video_id, video_id)
- return self._parse_rss_feed(rss)
+ match = self._match_valid_url(url)
+ season_id = match.group('id')
+ show = match.group('show')
+ show_info = self._download_json(self._API_BASE + show, season_id)
+ season = int(match.group('season'))
+ season_info = try_get(show_info, lambda x: x['seasons'][season - 1])
+
+ if season_info is None:
+ raise ExtractorError(f'Couldn\'t find season {season} of {show}')
+
+ episodes = []
+ for episode in season_info['assets']:
+ episodes.append({
+ '_type': 'url_transparent',
+ 'ie_key': 'CBCGem',
+ 'url': 'https://gem.cbc.ca/media/' + episode['id'],
+ 'id': episode['id'],
+ 'title': episode.get('title'),
+ 'description': episode.get('description'),
+ 'thumbnail': episode.get('image'),
+ 'series': episode.get('series'),
+ 'season_number': episode.get('season'),
+ 'season': season_info['title'],
+ 'season_id': season_info.get('id'),
+ 'episode_number': episode.get('episode'),
+ 'episode': episode.get('title'),
+ 'episode_id': episode['id'],
+ 'duration': episode.get('duration'),
+ 'categories': [episode.get('category')],
+ })
+ thumbnail = None
+ tn_uri = season_info.get('image')
+ # the-national was observed to use a "data:image/png;base64"
+ # URI for their 'image' value. The image was 1x1, and is
+ # probably just a placeholder, so it is ignored.
+ if tn_uri is not None and not tn_uri.startswith('data:'):
+ thumbnail = tn_uri
-class CBCOlympicsIE(InfoExtractor):
- IE_NAME = 'cbc.ca:olympics'
- _VALID_URL = r'https?://olympics\.cbc\.ca/video/[^/]+/(?P<id>[^/?#]+)'
- _TESTS = [{
- 'url': 'https://olympics.cbc.ca/video/whats-on-tv/olympic-morning-featuring-the-opening-ceremony/',
- 'only_matching': True,
- }]
+ return {
+ '_type': 'playlist',
+ 'entries': episodes,
+ 'id': season_id,
+ 'title': season_info['title'],
+ 'description': season_info.get('description'),
+ 'thumbnail': thumbnail,
+ 'series': show_info.get('title'),
+ 'season_number': season_info.get('season'),
+ 'season': season_info['title'],
+ }
+
+
+class CBCGemLiveIE(InfoExtractor):
+ IE_NAME = 'gem.cbc.ca:live'
+ _VALID_URL = r'https?://gem\.cbc\.ca/live/(?P<id>[0-9]{12})'
+ _TEST = {
+ 'url': 'https://gem.cbc.ca/live/920604739687',
+ 'info_dict': {
+ 'title': 'Ottawa',
+ 'description': 'The live TV channel and local programming from Ottawa',
+ 'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/CBC_OTT_VMS/Live_Channel_Static_Images/Ottawa_2880x1620.jpg',
+ 'is_live': True,
+ 'id': 'AyqZwxRqh8EH',
+ 'ext': 'mp4',
+ 'timestamp': 1492106160,
+ 'upload_date': '20170413',
+ 'uploader': 'CBCC-NEW',
+ },
+ 'skip': 'Live might have ended',
+ }
+
+ # It's unclear where the chars at the end come from, but they appear to be
+ # constant. Might need updating in the future.
+ _API = 'https://tpfeed.cbc.ca/f/ExhSPC/t_t3UKJR6MAT'
def _real_extract(self, url):
- display_id = self._match_id(url)
- webpage = self._download_webpage(url, display_id)
- video_id = self._hidden_inputs(webpage)['videoId']
- video_doc = self._download_xml(
- 'https://olympics.cbc.ca/videodata/%s.xml' % video_id, video_id)
- title = xpath_text(video_doc, 'title', fatal=True)
- is_live = xpath_text(video_doc, 'kind') == 'Live'
- if is_live:
- title = self._live_title(title)
-
- formats = []
- for video_source in video_doc.findall('videoSources/videoSource'):
- uri = xpath_text(video_source, 'uri')
- if not uri:
- continue
- tokenize = self._download_json(
- 'https://olympics.cbc.ca/api/api-akamai/tokenize',
- video_id, data=json.dumps({
- 'VideoSource': uri,
- }).encode(), headers={
- 'Content-Type': 'application/json',
- 'Referer': url,
- # d3.VideoPlayer._init in https://olympics.cbc.ca/components/script/base.js
- 'Cookie': '_dvp=TK:C0ObxjerU', # AKAMAI CDN cookie
- }, fatal=False)
- if not tokenize:
- continue
- content_url = tokenize['ContentUrl']
- video_source_format = video_source.get('format')
- if video_source_format == 'IIS':
- formats.extend(self._extract_ism_formats(
- content_url, video_id, ism_id=video_source_format, fatal=False))
- else:
- formats.extend(self._extract_m3u8_formats(
- content_url, video_id, 'mp4',
- 'm3u8' if is_live else 'm3u8_native',
- m3u8_id=video_source_format, fatal=False))
- self._sort_formats(formats)
+ video_id = self._match_id(url)
+ live_info = self._download_json(self._API, video_id)['entries']
+
+ video_info = None
+ for stream in live_info:
+ if stream.get('guid') == video_id:
+ video_info = stream
+
+ if video_info is None:
+ raise ExtractorError(
+ 'Couldn\'t find video metadata, maybe this livestream is now offline',
+ expected=True)
return {
+ '_type': 'url_transparent',
+ 'ie_key': 'ThePlatform',
+ 'url': video_info['content'][0]['url'],
'id': video_id,
- 'display_id': display_id,
- 'title': title,
- 'description': xpath_text(video_doc, 'description'),
- 'thumbnail': xpath_text(video_doc, 'thumbnailUrl'),
- 'duration': parse_duration(xpath_text(video_doc, 'duration')),
- 'formats': formats,
- 'is_live': is_live,
+ 'title': video_info.get('title'),
+ 'description': video_info.get('description'),
+ 'tags': try_get(video_info, lambda x: x['keywords'].split(', ')),
+ 'thumbnail': video_info.get('cbc$staticImage'),
+ 'is_live': True,
}
diff --git a/hypervideo_dl/extractor/cbs.py b/hypervideo_dl/extractor/cbs.py
index c79e55a..ae9ce58 100644
--- a/hypervideo_dl/extractor/cbs.py
+++ b/hypervideo_dl/extractor/cbs.py
@@ -8,6 +8,7 @@ from ..utils import (
xpath_element,
xpath_text,
update_url_query,
+ url_or_none,
)
@@ -25,12 +26,64 @@ class CBSBaseIE(ThePlatformFeedIE):
})
return subtitles
+ def _extract_common_video_info(self, content_id, asset_types, mpx_acc, extra_info):
+ tp_path = 'dJ5BDC/media/guid/%d/%s' % (mpx_acc, content_id)
+ tp_release_url = f'https://link.theplatform.com/s/{tp_path}'
+ info = self._extract_theplatform_metadata(tp_path, content_id)
+
+ formats, subtitles = [], {}
+ last_e = None
+ for asset_type, query in asset_types.items():
+ try:
+ tp_formats, tp_subtitles = self._extract_theplatform_smil(
+ update_url_query(tp_release_url, query), content_id,
+ 'Downloading %s SMIL data' % asset_type)
+ except ExtractorError as e:
+ last_e = e
+ if asset_type != 'fallback':
+ continue
+ query['formats'] = '' # blank query to check if expired
+ try:
+ tp_formats, tp_subtitles = self._extract_theplatform_smil(
+ update_url_query(tp_release_url, query), content_id,
+ 'Downloading %s SMIL data, trying again with another format' % asset_type)
+ except ExtractorError as e:
+ last_e = e
+ continue
+ formats.extend(tp_formats)
+ subtitles = self._merge_subtitles(subtitles, tp_subtitles)
+ if last_e and not formats:
+ self.raise_no_formats(last_e, True, content_id)
+ self._sort_formats(formats)
+
+ extra_info.update({
+ 'id': content_id,
+ 'formats': formats,
+ 'subtitles': subtitles,
+ })
+ info.update({k: v for k, v in extra_info.items() if v is not None})
+ return info
+
+ def _extract_video_info(self, *args, **kwargs):
+ # Extract assets + metadata and call _extract_common_video_info
+ raise NotImplementedError('This method must be implemented by subclasses')
+
+ def _real_extract(self, url):
+ return self._extract_video_info(self._match_id(url))
+
class CBSIE(CBSBaseIE):
- _VALID_URL = r'(?:cbs:|https?://(?:www\.)?(?:(?:cbs|paramountplus)\.com/shows/[^/]+/video|colbertlateshow\.com/(?:video|podcasts))/)(?P<id>[\w-]+)'
+ _VALID_URL = r'''(?x)
+ (?:
+ cbs:|
+ https?://(?:www\.)?(?:
+ cbs\.com/(?:shows/[^/]+/video|movies/[^/]+)/|
+ colbertlateshow\.com/(?:video|podcasts)/)
+ )(?P<id>[\w-]+)'''
+ # All tests are blocked outside US
_TESTS = [{
- 'url': 'http://www.cbs.com/shows/garth-brooks/video/_u7W953k6la293J7EPTd9oHkSPs6Xn6_/connect-chat-feat-garth-brooks/',
+ 'url': 'https://www.cbs.com/shows/garth-brooks/video/_u7W953k6la293J7EPTd9oHkSPs6Xn6_/connect-chat-feat-garth-brooks/',
'info_dict': {
'id': '_u7W953k6la293J7EPTd9oHkSPs6Xn6_',
'ext': 'mp4',
@@ -45,71 +98,70 @@ class CBSIE(CBSBaseIE):
# m3u8 download
'skip_download': True,
},
- '_skip': 'Blocked outside the US',
+ }, {
+ 'url': 'https://www.cbs.com/shows/the-late-show-with-stephen-colbert/video/60icOhMb9NcjbcWnF_gub9XXHdeBcNk2/the-late-show-6-23-21-christine-baranski-joy-oladokun-',
+ 'info_dict': {
+ 'id': '60icOhMb9NcjbcWnF_gub9XXHdeBcNk2',
+ 'title': 'The Late Show - 6/23/21 (Christine Baranski, Joy Oladokun)',
+ 'timestamp': 1624507140,
+ 'description': 'md5:e01af24e95c74d55e8775aef86117b95',
+ 'uploader': 'CBSI-NEW',
+ 'upload_date': '20210624',
+ },
+ 'params': {
+ 'ignore_no_formats_error': True,
+ 'skip_download': True,
+ },
+ 'expected_warnings': [
+ 'This content expired on', 'No video formats found', 'Requested format is not available'],
}, {
'url': 'http://colbertlateshow.com/video/8GmB0oY0McANFvp2aEffk9jZZZ2YyXxy/the-colbeard/',
'only_matching': True,
}, {
'url': 'http://www.colbertlateshow.com/podcasts/dYSwjqPs_X1tvbV_P2FcPWRa_qT6akTC/in-the-bad-room-with-stephen/',
'only_matching': True,
- }, {
- 'url': 'https://www.paramountplus.com/shows/all-rise/video/QmR1WhNkh1a_IrdHZrbcRklm176X_rVc/all-rise-space/',
- 'only_matching': True,
}]
def _extract_video_info(self, content_id, site='cbs', mpx_acc=2198311517):
items_data = self._download_xml(
- 'http://can.cbs.com/thunder/player/videoPlayerService.php',
+ 'https://can.cbs.com/thunder/player/videoPlayerService.php',
content_id, query={'partner': site, 'contentId': content_id})
video_data = xpath_element(items_data, './/item')
- title = xpath_text(video_data, 'videoTitle', 'title', True)
- tp_path = 'dJ5BDC/media/guid/%d/%s' % (mpx_acc, content_id)
- tp_release_url = 'http://link.theplatform.com/s/' + tp_path
+ title = xpath_text(video_data, 'videoTitle', 'title') or xpath_text(video_data, 'videotitle', 'title')
- asset_types = []
- subtitles = {}
- formats = []
- last_e = None
+ asset_types = {}
+ has_drm = False
for item in items_data.findall('.//item'):
asset_type = xpath_text(item, 'assetType')
- if not asset_type or asset_type in asset_types or 'HLS_FPS' in asset_type or 'DASH_CENC' in asset_type:
- continue
- asset_types.append(asset_type)
query = {
'mbr': 'true',
'assetTypes': asset_type,
}
- if asset_type.startswith('HLS') or asset_type in ('OnceURL', 'StreamPack'):
+ if not asset_type:
+ # fallback for content_ids that videoPlayerService doesn't return anything for
+ asset_type = 'fallback'
+ query['formats'] = 'M3U+none,MPEG4,M3U+appleHlsEncryption,MP3'
+ del query['assetTypes']
+ if asset_type in asset_types:
+ continue
+ elif any(excluded in asset_type for excluded in ('HLS_FPS', 'DASH_CENC', 'OnceURL')):
+ if 'DASH_CENC' in asset_type:
+ has_drm = True
+ continue
+ if asset_type.startswith('HLS') or 'StreamPack' in asset_type:
query['formats'] = 'MPEG4,M3U'
elif asset_type in ('RTMP', 'WIFI', '3G'):
query['formats'] = 'MPEG4,FLV'
- try:
- tp_formats, tp_subtitles = self._extract_theplatform_smil(
- update_url_query(tp_release_url, query), content_id,
- 'Downloading %s SMIL data' % asset_type)
- except ExtractorError as e:
- last_e = e
- continue
- formats.extend(tp_formats)
- subtitles = self._merge_subtitles(subtitles, tp_subtitles)
- if last_e and not formats:
- raise last_e
- self._sort_formats(formats)
+ asset_types[asset_type] = query
- info = self._extract_theplatform_metadata(tp_path, content_id)
- info.update({
- 'id': content_id,
+ if not asset_types and has_drm:
+ self.report_drm(content_id)
+
+ return self._extract_common_video_info(content_id, asset_types, mpx_acc, extra_info={
'title': title,
'series': xpath_text(video_data, 'seriesTitle'),
'season_number': int_or_none(xpath_text(video_data, 'seasonNumber')),
'episode_number': int_or_none(xpath_text(video_data, 'episodeNumber')),
'duration': int_or_none(xpath_text(video_data, 'videoLength'), 1000),
- 'thumbnail': xpath_text(video_data, 'previewImageURL'),
- 'formats': formats,
- 'subtitles': subtitles,
+ 'thumbnail': url_or_none(xpath_text(video_data, 'previewImageURL')),
})
- return info
-
- def _real_extract(self, url):
- content_id = self._match_id(url)
- return self._extract_video_info(content_id)
diff --git a/hypervideo_dl/extractor/cbsinteractive.py b/hypervideo_dl/extractor/cbsinteractive.py
index 6596e98..9d4f754 100644
--- a/hypervideo_dl/extractor/cbsinteractive.py
+++ b/hypervideo_dl/extractor/cbsinteractive.py
@@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
from .cbs import CBSIE
from ..utils import int_or_none
@@ -71,7 +70,7 @@ class CBSInteractiveIE(CBSIE):
}
def _real_extract(self, url):
- site, display_id = re.match(self._VALID_URL, url).groups()
+ site, display_id = self._match_valid_url(url).groups()
webpage = self._download_webpage(url, display_id)
data_json = self._html_search_regex(
diff --git a/hypervideo_dl/extractor/cbssports.py b/hypervideo_dl/extractor/cbssports.py
index a891c9a..b8a6e59 100644
--- a/hypervideo_dl/extractor/cbssports.py
+++ b/hypervideo_dl/extractor/cbssports.py
@@ -1,6 +1,5 @@
from __future__ import unicode_literals
-import re
# from .cbs import CBSBaseIE
from .common import InfoExtractor
@@ -30,7 +29,7 @@ class CBSSportsEmbedIE(InfoExtractor):
# return self._extract_feed_info('dJ5BDC', 'VxxJg8Ymh8sE', filter_query, video_id)
def _real_extract(self, url):
- uuid, pcid = re.match(self._VALID_URL, url).groups()
+ uuid, pcid = self._match_valid_url(url).groups()
query = {'id': uuid} if uuid else {'pcid': pcid}
video = self._download_json(
'https://www.cbssports.com/api/content/video/',
diff --git a/hypervideo_dl/extractor/ccma.py b/hypervideo_dl/extractor/ccma.py
index e6ae493..ea98f86 100644
--- a/hypervideo_dl/extractor/ccma.py
+++ b/hypervideo_dl/extractor/ccma.py
@@ -3,7 +3,6 @@ from __future__ import unicode_literals
import calendar
import datetime
-import re
from .common import InfoExtractor
from ..utils import (
@@ -61,7 +60,7 @@ class CCMAIE(InfoExtractor):
}]
def _real_extract(self, url):
- media_type, media_id = re.match(self._VALID_URL, url).groups()
+ media_type, media_id = self._match_valid_url(url).groups()
media = self._download_json(
'http://dinamics.ccma.cat/pvideo/media.jsp', media_id, query={
diff --git a/hypervideo_dl/extractor/cctv.py b/hypervideo_dl/extractor/cctv.py
index c76f361..9b86121 100644
--- a/hypervideo_dl/extractor/cctv.py
+++ b/hypervideo_dl/extractor/cctv.py
@@ -162,7 +162,7 @@ class CCTVIE(InfoExtractor):
'url': video_url,
'format_id': 'http',
'quality': quality,
- 'preference': -1,
+ 'source_preference': -10
})
hls_url = try_get(data, lambda x: x['hls_url'], compat_str)
diff --git a/hypervideo_dl/extractor/cda.py b/hypervideo_dl/extractor/cda.py
index e1b3919..72c4705 100644
--- a/hypervideo_dl/extractor/cda.py
+++ b/hypervideo_dl/extractor/cda.py
@@ -3,6 +3,7 @@ from __future__ import unicode_literals
import codecs
import re
+import json
from .common import InfoExtractor
from ..compat import (
@@ -19,6 +20,7 @@ from ..utils import (
parse_duration,
random_birthday,
urljoin,
+ try_get,
)
@@ -38,6 +40,8 @@ class CDAIE(InfoExtractor):
'average_rating': float,
'duration': 39,
'age_limit': 0,
+ 'upload_date': '20160221',
+ 'timestamp': 1456078244,
}
}, {
'url': 'http://www.cda.pl/video/57413289',
@@ -143,7 +147,7 @@ class CDAIE(InfoExtractor):
b = []
for c in a:
f = compat_ord(c)
- b.append(compat_chr(33 + (f + 14) % 94) if 33 <= f and 126 >= f else compat_chr(f))
+ b.append(compat_chr(33 + (f + 14) % 94) if 33 <= f <= 126 else compat_chr(f))
a = ''.join(b)
a = a.replace('.cda.mp4', '')
for p in ('.2cda.pl', '.3cda.pl'):
@@ -173,18 +177,34 @@ class CDAIE(InfoExtractor):
video['file'] = video['file'].replace('adc.mp4', '.mp4')
elif not video['file'].startswith('http'):
video['file'] = decrypt_file(video['file'])
- f = {
+ video_quality = video.get('quality')
+ qualities = video.get('qualities', {})
+ video_quality = next((k for k, v in qualities.items() if v == video_quality), video_quality)
+ info_dict['formats'].append({
'url': video['file'],
- }
- m = re.search(
- r'<a[^>]+data-quality="(?P<format_id>[^"]+)"[^>]+href="[^"]+"[^>]+class="[^"]*quality-btn-active[^"]*">(?P<height>[0-9]+)p',
- page)
- if m:
- f.update({
- 'format_id': m.group('format_id'),
- 'height': int(m.group('height')),
- })
- info_dict['formats'].append(f)
+ 'format_id': video_quality,
+ 'height': int_or_none(video_quality[:-1]),
+ })
+ for quality, cda_quality in qualities.items():
+ if quality == video_quality:
+ continue
+ data = {'jsonrpc': '2.0', 'method': 'videoGetLink', 'id': 2,
+ 'params': [video_id, cda_quality, video.get('ts'), video.get('hash2'), {}]}
+ data = json.dumps(data).encode('utf-8')
+ video_url = self._download_json(
+ f'https://www.cda.pl/video/{video_id}', video_id, headers={
+ 'Content-Type': 'application/json',
+ 'X-Requested-With': 'XMLHttpRequest'
+ }, data=data, note=f'Fetching {quality} url',
+ errnote=f'Failed to fetch {quality} url', fatal=False)
+ if try_get(video_url, lambda x: x['result']['status']) == 'ok':
+ video_url = try_get(video_url, lambda x: x['result']['resp'])
+ info_dict['formats'].append({
+ 'url': video_url,
+ 'format_id': quality,
+ 'height': int_or_none(quality[:-1])
+ })
+
if not info_dict['duration']:
info_dict['duration'] = parse_duration(video.get('duration'))
diff --git a/hypervideo_dl/extractor/ceskatelevize.py b/hypervideo_dl/extractor/ceskatelevize.py
index 7cb4efb..5e04d38 100644
--- a/hypervideo_dl/extractor/ceskatelevize.py
+++ b/hypervideo_dl/extractor/ceskatelevize.py
@@ -147,8 +147,6 @@ class CeskaTelevizeIE(InfoExtractor):
is_live = item.get('type') == 'LIVE'
formats = []
for format_id, stream_url in item.get('streamUrls', {}).items():
- if 'drmOnly=true' in stream_url:
- continue
if 'playerType=flash' in stream_url:
stream_formats = self._extract_m3u8_formats(
stream_url, playlist_id, 'mp4', 'm3u8_native',
@@ -157,6 +155,9 @@ class CeskaTelevizeIE(InfoExtractor):
stream_formats = self._extract_mpd_formats(
stream_url, playlist_id,
mpd_id='dash-%s' % format_id, fatal=False)
+ if 'drmOnly=true' in stream_url:
+ for f in stream_formats:
+ f['has_drm'] = True
# See https://github.com/ytdl-org/youtube-dl/issues/12119#issuecomment-280037031
if format_id == 'audioDescription':
for f in stream_formats:
diff --git a/hypervideo_dl/extractor/cgtn.py b/hypervideo_dl/extractor/cgtn.py
new file mode 100644
index 0000000..89f1738
--- /dev/null
+++ b/hypervideo_dl/extractor/cgtn.py
@@ -0,0 +1,64 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ try_get,
+ unified_timestamp,
+)
+
+
+class CGTNIE(InfoExtractor):
+ _VALID_URL = r'https?://news\.cgtn\.com/news/[0-9]{4}-[0-9]{2}-[0-9]{2}/[a-zA-Z0-9-]+-(?P<id>[a-zA-Z0-9-]+)/index\.html'
+ _TESTS = [
+ {
+ 'url': 'https://news.cgtn.com/news/2021-03-09/Up-and-Out-of-Poverty-Ep-1-A-solemn-promise-YuOUaOzGQU/index.html',
+ 'info_dict': {
+ 'id': 'YuOUaOzGQU',
+ 'ext': 'mp4',
+ 'title': 'Up and Out of Poverty Ep. 1: A solemn promise',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'timestamp': 1615295940,
+ 'upload_date': '20210309',
+ },
+ 'params': {
+ 'skip_download': True
+ }
+ }, {
+ 'url': 'https://news.cgtn.com/news/2021-06-06/China-Indonesia-vow-to-further-deepen-maritime-cooperation-10REvJCewCY/index.html',
+ 'info_dict': {
+ 'id': '10REvJCewCY',
+ 'ext': 'mp4',
+ 'title': 'China, Indonesia vow to further deepen maritime cooperation',
+ 'thumbnail': r're:^https?://.*\.png$',
+ 'description': 'China and Indonesia vowed to upgrade their cooperation into the maritime sector and also for political security, economy, and cultural and people-to-people exchanges.',
+ 'author': 'CGTN',
+ 'category': 'China',
+ 'timestamp': 1622950200,
+ 'upload_date': '20210606',
+ },
+ 'params': {
+ 'skip_download': False
+ }
+ }
+ ]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ download_url = self._html_search_regex(r'data-video ="(?P<url>.+m3u8)"', webpage, 'download_url')
+ datetime_str = self._html_search_regex(r'<span class="date">\s*(.+?)\s*</span>', webpage, 'datetime_str', fatal=False)
+
+ return {
+ 'id': video_id,
+ 'title': self._og_search_title(webpage),
+ 'description': self._og_search_description(webpage, default=None),
+ 'thumbnail': self._og_search_thumbnail(webpage),
+ 'formats': self._extract_m3u8_formats(download_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls'),
+ 'category': self._html_search_regex(r'<span class="section">\s*(.+?)\s*</span>',
+ webpage, 'category', fatal=False),
+ 'author': self._html_search_regex(r'<div class="news-author-name">\s*(.+?)\s*</div>',
+ webpage, 'author', default=None, fatal=False),
+ 'timestamp': try_get(unified_timestamp(datetime_str), lambda x: x - 8 * 3600),
+ }
diff --git a/hypervideo_dl/extractor/channel9.py b/hypervideo_dl/extractor/channel9.py
index 09cacf6..90024db 100644
--- a/hypervideo_dl/extractor/channel9.py
+++ b/hypervideo_dl/extractor/channel9.py
@@ -5,7 +5,6 @@ import re
from .common import InfoExtractor
from ..utils import (
clean_html,
- ExtractorError,
int_or_none,
parse_iso8601,
qualities,
@@ -97,7 +96,7 @@ class Channel9IE(InfoExtractor):
return self.playlist_result(entries, video_id, title_text)
def _real_extract(self, url):
- content_path, rss = re.match(self._VALID_URL, url).groups()
+ content_path, rss = self._match_valid_url(url).groups()
if rss:
return self._extract_list(content_path, url)
@@ -187,14 +186,13 @@ class Channel9IE(InfoExtractor):
'quality': quality(q, q_url),
})
- self._sort_formats(formats)
-
slides = content_data.get('Slides')
zip_file = content_data.get('ZipFile')
if not formats and not slides and not zip_file:
- raise ExtractorError(
+ self.raise_no_formats(
'None of recording, slides or zip are available for %s' % content_path)
+ self._sort_formats(formats)
subtitles = {}
for caption in content_data.get('Captions', []):
diff --git a/hypervideo_dl/extractor/chilloutzone.py b/hypervideo_dl/extractor/chilloutzone.py
index 5aac212..fd5202b 100644
--- a/hypervideo_dl/extractor/chilloutzone.py
+++ b/hypervideo_dl/extractor/chilloutzone.py
@@ -1,6 +1,5 @@
from __future__ import unicode_literals
-import re
import json
from .common import InfoExtractor
@@ -51,7 +50,7 @@ class ChilloutzoneIE(InfoExtractor):
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
diff --git a/hypervideo_dl/extractor/chingari.py b/hypervideo_dl/extractor/chingari.py
new file mode 100644
index 0000000..6bdc4f6
--- /dev/null
+++ b/hypervideo_dl/extractor/chingari.py
@@ -0,0 +1,209 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import itertools
+import json
+
+from .common import InfoExtractor
+from ..compat import compat_urllib_parse_unquote_plus
+from ..utils import (
+ clean_html,
+ ExtractorError,
+ int_or_none,
+ str_to_int,
+ url_or_none,
+)
+
+
+class ChingariBaseIE(InfoExtractor):
+ def _get_post(self, id, post_data):
+ media_data = post_data['mediaLocation']
+ base_url = media_data['base']
+ author_data = post_data.get('authorData', {})
+ song_data = post_data.get('song', {}) # revist this in future for differentiating b/w 'art' and 'author'
+
+ formats = [{
+ 'format_id': frmt,
+ 'width': str_to_int(frmt[1:]),
+ 'url': base_url + frmt_path,
+ } for frmt, frmt_path in media_data.get('transcoded', {}).items()]
+
+ if media_data.get('path'):
+ formats.append({
+ 'format_id': 'original',
+ 'format_note': 'Direct video.',
+ 'url': base_url + '/apipublic' + media_data['path'],
+ 'quality': 10,
+ })
+ self._sort_formats(formats)
+ timestamp = str_to_int(post_data.get('created_at'))
+ if timestamp:
+ timestamp = int_or_none(timestamp, 1000)
+
+ thumbnail, uploader_url = None, None
+ if media_data.get('thumbnail'):
+ thumbnail = base_url + media_data.get('thumbnail')
+ if author_data.get('username'):
+ uploader_url = 'https://chingari.io/' + author_data.get('username')
+
+ return {
+ 'id': id,
+ 'title': compat_urllib_parse_unquote_plus(clean_html(post_data.get('caption'))),
+ 'description': compat_urllib_parse_unquote_plus(clean_html(post_data.get('caption'))),
+ 'duration': media_data.get('duration'),
+ 'thumbnail': url_or_none(thumbnail),
+ 'like_count': post_data.get('likeCount'),
+ 'view_count': post_data.get('viewsCount'),
+ 'comment_count': post_data.get('commentCount'),
+ 'repost_count': post_data.get('shareCount'),
+ 'timestamp': timestamp,
+ 'uploader_id': post_data.get('userId') or author_data.get('_id'),
+ 'uploader': author_data.get('name'),
+ 'uploader_url': url_or_none(uploader_url),
+ 'track': song_data.get('title'),
+ 'artist': song_data.get('author'),
+ 'formats': formats,
+ }
+
+
+class ChingariIE(ChingariBaseIE):
+ _VALID_URL = r'(?:https?://)(?:www\.)?chingari\.io/share/post\?id=(?P<id>[^&/#?]+)'
+ _TESTS = [{
+ 'url': 'https://chingari.io/share/post?id=612f8f4ce1dc57090e8a7beb',
+ 'info_dict': {
+ 'id': '612f8f4ce1dc57090e8a7beb',
+ 'ext': 'mp4',
+ 'title': 'Happy birthday Srila Prabhupada',
+ 'description': 'md5:c7080ebfdfeb06016e638c286d6bc3fa',
+ 'duration': 0,
+ 'thumbnail': 'https://media.chingari.io/uploads/c41d30e2-06b6-4e3b-9b4b-edbb929cec06-1630506826911/thumbnail/198f993f-ce87-4623-82c6-cd071bd6d4f4-1630506828016.jpg',
+ 'like_count': int,
+ 'view_count': int,
+ 'comment_count': int,
+ 'repost_count': int,
+ 'timestamp': 1630506828,
+ 'upload_date': '20210901',
+ 'uploader_id': '5f0403982c8bd344f4813f8c',
+ 'uploader': 'ISKCON,Inc.',
+ 'uploader_url': 'https://chingari.io/iskcon,inc',
+ 'track': None,
+ 'artist': None,
+ },
+ 'params': {'skip_download': True}
+ }]
+
+ def _real_extract(self, url):
+ id = self._match_id(url)
+ post_json = self._download_json(f'https://api.chingari.io/post/post_details/{id}', id)
+ if post_json['code'] != 200:
+ raise ExtractorError(post_json['message'], expected=True)
+ post_data = post_json['data']
+ return self._get_post(id, post_data)
+
+
+class ChingariUserIE(ChingariBaseIE):
+ _VALID_URL = r'(?:https?://)(?:www\.)?chingari\.io/(?!share/post)(?P<id>[^/?]+)'
+ _TESTS = [{
+ 'url': 'https://chingari.io/dada1023',
+ 'playlist_mincount': 3,
+ 'info_dict': {
+ 'id': 'dada1023',
+ },
+ 'entries': [{
+ 'url': 'https://chingari.io/share/post?id=614781f3ade60b3a0bfff42a',
+ 'info_dict': {
+ 'id': '614781f3ade60b3a0bfff42a',
+ 'ext': 'mp4',
+ 'title': '#chingaribappa ',
+ 'description': 'md5:d1df21d84088770468fa63afe3b17857',
+ 'duration': 7,
+ 'thumbnail': 'https://media.chingari.io/uploads/346d86d4-abb2-474e-a164-ffccf2bbcb72-1632076273717/thumbnail/b0b3aac2-2b86-4dd1-909d-9ed6e57cf77c-1632076275552.jpg',
+ 'like_count': int,
+ 'view_count': int,
+ 'comment_count': int,
+ 'repost_count': int,
+ 'timestamp': 1632076275,
+ 'upload_date': '20210919',
+ 'uploader_id': '5efc4b12cca35c3d1794c2d3',
+ 'uploader': 'dada (girish) dhawale',
+ 'uploader_url': 'https://chingari.io/dada1023',
+ 'track': None,
+ 'artist': None
+ },
+ 'params': {'skip_download': True}
+ }, {
+ 'url': 'https://chingari.io/share/post?id=6146b132bcbf860959e12cba',
+ 'info_dict': {
+ 'id': '6146b132bcbf860959e12cba',
+ 'ext': 'mp4',
+ 'title': 'Tactor harvesting',
+ 'description': 'md5:8403f12dce68828b77ecee7eb7e887b7',
+ 'duration': 59.3,
+ 'thumbnail': 'https://media.chingari.io/uploads/b353ca70-7a87-400d-93a6-fa561afaec86-1632022814584/thumbnail/c09302e3-2043-41b1-a2fe-77d97e5bd676-1632022834260.jpg',
+ 'like_count': int,
+ 'view_count': int,
+ 'comment_count': int,
+ 'repost_count': int,
+ 'timestamp': 1632022834,
+ 'upload_date': '20210919',
+ 'uploader_id': '5efc4b12cca35c3d1794c2d3',
+ 'uploader': 'dada (girish) dhawale',
+ 'uploader_url': 'https://chingari.io/dada1023',
+ 'track': None,
+ 'artist': None
+ },
+ 'params': {'skip_download': True}
+ }, {
+ 'url': 'https://chingari.io/share/post?id=6145651b74cb030a64c40b82',
+ 'info_dict': {
+ 'id': '6145651b74cb030a64c40b82',
+ 'ext': 'mp4',
+ 'title': '#odiabhajan ',
+ 'description': 'md5:687ea36835b9276cf2af90f25e7654cb',
+ 'duration': 56.67,
+ 'thumbnail': 'https://media.chingari.io/uploads/6cbf216b-babc-4cce-87fe-ceaac8d706ac-1631937782708/thumbnail/8855754f-6669-48ce-b269-8cc0699ed6da-1631937819522.jpg',
+ 'like_count': int,
+ 'view_count': int,
+ 'comment_count': int,
+ 'repost_count': int,
+ 'timestamp': 1631937819,
+ 'upload_date': '20210918',
+ 'uploader_id': '5efc4b12cca35c3d1794c2d3',
+ 'uploader': 'dada (girish) dhawale',
+ 'uploader_url': 'https://chingari.io/dada1023',
+ 'track': None,
+ 'artist': None
+ },
+ 'params': {'skip_download': True}
+ }],
+ }, {
+ 'url': 'https://chingari.io/iskcon%2Cinc',
+ 'playlist_mincount': 1025,
+ 'info_dict': {
+ 'id': 'iskcon%2Cinc',
+ },
+ }]
+
+ def _entries(self, id):
+ skip = 0
+ has_more = True
+ for page in itertools.count():
+ posts = self._download_json('https://api.chingari.io/users/getPosts', id,
+ data=json.dumps({'userId': id, 'ownerId': id, 'skip': skip, 'limit': 20}).encode(),
+ headers={'content-type': 'application/json;charset=UTF-8'},
+ note='Downloading page %s' % page)
+ for post in posts.get('data', []):
+ post_data = post['post']
+ yield self._get_post(post_data['_id'], post_data)
+ skip += 20
+ has_more = posts['hasMoreData']
+ if not has_more:
+ break
+
+ def _real_extract(self, url):
+ alt_id = self._match_id(url)
+ post_json = self._download_json(f'https://api.chingari.io/user/{alt_id}', alt_id)
+ if post_json['code'] != 200:
+ raise ExtractorError(post_json['message'], expected=True)
+ id = post_json['data']['_id']
+ return self.playlist_result(self._entries(id), playlist_id=alt_id)
diff --git a/hypervideo_dl/extractor/cinemax.py b/hypervideo_dl/extractor/cinemax.py
index 7f89d33..2c3ff8d 100644
--- a/hypervideo_dl/extractor/cinemax.py
+++ b/hypervideo_dl/extractor/cinemax.py
@@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
from .hbo import HBOBaseIE
@@ -23,7 +22,7 @@ class CinemaxIE(HBOBaseIE):
}]
def _real_extract(self, url):
- path, video_id = re.match(self._VALID_URL, url).groups()
+ path, video_id = self._match_valid_url(url).groups()
info = self._extract_info('https://www.cinemax.com/%s.xml' % path, video_id)
info['id'] = video_id
return info
diff --git a/hypervideo_dl/extractor/ciscolive.py b/hypervideo_dl/extractor/ciscolive.py
index da404e4..349c5eb 100644
--- a/hypervideo_dl/extractor/ciscolive.py
+++ b/hypervideo_dl/extractor/ciscolive.py
@@ -4,14 +4,11 @@ from __future__ import unicode_literals
import itertools
from .common import InfoExtractor
-from ..compat import (
- compat_parse_qs,
- compat_urllib_parse_urlparse,
-)
from ..utils import (
clean_html,
float_or_none,
int_or_none,
+ parse_qs,
try_get,
urlencode_postdata,
)
@@ -145,7 +142,7 @@ class CiscoLiveSearchIE(CiscoLiveBaseIE):
query['from'] += query['size']
def _real_extract(self, url):
- query = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
+ query = parse_qs(url)
query['type'] = 'session'
return self.playlist_result(
self._entries(query, url), playlist_title='Search query')
diff --git a/hypervideo_dl/extractor/ciscowebex.py b/hypervideo_dl/extractor/ciscowebex.py
new file mode 100644
index 0000000..882dae9
--- /dev/null
+++ b/hypervideo_dl/extractor/ciscowebex.py
@@ -0,0 +1,90 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ int_or_none,
+ try_get,
+ unified_timestamp,
+)
+
+
+class CiscoWebexIE(InfoExtractor):
+ IE_NAME = 'ciscowebex'
+ IE_DESC = 'Cisco Webex'
+ _VALID_URL = r'''(?x)
+ (?P<url>https?://(?P<subdomain>[^/#?]*)\.webex\.com/(?:
+ (?P<siteurl_1>[^/#?]*)/(?:ldr|lsr).php\?(?:[^#]*&)*RCID=(?P<rcid>[0-9a-f]{32})|
+ (?:recordingservice|webappng)/sites/(?P<siteurl_2>[^/#?]*)/recording/(?:playback/|play/)?(?P<id>[0-9a-f]{32})
+ ))'''
+
+ _TESTS = [{
+ 'url': 'https://demosubdomain.webex.com/demositeurl/ldr.php?RCID=e58e803bc0f766bb5f6376d2e86adb5b',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://demosubdomain.webex.com/demositeurl/lsr.php?RCID=bc04b4a7b5ea2cc3a493d5ae6aaff5d7',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://demosubdomain.webex.com/recordingservice/sites/demositeurl/recording/88e7a42f7b19f5b423c54754aecc2ce9/playback',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ mobj = self._match_valid_url(url)
+ rcid = mobj.group('rcid')
+ if rcid:
+ webpage = self._download_webpage(url, None, note='Getting video ID')
+ url = self._search_regex(self._VALID_URL, webpage, 'redirection url', group='url')
+ url = self._request_webpage(url, None, note='Resolving final URL').geturl()
+ mobj = self._match_valid_url(url)
+ subdomain = mobj.group('subdomain')
+ siteurl = mobj.group('siteurl_1') or mobj.group('siteurl_2')
+ video_id = mobj.group('id')
+
+ stream = self._download_json(
+ 'https://%s.webex.com/webappng/api/v1/recordings/%s/stream' % (subdomain, video_id),
+ video_id, fatal=False, query={'siteurl': siteurl})
+ if not stream:
+ self.raise_login_required(method='cookies')
+
+ video_id = stream.get('recordUUID') or video_id
+
+ formats = [{
+ 'format_id': 'video',
+ 'url': stream['fallbackPlaySrc'],
+ 'ext': 'mp4',
+ 'vcodec': 'avc1.640028',
+ 'acodec': 'mp4a.40.2',
+ }]
+ if stream.get('preventDownload') is False:
+ mp4url = try_get(stream, lambda x: x['downloadRecordingInfo']['downloadInfo']['mp4URL'])
+ if mp4url:
+ formats.append({
+ 'format_id': 'video',
+ 'url': mp4url,
+ 'ext': 'mp4',
+ 'vcodec': 'avc1.640028',
+ 'acodec': 'mp4a.40.2',
+ })
+ audiourl = try_get(stream, lambda x: x['downloadRecordingInfo']['downloadInfo']['audioURL'])
+ if audiourl:
+ formats.append({
+ 'format_id': 'audio',
+ 'url': audiourl,
+ 'ext': 'mp3',
+ 'vcodec': 'none',
+ 'acodec': 'mp3',
+ })
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': stream['recordName'],
+ 'description': stream.get('description'),
+ 'uploader': stream.get('ownerDisplayName'),
+ 'uploader_id': stream.get('ownerUserName') or stream.get('ownerId'), # mail or id
+ 'timestamp': unified_timestamp(stream.get('createTime')),
+ 'duration': int_or_none(stream.get('duration'), 1000),
+ 'webpage_url': 'https://%s.webex.com/recordingservice/sites/%s/recording/playback/%s' % (subdomain, siteurl, video_id),
+ 'formats': formats,
+ }
diff --git a/hypervideo_dl/extractor/cjsw.py b/hypervideo_dl/extractor/cjsw.py
index 505bdbe..1dea0d7 100644
--- a/hypervideo_dl/extractor/cjsw.py
+++ b/hypervideo_dl/extractor/cjsw.py
@@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
from ..utils import (
@@ -30,7 +29,7 @@ class CJSWIE(InfoExtractor):
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
program, episode_id = mobj.group('program', 'id')
audio_id = '%s/%s' % (program, episode_id)
diff --git a/hypervideo_dl/extractor/clyp.py b/hypervideo_dl/extractor/clyp.py
index 06d04de..e6b2ac4 100644
--- a/hypervideo_dl/extractor/clyp.py
+++ b/hypervideo_dl/extractor/clyp.py
@@ -1,12 +1,9 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..compat import (
- compat_parse_qs,
- compat_urllib_parse_urlparse,
-)
from ..utils import (
float_or_none,
+ parse_qs,
unified_timestamp,
)
@@ -44,7 +41,7 @@ class ClypIE(InfoExtractor):
def _real_extract(self, url):
audio_id = self._match_id(url)
- qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
+ qs = parse_qs(url)
token = qs.get('token', [None])[0]
query = {}
diff --git a/hypervideo_dl/extractor/cmt.py b/hypervideo_dl/extractor/cmt.py
index e701fbe..a4ddb91 100644
--- a/hypervideo_dl/extractor/cmt.py
+++ b/hypervideo_dl/extractor/cmt.py
@@ -2,6 +2,8 @@ from __future__ import unicode_literals
from .mtv import MTVIE
+# TODO Remove - Reason: Outdated Site
+
class CMTIE(MTVIE):
IE_NAME = 'cmt.com'
@@ -39,7 +41,7 @@ class CMTIE(MTVIE):
'only_matching': True,
}]
- def _extract_mgid(self, webpage):
+ def _extract_mgid(self, webpage, url):
mgid = self._search_regex(
r'MTVN\.VIDEO\.contentUri\s*=\s*([\'"])(?P<mgid>.+?)\1',
webpage, 'mgid', group='mgid', default=None)
@@ -50,5 +52,5 @@ class CMTIE(MTVIE):
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- mgid = self._extract_mgid(webpage)
+ mgid = self._extract_mgid(webpage, url)
return self.url_result('http://media.mtvnservices.com/embed/%s' % mgid)
diff --git a/hypervideo_dl/extractor/cnbc.py b/hypervideo_dl/extractor/cnbc.py
index 7b9f453..da3730c 100644
--- a/hypervideo_dl/extractor/cnbc.py
+++ b/hypervideo_dl/extractor/cnbc.py
@@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
from ..utils import smuggle_url
@@ -57,7 +56,7 @@ class CNBCVideoIE(InfoExtractor):
}
def _real_extract(self, url):
- path, display_id = re.match(self._VALID_URL, url).groups()
+ path, display_id = self._match_valid_url(url).groups()
video_id = self._download_json(
'https://webql-redesign.cnbcfm.com/graphql', display_id, query={
'query': '''{
diff --git a/hypervideo_dl/extractor/cnn.py b/hypervideo_dl/extractor/cnn.py
index 2d950fa..af11d95 100644
--- a/hypervideo_dl/extractor/cnn.py
+++ b/hypervideo_dl/extractor/cnn.py
@@ -1,6 +1,5 @@
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
from .turner import TurnerBaseIE
@@ -88,7 +87,7 @@ class CNNIE(TurnerBaseIE):
return None
def _real_extract(self, url):
- sub_domain, path, page_title = re.match(self._VALID_URL, url).groups()
+ sub_domain, path, page_title = self._match_valid_url(url).groups()
if sub_domain not in ('money', 'edition'):
sub_domain = 'edition'
config = self._CONFIG[sub_domain]
diff --git a/hypervideo_dl/extractor/comedycentral.py b/hypervideo_dl/extractor/comedycentral.py
index 1bfa912..5a12ab5 100644
--- a/hypervideo_dl/extractor/comedycentral.py
+++ b/hypervideo_dl/extractor/comedycentral.py
@@ -4,7 +4,7 @@ from .mtv import MTVServicesInfoExtractor
class ComedyCentralIE(MTVServicesInfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?cc\.com/(?:episodes|video(?:-clips)?)/(?P<id>[0-9a-z]{6})'
+ _VALID_URL = r'https?://(?:www\.)?cc\.com/(?:episodes|video(?:-clips)?|collection-playlist)/(?P<id>[0-9a-z]{6})'
_FEED_URL = 'http://comedycentral.com/feeds/mrss/'
_TESTS = [{
@@ -24,6 +24,9 @@ class ComedyCentralIE(MTVServicesInfoExtractor):
}, {
'url': 'https://www.cc.com/video/k3sdvm/the-daily-show-with-jon-stewart-exclusive-the-fourth-estate',
'only_matching': True,
+ }, {
+ 'url': 'https://www.cc.com/collection-playlist/cosnej/stand-up-specials/t6vtjb',
+ 'only_matching': True,
}]
diff --git a/hypervideo_dl/extractor/common.py b/hypervideo_dl/extractor/common.py
index 8b622be..df74c75 100644
--- a/hypervideo_dl/extractor/common.py
+++ b/hypervideo_dl/extractor/common.py
@@ -4,13 +4,12 @@ from __future__ import unicode_literals
import base64
import datetime
import hashlib
+import itertools
import json
import netrc
import os
import random
import re
-import socket
-import ssl
import sys
import time
import math
@@ -20,8 +19,8 @@ from ..compat import (
compat_cookies_SimpleCookie,
compat_etree_Element,
compat_etree_fromstring,
+ compat_expanduser,
compat_getpass,
- compat_integer_types,
compat_http_client,
compat_os_name,
compat_str,
@@ -32,12 +31,12 @@ from ..compat import (
compat_urlparse,
compat_xml_parse_error,
)
+from ..downloader import FileDownloader
from ..downloader.f4m import (
get_base_url,
remove_encrypted_media,
)
from ..utils import (
- NO_DEFAULT,
age_restricted,
base_url,
bug_reports_message,
@@ -47,16 +46,19 @@ from ..utils import (
determine_protocol,
dict_get,
error_to_compat_str,
- ExtractorError,
extract_attributes,
+ ExtractorError,
fix_xml_ampersands,
float_or_none,
+ format_field,
GeoRestrictedError,
GeoUtils,
int_or_none,
js_to_json,
JSON_LD_RE,
mimetype2ext,
+ network_exceptions,
+ NO_DEFAULT,
orderedSet,
parse_bitrate,
parse_codecs,
@@ -65,19 +67,21 @@ from ..utils import (
parse_m3u8_attributes,
parse_resolution,
RegexNotFoundError,
- sanitized_Request,
sanitize_filename,
+ sanitized_Request,
str_or_none,
str_to_int,
strip_or_none,
+ traverse_obj,
unescapeHTML,
unified_strdate,
unified_timestamp,
update_Request,
update_url_query,
- urljoin,
url_basename,
url_or_none,
+ urljoin,
+ variadic,
xpath_element,
xpath_text,
xpath_with_ns,
@@ -143,6 +147,8 @@ class InfoExtractor(object):
* width Width of the video, if known
* height Height of the video, if known
* resolution Textual description of width and height
+ * dynamic_range The dynamic range of the video. One of:
+ "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
* tbr Average bitrate of audio and video in KBit/s
* abr Average audio bitrate in KBit/s
* acodec Name of the audio codec in use
@@ -156,7 +162,7 @@ class InfoExtractor(object):
* player_url SWF Player URL (used for rtmpdump).
* protocol The protocol that will be used for the actual
download, lower-case.
- "http", "https", "rtsp", "rtmp", "rtmpe",
+ "http", "https", "rtsp", "rtmp", "rtmp_ffmpeg", "rtmpe",
"m3u8", "m3u8_native" or "http_dash_segments".
* fragment_base_url
Base URL for fragments. Each fragment's path
@@ -201,8 +207,12 @@ class InfoExtractor(object):
width : height ratio as float.
* no_resume The server does not support resuming the
(HTTP or RTMP) download. Boolean.
+ * has_drm The format has DRM and cannot be downloaded. Boolean
* downloader_options A dictionary of downloader options as
described in FileDownloader
+ RTMP formats can also have the additional fields: page_url,
+ app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
+ rtmp_protocol, rtmp_real_time
url: Final video URL.
ext: Video filename extension.
@@ -232,8 +242,7 @@ class InfoExtractor(object):
creator: The creator of the video.
release_timestamp: UNIX timestamp of the moment the video was released.
release_date: The date (YYYYMMDD) when the video was released.
- timestamp: UNIX timestamp of the moment the video became available
- (uploaded).
+ timestamp: UNIX timestamp of the moment the video was uploaded
upload_date: Video upload date (YYYYMMDD).
If not explicitly set, calculated from timestamp.
uploader_id: Nickname or id of the video uploader.
@@ -251,9 +260,11 @@ class InfoExtractor(object):
entry and one of:
* "data": The subtitles file contents
* "url": A URL pointing to the subtitles file
+ It can optionally also have:
+ * "name": Name or description of the subtitles
"ext" will be calculated from URL if missing
- automatic_captions: Like 'subtitles', used by the YoutubeIE for
- automatically generated captions
+ automatic_captions: Like 'subtitles'; contains automatically generated
+ captions instead of normal subtitles
duration: Length of the video in seconds, as an integer or float.
view_count: How many users have watched the video on the platform.
like_count: Number of positive ratings of the video
@@ -265,6 +276,7 @@ class InfoExtractor(object):
properties (all but one of text or html optional):
* "author" - human-readable name of the comment author
* "author_id" - user ID of the comment author
+ * "author_thumbnail" - The thumbnail of the comment author
* "id" - Comment ID
* "html" - Comment as HTML
* "text" - Plain text of the comment
@@ -272,6 +284,12 @@ class InfoExtractor(object):
* "parent" - ID of the comment this one is replying to.
Set to "root" to indicate that this is a
comment to the original video.
+ * "like_count" - Number of positive ratings of the comment
+ * "dislike_count" - Number of negative ratings of the comment
+ * "is_favorited" - Whether the comment is marked as
+ favorite by the video uploader
+ * "author_is_uploader" - Whether the comment is made by
+ the video uploader
age_limit: Age restriction for the video, as an integer (years)
webpage_url: The URL to the video webpage, if given to hypervideo it
should allow to get the same result again. (It will be set
@@ -279,8 +297,13 @@ class InfoExtractor(object):
categories: A list of categories that the video falls in, for example
["Sports", "Berlin"]
tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
+ cast: A list of the video cast
is_live: True, False, or None (=unknown). Whether this video is a
live stream that goes on instead of a fixed-length video.
+ was_live: True, False, or None (=unknown). Whether this video was
+ originally a live stream.
+ live_status: 'is_live', 'is_upcoming', 'was_live', 'not_live' or None (=unknown)
+ If absent, automatically set from is_live, was_live
start_time: Time in seconds where the reproduction should start, as
specified in the URL.
end_time: Time in seconds where the reproduction should end, as
@@ -289,6 +312,22 @@ class InfoExtractor(object):
* "start_time" - The start time of the chapter in seconds
* "end_time" - The end time of the chapter in seconds
* "title" (optional, string)
+ playable_in_embed: Whether this video is allowed to play in embedded
+ players on other sites. Can be True (=always allowed),
+ False (=never allowed), None (=unknown), or a string
+ specifying the criteria for embedability (Eg: 'whitelist')
+ availability: Under what condition the video is available. One of
+ 'private', 'premium_only', 'subscriber_only', 'needs_auth',
+ 'unlisted' or 'public'. Use 'InfoExtractor._availability'
+ to set it
+ __post_extractor: A function to be called just before the metadata is
+ written to either disk, logger or console. The function
+ must return a dict which will be added to the info_dict.
+ This is usefull for additional information that is
+ time-consuming to extract. Note that the fields thus
+ extracted will not be available to output template and
+ match_filter. So, only "comments" and "comment_count" are
+ currently allowed to be extracted via this method.
The following fields should only be used when the video belongs to some logical
chapter or section:
@@ -337,9 +376,8 @@ class InfoExtractor(object):
There must be a key "entries", which is a list, an iterable, or a PagedList
object, each element of which is a valid dictionary by this specification.
- Additionally, playlists can have "id", "title", "description", "uploader",
- "uploader_id", "uploader_url", "duration" attributes with the same semantics
- as videos (see above).
+ Additionally, playlists can have "id", "title", and any other relevent
+ attributes with the same semantics as videos (see above).
_type "multi_video" indicates that there are multiple videos that
@@ -370,6 +408,10 @@ class InfoExtractor(object):
_real_extract() methods and define a _VALID_URL regexp.
Probably, they should also be added to the list of extractors.
+ Subclasses may also override suitable() if necessary, but ensure the function
+ signature is preserved and that this function imports everything it needs
+ (except other extractors), so that lazy_extractors works correctly
+
_GEO_BYPASS attribute may be set to False in order to disable
geo restriction bypass mechanisms for a particular extractor.
Though it won't disable explicit geo restriction bypass based on
@@ -385,7 +427,7 @@ class InfoExtractor(object):
will be used by geo restriction bypass mechanism similarly
to _GEO_COUNTRIES.
- Finally, the _WORKING attribute should be set to False for broken IEs
+ The _WORKING attribute should be set to False for broken IEs
in order to warn the users and skip the tests.
"""
@@ -397,30 +439,47 @@ class InfoExtractor(object):
_GEO_IP_BLOCKS = None
_WORKING = True
+ _LOGIN_HINTS = {
+ 'any': 'Use --cookies, --username and --password or --netrc to provide account credentials',
+ 'cookies': (
+ 'Use --cookies-from-browser or --cookies for the authentication. '
+ 'See https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl for how to manually pass cookies'),
+ 'password': 'Use --username and --password or --netrc to provide account credentials',
+ }
+
def __init__(self, downloader=None):
"""Constructor. Receives an optional downloader."""
self._ready = False
self._x_forwarded_for_ip = None
+ self._printed_messages = set()
self.set_downloader(downloader)
@classmethod
- def suitable(cls, url):
- """Receives a URL and returns True if suitable for this IE."""
-
+ def _match_valid_url(cls, url):
# This does not use has/getattr intentionally - we want to know whether
# we have cached the regexp for *this* class, whereas getattr would also
# match the superclass
if '_VALID_URL_RE' not in cls.__dict__:
cls._VALID_URL_RE = re.compile(cls._VALID_URL)
- return cls._VALID_URL_RE.match(url) is not None
+ return cls._VALID_URL_RE.match(url)
+
+ @classmethod
+ def suitable(cls, url):
+ """Receives a URL and returns True if suitable for this IE."""
+ # This function must import everything it needs (except other extractors),
+ # so that lazy_extractors works correctly
+ return cls._match_valid_url(url) is not None
@classmethod
def _match_id(cls, url):
- if '_VALID_URL_RE' not in cls.__dict__:
- cls._VALID_URL_RE = re.compile(cls._VALID_URL)
- m = cls._VALID_URL_RE.match(url)
- assert m
- return compat_str(m.group('id'))
+ return cls._match_valid_url(url).group('id')
+
+ @classmethod
+ def get_temp_id(cls, url):
+ try:
+ return cls._match_id(url)
+ except (IndexError, AttributeError):
+ return None
@classmethod
def working(cls):
@@ -429,6 +488,7 @@ class InfoExtractor(object):
def initialize(self):
"""Initializes an instance (authentication, etc)."""
+ self._printed_messages = set()
self._initialize_geo_bypass({
'countries': self._GEO_COUNTRIES,
'ip_blocks': self._GEO_IP_BLOCKS,
@@ -466,7 +526,7 @@ class InfoExtractor(object):
if not self._x_forwarded_for_ip:
# Geo bypass mechanism is explicitly disabled by user
- if not self._downloader.params.get('geo_bypass', True):
+ if not self.get_param('geo_bypass', True):
return
if not geo_bypass_context:
@@ -488,7 +548,7 @@ class InfoExtractor(object):
# Explicit IP block specified by user, use it right away
# regardless of whether extractor is geo bypassable or not
- ip_block = self._downloader.params.get('geo_bypass_ip_block', None)
+ ip_block = self.get_param('geo_bypass_ip_block', None)
# Otherwise use random IP block from geo bypass context but only
# if extractor is known as geo bypassable
@@ -499,17 +559,15 @@ class InfoExtractor(object):
if ip_block:
self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
- if self._downloader.params.get('verbose', False):
- self._downloader.to_screen(
- '[debug] Using fake IP %s as X-Forwarded-For.'
- % self._x_forwarded_for_ip)
+ self._downloader.write_debug(
+ '[debug] Using fake IP %s as X-Forwarded-For' % self._x_forwarded_for_ip)
return
# Path 2: bypassing based on country code
# Explicit country code specified by user, use it right away
# regardless of whether extractor is geo bypassable or not
- country = self._downloader.params.get('geo_bypass_country', None)
+ country = self.get_param('geo_bypass_country', None)
# Otherwise use random country code from geo bypass context but
# only if extractor is known as geo bypassable
@@ -520,10 +578,8 @@ class InfoExtractor(object):
if country:
self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
- if self._downloader.params.get('verbose', False):
- self._downloader.to_screen(
- '[debug] Using fake IP %s (%s) as X-Forwarded-For.'
- % (self._x_forwarded_for_ip, country.upper()))
+ self._downloader.write_debug(
+ 'Using fake IP %s (%s) as X-Forwarded-For' % (self._x_forwarded_for_ip, country.upper()))
def extract(self, url):
"""Extracts URL information and returns it in list of dicts."""
@@ -531,25 +587,34 @@ class InfoExtractor(object):
for _ in range(2):
try:
self.initialize()
+ self.write_debug('Extracting URL: %s' % url)
ie_result = self._real_extract(url)
+ if ie_result is None:
+ return None
if self._x_forwarded_for_ip:
ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
+ subtitles = ie_result.get('subtitles')
+ if (subtitles and 'live_chat' in subtitles
+ and 'no-live-chat' in self.get_param('compat_opts', [])):
+ del subtitles['live_chat']
return ie_result
except GeoRestrictedError as e:
if self.__maybe_fake_ip_and_retry(e.countries):
continue
raise
- except ExtractorError:
- raise
+ except ExtractorError as e:
+ video_id = e.video_id or self.get_temp_id(url)
+ raise ExtractorError(
+ e.msg, video_id=video_id, ie=self.IE_NAME, tb=e.traceback, expected=e.expected, cause=e.cause)
except compat_http_client.IncompleteRead as e:
- raise ExtractorError('A network error has occurred.', cause=e, expected=True)
+ raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
except (KeyError, StopIteration) as e:
- raise ExtractorError('An extractor error has occurred.', cause=e)
+ raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
def __maybe_fake_ip_and_retry(self, countries):
- if (not self._downloader.params.get('geo_bypass_country', None)
+ if (not self.get_param('geo_bypass_country', None)
and self._GEO_BYPASS
- and self._downloader.params.get('geo_bypass', True)
+ and self.get_param('geo_bypass', True)
and not self._x_forwarded_for_ip
and countries):
country_code = random.choice(countries)
@@ -576,7 +641,7 @@ class InfoExtractor(object):
@classmethod
def ie_key(cls):
"""A string for getting the InfoExtractor with get_info_extractor"""
- return compat_str(cls.__name__[:-2])
+ return cls.__name__[:-2]
@property
def IE_NAME(self):
@@ -587,14 +652,10 @@ class InfoExtractor(object):
assert isinstance(err, compat_urllib_error.HTTPError)
if expected_status is None:
return False
- if isinstance(expected_status, compat_integer_types):
- return err.code == expected_status
- elif isinstance(expected_status, (list, tuple)):
- return err.code in expected_status
elif callable(expected_status):
return expected_status(err.code) is True
else:
- assert False
+ return err.code in variadic(expected_status)
def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
"""
@@ -602,6 +663,14 @@ class InfoExtractor(object):
See _download_webpage docstring for arguments specification.
"""
+ if not self._downloader._first_webpage_request:
+ sleep_interval = float_or_none(self.get_param('sleep_interval_requests')) or 0
+ if sleep_interval > 0:
+ self.to_screen('Sleeping %s seconds ...' % sleep_interval)
+ time.sleep(sleep_interval)
+ else:
+ self._downloader._first_webpage_request = False
+
if note is None:
self.report_download_webpage(video_id)
elif note is not False:
@@ -627,12 +696,9 @@ class InfoExtractor(object):
url_or_request = update_url_query(url_or_request, query)
if data is not None or headers:
url_or_request = sanitized_Request(url_or_request, data, headers)
- exceptions = [compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error]
- if hasattr(ssl, 'CertificateError'):
- exceptions.append(ssl.CertificateError)
try:
return self._downloader.urlopen(url_or_request)
- except tuple(exceptions) as err:
+ except network_exceptions as err:
if isinstance(err, compat_urllib_error.HTTPError):
if self.__can_accept_status_code(err, expected_status):
# Retain reference to error to prevent file object from
@@ -651,7 +717,7 @@ class InfoExtractor(object):
if fatal:
raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
else:
- self._downloader.report_warning(errmsg)
+ self.report_warning(errmsg)
return False
def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
@@ -723,15 +789,16 @@ class InfoExtractor(object):
webpage_bytes = prefix + webpage_bytes
if not encoding:
encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
- if self._downloader.params.get('dump_intermediate_pages', False):
+ if self.get_param('dump_intermediate_pages', False):
self.to_screen('Dumping request to ' + urlh.geturl())
dump = base64.b64encode(webpage_bytes).decode('ascii')
self._downloader.to_screen(dump)
- if self._downloader.params.get('write_pages', False):
+ if self.get_param('write_pages', False):
basen = '%s_%s' % (video_id, urlh.geturl())
- if len(basen) > 240:
+ trim_length = self.get_param('trim_file_name') or 240
+ if len(basen) > trim_length:
h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
- basen = basen[:240 - len(h)] + h
+ basen = basen[:trim_length - len(h)] + h
raw_filename = basen + '.dump'
filename = sanitize_filename(raw_filename, restricted=True)
self.to_screen('Saving request to ' + filename)
@@ -911,14 +978,72 @@ class InfoExtractor(object):
else:
self.report_warning(errmsg + str(ve))
- def report_warning(self, msg, video_id=None):
- idstr = '' if video_id is None else '%s: ' % video_id
- self._downloader.report_warning(
- '[%s] %s%s' % (self.IE_NAME, idstr, msg))
+ def _parse_socket_response_as_json(self, data, video_id, transform_source=None, fatal=True):
+ return self._parse_json(
+ data[data.find('{'):data.rfind('}') + 1],
+ video_id, transform_source, fatal)
+
+ def _download_socket_json_handle(
+ self, url_or_request, video_id, note='Polling socket',
+ errnote='Unable to poll socket', transform_source=None,
+ fatal=True, encoding=None, data=None, headers={}, query={},
+ expected_status=None):
+ """
+ Return a tuple (JSON object, URL handle).
+
+ See _download_webpage docstring for arguments specification.
+ """
+ res = self._download_webpage_handle(
+ url_or_request, video_id, note, errnote, fatal=fatal,
+ encoding=encoding, data=data, headers=headers, query=query,
+ expected_status=expected_status)
+ if res is False:
+ return res
+ webpage, urlh = res
+ return self._parse_socket_response_as_json(
+ webpage, video_id, transform_source=transform_source,
+ fatal=fatal), urlh
+
+ def _download_socket_json(
+ self, url_or_request, video_id, note='Polling socket',
+ errnote='Unable to poll socket', transform_source=None,
+ fatal=True, encoding=None, data=None, headers={}, query={},
+ expected_status=None):
+ """
+ Return the JSON object as a dict.
+
+ See _download_webpage docstring for arguments specification.
+ """
+ res = self._download_socket_json_handle(
+ url_or_request, video_id, note=note, errnote=errnote,
+ transform_source=transform_source, fatal=fatal, encoding=encoding,
+ data=data, headers=headers, query=query,
+ expected_status=expected_status)
+ return res if res is False else res[0]
+
+ def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
+ idstr = format_field(video_id, template='%s: ')
+ msg = f'[{self.IE_NAME}] {idstr}{msg}'
+ if only_once:
+ if f'WARNING: {msg}' in self._printed_messages:
+ return
+ self._printed_messages.add(f'WARNING: {msg}')
+ self._downloader.report_warning(msg, *args, **kwargs)
- def to_screen(self, msg):
+ def to_screen(self, msg, *args, **kwargs):
"""Print msg to screen, prefixing it with '[ie_name]'"""
- self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
+ self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
+
+ def write_debug(self, msg, *args, **kwargs):
+ self._downloader.write_debug('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
+
+ def get_param(self, name, default=None, *args, **kwargs):
+ if self._downloader:
+ return self._downloader.params.get(name, default, *args, **kwargs)
+ return default
+
+ def report_drm(self, video_id, partial=False):
+ self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
def report_extraction(self, id_or_name):
"""Report information extraction."""
@@ -936,24 +1061,40 @@ class InfoExtractor(object):
"""Report attempt to log in."""
self.to_screen('Logging in')
- @staticmethod
- def raise_login_required(msg='This video is only available for registered users'):
- raise ExtractorError(
- '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
- expected=True)
+ def raise_login_required(
+ self, msg='This video is only available for registered users',
+ metadata_available=False, method='any'):
+ if metadata_available and self.get_param('ignore_no_formats_error'):
+ self.report_warning(msg)
+ if method is not None:
+ msg = '%s. %s' % (msg, self._LOGIN_HINTS[method])
+ raise ExtractorError(msg, expected=True)
+
+ def raise_geo_restricted(
+ self, msg='This video is not available from your location due to geo restriction',
+ countries=None, metadata_available=False):
+ if metadata_available and self.get_param('ignore_no_formats_error'):
+ self.report_warning(msg)
+ else:
+ raise GeoRestrictedError(msg, countries=countries)
- @staticmethod
- def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None):
- raise GeoRestrictedError(msg, countries=countries)
+ def raise_no_formats(self, msg, expected=False, video_id=None):
+ if expected and self.get_param('ignore_no_formats_error'):
+ self.report_warning(msg, video_id)
+ elif isinstance(msg, ExtractorError):
+ raise msg
+ else:
+ raise ExtractorError(msg, expected=expected, video_id=video_id)
# Methods for following #608
@staticmethod
- def url_result(url, ie=None, video_id=None, video_title=None):
+ def url_result(url, ie=None, video_id=None, video_title=None, **kwargs):
"""Returns a URL that points to a page that should be processed"""
# TODO: ie should be the class used for getting the info
video_info = {'_type': 'url',
'url': url,
'ie_key': ie}
+ video_info.update(kwargs)
if video_id is not None:
video_info['id'] = video_id
if video_title is not None:
@@ -968,15 +1109,16 @@ class InfoExtractor(object):
urls, playlist_id=playlist_id, playlist_title=playlist_title)
@staticmethod
- def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
+ def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, **kwargs):
"""Returns a playlist"""
video_info = {'_type': 'playlist',
'entries': entries}
+ video_info.update(kwargs)
if playlist_id:
video_info['id'] = playlist_id
if playlist_title:
video_info['title'] = playlist_title
- if playlist_description:
+ if playlist_description is not None:
video_info['description'] = playlist_description
return video_info
@@ -995,15 +1137,14 @@ class InfoExtractor(object):
if mobj:
break
- if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
- _name = '\033[0;34m%s\033[0m' % name
- else:
- _name = name
+ _name = self._downloader._color_text(name, 'blue')
if mobj:
if group is None:
# return the first matching group
return next(g for g in mobj.groups() if g is not None)
+ elif isinstance(group, (list, tuple)):
+ return tuple(mobj.group(g) for g in group)
else:
return mobj.group(group)
elif default is not NO_DEFAULT:
@@ -1011,7 +1152,7 @@ class InfoExtractor(object):
elif fatal:
raise RegexNotFoundError('Unable to extract %s' % _name)
else:
- self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
+ self.report_warning('unable to extract %s' % _name + bug_reports_message())
return None
def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
@@ -1029,9 +1170,12 @@ class InfoExtractor(object):
password = None
netrc_machine = netrc_machine or self._NETRC_MACHINE
- if self._downloader.params.get('usenetrc', False):
+ if self.get_param('usenetrc', False):
try:
- info = netrc.netrc().authenticators(netrc_machine)
+ netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
+ if os.path.isdir(netrc_file):
+ netrc_file = os.path.join(netrc_file, '.netrc')
+ info = netrc.netrc(file=netrc_file).authenticators(netrc_machine)
if info is not None:
username = info[0]
password = info[2]
@@ -1039,7 +1183,7 @@ class InfoExtractor(object):
raise netrc.NetrcParseError(
'No authenticators for %s' % netrc_machine)
except (IOError, netrc.NetrcParseError) as err:
- self._downloader.report_warning(
+ self.report_warning(
'parsing .netrc: %s' % error_to_compat_str(err))
return username, password
@@ -1053,15 +1197,11 @@ class InfoExtractor(object):
value.
If there's no info available, return (None, None)
"""
- if self._downloader is None:
- return (None, None)
-
- downloader_params = self._downloader.params
# Attempt to use provided username and password or .netrc data
- if downloader_params.get(username_option) is not None:
- username = downloader_params[username_option]
- password = downloader_params[password_option]
+ username = self.get_param(username_option)
+ if username is not None:
+ password = self.get_param(password_option)
else:
username, password = self._get_netrc_login_info(netrc_machine)
@@ -1074,12 +1214,10 @@ class InfoExtractor(object):
currently just uses the command line option
If there's no info available, return None
"""
- if self._downloader is None:
- return None
- downloader_params = self._downloader.params
- if downloader_params.get('twofactor') is not None:
- return downloader_params['twofactor']
+ tfa = self.get_param('twofactor')
+ if tfa is not None:
+ return tfa
return compat_getpass('Type %s and press [Return]: ' % note)
@@ -1102,8 +1240,7 @@ class InfoExtractor(object):
[^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
def _og_search_property(self, prop, html, name=None, **kargs):
- if not isinstance(prop, (list, tuple)):
- prop = [prop]
+ prop = variadic(prop)
if name is None:
name = 'OpenGraph %s' % prop[0]
og_regexes = []
@@ -1133,8 +1270,7 @@ class InfoExtractor(object):
return self._og_search_property('url', html, **kargs)
def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
- if not isinstance(name, (list, tuple)):
- name = [name]
+ name = variadic(name)
if display_name is None:
display_name = name[0]
return self._html_search_regex(
@@ -1194,7 +1330,7 @@ class InfoExtractor(object):
# JSON-LD may be malformed and thus `fatal` should be respected.
# At the same time `default` may be passed that assumes `fatal=False`
# for _search_regex. Let's simulate the same behavior here as well.
- fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
+ fatal = kwargs.get('fatal', True) if default is NO_DEFAULT else False
json_ld = []
for mobj in json_ld_list:
json_ld_item = self._parse_json(
@@ -1214,7 +1350,7 @@ class InfoExtractor(object):
elif fatal:
raise RegexNotFoundError('Unable to extract JSON-LD')
else:
- self._downloader.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
+ self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
return {}
def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
@@ -1369,81 +1505,283 @@ class InfoExtractor(object):
html, '%s form' % form_id, group='form')
return self._hidden_inputs(form)
- def _sort_formats(self, formats, field_preference=None):
- if not formats:
- raise ExtractorError('No video formats found')
+ class FormatSort:
+ regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
+
+ default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
+ 'res', 'fps', 'hdr:12', 'codec:vp9.2', 'size', 'br', 'asr',
+ 'proto', 'ext', 'hasaud', 'source', 'format_id') # These must not be aliases
+ ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
+ 'height', 'width', 'proto', 'vext', 'abr', 'aext',
+ 'fps', 'fs_approx', 'source', 'format_id')
+
+ settings = {
+ 'vcodec': {'type': 'ordered', 'regex': True,
+ 'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
+ 'acodec': {'type': 'ordered', 'regex': True,
+ 'order': ['opus', 'vorbis', 'aac', 'mp?4a?', 'mp3', 'e?a?c-?3', 'dts', '', None, 'none']},
+ 'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
+ 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
+ 'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
+ 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.+', '.*dash', 'ws|websocket', '', 'mms|rtsp', 'none', 'f4']},
+ 'vext': {'type': 'ordered', 'field': 'video_ext',
+ 'order': ('mp4', 'webm', 'flv', '', 'none'),
+ 'order_free': ('webm', 'mp4', 'flv', '', 'none')},
+ 'aext': {'type': 'ordered', 'field': 'audio_ext',
+ 'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
+ 'order_free': ('opus', 'ogg', 'webm', 'm4a', 'mp3', 'aac', '', 'none')},
+ 'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
+ 'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
+ 'field': ('vcodec', 'acodec'),
+ 'function': lambda it: int(any(v != 'none' for v in it))},
+ 'ie_pref': {'priority': True, 'type': 'extractor'},
+ 'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
+ 'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
+ 'lang': {'convert': 'ignore', 'field': 'language_preference'},
+ 'quality': {'convert': 'float_none', 'default': -1},
+ 'filesize': {'convert': 'bytes'},
+ 'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
+ 'id': {'convert': 'string', 'field': 'format_id'},
+ 'height': {'convert': 'float_none'},
+ 'width': {'convert': 'float_none'},
+ 'fps': {'convert': 'float_none'},
+ 'tbr': {'convert': 'float_none'},
+ 'vbr': {'convert': 'float_none'},
+ 'abr': {'convert': 'float_none'},
+ 'asr': {'convert': 'float_none'},
+ 'source': {'convert': 'ignore', 'field': 'source_preference'},
+
+ 'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
+ 'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
+ 'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
+ 'ext': {'type': 'combined', 'field': ('vext', 'aext')},
+ 'res': {'type': 'multiple', 'field': ('height', 'width'),
+ 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
+
+ # Most of these exist only for compatibility reasons
+ 'dimension': {'type': 'alias', 'field': 'res'},
+ 'resolution': {'type': 'alias', 'field': 'res'},
+ 'extension': {'type': 'alias', 'field': 'ext'},
+ 'bitrate': {'type': 'alias', 'field': 'br'},
+ 'total_bitrate': {'type': 'alias', 'field': 'tbr'},
+ 'video_bitrate': {'type': 'alias', 'field': 'vbr'},
+ 'audio_bitrate': {'type': 'alias', 'field': 'abr'},
+ 'framerate': {'type': 'alias', 'field': 'fps'},
+ 'language_preference': {'type': 'alias', 'field': 'lang'}, # not named as 'language' because such a field exists
+ 'protocol': {'type': 'alias', 'field': 'proto'},
+ 'source_preference': {'type': 'alias', 'field': 'source'},
+ 'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
+ 'filesize_estimate': {'type': 'alias', 'field': 'size'},
+ 'samplerate': {'type': 'alias', 'field': 'asr'},
+ 'video_ext': {'type': 'alias', 'field': 'vext'},
+ 'audio_ext': {'type': 'alias', 'field': 'aext'},
+ 'video_codec': {'type': 'alias', 'field': 'vcodec'},
+ 'audio_codec': {'type': 'alias', 'field': 'acodec'},
+ 'video': {'type': 'alias', 'field': 'hasvid'},
+ 'has_video': {'type': 'alias', 'field': 'hasvid'},
+ 'audio': {'type': 'alias', 'field': 'hasaud'},
+ 'has_audio': {'type': 'alias', 'field': 'hasaud'},
+ 'extractor': {'type': 'alias', 'field': 'ie_pref'},
+ 'preference': {'type': 'alias', 'field': 'ie_pref'},
+ 'extractor_preference': {'type': 'alias', 'field': 'ie_pref'},
+ 'format_id': {'type': 'alias', 'field': 'id'},
+ }
- for f in formats:
- # Automatically determine tbr when missing based on abr and vbr (improves
- # formats sorting in some cases)
- if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
- f['tbr'] = f['abr'] + f['vbr']
-
- def _formats_key(f):
- # TODO remove the following workaround
- from ..utils import determine_ext
- if not f.get('ext') and 'url' in f:
- f['ext'] = determine_ext(f['url'])
-
- if isinstance(field_preference, (list, tuple)):
- return tuple(
- f.get(field)
- if f.get(field) is not None
- else ('' if field == 'format_id' else -1)
- for field in field_preference)
-
- preference = f.get('preference')
- if preference is None:
- preference = 0
- if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
- preference -= 0.5
-
- protocol = f.get('protocol') or determine_protocol(f)
- proto_preference = 0 if protocol in ['http', 'https'] else (-0.5 if protocol == 'rtsp' else -0.1)
-
- if f.get('vcodec') == 'none': # audio only
- preference -= 50
- if self._downloader.params.get('prefer_free_formats'):
- ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
+ _order = []
+
+ def _get_field_setting(self, field, key):
+ if field not in self.settings:
+ self.settings[field] = {}
+ propObj = self.settings[field]
+ if key not in propObj:
+ type = propObj.get('type')
+ if key == 'field':
+ default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
+ elif key == 'convert':
+ default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
else:
- ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
- ext_preference = 0
- try:
- audio_ext_preference = ORDER.index(f['ext'])
- except ValueError:
- audio_ext_preference = -1
+ default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
+ propObj[key] = default
+ return propObj[key]
+
+ def _resolve_field_value(self, field, value, convertNone=False):
+ if value is None:
+ if not convertNone:
+ return None
+ else:
+ value = value.lower()
+ conversion = self._get_field_setting(field, 'convert')
+ if conversion == 'ignore':
+ return None
+ if conversion == 'string':
+ return value
+ elif conversion == 'float_none':
+ return float_or_none(value)
+ elif conversion == 'bytes':
+ return FileDownloader.parse_bytes(value)
+ elif conversion == 'order':
+ order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
+ use_regex = self._get_field_setting(field, 'regex')
+ list_length = len(order_list)
+ empty_pos = order_list.index('') if '' in order_list else list_length + 1
+ if use_regex and value is not None:
+ for i, regex in enumerate(order_list):
+ if regex and re.match(regex, value):
+ return list_length - i
+ return list_length - empty_pos # not in list
+ else: # not regex or value = None
+ return list_length - (order_list.index(value) if value in order_list else empty_pos)
else:
- if f.get('acodec') == 'none': # video only
- preference -= 40
- if self._downloader.params.get('prefer_free_formats'):
- ORDER = ['flv', 'mp4', 'webm']
+ if value.isnumeric():
+ return float(value)
else:
- ORDER = ['webm', 'flv', 'mp4']
- try:
- ext_preference = ORDER.index(f['ext'])
- except ValueError:
- ext_preference = -1
- audio_ext_preference = 0
-
- return (
- preference,
- f.get('language_preference') if f.get('language_preference') is not None else -1,
- f.get('quality') if f.get('quality') is not None else -1,
- f.get('tbr') if f.get('tbr') is not None else -1,
- f.get('filesize') if f.get('filesize') is not None else -1,
- f.get('vbr') if f.get('vbr') is not None else -1,
- f.get('height') if f.get('height') is not None else -1,
- f.get('width') if f.get('width') is not None else -1,
- proto_preference,
- ext_preference,
- f.get('abr') if f.get('abr') is not None else -1,
- audio_ext_preference,
- f.get('fps') if f.get('fps') is not None else -1,
- f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
- f.get('source_preference') if f.get('source_preference') is not None else -1,
- f.get('format_id') if f.get('format_id') is not None else '',
- )
- formats.sort(key=_formats_key)
+ self.settings[field]['convert'] = 'string'
+ return value
+
+ def evaluate_params(self, params, sort_extractor):
+ self._use_free_order = params.get('prefer_free_formats', False)
+ self._sort_user = params.get('format_sort', [])
+ self._sort_extractor = sort_extractor
+
+ def add_item(field, reverse, closest, limit_text):
+ field = field.lower()
+ if field in self._order:
+ return
+ self._order.append(field)
+ limit = self._resolve_field_value(field, limit_text)
+ data = {
+ 'reverse': reverse,
+ 'closest': False if limit is None else closest,
+ 'limit_text': limit_text,
+ 'limit': limit}
+ if field in self.settings:
+ self.settings[field].update(data)
+ else:
+ self.settings[field] = data
+
+ sort_list = (
+ tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
+ + (tuple() if params.get('format_sort_force', False)
+ else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
+ + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
+
+ for item in sort_list:
+ match = re.match(self.regex, item)
+ if match is None:
+ raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
+ field = match.group('field')
+ if field is None:
+ continue
+ if self._get_field_setting(field, 'type') == 'alias':
+ field = self._get_field_setting(field, 'field')
+ reverse = match.group('reverse') is not None
+ closest = match.group('separator') == '~'
+ limit_text = match.group('limit')
+
+ has_limit = limit_text is not None
+ has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
+ has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
+
+ fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
+ limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
+ limit_count = len(limits)
+ for (i, f) in enumerate(fields):
+ add_item(f, reverse, closest,
+ limits[i] if i < limit_count
+ else limits[0] if has_limit and not has_multiple_limits
+ else None)
+
+ def print_verbose_info(self, write_debug):
+ if self._sort_user:
+ write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
+ if self._sort_extractor:
+ write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
+ write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
+ '+' if self._get_field_setting(field, 'reverse') else '', field,
+ '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
+ self._get_field_setting(field, 'limit_text'),
+ self._get_field_setting(field, 'limit'))
+ if self._get_field_setting(field, 'limit_text') is not None else '')
+ for field in self._order if self._get_field_setting(field, 'visible')]))
+
+ def _calculate_field_preference_from_value(self, format, field, type, value):
+ reverse = self._get_field_setting(field, 'reverse')
+ closest = self._get_field_setting(field, 'closest')
+ limit = self._get_field_setting(field, 'limit')
+
+ if type == 'extractor':
+ maximum = self._get_field_setting(field, 'max')
+ if value is None or (maximum is not None and value >= maximum):
+ value = -1
+ elif type == 'boolean':
+ in_list = self._get_field_setting(field, 'in_list')
+ not_in_list = self._get_field_setting(field, 'not_in_list')
+ value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
+ elif type == 'ordered':
+ value = self._resolve_field_value(field, value, True)
+
+ # try to convert to number
+ val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
+ is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
+ if is_num:
+ value = val_num
+
+ return ((-10, 0) if value is None
+ else (1, value, 0) if not is_num # if a field has mixed strings and numbers, strings are sorted higher
+ else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
+ else (0, value, 0) if not reverse and (limit is None or value <= limit)
+ else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
+ else (-1, value, 0))
+
+ def _calculate_field_preference(self, format, field):
+ type = self._get_field_setting(field, 'type') # extractor, boolean, ordered, field, multiple
+ get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
+ if type == 'multiple':
+ type = 'field' # Only 'field' is allowed in multiple for now
+ actual_fields = self._get_field_setting(field, 'field')
+
+ value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
+ else:
+ value = get_value(field)
+ return self._calculate_field_preference_from_value(format, field, type, value)
+
+ def calculate_preference(self, format):
+ # Determine missing protocol
+ if not format.get('protocol'):
+ format['protocol'] = determine_protocol(format)
+
+ # Determine missing ext
+ if not format.get('ext') and 'url' in format:
+ format['ext'] = determine_ext(format['url'])
+ if format.get('vcodec') == 'none':
+ format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
+ format['video_ext'] = 'none'
+ else:
+ format['video_ext'] = format['ext']
+ format['audio_ext'] = 'none'
+ # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'): # Not supported?
+ # format['preference'] = -1000
+
+ # Determine missing bitrates
+ if format.get('tbr') is None:
+ if format.get('vbr') is not None and format.get('abr') is not None:
+ format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
+ else:
+ if format.get('vcodec') != 'none' and format.get('vbr') is None:
+ format['vbr'] = format.get('tbr') - format.get('abr', 0)
+ if format.get('acodec') != 'none' and format.get('abr') is None:
+ format['abr'] = format.get('tbr') - format.get('vbr', 0)
+
+ return tuple(self._calculate_field_preference(format, field) for field in self._order)
+
+ def _sort_formats(self, formats, field_preference=[]):
+ if not formats:
+ return
+ format_sort = self.FormatSort() # params and to_screen are taken from the downloader
+ format_sort.evaluate_params(self._downloader.params, field_preference)
+ if self.get_param('verbose', False):
+ format_sort.print_verbose_info(self._downloader.write_debug)
+ formats.sort(key=lambda f: format_sort.calculate_preference(f))
def _check_formats(self, formats, video_id):
if formats:
@@ -1481,7 +1819,7 @@ class InfoExtractor(object):
""" Either "http:" or "https:", depending on the user's preferences """
return (
'http:'
- if self._downloader.params.get('prefer_insecure', False)
+ if self.get_param('prefer_insecure', False)
else 'https:')
def _proto_relative_url(self, url, scheme=None):
@@ -1501,7 +1839,7 @@ class InfoExtractor(object):
self.to_screen(msg)
time.sleep(timeout)
- def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
+ def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
transform_source=lambda s: fix_xml_ampersands(s).strip(),
fatal=True, m3u8_id=None, data=None, headers={}, query={}):
manifest = self._download_xml(
@@ -1516,10 +1854,10 @@ class InfoExtractor(object):
return []
return self._parse_f4m_formats(
- manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
+ manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
- def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
+ def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
transform_source=lambda s: fix_xml_ampersands(s).strip(),
fatal=True, m3u8_id=None):
if not isinstance(manifest, compat_etree_Element) and not fatal:
@@ -1584,7 +1922,7 @@ class InfoExtractor(object):
ext = determine_ext(manifest_url)
if ext == 'f4m':
f4m_formats = self._extract_f4m_formats(
- manifest_url, video_id, preference=preference, f4m_id=f4m_id,
+ manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
transform_source=transform_source, fatal=fatal)
# Sometimes stream-level manifest contains single media entry that
# does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
@@ -1604,7 +1942,7 @@ class InfoExtractor(object):
elif ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
manifest_url, video_id, 'mp4', preference=preference,
- m3u8_id=m3u8_id, fatal=fatal))
+ quality=quality, m3u8_id=m3u8_id, fatal=fatal))
continue
formats.append({
'format_id': format_id,
@@ -1617,56 +1955,88 @@ class InfoExtractor(object):
'height': height,
'vcodec': vcodec,
'preference': preference,
+ 'quality': quality,
})
return formats
- def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None):
+ def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
return {
'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
'url': m3u8_url,
'ext': ext,
'protocol': 'm3u8',
'preference': preference - 100 if preference else -100,
+ 'quality': quality,
'resolution': 'multiple',
'format_note': 'Quality selection URL',
}
- def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
- entry_protocol='m3u8', preference=None,
- m3u8_id=None, note=None, errnote=None,
- fatal=True, live=False, data=None, headers={},
- query={}):
+ def _report_ignoring_subs(self, name):
+ self.report_warning(bug_reports_message(
+ f'Ignoring subtitle tracks found in the {name} manifest; '
+ 'if any subtitle tracks are missing,'
+ ), only_once=True)
+
+ def _extract_m3u8_formats(self, *args, **kwargs):
+ fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
+ if subs:
+ self._report_ignoring_subs('HLS')
+ return fmts
+
+ def _extract_m3u8_formats_and_subtitles(
+ self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
+ preference=None, quality=None, m3u8_id=None, note=None,
+ errnote=None, fatal=True, live=False, data=None, headers={},
+ query={}):
+
res = self._download_webpage_handle(
m3u8_url, video_id,
- note=note or 'Downloading m3u8 information',
- errnote=errnote or 'Failed to download m3u8 information',
+ note='Downloading m3u8 information' if note is None else note,
+ errnote='Failed to download m3u8 information' if errnote is None else errnote,
fatal=fatal, data=data, headers=headers, query=query)
if res is False:
- return []
+ return [], {}
m3u8_doc, urlh = res
m3u8_url = urlh.geturl()
- return self._parse_m3u8_formats(
+ return self._parse_m3u8_formats_and_subtitles(
m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
- preference=preference, m3u8_id=m3u8_id, live=live)
+ preference=preference, quality=quality, m3u8_id=m3u8_id,
+ note=note, errnote=errnote, fatal=fatal, live=live, data=data,
+ headers=headers, query=query, video_id=video_id)
+
+ def _parse_m3u8_formats_and_subtitles(
+ self, m3u8_doc, m3u8_url, ext=None, entry_protocol='m3u8_native',
+ preference=None, quality=None, m3u8_id=None, live=False, note=None,
+ errnote=None, fatal=True, data=None, headers={}, query={},
+ video_id=None):
+ formats, subtitles = [], {}
- def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None,
- entry_protocol='m3u8', preference=None,
- m3u8_id=None, live=False):
if '#EXT-X-FAXS-CM:' in m3u8_doc: # Adobe Flash Access
- return []
+ return formats, subtitles
- if re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc): # Apple FairPlay
- return []
+ has_drm = re.search(r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://', m3u8_doc)
- formats = []
+ def format_url(url):
+ return url if re.match(r'^https?://', url) else compat_urlparse.urljoin(m3u8_url, url)
- format_url = lambda u: (
- u
- if re.match(r'^https?://', u)
- else compat_urlparse.urljoin(m3u8_url, u))
+ if self.get_param('hls_split_discontinuity', False):
+ def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
+ if not m3u8_doc:
+ if not manifest_url:
+ return []
+ m3u8_doc = self._download_webpage(
+ manifest_url, video_id, fatal=fatal, data=data, headers=headers,
+ note=False, errnote='Failed to download m3u8 playlist information')
+ if m3u8_doc is False:
+ return []
+ return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
+
+ else:
+ def _extract_m3u8_playlist_indices(*args, **kwargs):
+ return [None]
# References:
# 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
@@ -1685,13 +2055,18 @@ class InfoExtractor(object):
# clearly detect media playlist with this criterion.
if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
- return [{
+ formats = [{
+ 'format_id': '-'.join(map(str, filter(None, [m3u8_id, idx]))),
+ 'format_index': idx,
'url': m3u8_url,
- 'format_id': m3u8_id,
'ext': ext,
'protocol': entry_protocol,
'preference': preference,
- }]
+ 'quality': quality,
+ 'has_drm': has_drm,
+ } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
+
+ return formats, subtitles
groups = {}
last_stream_inf = {}
@@ -1703,26 +2078,45 @@ class InfoExtractor(object):
if not (media_type and group_id and name):
return
groups.setdefault(group_id, []).append(media)
+ # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
+ if media_type == 'SUBTITLES':
+ # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
+ # EXT-X-MEDIA tag if the media type is SUBTITLES.
+ # However, lack of URI has been spotted in the wild.
+ # e.g. NebulaIE; see https://github.com/hypervideo/hypervideo/issues/339
+ if not media.get('URI'):
+ return
+ url = format_url(media['URI'])
+ sub_info = {
+ 'url': url,
+ 'ext': determine_ext(url),
+ }
+ if sub_info['ext'] == 'm3u8':
+ # Per RFC 8216 §3.1, the only possible subtitle format m3u8
+ # files may contain is WebVTT:
+ # <https://tools.ietf.org/html/rfc8216#section-3.1>
+ sub_info['ext'] = 'vtt'
+ sub_info['protocol'] = 'm3u8_native'
+ lang = media.get('LANGUAGE') or 'und'
+ subtitles.setdefault(lang, []).append(sub_info)
if media_type not in ('VIDEO', 'AUDIO'):
return
media_url = media.get('URI')
if media_url:
- format_id = []
- for v in (m3u8_id, group_id, name):
- if v:
- format_id.append(v)
- f = {
- 'format_id': '-'.join(format_id),
- 'url': format_url(media_url),
+ manifest_url = format_url(media_url)
+ formats.extend({
+ 'format_id': '-'.join(map(str, filter(None, (m3u8_id, group_id, name, idx)))),
+ 'format_note': name,
+ 'format_index': idx,
+ 'url': manifest_url,
'manifest_url': m3u8_url,
'language': media.get('LANGUAGE'),
'ext': ext,
'protocol': entry_protocol,
'preference': preference,
- }
- if media_type == 'AUDIO':
- f['vcodec'] = 'none'
- formats.append(f)
+ 'quality': quality,
+ 'vcodec': 'none' if media_type == 'AUDIO' else None,
+ } for idx in _extract_m3u8_playlist_indices(manifest_url))
def build_stream_name():
# Despite specification does not mention NAME attribute for
@@ -1759,76 +2153,99 @@ class InfoExtractor(object):
tbr = float_or_none(
last_stream_inf.get('AVERAGE-BANDWIDTH')
or last_stream_inf.get('BANDWIDTH'), scale=1000)
- format_id = []
- if m3u8_id:
- format_id.append(m3u8_id)
- stream_name = build_stream_name()
- # Bandwidth of live streams may differ over time thus making
- # format_id unpredictable. So it's better to keep provided
- # format_id intact.
- if not live:
- format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
manifest_url = format_url(line.strip())
- f = {
- 'format_id': '-'.join(format_id),
- 'url': manifest_url,
- 'manifest_url': m3u8_url,
- 'tbr': tbr,
- 'ext': ext,
- 'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
- 'protocol': entry_protocol,
- 'preference': preference,
- }
- resolution = last_stream_inf.get('RESOLUTION')
- if resolution:
- mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
+
+ for idx in _extract_m3u8_playlist_indices(manifest_url):
+ format_id = [m3u8_id, None, idx]
+ # Bandwidth of live streams may differ over time thus making
+ # format_id unpredictable. So it's better to keep provided
+ # format_id intact.
+ if not live:
+ stream_name = build_stream_name()
+ format_id[1] = stream_name if stream_name else '%d' % (tbr if tbr else len(formats))
+ f = {
+ 'format_id': '-'.join(map(str, filter(None, format_id))),
+ 'format_index': idx,
+ 'url': manifest_url,
+ 'manifest_url': m3u8_url,
+ 'tbr': tbr,
+ 'ext': ext,
+ 'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
+ 'protocol': entry_protocol,
+ 'preference': preference,
+ 'quality': quality,
+ }
+ resolution = last_stream_inf.get('RESOLUTION')
+ if resolution:
+ mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
+ if mobj:
+ f['width'] = int(mobj.group('width'))
+ f['height'] = int(mobj.group('height'))
+ # Unified Streaming Platform
+ mobj = re.search(
+ r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
if mobj:
- f['width'] = int(mobj.group('width'))
- f['height'] = int(mobj.group('height'))
- # Unified Streaming Platform
- mobj = re.search(
- r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
- if mobj:
- abr, vbr = mobj.groups()
- abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
- f.update({
- 'vbr': vbr,
- 'abr': abr,
- })
- codecs = parse_codecs(last_stream_inf.get('CODECS'))
- f.update(codecs)
- audio_group_id = last_stream_inf.get('AUDIO')
- # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
- # references a rendition group MUST have a CODECS attribute.
- # However, this is not always respected, for example, [2]
- # contains EXT-X-STREAM-INF tag which references AUDIO
- # rendition group but does not have CODECS and despite
- # referencing an audio group it represents a complete
- # (with audio and video) format. So, for such cases we will
- # ignore references to rendition groups and treat them
- # as complete formats.
- if audio_group_id and codecs and f.get('vcodec') != 'none':
- audio_group = groups.get(audio_group_id)
- if audio_group and audio_group[0].get('URI'):
- # TODO: update acodec for audio only formats with
- # the same GROUP-ID
- f['acodec'] = 'none'
- formats.append(f)
-
- # for DailyMotion
- progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
- if progressive_uri:
- http_f = f.copy()
- del http_f['manifest_url']
- http_f.update({
- 'format_id': f['format_id'].replace('hls-', 'http-'),
- 'protocol': 'http',
- 'url': progressive_uri,
- })
- formats.append(http_f)
+ abr, vbr = mobj.groups()
+ abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
+ f.update({
+ 'vbr': vbr,
+ 'abr': abr,
+ })
+ codecs = parse_codecs(last_stream_inf.get('CODECS'))
+ f.update(codecs)
+ audio_group_id = last_stream_inf.get('AUDIO')
+ # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
+ # references a rendition group MUST have a CODECS attribute.
+ # However, this is not always respected, for example, [2]
+ # contains EXT-X-STREAM-INF tag which references AUDIO
+ # rendition group but does not have CODECS and despite
+ # referencing an audio group it represents a complete
+ # (with audio and video) format. So, for such cases we will
+ # ignore references to rendition groups and treat them
+ # as complete formats.
+ if audio_group_id and codecs and f.get('vcodec') != 'none':
+ audio_group = groups.get(audio_group_id)
+ if audio_group and audio_group[0].get('URI'):
+ # TODO: update acodec for audio only formats with
+ # the same GROUP-ID
+ f['acodec'] = 'none'
+ if not f.get('ext'):
+ f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
+ formats.append(f)
+
+ # for DailyMotion
+ progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
+ if progressive_uri:
+ http_f = f.copy()
+ del http_f['manifest_url']
+ http_f.update({
+ 'format_id': f['format_id'].replace('hls-', 'http-'),
+ 'protocol': 'http',
+ 'url': progressive_uri,
+ })
+ formats.append(http_f)
last_stream_inf = {}
- return formats
+ return formats, subtitles
+
+ def _extract_m3u8_vod_duration(
+ self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
+
+ m3u8_vod = self._download_webpage(
+ m3u8_vod_url, video_id,
+ note='Downloading m3u8 VOD manifest' if note is None else note,
+ errnote='Failed to download VOD manifest' if errnote is None else errnote,
+ fatal=False, data=data, headers=headers, query=query)
+
+ return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
+
+ def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
+ if '#EXT-X-PLAYLIST-TYPE:VOD' not in m3u8_vod:
+ return None
+
+ return int(sum(
+ float(line[len('#EXTINF:'):].split(',')[0])
+ for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
@staticmethod
def _xpath_ns(path, namespace=None):
@@ -1842,7 +2259,7 @@ class InfoExtractor(object):
out.append('{%s}%s' % (namespace, c))
return '/'.join(out)
- def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
+ def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
if smil is False:
@@ -1851,8 +2268,18 @@ class InfoExtractor(object):
namespace = self._parse_smil_namespace(smil)
- return self._parse_smil_formats(
+ fmts = self._parse_smil_formats(
smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
+ subs = self._parse_smil_subtitles(
+ smil, namespace=namespace)
+
+ return fmts, subs
+
+ def _extract_smil_formats(self, *args, **kwargs):
+ fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
+ if subs:
+ self._report_ignoring_subs('SMIL')
+ return fmts
def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
smil = self._download_smil(smil_url, video_id, fatal=fatal)
@@ -1921,14 +2348,15 @@ class InfoExtractor(object):
rtmp_count = 0
http_count = 0
m3u8_count = 0
+ imgs_count = 0
- srcs = []
+ srcs = set()
media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
for medium in media:
src = medium.get('src')
if not src or src in srcs:
continue
- srcs.append(src)
+ srcs.add(src)
bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
@@ -2002,6 +2430,24 @@ class InfoExtractor(object):
'height': height,
})
+ for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
+ src = medium.get('src')
+ if not src or src in srcs:
+ continue
+ srcs.add(src)
+
+ imgs_count += 1
+ formats.append({
+ 'format_id': 'imagestream-%d' % (imgs_count),
+ 'url': src,
+ 'ext': mimetype2ext(medium.get('type')),
+ 'acodec': 'none',
+ 'vcodec': 'none',
+ 'width': int_or_none(medium.get('width')),
+ 'height': int_or_none(medium.get('height')),
+ 'format_note': 'SMIL storyboards',
+ })
+
return formats
def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
@@ -2071,23 +2517,38 @@ class InfoExtractor(object):
})
return entries
- def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
+ def _extract_mpd_formats(self, *args, **kwargs):
+ fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
+ if subs:
+ self._report_ignoring_subs('DASH')
+ return fmts
+
+ def _extract_mpd_formats_and_subtitles(
+ self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
+ fatal=True, data=None, headers={}, query={}):
res = self._download_xml_handle(
mpd_url, video_id,
- note=note or 'Downloading MPD manifest',
- errnote=errnote or 'Failed to download MPD manifest',
+ note='Downloading MPD manifest' if note is None else note,
+ errnote='Failed to download MPD manifest' if errnote is None else errnote,
fatal=fatal, data=data, headers=headers, query=query)
if res is False:
- return []
+ return [], {}
mpd_doc, urlh = res
if mpd_doc is None:
- return []
+ return [], {}
mpd_base_url = base_url(urlh.geturl())
- return self._parse_mpd_formats(
+ return self._parse_mpd_formats_and_subtitles(
mpd_doc, mpd_id, mpd_base_url, mpd_url)
- def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
+ def _parse_mpd_formats(self, *args, **kwargs):
+ fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
+ if subs:
+ self._report_ignoring_subs('DASH')
+ return fmts
+
+ def _parse_mpd_formats_and_subtitles(
+ self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
"""
Parse formats from MPD manifest.
References:
@@ -2095,8 +2556,9 @@ class InfoExtractor(object):
http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
"""
- if mpd_doc.get('type') == 'dynamic':
- return []
+ if not self.get_param('dynamic_mpd', True):
+ if mpd_doc.get('type') == 'dynamic':
+ return [], {}
namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
@@ -2165,7 +2627,8 @@ class InfoExtractor(object):
return ms_info
mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
- formats = []
+ formats, subtitles = [], {}
+ stream_numbers = {'audio': 0, 'video': 0}
for period in mpd_doc.findall(_add_ns('Period')):
period_duration = parse_duration(period.get('duration')) or mpd_duration
period_ms_info = extract_multisegment_info(period, {
@@ -2173,39 +2636,53 @@ class InfoExtractor(object):
'timescale': 1,
})
for adaptation_set in period.findall(_add_ns('AdaptationSet')):
- if is_drm_protected(adaptation_set):
- continue
adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
for representation in adaptation_set.findall(_add_ns('Representation')):
- if is_drm_protected(representation):
- continue
representation_attrib = adaptation_set.attrib.copy()
representation_attrib.update(representation.attrib)
# According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
mime_type = representation_attrib['mimeType']
- content_type = mime_type.split('/')[0]
- if content_type == 'text':
- # TODO implement WebVTT downloading
- pass
- elif content_type in ('video', 'audio'):
- base_url = ''
- for element in (representation, adaptation_set, period, mpd_doc):
- base_url_e = element.find(_add_ns('BaseURL'))
- if base_url_e is not None:
- base_url = base_url_e.text + base_url
- if re.match(r'^https?://', base_url):
- break
- if mpd_base_url and not re.match(r'^https?://', base_url):
- if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
- mpd_base_url += '/'
- base_url = mpd_base_url + base_url
- representation_id = representation_attrib.get('id')
- lang = representation_attrib.get('lang')
- url_el = representation.find(_add_ns('BaseURL'))
- filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
- bandwidth = int_or_none(representation_attrib.get('bandwidth'))
+ content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
+
+ codecs = representation_attrib.get('codecs', '')
+ if content_type not in ('video', 'audio', 'text'):
+ if mime_type == 'image/jpeg':
+ content_type = mime_type
+ elif codecs.split('.')[0] == 'stpp':
+ content_type = 'text'
+ elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
+ content_type = 'text'
+ else:
+ self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
+ continue
+
+ base_url = ''
+ for element in (representation, adaptation_set, period, mpd_doc):
+ base_url_e = element.find(_add_ns('BaseURL'))
+ if base_url_e is not None:
+ base_url = base_url_e.text + base_url
+ if re.match(r'^https?://', base_url):
+ break
+ if mpd_base_url and base_url.startswith('/'):
+ base_url = compat_urlparse.urljoin(mpd_base_url, base_url)
+ elif mpd_base_url and not re.match(r'^https?://', base_url):
+ if not mpd_base_url.endswith('/'):
+ mpd_base_url += '/'
+ base_url = mpd_base_url + base_url
+ representation_id = representation_attrib.get('id')
+ lang = representation_attrib.get('lang')
+ url_el = representation.find(_add_ns('BaseURL'))
+ filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
+ bandwidth = int_or_none(representation_attrib.get('bandwidth'))
+ if representation_id is not None:
+ format_id = representation_id
+ else:
+ format_id = content_type
+ if mpd_id:
+ format_id = mpd_id + '-' + format_id
+ if content_type in ('video', 'audio'):
f = {
- 'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
+ 'format_id': format_id,
'manifest_url': mpd_url,
'ext': mimetype2ext(mime_type),
'width': int_or_none(representation_attrib.get('width')),
@@ -2217,198 +2694,230 @@ class InfoExtractor(object):
'format_note': 'DASH %s' % content_type,
'filesize': filesize,
'container': mimetype2ext(mime_type) + '_dash',
+ 'manifest_stream_number': stream_numbers[content_type]
+ }
+ f.update(parse_codecs(codecs))
+ stream_numbers[content_type] += 1
+ elif content_type == 'text':
+ f = {
+ 'ext': mimetype2ext(mime_type),
+ 'manifest_url': mpd_url,
+ 'filesize': filesize,
}
- f.update(parse_codecs(representation_attrib.get('codecs')))
- representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
-
- def prepare_template(template_name, identifiers):
- tmpl = representation_ms_info[template_name]
- # First of, % characters outside $...$ templates
- # must be escaped by doubling for proper processing
- # by % operator string formatting used further (see
- # https://github.com/ytdl-org/youtube-dl/issues/16867).
- t = ''
- in_template = False
- for c in tmpl:
+ elif content_type == 'image/jpeg':
+ # See test case in VikiIE
+ # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
+ f = {
+ 'format_id': format_id,
+ 'ext': 'mhtml',
+ 'manifest_url': mpd_url,
+ 'format_note': 'DASH storyboards (jpeg)',
+ 'acodec': 'none',
+ 'vcodec': 'none',
+ }
+ if is_drm_protected(adaptation_set) or is_drm_protected(representation):
+ f['has_drm'] = True
+ representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
+
+ def prepare_template(template_name, identifiers):
+ tmpl = representation_ms_info[template_name]
+ # First of, % characters outside $...$ templates
+ # must be escaped by doubling for proper processing
+ # by % operator string formatting used further (see
+ # https://github.com/ytdl-org/youtube-dl/issues/16867).
+ t = ''
+ in_template = False
+ for c in tmpl:
+ t += c
+ if c == '$':
+ in_template = not in_template
+ elif c == '%' and not in_template:
t += c
- if c == '$':
- in_template = not in_template
- elif c == '%' and not in_template:
- t += c
- # Next, $...$ templates are translated to their
- # %(...) counterparts to be used with % operator
+ # Next, $...$ templates are translated to their
+ # %(...) counterparts to be used with % operator
+ if representation_id is not None:
t = t.replace('$RepresentationID$', representation_id)
- t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
- t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
- t.replace('$$', '$')
- return t
-
- # @initialization is a regular template like @media one
- # so it should be handled just the same way (see
- # https://github.com/ytdl-org/youtube-dl/issues/11605)
- if 'initialization' in representation_ms_info:
- initialization_template = prepare_template(
- 'initialization',
- # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
- # $Time$ shall not be included for @initialization thus
- # only $Bandwidth$ remains
- ('Bandwidth', ))
- representation_ms_info['initialization_url'] = initialization_template % {
- 'Bandwidth': bandwidth,
- }
+ t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
+ t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
+ t.replace('$$', '$')
+ return t
+
+ # @initialization is a regular template like @media one
+ # so it should be handled just the same way (see
+ # https://github.com/ytdl-org/youtube-dl/issues/11605)
+ if 'initialization' in representation_ms_info:
+ initialization_template = prepare_template(
+ 'initialization',
+ # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
+ # $Time$ shall not be included for @initialization thus
+ # only $Bandwidth$ remains
+ ('Bandwidth', ))
+ representation_ms_info['initialization_url'] = initialization_template % {
+ 'Bandwidth': bandwidth,
+ }
- def location_key(location):
- return 'url' if re.match(r'^https?://', location) else 'path'
-
- if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
-
- media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
- media_location_key = location_key(media_template)
-
- # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
- # can't be used at the same time
- if '%(Number' in media_template and 's' not in representation_ms_info:
- segment_duration = None
- if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
- segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
- representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
- representation_ms_info['fragments'] = [{
- media_location_key: media_template % {
- 'Number': segment_number,
- 'Bandwidth': bandwidth,
- },
- 'duration': segment_duration,
- } for segment_number in range(
- representation_ms_info['start_number'],
- representation_ms_info['total_number'] + representation_ms_info['start_number'])]
- else:
- # $Number*$ or $Time$ in media template with S list available
- # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
- # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
- representation_ms_info['fragments'] = []
- segment_time = 0
- segment_d = None
- segment_number = representation_ms_info['start_number']
-
- def add_segment_url():
- segment_url = media_template % {
- 'Time': segment_time,
- 'Bandwidth': bandwidth,
- 'Number': segment_number,
- }
- representation_ms_info['fragments'].append({
- media_location_key: segment_url,
- 'duration': float_or_none(segment_d, representation_ms_info['timescale']),
- })
-
- for num, s in enumerate(representation_ms_info['s']):
- segment_time = s.get('t') or segment_time
- segment_d = s['d']
+ def location_key(location):
+ return 'url' if re.match(r'^https?://', location) else 'path'
+
+ if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
+
+ media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
+ media_location_key = location_key(media_template)
+
+ # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
+ # can't be used at the same time
+ if '%(Number' in media_template and 's' not in representation_ms_info:
+ segment_duration = None
+ if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
+ segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
+ representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
+ representation_ms_info['fragments'] = [{
+ media_location_key: media_template % {
+ 'Number': segment_number,
+ 'Bandwidth': bandwidth,
+ },
+ 'duration': segment_duration,
+ } for segment_number in range(
+ representation_ms_info['start_number'],
+ representation_ms_info['total_number'] + representation_ms_info['start_number'])]
+ else:
+ # $Number*$ or $Time$ in media template with S list available
+ # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
+ # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
+ representation_ms_info['fragments'] = []
+ segment_time = 0
+ segment_d = None
+ segment_number = representation_ms_info['start_number']
+
+ def add_segment_url():
+ segment_url = media_template % {
+ 'Time': segment_time,
+ 'Bandwidth': bandwidth,
+ 'Number': segment_number,
+ }
+ representation_ms_info['fragments'].append({
+ media_location_key: segment_url,
+ 'duration': float_or_none(segment_d, representation_ms_info['timescale']),
+ })
+
+ for num, s in enumerate(representation_ms_info['s']):
+ segment_time = s.get('t') or segment_time
+ segment_d = s['d']
+ add_segment_url()
+ segment_number += 1
+ for r in range(s.get('r', 0)):
+ segment_time += segment_d
add_segment_url()
segment_number += 1
- for r in range(s.get('r', 0)):
- segment_time += segment_d
- add_segment_url()
- segment_number += 1
- segment_time += segment_d
- elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
- # No media template
- # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
- # or any YouTube dashsegments video
- fragments = []
- segment_index = 0
- timescale = representation_ms_info['timescale']
- for s in representation_ms_info['s']:
- duration = float_or_none(s['d'], timescale)
- for r in range(s.get('r', 0) + 1):
- segment_uri = representation_ms_info['segment_urls'][segment_index]
- fragments.append({
- location_key(segment_uri): segment_uri,
- 'duration': duration,
- })
- segment_index += 1
- representation_ms_info['fragments'] = fragments
- elif 'segment_urls' in representation_ms_info:
- # Segment URLs with no SegmentTimeline
- # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
- # https://github.com/ytdl-org/youtube-dl/pull/14844
- fragments = []
- segment_duration = float_or_none(
- representation_ms_info['segment_duration'],
- representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
- for segment_url in representation_ms_info['segment_urls']:
- fragment = {
- location_key(segment_url): segment_url,
- }
- if segment_duration:
- fragment['duration'] = segment_duration
- fragments.append(fragment)
- representation_ms_info['fragments'] = fragments
- # If there is a fragments key available then we correctly recognized fragmented media.
- # Otherwise we will assume unfragmented media with direct access. Technically, such
- # assumption is not necessarily correct since we may simply have no support for
- # some forms of fragmented media renditions yet, but for now we'll use this fallback.
- if 'fragments' in representation_ms_info:
- f.update({
- # NB: mpd_url may be empty when MPD manifest is parsed from a string
- 'url': mpd_url or base_url,
- 'fragment_base_url': base_url,
- 'fragments': [],
- 'protocol': 'http_dash_segments',
- })
- if 'initialization_url' in representation_ms_info:
- initialization_url = representation_ms_info['initialization_url']
- if not f.get('url'):
- f['url'] = initialization_url
- f['fragments'].append({location_key(initialization_url): initialization_url})
- f['fragments'].extend(representation_ms_info['fragments'])
- else:
- # Assuming direct URL to unfragmented media.
- f['url'] = base_url
- formats.append(f)
+ segment_time += segment_d
+ elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
+ # No media template
+ # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
+ # or any YouTube dashsegments video
+ fragments = []
+ segment_index = 0
+ timescale = representation_ms_info['timescale']
+ for s in representation_ms_info['s']:
+ duration = float_or_none(s['d'], timescale)
+ for r in range(s.get('r', 0) + 1):
+ segment_uri = representation_ms_info['segment_urls'][segment_index]
+ fragments.append({
+ location_key(segment_uri): segment_uri,
+ 'duration': duration,
+ })
+ segment_index += 1
+ representation_ms_info['fragments'] = fragments
+ elif 'segment_urls' in representation_ms_info:
+ # Segment URLs with no SegmentTimeline
+ # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
+ # https://github.com/ytdl-org/youtube-dl/pull/14844
+ fragments = []
+ segment_duration = float_or_none(
+ representation_ms_info['segment_duration'],
+ representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
+ for segment_url in representation_ms_info['segment_urls']:
+ fragment = {
+ location_key(segment_url): segment_url,
+ }
+ if segment_duration:
+ fragment['duration'] = segment_duration
+ fragments.append(fragment)
+ representation_ms_info['fragments'] = fragments
+ # If there is a fragments key available then we correctly recognized fragmented media.
+ # Otherwise we will assume unfragmented media with direct access. Technically, such
+ # assumption is not necessarily correct since we may simply have no support for
+ # some forms of fragmented media renditions yet, but for now we'll use this fallback.
+ if 'fragments' in representation_ms_info:
+ f.update({
+ # NB: mpd_url may be empty when MPD manifest is parsed from a string
+ 'url': mpd_url or base_url,
+ 'fragment_base_url': base_url,
+ 'fragments': [],
+ 'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
+ })
+ if 'initialization_url' in representation_ms_info:
+ initialization_url = representation_ms_info['initialization_url']
+ if not f.get('url'):
+ f['url'] = initialization_url
+ f['fragments'].append({location_key(initialization_url): initialization_url})
+ f['fragments'].extend(representation_ms_info['fragments'])
else:
- self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
- return formats
+ # Assuming direct URL to unfragmented media.
+ f['url'] = base_url
+ if content_type in ('video', 'audio') or mime_type == 'image/jpeg':
+ formats.append(f)
+ elif content_type == 'text':
+ subtitles.setdefault(lang or 'und', []).append(f)
- def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
+ return formats, subtitles
+
+ def _extract_ism_formats(self, *args, **kwargs):
+ fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
+ if subs:
+ self._report_ignoring_subs('ISM')
+ return fmts
+
+ def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
res = self._download_xml_handle(
ism_url, video_id,
- note=note or 'Downloading ISM manifest',
- errnote=errnote or 'Failed to download ISM manifest',
+ note='Downloading ISM manifest' if note is None else note,
+ errnote='Failed to download ISM manifest' if errnote is None else errnote,
fatal=fatal, data=data, headers=headers, query=query)
if res is False:
- return []
+ return [], {}
ism_doc, urlh = res
if ism_doc is None:
- return []
+ return [], {}
- return self._parse_ism_formats(ism_doc, urlh.geturl(), ism_id)
+ return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
- def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
+ def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
"""
Parse formats from ISM manifest.
References:
1. [MS-SSTR]: Smooth Streaming Protocol,
https://msdn.microsoft.com/en-us/library/ff469518.aspx
"""
- if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None:
- return []
+ if ism_doc.get('IsLive') == 'TRUE':
+ return [], {}
duration = int(ism_doc.attrib['Duration'])
timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
formats = []
+ subtitles = {}
for stream in ism_doc.findall('StreamIndex'):
stream_type = stream.get('Type')
- if stream_type not in ('video', 'audio'):
+ if stream_type not in ('video', 'audio', 'text'):
continue
url_pattern = stream.attrib['Url']
stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
stream_name = stream.get('Name')
+ stream_language = stream.get('Language', 'und')
for track in stream.findall('QualityLevel'):
- fourcc = track.get('FourCC', 'AACL' if track.get('AudioTag') == '255' else None)
+ fourcc = track.get('FourCC') or ('AACL' if track.get('AudioTag') == '255' else None)
# TODO: add support for WVC1 and WMAP
- if fourcc not in ('H264', 'AVC1', 'AACL'):
+ if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML'):
self.report_warning('%s is not a supported codec' % fourcc)
continue
tbr = int(track.attrib['Bitrate']) // 1000
@@ -2451,35 +2960,55 @@ class InfoExtractor(object):
format_id.append(stream_name)
format_id.append(compat_str(tbr))
- formats.append({
- 'format_id': '-'.join(format_id),
- 'url': ism_url,
- 'manifest_url': ism_url,
- 'ext': 'ismv' if stream_type == 'video' else 'isma',
- 'width': width,
- 'height': height,
- 'tbr': tbr,
- 'asr': sampling_rate,
- 'vcodec': 'none' if stream_type == 'audio' else fourcc,
- 'acodec': 'none' if stream_type == 'video' else fourcc,
- 'protocol': 'ism',
- 'fragments': fragments,
- '_download_params': {
- 'duration': duration,
- 'timescale': stream_timescale,
- 'width': width or 0,
- 'height': height or 0,
- 'fourcc': fourcc,
- 'codec_private_data': track.get('CodecPrivateData'),
- 'sampling_rate': sampling_rate,
- 'channels': int_or_none(track.get('Channels', 2)),
- 'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
- 'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
- },
- })
- return formats
+ if stream_type == 'text':
+ subtitles.setdefault(stream_language, []).append({
+ 'ext': 'ismt',
+ 'protocol': 'ism',
+ 'url': ism_url,
+ 'manifest_url': ism_url,
+ 'fragments': fragments,
+ '_download_params': {
+ 'stream_type': stream_type,
+ 'duration': duration,
+ 'timescale': stream_timescale,
+ 'fourcc': fourcc,
+ 'language': stream_language,
+ 'codec_private_data': track.get('CodecPrivateData'),
+ }
+ })
+ elif stream_type in ('video', 'audio'):
+ formats.append({
+ 'format_id': '-'.join(format_id),
+ 'url': ism_url,
+ 'manifest_url': ism_url,
+ 'ext': 'ismv' if stream_type == 'video' else 'isma',
+ 'width': width,
+ 'height': height,
+ 'tbr': tbr,
+ 'asr': sampling_rate,
+ 'vcodec': 'none' if stream_type == 'audio' else fourcc,
+ 'acodec': 'none' if stream_type == 'video' else fourcc,
+ 'protocol': 'ism',
+ 'fragments': fragments,
+ 'has_drm': ism_doc.find('Protection') is not None,
+ '_download_params': {
+ 'stream_type': stream_type,
+ 'duration': duration,
+ 'timescale': stream_timescale,
+ 'width': width or 0,
+ 'height': height or 0,
+ 'fourcc': fourcc,
+ 'language': stream_language,
+ 'codec_private_data': track.get('CodecPrivateData'),
+ 'sampling_rate': sampling_rate,
+ 'channels': int_or_none(track.get('Channels', 2)),
+ 'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
+ 'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
+ },
+ })
+ return formats, subtitles
- def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None):
+ def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None, quality=None):
def absolute_url(item_url):
return urljoin(base_url, item_url)
@@ -2502,7 +3031,7 @@ class InfoExtractor(object):
formats = self._extract_m3u8_formats(
full_url, video_id, ext='mp4',
entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
- preference=preference, fatal=False)
+ preference=preference, quality=quality, fatal=False)
elif ext == 'mpd':
is_plain_url = False
formats = self._extract_mpd_formats(
@@ -2602,7 +3131,13 @@ class InfoExtractor(object):
entries.append(media_info)
return entries
- def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
+ def _extract_akamai_formats(self, *args, **kwargs):
+ fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
+ if subs:
+ self._report_ignoring_subs('akamai')
+ return fmts
+
+ def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
signed = 'hdnea=' in manifest_url
if not signed:
# https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
@@ -2611,6 +3146,7 @@ class InfoExtractor(object):
'', manifest_url).strip('?')
formats = []
+ subtitles = {}
hdcore_sign = 'hdcore=3.7.0'
f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
@@ -2629,10 +3165,11 @@ class InfoExtractor(object):
hls_host = hosts.get('hls')
if hls_host:
m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
- m3u8_formats = self._extract_m3u8_formats(
+ m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
m3u8_url, video_id, 'mp4', 'm3u8_native',
m3u8_id='hls', fatal=False)
formats.extend(m3u8_formats)
+ subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
http_host = hosts.get('http')
if http_host and m3u8_formats and not signed:
@@ -2656,7 +3193,7 @@ class InfoExtractor(object):
formats.append(http_f)
i += 1
- return formats
+ return formats, subtitles
def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
query = compat_urlparse.urlparse(url).query
@@ -2879,7 +3416,7 @@ class InfoExtractor(object):
if fatal:
raise ExtractorError(msg)
else:
- self._downloader.report_warning(msg)
+ self.report_warning(msg)
return res
def _float(self, v, name, fatal=False, **kwargs):
@@ -2889,7 +3426,7 @@ class InfoExtractor(object):
if fatal:
raise ExtractorError(msg)
else:
- self._downloader.report_warning(msg)
+ self.report_warning(msg)
return res
def _set_cookie(self, domain, name, value, expire_time=None, port=None,
@@ -2963,14 +3500,40 @@ class InfoExtractor(object):
return not any_restricted
def extract_subtitles(self, *args, **kwargs):
- if (self._downloader.params.get('writesubtitles', False)
- or self._downloader.params.get('listsubtitles')):
+ if (self.get_param('writesubtitles', False)
+ or self.get_param('listsubtitles')):
return self._get_subtitles(*args, **kwargs)
return {}
def _get_subtitles(self, *args, **kwargs):
raise NotImplementedError('This method must be implemented by subclasses')
+ def extract_comments(self, *args, **kwargs):
+ if not self.get_param('getcomments'):
+ return None
+ generator = self._get_comments(*args, **kwargs)
+
+ def extractor():
+ comments = []
+ try:
+ while True:
+ comments.append(next(generator))
+ except KeyboardInterrupt:
+ interrupted = True
+ self.to_screen('Interrupted by user')
+ except StopIteration:
+ interrupted = False
+ comment_count = len(comments)
+ self.to_screen(f'Extracted {comment_count} comments')
+ return {
+ 'comments': comments,
+ 'comment_count': None if interrupted else comment_count
+ }
+ return extractor
+
+ def _get_comments(self, *args, **kwargs):
+ raise NotImplementedError('This method must be implemented by subclasses')
+
@staticmethod
def _merge_subtitle_items(subtitle_list1, subtitle_list2):
""" Merge subtitle items for one language. Items with duplicated URLs
@@ -2981,16 +3544,18 @@ class InfoExtractor(object):
return ret
@classmethod
- def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
- """ Merge two subtitle dictionaries, language by language. """
- ret = dict(subtitle_dict1)
- for lang in subtitle_dict2:
- ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
- return ret
+ def _merge_subtitles(cls, *dicts, target=None):
+ """ Merge subtitle dictionaries, language by language. """
+ if target is None:
+ target = {}
+ for d in dicts:
+ for lang, subs in d.items():
+ target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
+ return target
def extract_automatic_captions(self, *args, **kwargs):
- if (self._downloader.params.get('writeautomaticsub', False)
- or self._downloader.params.get('listsubtitles')):
+ if (self.get_param('writeautomaticsub', False)
+ or self.get_param('listsubtitles')):
return self._get_automatic_captions(*args, **kwargs)
return {}
@@ -2998,9 +3563,11 @@ class InfoExtractor(object):
raise NotImplementedError('This method must be implemented by subclasses')
def mark_watched(self, *args, **kwargs):
- if (self._downloader.params.get('mark_watched', False)
- and (self._get_login_info()[0] is not None
- or self._downloader.params.get('cookiefile') is not None)):
+ if not self.get_param('mark_watched', False):
+ return
+ if (self._get_login_info()[0] is not None
+ or self.get_param('cookiefile')
+ or self.get_param('cookiesfrombrowser')):
self._mark_watched(*args, **kwargs)
def _mark_watched(self, *args, **kwargs):
@@ -3008,7 +3575,7 @@ class InfoExtractor(object):
def geo_verification_headers(self):
headers = {}
- geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
+ geo_verification_proxy = self.get_param('geo_verification_proxy')
if geo_verification_proxy:
headers['Ytdl-request-proxy'] = geo_verification_proxy
return headers
@@ -3019,6 +3586,33 @@ class InfoExtractor(object):
def _generic_title(self, url):
return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
+ @staticmethod
+ def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
+ all_known = all(map(
+ lambda x: x is not None,
+ (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
+ return (
+ 'private' if is_private
+ else 'premium_only' if needs_premium
+ else 'subscriber_only' if needs_subscription
+ else 'needs_auth' if needs_auth
+ else 'unlisted' if is_unlisted
+ else 'public' if all_known
+ else None)
+
+ def _configuration_arg(self, key, default=NO_DEFAULT, casesense=False):
+ '''
+ @returns A list of values for the extractor argument given by "key"
+ or "default" if no such key is present
+ @param default The default value to return when the key is not present (default: [])
+ @param casesense When false, the values are converted to lower case
+ '''
+ val = traverse_obj(
+ self._downloader.params, ('extractor_args', self.ie_key().lower(), key))
+ if val is None:
+ return [] if default is NO_DEFAULT else default
+ return list(val) if casesense else [x.lower() for x in val]
+
class SearchInfoExtractor(InfoExtractor):
"""
@@ -3051,12 +3645,19 @@ class SearchInfoExtractor(InfoExtractor):
if n <= 0:
raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
elif n > self._MAX_RESULTS:
- self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
+ self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
n = self._MAX_RESULTS
return self._get_n_results(query, n)
def _get_n_results(self, query, n):
- """Get a specified number of results for a query"""
+ """Get a specified number of results for a query.
+ Either this function or _search_results must be overridden by subclasses """
+ return self.playlist_result(
+ itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
+ query, query)
+
+ def _search_results(self, query):
+ """Returns an iterator of search results"""
raise NotImplementedError('This method must be implemented by subclasses')
@property
diff --git a/hypervideo_dl/extractor/commonmistakes.py b/hypervideo_dl/extractor/commonmistakes.py
index ed9d26e..eb76fe5 100644
--- a/hypervideo_dl/extractor/commonmistakes.py
+++ b/hypervideo_dl/extractor/commonmistakes.py
@@ -26,8 +26,8 @@ class CommonMistakesIE(InfoExtractor):
'That doesn\'t make any sense. '
'Simply remove the parameter in your command or configuration.'
) % url
- if not self._downloader.params.get('verbose'):
- msg += ' Add -v to the command line to see what arguments and configuration hypervideo got.'
+ if not self.get_param('verbose'):
+ msg += ' Add -v to the command line to see what arguments and configuration hypervideo has'
raise ExtractorError(msg, expected=True)
diff --git a/hypervideo_dl/extractor/commonprotocols.py b/hypervideo_dl/extractor/commonprotocols.py
index d98331a..3708c6a 100644
--- a/hypervideo_dl/extractor/commonprotocols.py
+++ b/hypervideo_dl/extractor/commonprotocols.py
@@ -1,5 +1,6 @@
from __future__ import unicode_literals
+
from .common import InfoExtractor
from ..compat import (
compat_urlparse,
@@ -58,3 +59,16 @@ class MmsIE(InfoExtractor):
'title': title,
'url': url,
}
+
+
+class ViewSourceIE(InfoExtractor):
+ IE_DESC = False
+ _VALID_URL = r'view-source:(?P<url>.+)'
+
+ _TEST = {
+ 'url': 'view-source:https://www.youtube.com/watch?v=BaW_jenozKc',
+ 'only_matching': True
+ }
+
+ def _real_extract(self, url):
+ return self.url_result(self._match_valid_url(url).group('url'))
diff --git a/hypervideo_dl/extractor/condenast.py b/hypervideo_dl/extractor/condenast.py
index d5e77af..54e7af8 100644
--- a/hypervideo_dl/extractor/condenast.py
+++ b/hypervideo_dl/extractor/condenast.py
@@ -222,7 +222,7 @@ class CondeNastIE(InfoExtractor):
}
def _real_extract(self, url):
- video_id, player_id, target, url_type, display_id = re.match(self._VALID_URL, url).groups()
+ video_id, player_id, target, url_type, display_id = self._match_valid_url(url).groups()
if video_id:
return self._extract_video({
diff --git a/hypervideo_dl/extractor/corus.py b/hypervideo_dl/extractor/corus.py
index e11aadf..352951e 100644
--- a/hypervideo_dl/extractor/corus.py
+++ b/hypervideo_dl/extractor/corus.py
@@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
from .theplatform import ThePlatformFeedIE
from ..utils import (
@@ -96,7 +95,7 @@ class CorusIE(ThePlatformFeedIE):
}
def _real_extract(self, url):
- domain, video_id = re.match(self._VALID_URL, url).groups()
+ domain, video_id = self._match_valid_url(url).groups()
site = domain.split('.')[0]
path = self._SITE_MAP.get(site, site)
if path != 'series':
@@ -131,7 +130,7 @@ class CorusIE(ThePlatformFeedIE):
formats.extend(self._parse_smil_formats(
smil, smil_url, video_id, namespace))
if not formats and video.get('drm'):
- raise ExtractorError('This video is DRM protected.', expected=True)
+ self.report_drm(video_id)
self._sort_formats(formats)
subtitles = {}
diff --git a/hypervideo_dl/extractor/coub.py b/hypervideo_dl/extractor/coub.py
index 6ea03e6..eba6b73 100644
--- a/hypervideo_dl/extractor/coub.py
+++ b/hypervideo_dl/extractor/coub.py
@@ -87,7 +87,7 @@ class CoubIE(InfoExtractor):
'filesize': int_or_none(item.get('size')),
'vcodec': 'none' if kind == 'audio' else None,
'quality': quality_key(quality),
- 'preference': preference_key(HTML5),
+ 'source_preference': preference_key(HTML5),
})
iphone_url = file_versions.get(IPHONE, {}).get('url')
@@ -95,7 +95,7 @@ class CoubIE(InfoExtractor):
formats.append({
'url': iphone_url,
'format_id': IPHONE,
- 'preference': preference_key(IPHONE),
+ 'source_preference': preference_key(IPHONE),
})
mobile_url = file_versions.get(MOBILE, {}).get('audio_url')
@@ -103,7 +103,7 @@ class CoubIE(InfoExtractor):
formats.append({
'url': mobile_url,
'format_id': '%s-audio' % MOBILE,
- 'preference': preference_key(MOBILE),
+ 'source_preference': preference_key(MOBILE),
})
self._sort_formats(formats)
diff --git a/hypervideo_dl/extractor/crackle.py b/hypervideo_dl/extractor/crackle.py
index 49bf3a4..2c9d28d 100644
--- a/hypervideo_dl/extractor/crackle.py
+++ b/hypervideo_dl/extractor/crackle.py
@@ -12,6 +12,7 @@ from ..utils import (
determine_ext,
float_or_none,
int_or_none,
+ orderedSet,
parse_age_limit,
parse_duration,
url_or_none,
@@ -66,135 +67,179 @@ class CrackleIE(InfoExtractor):
},
}
+ def _download_json(self, url, *args, **kwargs):
+ # Authorization generation algorithm is reverse engineered from:
+ # https://www.sonycrackle.com/static/js/main.ea93451f.chunk.js
+ timestamp = time.strftime('%Y%m%d%H%M', time.gmtime())
+ h = hmac.new(b'IGSLUQCBDFHEOIFM', '|'.join([url, timestamp]).encode(), hashlib.sha1).hexdigest().upper()
+ headers = {
+ 'Accept': 'application/json',
+ 'Authorization': '|'.join([h, timestamp, '117', '1']),
+ }
+ return InfoExtractor._download_json(self, url, *args, headers=headers, **kwargs)
+
def _real_extract(self, url):
video_id = self._match_id(url)
- country_code = self._downloader.params.get('geo_bypass_country', None)
- countries = [country_code] if country_code else (
- 'US', 'AU', 'CA', 'AS', 'FM', 'GU', 'MP', 'PR', 'PW', 'MH', 'VI')
-
- last_e = None
+ geo_bypass_country = self.get_param('geo_bypass_country', None)
+ countries = orderedSet((geo_bypass_country, 'US', 'AU', 'CA', 'AS', 'FM', 'GU', 'MP', 'PR', 'PW', 'MH', 'VI', ''))
+ num_countries, num = len(countries) - 1, 0
+
+ media = {}
+ for num, country in enumerate(countries):
+ if num == 1: # start hard-coded list
+ self.report_warning('%s. Trying with a list of known countries' % (
+ 'Unable to obtain video formats from %s API' % geo_bypass_country if geo_bypass_country
+ else 'No country code was given using --geo-bypass-country'))
+ elif num == num_countries: # end of list
+ geo_info = self._download_json(
+ 'https://web-api-us.crackle.com/Service.svc/geo/country',
+ video_id, fatal=False, note='Downloading geo-location information from crackle API',
+ errnote='Unable to fetch geo-location information from crackle') or {}
+ country = geo_info.get('CountryCode')
+ if country is None:
+ continue
+ self.to_screen('%s identified country as %s' % (self.IE_NAME, country))
+ if country in countries:
+ self.to_screen('Downloading from %s API was already attempted. Skipping...' % country)
+ continue
- for country in countries:
+ if country is None:
+ continue
try:
- # Authorization generation algorithm is reverse engineered from:
- # https://www.sonycrackle.com/static/js/main.ea93451f.chunk.js
- media_detail_url = 'https://web-api-us.crackle.com/Service.svc/details/media/%s/%s?disableProtocols=true' % (video_id, country)
- timestamp = time.strftime('%Y%m%d%H%M', time.gmtime())
- h = hmac.new(b'IGSLUQCBDFHEOIFM', '|'.join([media_detail_url, timestamp]).encode(), hashlib.sha1).hexdigest().upper()
media = self._download_json(
- media_detail_url, video_id, 'Downloading media JSON as %s' % country,
- 'Unable to download media JSON', headers={
- 'Accept': 'application/json',
- 'Authorization': '|'.join([h, timestamp, '117', '1']),
- })
+ 'https://web-api-us.crackle.com/Service.svc/details/media/%s/%s?disableProtocols=true' % (video_id, country),
+ video_id, note='Downloading media JSON from %s API' % country,
+ errnote='Unable to download media JSON')
except ExtractorError as e:
# 401 means geo restriction, trying next country
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
- last_e = e
continue
raise
- media_urls = media.get('MediaURLs')
- if not media_urls or not isinstance(media_urls, list):
+ status = media.get('status')
+ if status.get('messageCode') != '0':
+ raise ExtractorError(
+ '%s said: %s %s - %s' % (
+ self.IE_NAME, status.get('messageCodeDescription'), status.get('messageCode'), status.get('message')),
+ expected=True)
+
+ # Found video formats
+ if isinstance(media.get('MediaURLs'), list):
+ break
+
+ ignore_no_formats = self.get_param('ignore_no_formats_error')
+ allow_unplayable_formats = self.get_param('allow_unplayable_formats')
+
+ if not media or (not media.get('MediaURLs') and not ignore_no_formats):
+ raise ExtractorError(
+ 'Unable to access the crackle API. Try passing your country code '
+ 'to --geo-bypass-country. If it still does not work and the '
+ 'video is available in your country')
+ title = media['Title']
+
+ formats, subtitles = [], {}
+ has_drm = False
+ for e in media.get('MediaURLs') or []:
+ if e.get('UseDRM'):
+ has_drm = True
+ if not allow_unplayable_formats:
+ continue
+ format_url = url_or_none(e.get('Path'))
+ if not format_url:
continue
-
- title = media['Title']
-
- formats = []
- for e in media['MediaURLs']:
- if e.get('UseDRM') is True:
+ ext = determine_ext(format_url)
+ if ext == 'm3u8':
+ fmts, subs = self._extract_m3u8_formats_and_subtitles(
+ format_url, video_id, 'mp4', entry_protocol='m3u8_native',
+ m3u8_id='hls', fatal=False)
+ formats.extend(fmts)
+ subtitles = self._merge_subtitles(subtitles, subs)
+ elif ext == 'mpd':
+ fmts, subs = self._extract_mpd_formats_and_subtitles(
+ format_url, video_id, mpd_id='dash', fatal=False)
+ formats.extend(fmts)
+ subtitles = self._merge_subtitles(subtitles, subs)
+ elif format_url.endswith('.ism/Manifest'):
+ fmts, subs = self._extract_ism_formats_and_subtitles(
+ format_url, video_id, ism_id='mss', fatal=False)
+ formats.extend(fmts)
+ subtitles = self._merge_subtitles(subtitles, subs)
+ else:
+ mfs_path = e.get('Type')
+ mfs_info = self._MEDIA_FILE_SLOTS.get(mfs_path)
+ if not mfs_info:
continue
- format_url = url_or_none(e.get('Path'))
- if not format_url:
+ formats.append({
+ 'url': format_url,
+ 'format_id': 'http-' + mfs_path.split('.')[0],
+ 'width': mfs_info['width'],
+ 'height': mfs_info['height'],
+ })
+ if not formats and has_drm:
+ self.report_drm(video_id)
+ self._sort_formats(formats)
+
+ description = media.get('Description')
+ duration = int_or_none(media.get(
+ 'DurationInSeconds')) or parse_duration(media.get('Duration'))
+ view_count = int_or_none(media.get('CountViews'))
+ average_rating = float_or_none(media.get('UserRating'))
+ age_limit = parse_age_limit(media.get('Rating'))
+ genre = media.get('Genre')
+ release_year = int_or_none(media.get('ReleaseYear'))
+ creator = media.get('Directors')
+ artist = media.get('Cast')
+
+ if media.get('MediaTypeDisplayValue') == 'Full Episode':
+ series = media.get('ShowName')
+ episode = title
+ season_number = int_or_none(media.get('Season'))
+ episode_number = int_or_none(media.get('Episode'))
+ else:
+ series = episode = season_number = episode_number = None
+
+ cc_files = media.get('ClosedCaptionFiles')
+ if isinstance(cc_files, list):
+ for cc_file in cc_files:
+ if not isinstance(cc_file, dict):
continue
- ext = determine_ext(format_url)
- if ext == 'm3u8':
- formats.extend(self._extract_m3u8_formats(
- format_url, video_id, 'mp4', entry_protocol='m3u8_native',
- m3u8_id='hls', fatal=False))
- elif ext == 'mpd':
- formats.extend(self._extract_mpd_formats(
- format_url, video_id, mpd_id='dash', fatal=False))
- elif format_url.endswith('.ism/Manifest'):
- formats.extend(self._extract_ism_formats(
- format_url, video_id, ism_id='mss', fatal=False))
- else:
- mfs_path = e.get('Type')
- mfs_info = self._MEDIA_FILE_SLOTS.get(mfs_path)
- if not mfs_info:
- continue
- formats.append({
- 'url': format_url,
- 'format_id': 'http-' + mfs_path.split('.')[0],
- 'width': mfs_info['width'],
- 'height': mfs_info['height'],
- })
- self._sort_formats(formats)
-
- description = media.get('Description')
- duration = int_or_none(media.get(
- 'DurationInSeconds')) or parse_duration(media.get('Duration'))
- view_count = int_or_none(media.get('CountViews'))
- average_rating = float_or_none(media.get('UserRating'))
- age_limit = parse_age_limit(media.get('Rating'))
- genre = media.get('Genre')
- release_year = int_or_none(media.get('ReleaseYear'))
- creator = media.get('Directors')
- artist = media.get('Cast')
-
- if media.get('MediaTypeDisplayValue') == 'Full Episode':
- series = media.get('ShowName')
- episode = title
- season_number = int_or_none(media.get('Season'))
- episode_number = int_or_none(media.get('Episode'))
- else:
- series = episode = season_number = episode_number = None
-
- subtitles = {}
- cc_files = media.get('ClosedCaptionFiles')
- if isinstance(cc_files, list):
- for cc_file in cc_files:
- if not isinstance(cc_file, dict):
- continue
- cc_url = url_or_none(cc_file.get('Path'))
- if not cc_url:
- continue
- lang = cc_file.get('Locale') or 'en'
- subtitles.setdefault(lang, []).append({'url': cc_url})
-
- thumbnails = []
- images = media.get('Images')
- if isinstance(images, list):
- for image_key, image_url in images.items():
- mobj = re.search(r'Img_(\d+)[xX](\d+)', image_key)
- if not mobj:
- continue
- thumbnails.append({
- 'url': image_url,
- 'width': int(mobj.group(1)),
- 'height': int(mobj.group(2)),
- })
-
- return {
- 'id': video_id,
- 'title': title,
- 'description': description,
- 'duration': duration,
- 'view_count': view_count,
- 'average_rating': average_rating,
- 'age_limit': age_limit,
- 'genre': genre,
- 'creator': creator,
- 'artist': artist,
- 'release_year': release_year,
- 'series': series,
- 'episode': episode,
- 'season_number': season_number,
- 'episode_number': episode_number,
- 'thumbnails': thumbnails,
- 'subtitles': subtitles,
- 'formats': formats,
- }
-
- raise last_e
+ cc_url = url_or_none(cc_file.get('Path'))
+ if not cc_url:
+ continue
+ lang = cc_file.get('Locale') or 'en'
+ subtitles.setdefault(lang, []).append({'url': cc_url})
+
+ thumbnails = []
+ images = media.get('Images')
+ if isinstance(images, list):
+ for image_key, image_url in images.items():
+ mobj = re.search(r'Img_(\d+)[xX](\d+)', image_key)
+ if not mobj:
+ continue
+ thumbnails.append({
+ 'url': image_url,
+ 'width': int(mobj.group(1)),
+ 'height': int(mobj.group(2)),
+ })
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'duration': duration,
+ 'view_count': view_count,
+ 'average_rating': average_rating,
+ 'age_limit': age_limit,
+ 'genre': genre,
+ 'creator': creator,
+ 'artist': artist,
+ 'release_year': release_year,
+ 'series': series,
+ 'episode': episode,
+ 'season_number': season_number,
+ 'episode_number': episode_number,
+ 'thumbnails': thumbnails,
+ 'subtitles': subtitles,
+ 'formats': formats,
+ }
diff --git a/hypervideo_dl/extractor/crunchyroll.py b/hypervideo_dl/extractor/crunchyroll.py
index bc2d1fa..511ac1b 100644
--- a/hypervideo_dl/extractor/crunchyroll.py
+++ b/hypervideo_dl/extractor/crunchyroll.py
@@ -29,6 +29,7 @@ from ..utils import (
merge_dicts,
remove_end,
sanitized_Request,
+ try_get,
urlencode_postdata,
xpath_text,
)
@@ -120,7 +121,7 @@ class CrunchyrollBaseIE(InfoExtractor):
class CrunchyrollIE(CrunchyrollBaseIE, VRVIE):
IE_NAME = 'crunchyroll'
- _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.(?:com|fr)/(?:media(?:-|/\?id=)|(?:[^/]*/){1,2}[^/?&]*?)(?P<video_id>[0-9]+))(?:[/?&]|$)'
+ _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.(?:com|fr)/(?:media(?:-|/\?id=)|(?:[^/]*/){1,2}[^/?&]*?)(?P<id>[0-9]+))(?:[/?&]|$)'
_TESTS = [{
'url': 'http://www.crunchyroll.com/wanna-be-the-strongest-in-the-world/episode-1-an-idol-wrestler-is-born-645513',
'info_dict': {
@@ -412,8 +413,8 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
return subtitles
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('video_id')
+ mobj = self._match_valid_url(url)
+ video_id = mobj.group('id')
if mobj.group('prefix') == 'm':
mobile_webpage = self._download_webpage(url, video_id, 'Downloading mobile webpage')
@@ -428,7 +429,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
r'<div class="showmedia-trailer-notice">(.+?)</div>',
webpage, 'trailer-notice', default='')
if note_m:
- raise ExtractorError(note_m)
+ raise ExtractorError(note_m, expected=True)
mobj = re.search(r'Page\.messaging_box_controller\.addItems\(\[(?P<msg>{.+?})\]\)', webpage)
if mobj:
@@ -458,6 +459,18 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
video_description = (self._parse_json(self._html_search_regex(
r'<script[^>]*>\s*.+?\[media_id=%s\].+?({.+?"description"\s*:.+?})\);' % video_id,
webpage, 'description', default='{}'), video_id) or media_metadata).get('description')
+
+ thumbnails = []
+ thumbnail_url = (self._parse_json(self._html_search_regex(
+ r'<script type="application\/ld\+json">\n\s*(.+?)<\/script>',
+ webpage, 'thumbnail_url', default='{}'), video_id)).get('image')
+ if thumbnail_url:
+ thumbnails.append({
+ 'url': thumbnail_url,
+ 'width': 1920,
+ 'height': 1080
+ })
+
if video_description:
video_description = lowercase_escape(video_description.replace(r'\r\n', '\n'))
video_uploader = self._html_search_regex(
@@ -473,15 +486,11 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
stream.get('url'), video_id, stream.get('format'),
audio_lang, hardsub_lang)
for f in vrv_formats:
- if not hardsub_lang:
- f['preference'] = 1
- language_preference = 0
- if audio_lang == language:
- language_preference += 1
- if hardsub_lang == language:
- language_preference += 1
- if language_preference:
- f['language_preference'] = language_preference
+ f['language_preference'] = 1 if audio_lang == language else 0
+ f['quality'] = (
+ 1 if not hardsub_lang
+ else 0 if hardsub_lang == language
+ else -1)
formats.extend(vrv_formats)
if not formats:
available_fmts = []
@@ -571,7 +580,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
'ext': 'flv',
})
formats.append(format_info)
- self._sort_formats(formats, ('preference', 'language_preference', 'height', 'width', 'tbr', 'fps'))
+ self._sort_formats(formats)
metadata = self._call_rpc_api(
'VideoPlayer_GetMediaMetadata', video_id,
@@ -596,21 +605,25 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
r'(?s)<h\d[^>]+\bid=["\']showmedia_about_episode_num[^>]+>(.+?)</h\d',
webpage, 'series', fatal=False)
- season = episode = episode_number = duration = thumbnail = None
+ season = episode = episode_number = duration = None
if isinstance(metadata, compat_etree_Element):
season = xpath_text(metadata, 'series_title')
episode = xpath_text(metadata, 'episode_title')
episode_number = int_or_none(xpath_text(metadata, 'episode_number'))
duration = float_or_none(media_metadata.get('duration'), 1000)
- thumbnail = xpath_text(metadata, 'episode_image_url')
if not episode:
episode = media_metadata.get('title')
if not episode_number:
episode_number = int_or_none(media_metadata.get('episode_number'))
- if not thumbnail:
- thumbnail = media_metadata.get('thumbnail', {}).get('url')
+ thumbnail_url = try_get(media, lambda x: x['thumbnail']['url'])
+ if thumbnail_url:
+ thumbnails.append({
+ 'url': thumbnail_url,
+ 'width': 640,
+ 'height': 360
+ })
season_number = int_or_none(self._search_regex(
r'(?s)<h\d[^>]+id=["\']showmedia_about_episode_num[^>]+>.+?</h\d>\s*<h4>\s*Season (\d+)',
@@ -623,7 +636,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
'title': video_title,
'description': video_description,
'duration': duration,
- 'thumbnail': thumbnail,
+ 'thumbnails': thumbnails,
'uploader': video_uploader,
'series': series,
'season': season,
@@ -637,10 +650,10 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
class CrunchyrollShowPlaylistIE(CrunchyrollBaseIE):
IE_NAME = 'crunchyroll:playlist'
- _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.com/(?!(?:news|anime-news|library|forum|launchcalendar|lineup|store|comics|freetrial|login|media-\d+))(?P<id>[\w\-]+))/?(?:\?|$)'
+ _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.com/(?:\w{1,2}/)?(?!(?:news|anime-news|library|forum|launchcalendar|lineup|store|comics|freetrial|login|media-\d+))(?P<id>[\w\-]+))/?(?:\?|$)'
_TESTS = [{
- 'url': 'http://www.crunchyroll.com/a-bridge-to-the-starry-skies-hoshizora-e-kakaru-hashi',
+ 'url': 'https://www.crunchyroll.com/a-bridge-to-the-starry-skies-hoshizora-e-kakaru-hashi',
'info_dict': {
'id': 'a-bridge-to-the-starry-skies-hoshizora-e-kakaru-hashi',
'title': 'A Bridge to the Starry Skies - Hoshizora e Kakaru Hashi'
@@ -659,28 +672,86 @@ class CrunchyrollShowPlaylistIE(CrunchyrollBaseIE):
# geo-restricted (US), 18+ maturity wall, non-premium will be available since 2015.11.14
'url': 'http://www.crunchyroll.com/ladies-versus-butlers?skip_wall=1',
'only_matching': True,
+ }, {
+ 'url': 'http://www.crunchyroll.com/fr/ladies-versus-butlers',
+ 'only_matching': True,
}]
def _real_extract(self, url):
show_id = self._match_id(url)
webpage = self._download_webpage(
- self._add_skip_wall(url), show_id,
+ # https:// gives a 403, but http:// does not
+ self._add_skip_wall(url).replace('https://', 'http://'), show_id,
headers=self.geo_verification_headers())
title = self._html_search_meta('name', webpage, default=None)
- episode_paths = re.findall(
- r'(?s)<li id="showview_videos_media_(\d+)"[^>]+>.*?<a href="([^"]+)"',
- webpage)
- entries = [
- self.url_result('http://www.crunchyroll.com' + ep, 'Crunchyroll', ep_id)
- for ep_id, ep in episode_paths
- ]
- entries.reverse()
+ episode_re = r'<li id="showview_videos_media_(\d+)"[^>]+>.*?<a href="([^"]+)"'
+ season_re = r'<a [^>]+season-dropdown[^>]+>([^<]+)'
+ paths = re.findall(f'(?s){episode_re}|{season_re}', webpage)
+
+ entries, current_season = [], None
+ for ep_id, ep, season in paths:
+ if season:
+ current_season = season
+ continue
+ entries.append(self.url_result(
+ f'http://www.crunchyroll.com{ep}', CrunchyrollIE.ie_key(), ep_id, season=current_season))
return {
'_type': 'playlist',
'id': show_id,
'title': title,
- 'entries': entries,
+ 'entries': reversed(entries),
}
+
+
+class CrunchyrollBetaIE(CrunchyrollBaseIE):
+ IE_NAME = 'crunchyroll:beta'
+ _VALID_URL = r'https?://beta\.crunchyroll\.com/(?P<lang>(?:\w{1,2}/)?)watch/(?P<internal_id>\w+)/(?P<id>[\w\-]+)/?(?:\?|$)'
+ _TESTS = [{
+ 'url': 'https://beta.crunchyroll.com/watch/GY2P1Q98Y/to-the-future',
+ 'info_dict': {
+ 'id': '696363',
+ 'ext': 'mp4',
+ 'timestamp': 1459610100,
+ 'description': 'md5:a022fbec4fbb023d43631032c91ed64b',
+ 'uploader': 'Toei Animation',
+ 'title': 'World Trigger Episode 73 – To the Future',
+ 'upload_date': '20160402',
+ },
+ 'params': {'skip_download': 'm3u8'},
+ 'expected_warnings': ['Unable to download XML']
+ }]
+
+ def _real_extract(self, url):
+ lang, internal_id, display_id = self._match_valid_url(url).group('lang', 'internal_id', 'id')
+ webpage = self._download_webpage(url, display_id)
+ episode_data = self._parse_json(
+ self._search_regex(r'__INITIAL_STATE__\s*=\s*({.+?})\s*;', webpage, 'episode data'),
+ display_id)['content']['byId'][internal_id]
+ video_id = episode_data['external_id'].split('.')[1]
+ series_id = episode_data['episode_metadata']['series_slug_title']
+ return self.url_result(f'https://www.crunchyroll.com/{lang}{series_id}/{display_id}-{video_id}',
+ CrunchyrollIE.ie_key(), video_id)
+
+
+class CrunchyrollBetaShowIE(CrunchyrollBaseIE):
+ IE_NAME = 'crunchyroll:playlist:beta'
+ _VALID_URL = r'https?://beta\.crunchyroll\.com/(?P<lang>(?:\w{1,2}/)?)series/\w+/(?P<id>[\w\-]+)/?(?:\?|$)'
+ _TESTS = [{
+ 'url': 'https://beta.crunchyroll.com/series/GY19NQ2QR/Girl-Friend-BETA',
+ 'info_dict': {
+ 'id': 'girl-friend-beta',
+ 'title': 'Girl Friend BETA',
+ },
+ 'playlist_mincount': 10,
+ }, {
+ 'url': 'https://beta.crunchyroll.com/it/series/GY19NQ2QR/Girl-Friend-BETA',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ lang, series_id = self._match_valid_url(url).group('lang', 'id')
+ return self.url_result(f'https://www.crunchyroll.com/{lang}{series_id.lower()}',
+ CrunchyrollShowPlaylistIE.ie_key(), series_id)
diff --git a/hypervideo_dl/extractor/cultureunplugged.py b/hypervideo_dl/extractor/cultureunplugged.py
index bcdf273..9002e4c 100644
--- a/hypervideo_dl/extractor/cultureunplugged.py
+++ b/hypervideo_dl/extractor/cultureunplugged.py
@@ -1,6 +1,5 @@
from __future__ import unicode_literals
-import re
import time
from .common import InfoExtractor
@@ -32,7 +31,7 @@ class CultureUnpluggedIE(InfoExtractor):
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
video_id = mobj.group('id')
display_id = mobj.group('display_id') or video_id
diff --git a/hypervideo_dl/extractor/curiositystream.py b/hypervideo_dl/extractor/curiositystream.py
index ae64a07..034a5c9 100644
--- a/hypervideo_dl/extractor/curiositystream.py
+++ b/hypervideo_dl/extractor/curiositystream.py
@@ -145,8 +145,17 @@ class CuriosityStreamIE(CuriosityStreamBaseIE):
class CuriosityStreamCollectionIE(CuriosityStreamBaseIE):
IE_NAME = 'curiositystream:collection'
- _VALID_URL = r'https?://(?:app\.)?curiositystream\.com/(?:collection|series)/(?P<id>\d+)'
+ _VALID_URL = r'https?://(?:app\.)?curiositystream\.com/(?:collections?|series)/(?P<id>\d+)'
+ _API_BASE_URL = 'https://api.curiositystream.com/v2/collections/'
_TESTS = [{
+ 'url': 'https://curiositystream.com/collections/86',
+ 'info_dict': {
+ 'id': '86',
+ 'title': 'Staff Picks',
+ 'description': 'Wondering where to start? Here are a few of our favorite series and films... from our couch to yours.',
+ },
+ 'playlist_mincount': 7,
+ }, {
'url': 'https://app.curiositystream.com/collection/2',
'info_dict': {
'id': '2',
@@ -157,18 +166,21 @@ class CuriosityStreamCollectionIE(CuriosityStreamBaseIE):
}, {
'url': 'https://curiositystream.com/series/2',
'only_matching': True,
+ }, {
+ 'url': 'https://curiositystream.com/collections/36',
+ 'only_matching': True,
}]
def _real_extract(self, url):
collection_id = self._match_id(url)
- collection = self._call_api(
- 'collections/' + collection_id, collection_id)
+ collection = self._call_api(collection_id, collection_id)
entries = []
for media in collection.get('media', []):
media_id = compat_str(media.get('id'))
+ media_type, ie = ('series', CuriosityStreamCollectionIE) if media.get('is_collection') else ('video', CuriosityStreamIE)
entries.append(self.url_result(
- 'https://curiositystream.com/video/' + media_id,
- CuriosityStreamIE.ie_key(), media_id))
+ 'https://curiositystream.com/%s/%s' % (media_type, media_id),
+ ie=ie.ie_key(), video_id=media_id))
return self.playlist_result(
entries, collection_id,
collection.get('title'), collection.get('description'))
diff --git a/hypervideo_dl/extractor/dailymotion.py b/hypervideo_dl/extractor/dailymotion.py
index b852905..e04e10b 100644
--- a/hypervideo_dl/extractor/dailymotion.py
+++ b/hypervideo_dl/extractor/dailymotion.py
@@ -42,7 +42,7 @@ class DailymotionBaseInfoExtractor(InfoExtractor):
def _real_initialize(self):
cookies = self._get_dailymotion_cookies()
ff = self._get_cookie_value(cookies, 'ff')
- self._FAMILY_FILTER = ff == 'on' if ff else age_restricted(18, self._downloader.params.get('age_limit'))
+ self._FAMILY_FILTER = ff == 'on' if ff else age_restricted(18, self.get_param('age_limit'))
self._set_dailymotion_cookie('ff', 'on' if self._FAMILY_FILTER else 'off')
def _call_api(self, object_type, xid, object_fields, note, filter_extra=None):
@@ -204,17 +204,17 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
return urls
def _real_extract(self, url):
- video_id, playlist_id = re.match(self._VALID_URL, url).groups()
+ video_id, playlist_id = self._match_valid_url(url).groups()
if playlist_id:
- if not self._downloader.params.get('noplaylist'):
+ if not self.get_param('noplaylist'):
self.to_screen('Downloading playlist %s - add --no-playlist to just download video' % playlist_id)
return self.url_result(
'http://www.dailymotion.com/playlist/' + playlist_id,
'DailymotionPlaylist', playlist_id)
self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
- password = self._downloader.params.get('videopassword')
+ password = self.get_param('videopassword')
media = self._call_api(
'media', video_id, '''... on Video {
%s
@@ -232,7 +232,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
audienceCount
isOnAir
}''' % (self._COMMON_MEDIA_FIELDS, self._COMMON_MEDIA_FIELDS), 'Downloading media JSON metadata',
- 'password: "%s"' % self._downloader.params.get('videopassword') if password else None)
+ 'password: "%s"' % self.get_param('videopassword') if password else None)
xid = media['xid']
metadata = self._download_json(
diff --git a/hypervideo_dl/extractor/damtomo.py b/hypervideo_dl/extractor/damtomo.py
new file mode 100644
index 0000000..456cd35
--- /dev/null
+++ b/hypervideo_dl/extractor/damtomo.py
@@ -0,0 +1,113 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import ExtractorError, clean_html, int_or_none, try_get, unified_strdate
+from ..compat import compat_str
+
+
+class DamtomoBaseIE(InfoExtractor):
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage, handle = self._download_webpage_handle(self._WEBPAGE_URL_TMPL % video_id, video_id, encoding='sjis')
+
+ if handle.url == 'https://www.clubdam.com/sorry/':
+ raise ExtractorError('You are rate-limited. Try again later.', expected=True)
+ if '<h2>予期せぬエラーが発生しました。</h2>' in webpage:
+ raise ExtractorError('There is an error on server-side. Try again later.', expected=True)
+
+ description = self._search_regex(r'(?m)<div id="public_comment">\s*<p>\s*([^<]*?)\s*</p>', webpage, 'description', default=None)
+ uploader_id = self._search_regex(r'<a href="https://www\.clubdam\.com/app/damtomo/member/info/Profile\.do\?damtomoId=([^"]+)"', webpage, 'uploader_id', default=None)
+
+ data_dict = {
+ mobj.group('class'): re.sub(r'\s+', ' ', clean_html(mobj.group('value')))
+ for mobj in re.finditer(r'(?s)<(p|div)\s+class="(?P<class>[^" ]+?)">(?P<value>.+?)</\1>', webpage)}
+
+ # since videos do not have title, give the name of song instead
+ data_dict['user_name'] = re.sub(r'\s*さん\s*$', '', data_dict['user_name'])
+ title = data_dict.get('song_title')
+
+ stream_tree = self._download_xml(
+ self._DKML_XML_URL % video_id, video_id, note='Requesting stream information', encoding='sjis',
+ # doing this has no problem since there is no character outside ASCII,
+ # and never likely to happen in the future
+ transform_source=lambda x: re.sub(r'\s*encoding="[^"]+?"', '', x))
+ m3u8_url = try_get(stream_tree, lambda x: x.find(
+ './/d:streamingUrl', {'d': self._DKML_XML_NS}).text.strip(), compat_str)
+ if not m3u8_url:
+ raise ExtractorError('Failed to obtain m3u8 URL')
+ formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4')
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'uploader_id': uploader_id,
+ 'description': description,
+ 'uploader': data_dict.get('user_name'),
+ 'upload_date': unified_strdate(self._search_regex(r'(\d{4}/\d{2}/\d{2})', data_dict.get('date'), 'upload_date', default=None)),
+ 'view_count': int_or_none(self._search_regex(r'(\d+)', data_dict['audience'], 'view_count', default=None)),
+ 'like_count': int_or_none(self._search_regex(r'(\d+)', data_dict['nice'], 'like_count', default=None)),
+ 'track': title,
+ 'artist': data_dict.get('song_artist'),
+ 'formats': formats,
+ }
+
+
+class DamtomoVideoIE(DamtomoBaseIE):
+ IE_NAME = 'damtomo:video'
+ _VALID_URL = r'https?://(?:www\.)?clubdam\.com/app/damtomo/(?:SP/)?karaokeMovie/StreamingDkm\.do\?karaokeMovieId=(?P<id>\d+)'
+ _WEBPAGE_URL_TMPL = 'https://www.clubdam.com/app/damtomo/karaokeMovie/StreamingDkm.do?karaokeMovieId=%s'
+ _DKML_XML_URL = 'https://www.clubdam.com/app/damtomo/karaokeMovie/GetStreamingDkmUrlXML.do?movieSelectFlg=2&karaokeMovieId=%s'
+ _DKML_XML_NS = 'https://www.clubdam.com/app/damtomo/karaokeMovie/GetStreamingDkmUrlXML'
+ _TESTS = [{
+ 'url': 'https://www.clubdam.com/app/damtomo/karaokeMovie/StreamingDkm.do?karaokeMovieId=2414316',
+ 'info_dict': {
+ 'id': '2414316',
+ 'title': 'Get Wild',
+ 'uploader': 'Kドロン',
+ 'uploader_id': 'ODk5NTQwMzQ',
+ 'track': 'Get Wild',
+ 'artist': 'TM NETWORK(TMN)',
+ 'upload_date': '20201226',
+ }
+ }]
+
+
+class DamtomoRecordIE(DamtomoBaseIE):
+ IE_NAME = 'damtomo:record'
+ _VALID_URL = r'https?://(?:www\.)?clubdam\.com/app/damtomo/(?:SP/)?karaokePost/StreamingKrk\.do\?karaokeContributeId=(?P<id>\d+)'
+ _WEBPAGE_URL_TMPL = 'https://www.clubdam.com/app/damtomo/karaokePost/StreamingKrk.do?karaokeContributeId=%s'
+ _DKML_XML_URL = 'https://www.clubdam.com/app/damtomo/karaokePost/GetStreamingKrkUrlXML.do?karaokeContributeId=%s'
+ _DKML_XML_NS = 'https://www.clubdam.com/app/damtomo/karaokePost/GetStreamingKrkUrlXML'
+ _TESTS = [{
+ 'url': 'https://www.clubdam.com/app/damtomo/karaokePost/StreamingKrk.do?karaokeContributeId=27376862',
+ 'info_dict': {
+ 'id': '27376862',
+ 'title': 'イカSUMMER [良音]',
+ 'description': None,
+ 'uploader': 'NANA',
+ 'uploader_id': 'MzAyMDExNTY',
+ 'upload_date': '20210721',
+ 'view_count': 4,
+ 'like_count': 1,
+ 'track': 'イカSUMMER [良音]',
+ 'artist': 'ORANGE RANGE',
+ }
+ }, {
+ 'url': 'https://www.clubdam.com/app/damtomo/karaokePost/StreamingKrk.do?karaokeContributeId=27489418',
+ 'info_dict': {
+ 'id': '27489418',
+ 'title': '心みだれて〜say it with flowers〜(生音)',
+ 'uploader_id': 'NjI1MjI2MjU',
+ 'description': 'やっぱりキーを下げて正解だった感じ。リベンジ成功ということで。',
+ 'uploader': '箱の「中の人」',
+ 'upload_date': '20210815',
+ 'view_count': 5,
+ 'like_count': 3,
+ 'track': '心みだれて〜say it with flowers〜(生音)',
+ 'artist': '小林明子',
+ }
+ }]
diff --git a/hypervideo_dl/extractor/daum.py b/hypervideo_dl/extractor/daum.py
index 1370955..8aa2af9 100644
--- a/hypervideo_dl/extractor/daum.py
+++ b/hypervideo_dl/extractor/daum.py
@@ -6,10 +6,9 @@ import itertools
from .common import InfoExtractor
from ..compat import (
- compat_parse_qs,
compat_urllib_parse_unquote,
- compat_urlparse,
)
+from ..utils import parse_qs
class DaumBaseIE(InfoExtractor):
@@ -155,10 +154,10 @@ class DaumListIE(InfoExtractor):
return name, entries
def _check_clip(self, url, list_id):
- query_dict = compat_parse_qs(compat_urlparse.urlparse(url).query)
+ query_dict = parse_qs(url)
if 'clipid' in query_dict:
clip_id = query_dict['clipid'][0]
- if self._downloader.params.get('noplaylist'):
+ if self.get_param('noplaylist'):
self.to_screen('Downloading just video %s because of --no-playlist' % clip_id)
return self.url_result(DaumClipIE._URL_TEMPLATE % clip_id, 'DaumClip')
else:
@@ -256,7 +255,7 @@ class DaumUserIE(DaumListIE):
if clip_result:
return clip_result
- query_dict = compat_parse_qs(compat_urlparse.urlparse(url).query)
+ query_dict = parse_qs(url)
if 'playlistid' in query_dict:
playlist_id = query_dict['playlistid'][0]
return self.url_result(DaumPlaylistIE._URL_TEMPLATE % playlist_id, 'DaumPlaylist')
diff --git a/hypervideo_dl/extractor/dbtv.py b/hypervideo_dl/extractor/dbtv.py
index aaedf2e..8e73176 100644
--- a/hypervideo_dl/extractor/dbtv.py
+++ b/hypervideo_dl/extractor/dbtv.py
@@ -38,7 +38,7 @@ class DBTVIE(InfoExtractor):
webpage)]
def _real_extract(self, url):
- display_id, video_id = re.match(self._VALID_URL, url).groups()
+ display_id, video_id = self._match_valid_url(url).groups()
info = {
'_type': 'url_transparent',
'id': video_id,
diff --git a/hypervideo_dl/extractor/deezer.py b/hypervideo_dl/extractor/deezer.py
index a38b268..7ba02e5 100644
--- a/hypervideo_dl/extractor/deezer.py
+++ b/hypervideo_dl/extractor/deezer.py
@@ -1,7 +1,6 @@
from __future__ import unicode_literals
import json
-import re
from .common import InfoExtractor
from ..utils import (
@@ -11,28 +10,15 @@ from ..utils import (
)
-class DeezerPlaylistIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?deezer\.com/playlist/(?P<id>[0-9]+)'
- _TEST = {
- 'url': 'http://www.deezer.com/playlist/176747451',
- 'info_dict': {
- 'id': '176747451',
- 'title': 'Best!',
- 'uploader': 'Anonymous',
- 'thumbnail': r're:^https?://cdn-images\.deezer\.com/images/cover/.*\.jpg$',
- },
- 'playlist_count': 30,
- 'skip': 'Only available in .de',
- }
-
- def _real_extract(self, url):
- if 'test' not in self._downloader.params:
- self._downloader.report_warning('For now, this extractor only supports the 30 second previews. Patches welcome!')
+class DeezerBaseInfoExtractor(InfoExtractor):
+ def get_data(self, url):
+ if not self.get_param('test'):
+ self.report_warning('For now, this extractor only supports the 30 second previews. Patches welcome!')
- mobj = re.match(self._VALID_URL, url)
- playlist_id = mobj.group('id')
+ mobj = self._match_valid_url(url)
+ data_id = mobj.group('id')
- webpage = self._download_webpage(url, playlist_id)
+ webpage = self._download_webpage(url, data_id)
geoblocking_msg = self._html_search_regex(
r'<p class="soon-txt">(.*?)</p>', webpage, 'geoblocking message',
default=None)
@@ -45,6 +31,24 @@ class DeezerPlaylistIE(InfoExtractor):
r'naboo\.display\(\'[^\']+\',\s*(.*?)\);\n'),
webpage, 'data JSON')
data = json.loads(data_json)
+ return data_id, webpage, data
+
+
+class DeezerPlaylistIE(DeezerBaseInfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?deezer\.com/(../)?playlist/(?P<id>[0-9]+)'
+ _TEST = {
+ 'url': 'http://www.deezer.com/playlist/176747451',
+ 'info_dict': {
+ 'id': '176747451',
+ 'title': 'Best!',
+ 'uploader': 'anonymous',
+ 'thumbnail': r're:^https?://(e-)?cdns-images\.dzcdn\.net/images/cover/.*\.jpg$',
+ },
+ 'playlist_count': 29,
+ }
+
+ def _real_extract(self, url):
+ playlist_id, webpage, data = self.get_data(url)
playlist_title = data.get('DATA', {}).get('TITLE')
playlist_uploader = data.get('DATA', {}).get('PARENT_USERNAME')
@@ -52,31 +56,23 @@ class DeezerPlaylistIE(InfoExtractor):
r'<img id="naboo_playlist_image".*?src="([^"]+)"', webpage,
'playlist thumbnail')
- preview_pattern = self._search_regex(
- r"var SOUND_PREVIEW_GATEWAY\s*=\s*'([^']+)';", webpage,
- 'preview URL pattern', fatal=False)
entries = []
- for s in data['SONGS']['data']:
- puid = s['MD5_ORIGIN']
- preview_video_url = preview_pattern.\
- replace('{0}', puid[0]).\
- replace('{1}', puid).\
- replace('{2}', s['MEDIA_VERSION'])
+ for s in data.get('SONGS', {}).get('data'):
formats = [{
'format_id': 'preview',
- 'url': preview_video_url,
+ 'url': s.get('MEDIA', [{}])[0].get('HREF'),
'preference': -100, # Only the first 30 seconds
'ext': 'mp3',
}]
self._sort_formats(formats)
artists = ', '.join(
- orderedSet(a['ART_NAME'] for a in s['ARTISTS']))
+ orderedSet(a.get('ART_NAME') for a in s.get('ARTISTS')))
entries.append({
- 'id': s['SNG_ID'],
+ 'id': s.get('SNG_ID'),
'duration': int_or_none(s.get('DURATION')),
- 'title': '%s - %s' % (artists, s['SNG_TITLE']),
- 'uploader': s['ART_NAME'],
- 'uploader_id': s['ART_ID'],
+ 'title': '%s - %s' % (artists, s.get('SNG_TITLE')),
+ 'uploader': s.get('ART_NAME'),
+ 'uploader_id': s.get('ART_ID'),
'age_limit': 16 if s.get('EXPLICIT_LYRICS') == '1' else 0,
'formats': formats,
})
@@ -89,3 +85,62 @@ class DeezerPlaylistIE(InfoExtractor):
'thumbnail': playlist_thumbnail,
'entries': entries,
}
+
+
+class DeezerAlbumIE(DeezerBaseInfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?deezer\.com/(../)?album/(?P<id>[0-9]+)'
+ _TEST = {
+ 'url': 'https://www.deezer.com/fr/album/67505622',
+ 'info_dict': {
+ 'id': '67505622',
+ 'title': 'Last Week',
+ 'uploader': 'Home Brew',
+ 'thumbnail': r're:^https?://(e-)?cdns-images\.dzcdn\.net/images/cover/.*\.jpg$',
+ },
+ 'playlist_count': 7,
+ }
+
+ def _real_extract(self, url):
+ album_id, webpage, data = self.get_data(url)
+
+ album_title = data.get('DATA', {}).get('ALB_TITLE')
+ album_uploader = data.get('DATA', {}).get('ART_NAME')
+ album_thumbnail = self._search_regex(
+ r'<img id="naboo_album_image".*?src="([^"]+)"', webpage,
+ 'album thumbnail')
+
+ entries = []
+ for s in data.get('SONGS', {}).get('data'):
+ formats = [{
+ 'format_id': 'preview',
+ 'url': s.get('MEDIA', [{}])[0].get('HREF'),
+ 'preference': -100, # Only the first 30 seconds
+ 'ext': 'mp3',
+ }]
+ self._sort_formats(formats)
+ artists = ', '.join(
+ orderedSet(a.get('ART_NAME') for a in s.get('ARTISTS')))
+ entries.append({
+ 'id': s.get('SNG_ID'),
+ 'duration': int_or_none(s.get('DURATION')),
+ 'title': '%s - %s' % (artists, s.get('SNG_TITLE')),
+ 'uploader': s.get('ART_NAME'),
+ 'uploader_id': s.get('ART_ID'),
+ 'age_limit': 16 if s.get('EXPLICIT_LYRICS') == '1' else 0,
+ 'formats': formats,
+ 'track': s.get('SNG_TITLE'),
+ 'track_number': int_or_none(s.get('TRACK_NUMBER')),
+ 'track_id': s.get('SNG_ID'),
+ 'artist': album_uploader,
+ 'album': album_title,
+ 'album_artist': album_uploader,
+ })
+
+ return {
+ '_type': 'playlist',
+ 'id': album_id,
+ 'title': album_title,
+ 'uploader': album_uploader,
+ 'thumbnail': album_thumbnail,
+ 'entries': entries,
+ }
diff --git a/hypervideo_dl/extractor/dfb.py b/hypervideo_dl/extractor/dfb.py
index a4d0448..97f70fc 100644
--- a/hypervideo_dl/extractor/dfb.py
+++ b/hypervideo_dl/extractor/dfb.py
@@ -1,6 +1,5 @@
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
from ..utils import unified_strdate
@@ -23,7 +22,7 @@ class DFBIE(InfoExtractor):
}
def _real_extract(self, url):
- display_id, video_id = re.match(self._VALID_URL, url).groups()
+ display_id, video_id = self._match_valid_url(url).groups()
player_info = self._download_xml(
'http://tv.dfb.de/server/hd_video.php?play=%s' % video_id,
diff --git a/hypervideo_dl/extractor/digiteka.py b/hypervideo_dl/extractor/digiteka.py
index 3dfde0d..d632047 100644
--- a/hypervideo_dl/extractor/digiteka.py
+++ b/hypervideo_dl/extractor/digiteka.py
@@ -70,7 +70,7 @@ class DigitekaIE(InfoExtractor):
return mobj.group('url')
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
video_id = mobj.group('id')
video_type = mobj.group('embed_type') or mobj.group('site_type')
if video_type == 'music':
diff --git a/hypervideo_dl/extractor/discovery.py b/hypervideo_dl/extractor/discovery.py
index e0139cc..fd3ad75 100644
--- a/hypervideo_dl/extractor/discovery.py
+++ b/hypervideo_dl/extractor/discovery.py
@@ -1,7 +1,6 @@
from __future__ import unicode_literals
import random
-import re
import string
from .discoverygo import DiscoveryGoBaseIE
@@ -62,7 +61,7 @@ class DiscoveryIE(DiscoveryGoBaseIE):
_API_BASE_URL = 'https://api.discovery.com/v1/'
def _real_extract(self, url):
- site, show_slug, display_id = re.match(self._VALID_URL, url).groups()
+ site, show_slug, display_id = self._match_valid_url(url).groups()
access_token = None
cookies = self._get_cookies(url)
diff --git a/hypervideo_dl/extractor/discoverynetworks.py b/hypervideo_dl/extractor/discoverynetworks.py
index c512b95..f43c871 100644
--- a/hypervideo_dl/extractor/discoverynetworks.py
+++ b/hypervideo_dl/extractor/discoverynetworks.py
@@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
from .dplay import DPlayIE
@@ -35,7 +34,7 @@ class DiscoveryNetworksDeIE(DPlayIE):
}]
def _real_extract(self, url):
- domain, programme, alternate_id = re.match(self._VALID_URL, url).groups()
+ domain, programme, alternate_id = self._match_valid_url(url).groups()
country = 'GB' if domain == 'dplay.co.uk' else 'DE'
realm = 'questuk' if country == 'GB' else domain.replace('.', '')
return self._get_disco_api_info(
diff --git a/hypervideo_dl/extractor/discoveryplusindia.py b/hypervideo_dl/extractor/discoveryplusindia.py
new file mode 100644
index 0000000..5180140
--- /dev/null
+++ b/hypervideo_dl/extractor/discoveryplusindia.py
@@ -0,0 +1,98 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import json
+
+from ..compat import compat_str
+from ..utils import try_get
+from .common import InfoExtractor
+from .dplay import DPlayIE
+
+
+class DiscoveryPlusIndiaIE(DPlayIE):
+ _VALID_URL = r'https?://(?:www\.)?discoveryplus\.in/videos?' + DPlayIE._PATH_REGEX
+ _TESTS = [{
+ 'url': 'https://www.discoveryplus.in/videos/how-do-they-do-it/fugu-and-more?seasonId=8&type=EPISODE',
+ 'info_dict': {
+ 'id': '27104',
+ 'ext': 'mp4',
+ 'display_id': 'how-do-they-do-it/fugu-and-more',
+ 'title': 'Fugu and More',
+ 'description': 'The Japanese catch, prepare and eat the deadliest fish on the planet.',
+ 'duration': 1319,
+ 'timestamp': 1582309800,
+ 'upload_date': '20200221',
+ 'series': 'How Do They Do It?',
+ 'season_number': 8,
+ 'episode_number': 2,
+ 'creator': 'Discovery Channel',
+ },
+ 'params': {
+ 'format': 'bestvideo',
+ 'skip_download': True,
+ },
+ 'skip': 'Cookies (not necessarily logged in) are needed'
+ }]
+
+ def _update_disco_api_headers(self, headers, disco_base, display_id, realm):
+ headers['x-disco-params'] = 'realm=%s' % realm
+ headers['x-disco-client'] = 'WEB:UNKNOWN:dplus-india:17.0.0'
+
+ def _download_video_playback_info(self, disco_base, video_id, headers):
+ return self._download_json(
+ disco_base + 'playback/v3/videoPlaybackInfo',
+ video_id, headers=headers, data=json.dumps({
+ 'deviceInfo': {
+ 'adBlocker': False,
+ },
+ 'videoId': video_id,
+ }).encode('utf-8'))['data']['attributes']['streaming']
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ return self._get_disco_api_info(
+ url, display_id, 'ap2-prod-direct.discoveryplus.in', 'dplusindia', 'in')
+
+
+class DiscoveryPlusIndiaShowIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?discoveryplus\.in/show/(?P<show_name>[^/]+)/?(?:[?#]|$)'
+ _TESTS = [{
+ 'url': 'https://www.discoveryplus.in/show/how-do-they-do-it',
+ 'playlist_mincount': 140,
+ 'info_dict': {
+ 'id': 'how-do-they-do-it',
+ },
+ }]
+
+ def _entries(self, show_name):
+ headers = {
+ 'x-disco-client': 'WEB:UNKNOWN:dplus-india:prod',
+ 'x-disco-params': 'realm=dplusindia',
+ 'referer': 'https://www.discoveryplus.in/',
+ }
+ show_url = 'https://ap2-prod-direct.discoveryplus.in/cms/routes/show/{}?include=default'.format(show_name)
+ show_json = self._download_json(show_url,
+ video_id=show_name,
+ headers=headers)['included'][4]['attributes']['component']
+ show_id = show_json['mandatoryParams'].split('=')[-1]
+ season_url = 'https://ap2-prod-direct.discoveryplus.in/content/videos?sort=episodeNumber&filter[seasonNumber]={}&filter[show.id]={}&page[size]=100&page[number]={}'
+ for season in show_json['filters'][0]['options']:
+ season_id = season['id']
+ total_pages, page_num = 1, 0
+ while page_num < total_pages:
+ season_json = self._download_json(season_url.format(season_id, show_id, compat_str(page_num + 1)),
+ video_id=show_id, headers=headers,
+ note='Downloading JSON metadata%s' % (' page %d' % page_num if page_num else ''))
+ if page_num == 0:
+ total_pages = try_get(season_json, lambda x: x['meta']['totalPages'], int) or 1
+ episodes_json = season_json['data']
+ for episode in episodes_json:
+ video_id = episode['attributes']['path']
+ yield self.url_result(
+ 'https://discoveryplus.in/videos/%s' % video_id,
+ ie=DiscoveryPlusIndiaIE.ie_key(), video_id=video_id)
+ page_num += 1
+
+ def _real_extract(self, url):
+ show_name = self._match_valid_url(url).group('show_name')
+ return self.playlist_result(self._entries(show_name), playlist_id=show_name)
diff --git a/hypervideo_dl/extractor/disney.py b/hypervideo_dl/extractor/disney.py
index 0eee82f..f018cbe 100644
--- a/hypervideo_dl/extractor/disney.py
+++ b/hypervideo_dl/extractor/disney.py
@@ -9,7 +9,6 @@ from ..utils import (
unified_strdate,
compat_str,
determine_ext,
- ExtractorError,
update_url_query,
)
@@ -78,7 +77,7 @@ class DisneyIE(InfoExtractor):
}]
def _real_extract(self, url):
- domain, video_id, display_id = re.match(self._VALID_URL, url).groups()
+ domain, video_id, display_id = self._match_valid_url(url).groups()
if not video_id:
webpage = self._download_webpage(url, display_id)
grill = re.sub(r'"\s*\+\s*"', '', self._search_regex(
@@ -140,7 +139,7 @@ class DisneyIE(InfoExtractor):
'vcodec': 'none' if (width == 0 and height == 0) else None,
})
if not formats and video_data.get('expired'):
- raise ExtractorError(
+ self.raise_no_formats(
'%s said: %s' % (self.IE_NAME, page_data['translations']['video_expired']),
expected=True)
self._sort_formats(formats)
diff --git a/hypervideo_dl/extractor/dispeak.py b/hypervideo_dl/extractor/dispeak.py
index 276fd4b..be7ad12 100644
--- a/hypervideo_dl/extractor/dispeak.py
+++ b/hypervideo_dl/extractor/dispeak.py
@@ -94,6 +94,7 @@ class DigitallySpeakingIE(InfoExtractor):
'play_path': remove_end(audio.get('url'), '.flv'),
'ext': 'flv',
'vcodec': 'none',
+ 'quality': 1,
'format_id': audio.get('code'),
})
for video_key, format_id, preference in (
@@ -107,7 +108,6 @@ class DigitallySpeakingIE(InfoExtractor):
'ext': 'flv',
'format_note': '%s video' % video_key,
'quality': preference,
- 'preference': preference,
'format_id': format_id,
})
return formats
diff --git a/hypervideo_dl/extractor/dlive.py b/hypervideo_dl/extractor/dlive.py
index d95c67a..90462c0 100644
--- a/hypervideo_dl/extractor/dlive.py
+++ b/hypervideo_dl/extractor/dlive.py
@@ -1,7 +1,6 @@
from __future__ import unicode_literals
import json
-import re
from .common import InfoExtractor
from ..utils import int_or_none
@@ -26,7 +25,7 @@ class DLiveVODIE(InfoExtractor):
}]
def _real_extract(self, url):
- uploader_id, vod_id = re.match(self._VALID_URL, url).groups()
+ uploader_id, vod_id = self._match_valid_url(url).groups()
broadcast = self._download_json(
'https://graphigo.prd.dlive.tv/', vod_id,
data=json.dumps({'query': '''query {
diff --git a/hypervideo_dl/extractor/doodstream.py b/hypervideo_dl/extractor/doodstream.py
new file mode 100644
index 0000000..2c9ea68
--- /dev/null
+++ b/hypervideo_dl/extractor/doodstream.py
@@ -0,0 +1,71 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import string
+import random
+import time
+
+from .common import InfoExtractor
+
+
+class DoodStreamIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?dood\.(?:to|watch)/[ed]/(?P<id>[a-z0-9]+)'
+ _TESTS = [{
+ 'url': 'http://dood.to/e/5s1wmbdacezb',
+ 'md5': '4568b83b31e13242b3f1ff96c55f0595',
+ 'info_dict': {
+ 'id': '5s1wmbdacezb',
+ 'ext': 'mp4',
+ 'title': 'Kat Wonders - Monthly May 2020',
+ 'description': 'Kat Wonders - Monthly May 2020 | DoodStream.com',
+ 'thumbnail': 'https://img.doodcdn.com/snaps/flyus84qgl2fsk4g.jpg',
+ }
+ }, {
+ 'url': 'https://dood.to/d/jzrxn12t2s7n',
+ 'md5': '3207e199426eca7c2aa23c2872e6728a',
+ 'info_dict': {
+ 'id': 'jzrxn12t2s7n',
+ 'ext': 'mp4',
+ 'title': 'Stacy Cruz Cute ALLWAYSWELL',
+ 'description': 'Stacy Cruz Cute ALLWAYSWELL | DoodStream.com',
+ 'thumbnail': 'https://img.doodcdn.com/snaps/8edqd5nppkac3x8u.jpg',
+ }
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ if '/d/' in url:
+ url = "https://dood.to" + self._html_search_regex(
+ r'<iframe src="(/e/[a-z0-9]+)"', webpage, 'embed')
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ title = self._html_search_meta(['og:title', 'twitter:title'],
+ webpage, default=None)
+ thumb = self._html_search_meta(['og:image', 'twitter:image'],
+ webpage, default=None)
+ token = self._html_search_regex(r'[?&]token=([a-z0-9]+)[&\']', webpage, 'token')
+ description = self._html_search_meta(
+ ['og:description', 'description', 'twitter:description'],
+ webpage, default=None)
+ auth_url = 'https://dood.to' + self._html_search_regex(
+ r'(/pass_md5.*?)\'', webpage, 'pass_md5')
+ headers = {
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:53.0) Gecko/20100101 Firefox/66.0',
+ 'referer': url
+ }
+
+ webpage = self._download_webpage(auth_url, video_id, headers=headers)
+ final_url = webpage + ''.join([random.choice(string.ascii_letters + string.digits) for _ in range(10)]) + "?token=" + token + "&expiry=" + str(int(time.time() * 1000))
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'url': final_url,
+ 'http_headers': headers,
+ 'ext': 'mp4',
+ 'description': description,
+ 'thumbnail': thumb,
+ }
diff --git a/hypervideo_dl/extractor/dplay.py b/hypervideo_dl/extractor/dplay.py
index bbb1990..e0e446b 100644
--- a/hypervideo_dl/extractor/dplay.py
+++ b/hypervideo_dl/extractor/dplay.py
@@ -2,7 +2,6 @@
from __future__ import unicode_literals
import json
-import re
from .common import InfoExtractor
from ..compat import compat_HTTPError
@@ -287,7 +286,7 @@ class DPlayIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
display_id = mobj.group('id')
domain = mobj.group('domain').lstrip('www.')
country = mobj.group('country') or mobj.group('subdomain_country') or mobj.group('plus_country')
@@ -296,6 +295,35 @@ class DPlayIE(InfoExtractor):
url, display_id, host, 'dplay' + country, country)
+class HGTVDeIE(DPlayIE):
+ _VALID_URL = r'https?://de\.hgtv\.com/sendungen' + DPlayIE._PATH_REGEX
+ _TESTS = [{
+ 'url': 'https://de.hgtv.com/sendungen/tiny-house-klein-aber-oho/wer-braucht-schon-eine-toilette/',
+ 'info_dict': {
+ 'id': '151205',
+ 'display_id': 'tiny-house-klein-aber-oho/wer-braucht-schon-eine-toilette',
+ 'ext': 'mp4',
+ 'title': 'Wer braucht schon eine Toilette',
+ 'description': 'md5:05b40a27e7aed2c9172de34d459134e2',
+ 'duration': 1177.024,
+ 'timestamp': 1595705400,
+ 'upload_date': '20200725',
+ 'creator': 'HGTV',
+ 'series': 'Tiny House - klein, aber oho',
+ 'season_number': 3,
+ 'episode_number': 3,
+ },
+ 'params': {
+ 'format': 'bestvideo',
+ },
+ }]
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ return self._get_disco_api_info(
+ url, display_id, 'eu1-prod.disco-api.com', 'hgtv', 'de')
+
+
class DiscoveryPlusIE(DPlayIE):
_VALID_URL = r'https?://(?:www\.)?discoveryplus\.com/video' + DPlayIE._PATH_REGEX
_TESTS = [{
@@ -317,8 +345,11 @@ class DiscoveryPlusIE(DPlayIE):
'skip': 'Available for Premium users',
}]
+ _PRODUCT = 'dplus_us'
+ _API_URL = 'us1-prod-direct.discoveryplus.com'
+
def _update_disco_api_headers(self, headers, disco_base, display_id, realm):
- headers['x-disco-client'] = 'WEB:UNKNOWN:dplus_us:15.0.0'
+ headers['x-disco-client'] = f'WEB:UNKNOWN:{self._PRODUCT}:25.2.6'
def _download_video_playback_info(self, disco_base, video_id, headers):
return self._download_json(
@@ -330,40 +361,71 @@ class DiscoveryPlusIE(DPlayIE):
'videoId': video_id,
'wisteriaProperties': {
'platform': 'desktop',
- 'product': 'dplus_us',
+ 'product': self._PRODUCT,
},
}).encode('utf-8'))['data']['attributes']['streaming']
def _real_extract(self, url):
display_id = self._match_id(url)
return self._get_disco_api_info(
- url, display_id, 'us1-prod-direct.discoveryplus.com', 'go', 'us')
+ url, display_id, self._API_URL, 'go', 'us')
-class HGTVDeIE(DPlayIE):
- _VALID_URL = r'https?://de\.hgtv\.com/sendungen' + DPlayIE._PATH_REGEX
+class ScienceChannelIE(DiscoveryPlusIE):
+ _VALID_URL = r'https?://(?:www\.)?sciencechannel\.com/video' + DPlayIE._PATH_REGEX
_TESTS = [{
- 'url': 'https://de.hgtv.com/sendungen/tiny-house-klein-aber-oho/wer-braucht-schon-eine-toilette/',
+ 'url': 'https://www.sciencechannel.com/video/strangest-things-science-atve-us/nazi-mystery-machine',
'info_dict': {
- 'id': '151205',
- 'display_id': 'tiny-house-klein-aber-oho/wer-braucht-schon-eine-toilette',
+ 'id': '2842849',
+ 'display_id': 'strangest-things-science-atve-us/nazi-mystery-machine',
'ext': 'mp4',
- 'title': 'Wer braucht schon eine Toilette',
- 'description': 'md5:05b40a27e7aed2c9172de34d459134e2',
- 'duration': 1177.024,
- 'timestamp': 1595705400,
- 'upload_date': '20200725',
- 'creator': 'HGTV',
- 'series': 'Tiny House - klein, aber oho',
- 'season_number': 3,
- 'episode_number': 3,
+ 'title': 'Nazi Mystery Machine',
+ 'description': 'Experts investigate the secrets of a revolutionary encryption machine.',
+ 'season_number': 1,
+ 'episode_number': 1,
},
- 'params': {
- 'format': 'bestvideo',
+ 'skip': 'Available for Premium users',
+ }]
+
+ _PRODUCT = 'sci'
+ _API_URL = 'us1-prod-direct.sciencechannel.com'
+
+
+class DIYNetworkIE(DiscoveryPlusIE):
+ _VALID_URL = r'https?://(?:watch\.)?diynetwork\.com/video' + DPlayIE._PATH_REGEX
+ _TESTS = [{
+ 'url': 'https://watch.diynetwork.com/video/pool-kings-diy-network/bringing-beach-life-to-texas',
+ 'info_dict': {
+ 'id': '2309730',
+ 'display_id': 'pool-kings-diy-network/bringing-beach-life-to-texas',
+ 'ext': 'mp4',
+ 'title': 'Bringing Beach Life to Texas',
+ 'description': 'The Pool Kings give a family a day at the beach in their own backyard.',
+ 'season_number': 10,
+ 'episode_number': 2,
},
+ 'skip': 'Available for Premium users',
}]
- def _real_extract(self, url):
- display_id = self._match_id(url)
- return self._get_disco_api_info(
- url, display_id, 'eu1-prod.disco-api.com', 'hgtv', 'de')
+ _PRODUCT = 'diy'
+ _API_URL = 'us1-prod-direct.watch.diynetwork.com'
+
+
+class AnimalPlanetIE(DiscoveryPlusIE):
+ _VALID_URL = r'https?://(?:www\.)?animalplanet\.com/video' + DPlayIE._PATH_REGEX
+ _TESTS = [{
+ 'url': 'https://www.animalplanet.com/video/north-woods-law-animal-planet/squirrel-showdown',
+ 'info_dict': {
+ 'id': '3338923',
+ 'display_id': 'north-woods-law-animal-planet/squirrel-showdown',
+ 'ext': 'mp4',
+ 'title': 'Squirrel Showdown',
+ 'description': 'A woman is suspected of being in possession of flying squirrel kits.',
+ 'season_number': 16,
+ 'episode_number': 11,
+ },
+ 'skip': 'Available for Premium users',
+ }]
+
+ _PRODUCT = 'apl'
+ _API_URL = 'us1-prod-direct.animalplanet.com'
diff --git a/hypervideo_dl/extractor/drbonanza.py b/hypervideo_dl/extractor/drbonanza.py
index 164e97c..ea0f06d 100644
--- a/hypervideo_dl/extractor/drbonanza.py
+++ b/hypervideo_dl/extractor/drbonanza.py
@@ -1,6 +1,5 @@
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
from ..utils import (
@@ -26,7 +25,7 @@ class DRBonanzaIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
video_id, display_id = mobj.group('id', 'display_id')
webpage = self._download_webpage(url, display_id)
diff --git a/hypervideo_dl/extractor/dropbox.py b/hypervideo_dl/extractor/dropbox.py
index 2bedcc1..6a7d050 100644
--- a/hypervideo_dl/extractor/dropbox.py
+++ b/hypervideo_dl/extractor/dropbox.py
@@ -17,7 +17,7 @@ class DropboxIE(InfoExtractor):
'info_dict': {
'id': 'nelirfsxnmcfbfh',
'ext': 'mp4',
- 'title': 'hypervideo test video \'ä"BaW_jenozKc'
+ 'title': 'youtube-dl test video \'ä"BaW_jenozKc'
}
}, {
'url': 'https://www.dropbox.com/sh/662glsejgzoj9sr/AAByil3FGH9KFNZ13e08eSa1a/Pregame%20Ceremony%20Program%20PA%2020140518.m4v',
@@ -26,7 +26,7 @@ class DropboxIE(InfoExtractor):
]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
video_id = mobj.group('id')
fn = compat_urllib_parse_unquote(url_basename(url))
title = os.path.splitext(fn)[0]
diff --git a/hypervideo_dl/extractor/drtuber.py b/hypervideo_dl/extractor/drtuber.py
index 2baea58..540b86a 100644
--- a/hypervideo_dl/extractor/drtuber.py
+++ b/hypervideo_dl/extractor/drtuber.py
@@ -42,7 +42,7 @@ class DrTuberIE(InfoExtractor):
webpage)
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
video_id = mobj.group('id')
display_id = mobj.group('display_id') or video_id
diff --git a/hypervideo_dl/extractor/drtv.py b/hypervideo_dl/extractor/drtv.py
index c0036ad..7bb15f8 100644
--- a/hypervideo_dl/extractor/drtv.py
+++ b/hypervideo_dl/extractor/drtv.py
@@ -242,7 +242,7 @@ class DRTVIE(InfoExtractor):
elif target == 'HLS':
formats.extend(self._extract_m3u8_formats(
uri, video_id, 'mp4', entry_protocol='m3u8_native',
- preference=preference, m3u8_id=format_id,
+ quality=preference, m3u8_id=format_id,
fatal=False))
else:
bitrate = link.get('Bitrate')
@@ -254,7 +254,7 @@ class DRTVIE(InfoExtractor):
'tbr': int_or_none(bitrate),
'ext': link.get('FileFormat'),
'vcodec': 'none' if kind == 'AudioResource' else None,
- 'preference': preference,
+ 'quality': preference,
})
subtitles_list = asset.get('SubtitlesList') or asset.get('Subtitleslist')
if isinstance(subtitles_list, list):
diff --git a/hypervideo_dl/extractor/dtube.py b/hypervideo_dl/extractor/dtube.py
index 114d2db..ad247b7 100644
--- a/hypervideo_dl/extractor/dtube.py
+++ b/hypervideo_dl/extractor/dtube.py
@@ -2,7 +2,6 @@
from __future__ import unicode_literals
import json
-import re
from socket import timeout
from .common import InfoExtractor
@@ -32,7 +31,7 @@ class DTubeIE(InfoExtractor):
}
def _real_extract(self, url):
- uploader_id, video_id = re.match(self._VALID_URL, url).groups()
+ uploader_id, video_id = self._match_valid_url(url).groups()
result = self._download_json('https://api.steemit.com/', video_id, data=json.dumps({
'jsonrpc': '2.0',
'method': 'get_content',
diff --git a/hypervideo_dl/extractor/duboku.py b/hypervideo_dl/extractor/duboku.py
new file mode 100644
index 0000000..a875978
--- /dev/null
+++ b/hypervideo_dl/extractor/duboku.py
@@ -0,0 +1,242 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_urlparse
+from ..utils import (
+ clean_html,
+ extract_attributes,
+ ExtractorError,
+ get_elements_by_class,
+ int_or_none,
+ js_to_json,
+ smuggle_url,
+ unescapeHTML,
+)
+
+
+def _get_elements_by_tag_and_attrib(html, tag=None, attribute=None, value=None, escape_value=True):
+ """Return the content of the tag with the specified attribute in the passed HTML document"""
+
+ if tag is None:
+ tag = '[a-zA-Z0-9:._-]+'
+ if attribute is None:
+ attribute = ''
+ else:
+ attribute = r'\s+(?P<attribute>%s)' % re.escape(attribute)
+ if value is None:
+ value = ''
+ else:
+ value = re.escape(value) if escape_value else value
+ value = '=[\'"]?(?P<value>%s)[\'"]?' % value
+
+ retlist = []
+ for m in re.finditer(r'''(?xs)
+ <(?P<tag>%s)
+ (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
+ %s%s
+ (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
+ \s*>
+ (?P<content>.*?)
+ </\1>
+ ''' % (tag, attribute, value), html):
+ retlist.append(m)
+
+ return retlist
+
+
+def _get_element_by_tag_and_attrib(html, tag=None, attribute=None, value=None, escape_value=True):
+ retval = _get_elements_by_tag_and_attrib(html, tag, attribute, value, escape_value)
+ return retval[0] if retval else None
+
+
+class DubokuIE(InfoExtractor):
+ IE_NAME = 'duboku'
+ IE_DESC = 'www.duboku.co'
+
+ _VALID_URL = r'(?:https?://[^/]+\.duboku\.co/vodplay/)(?P<id>[0-9]+-[0-9-]+)\.html.*'
+ _TESTS = [{
+ 'url': 'https://www.duboku.co/vodplay/1575-1-1.html',
+ 'info_dict': {
+ 'id': '1575-1-1',
+ 'ext': 'ts',
+ 'series': '白色月光',
+ 'title': 'contains:白色月光',
+ 'season_number': 1,
+ 'episode_number': 1,
+ },
+ 'params': {
+ 'skip_download': 'm3u8 download',
+ },
+ }, {
+ 'url': 'https://www.duboku.co/vodplay/1588-1-1.html',
+ 'info_dict': {
+ 'id': '1588-1-1',
+ 'ext': 'ts',
+ 'series': '亲爱的自己',
+ 'title': 'contains:预告片',
+ 'season_number': 1,
+ 'episode_number': 1,
+ },
+ 'params': {
+ 'skip_download': 'm3u8 download',
+ },
+ }]
+
+ _PLAYER_DATA_PATTERN = r'player_data\s*=\s*(\{\s*(.*)})\s*;?\s*</script'
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ temp = video_id.split('-')
+ series_id = temp[0]
+ season_id = temp[1]
+ episode_id = temp[2]
+
+ webpage_url = 'https://www.duboku.co/vodplay/%s.html' % video_id
+ webpage_html = self._download_webpage(webpage_url, video_id)
+
+ # extract video url
+
+ player_data = self._search_regex(
+ self._PLAYER_DATA_PATTERN, webpage_html, 'player_data')
+ player_data = self._parse_json(player_data, video_id, js_to_json)
+
+ # extract title
+
+ temp = get_elements_by_class('title', webpage_html)
+ series_title = None
+ title = None
+ for html in temp:
+ mobj = re.search(r'<a\s+.*>(.*)</a>', html)
+ if mobj:
+ href = extract_attributes(mobj.group(0)).get('href')
+ if href:
+ mobj1 = re.search(r'/(\d+)\.html', href)
+ if mobj1 and mobj1.group(1) == series_id:
+ series_title = clean_html(mobj.group(0))
+ series_title = re.sub(r'[\s\r\n\t]+', ' ', series_title)
+ title = clean_html(html)
+ title = re.sub(r'[\s\r\n\t]+', ' ', title)
+ break
+
+ data_url = player_data.get('url')
+ if not data_url:
+ raise ExtractorError('Cannot find url in player_data')
+ data_from = player_data.get('from')
+
+ # if it is an embedded iframe, maybe it's an external source
+ if data_from == 'iframe':
+ # use _type url_transparent to retain the meaningful details
+ # of the video.
+ return {
+ '_type': 'url_transparent',
+ 'url': smuggle_url(data_url, {'http_headers': {'Referer': webpage_url}}),
+ 'id': video_id,
+ 'title': title,
+ 'series': series_title,
+ 'season_number': int_or_none(season_id),
+ 'season_id': season_id,
+ 'episode_number': int_or_none(episode_id),
+ 'episode_id': episode_id,
+ }
+
+ formats = self._extract_m3u8_formats(data_url, video_id, 'mp4')
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'series': series_title,
+ 'season_number': int_or_none(season_id),
+ 'season_id': season_id,
+ 'episode_number': int_or_none(episode_id),
+ 'episode_id': episode_id,
+ 'formats': formats,
+ 'http_headers': {'Referer': 'https://www.duboku.co/static/player/videojs.html'}
+ }
+
+
+class DubokuPlaylistIE(InfoExtractor):
+ IE_NAME = 'duboku:list'
+ IE_DESC = 'www.duboku.co entire series'
+
+ _VALID_URL = r'(?:https?://[^/]+\.duboku\.co/voddetail/)(?P<id>[0-9]+)\.html.*'
+ _TESTS = [{
+ 'url': 'https://www.duboku.co/voddetail/1575.html',
+ 'info_dict': {
+ 'id': 'startswith:1575',
+ 'title': '白色月光',
+ },
+ 'playlist_count': 12,
+ }, {
+ 'url': 'https://www.duboku.co/voddetail/1554.html',
+ 'info_dict': {
+ 'id': 'startswith:1554',
+ 'title': '以家人之名',
+ },
+ 'playlist_mincount': 30,
+ }, {
+ 'url': 'https://www.duboku.co/voddetail/1554.html#playlist2',
+ 'info_dict': {
+ 'id': '1554#playlist2',
+ 'title': '以家人之名',
+ },
+ 'playlist_mincount': 27,
+ }]
+
+ def _real_extract(self, url):
+ mobj = self._match_valid_url(url)
+ if mobj is None:
+ raise ExtractorError('Invalid URL: %s' % url)
+ series_id = mobj.group('id')
+ fragment = compat_urlparse.urlparse(url).fragment
+
+ webpage_url = 'https://www.duboku.co/voddetail/%s.html' % series_id
+ webpage_html = self._download_webpage(webpage_url, series_id)
+
+ # extract title
+
+ title = _get_element_by_tag_and_attrib(webpage_html, 'h1', 'class', 'title')
+ title = unescapeHTML(title.group('content')) if title else None
+ if not title:
+ title = self._html_search_meta('keywords', webpage_html)
+ if not title:
+ title = _get_element_by_tag_and_attrib(webpage_html, 'title')
+ title = unescapeHTML(title.group('content')) if title else None
+
+ # extract playlists
+
+ playlists = {}
+ for div in _get_elements_by_tag_and_attrib(
+ webpage_html, attribute='id', value='playlist\\d+', escape_value=False):
+ playlist_id = div.group('value')
+ playlist = []
+ for a in _get_elements_by_tag_and_attrib(
+ div.group('content'), 'a', 'href', value='[^\'"]+?', escape_value=False):
+ playlist.append({
+ 'href': unescapeHTML(a.group('value')),
+ 'title': unescapeHTML(a.group('content'))
+ })
+ playlists[playlist_id] = playlist
+
+ # select the specified playlist if url fragment exists
+ playlist = None
+ playlist_id = None
+ if fragment:
+ playlist = playlists.get(fragment)
+ playlist_id = fragment
+ else:
+ first = next(iter(playlists.items()), None)
+ if first:
+ (playlist_id, playlist) = first
+ if not playlist:
+ raise ExtractorError(
+ 'Cannot find %s' % fragment if fragment else 'Cannot extract playlist')
+
+ # return url results
+ return self.playlist_result([
+ self.url_result(
+ compat_urlparse.urljoin('https://www.duboku.co', x['href']),
+ ie=DubokuIE.ie_key(), video_title=x.get('title'))
+ for x in playlist], series_id + '#' + playlist_id, title)
diff --git a/hypervideo_dl/extractor/dw.py b/hypervideo_dl/extractor/dw.py
index d740652..6eaee07 100644
--- a/hypervideo_dl/extractor/dw.py
+++ b/hypervideo_dl/extractor/dw.py
@@ -5,6 +5,7 @@ from .common import InfoExtractor
from ..utils import (
int_or_none,
unified_strdate,
+ url_or_none,
)
from ..compat import compat_urlparse
@@ -15,13 +16,13 @@ class DWIE(InfoExtractor):
_TESTS = [{
# video
'url': 'http://www.dw.com/en/intelligent-light/av-19112290',
- 'md5': '7372046e1815c5a534b43f3c3c36e6e9',
+ 'md5': 'fb9dfd9520811d3ece80f04befd73428',
'info_dict': {
'id': '19112290',
'ext': 'mp4',
'title': 'Intelligent light',
'description': 'md5:90e00d5881719f2a6a5827cb74985af1',
- 'upload_date': '20160311',
+ 'upload_date': '20160605',
}
}, {
# audio
@@ -55,15 +56,16 @@ class DWIE(InfoExtractor):
title = hidden_inputs['media_title']
media_id = hidden_inputs.get('media_id') or media_id
- if hidden_inputs.get('player_type') == 'video' and hidden_inputs.get('stream_file') == '1':
+ direct_url = url_or_none(hidden_inputs.get('file_name'))
+ if direct_url:
+ formats = [{'url': hidden_inputs['file_name']}]
+ else:
formats = self._extract_smil_formats(
'http://www.dw.com/smil/v-%s' % media_id, media_id,
transform_source=lambda s: s.replace(
'rtmp://tv-od.dw.de/flash/',
'http://tv-download.dw.de/dwtv_video/flv/'))
- self._sort_formats(formats)
- else:
- formats = [{'url': hidden_inputs['file_name']}]
+ self._sort_formats(formats)
upload_date = hidden_inputs.get('display_date')
if not upload_date:
diff --git a/hypervideo_dl/extractor/eagleplatform.py b/hypervideo_dl/extractor/eagleplatform.py
index 36fef07..f86731a 100644
--- a/hypervideo_dl/extractor/eagleplatform.py
+++ b/hypervideo_dl/extractor/eagleplatform.py
@@ -123,7 +123,7 @@ class EaglePlatformIE(InfoExtractor):
def _real_extract(self, url):
url, smuggled_data = unsmuggle_url(url, {})
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
host, video_id = mobj.group('custom_host') or mobj.group('host'), mobj.group('id')
headers = {}
diff --git a/hypervideo_dl/extractor/egghead.py b/hypervideo_dl/extractor/egghead.py
index aff9b88..f6b50e7 100644
--- a/hypervideo_dl/extractor/egghead.py
+++ b/hypervideo_dl/extractor/egghead.py
@@ -22,16 +22,19 @@ class EggheadBaseIE(InfoExtractor):
class EggheadCourseIE(EggheadBaseIE):
IE_DESC = 'egghead.io course'
IE_NAME = 'egghead:course'
- _VALID_URL = r'https://egghead\.io/courses/(?P<id>[^/?#&]+)'
- _TEST = {
+ _VALID_URL = r'https://(?:app\.)?egghead\.io/(?:course|playlist)s/(?P<id>[^/?#&]+)'
+ _TESTS = [{
'url': 'https://egghead.io/courses/professor-frisby-introduces-composable-functional-javascript',
'playlist_count': 29,
'info_dict': {
- 'id': '72',
+ 'id': '432655',
'title': 'Professor Frisby Introduces Composable Functional JavaScript',
'description': 're:(?s)^This course teaches the ubiquitous.*You\'ll start composing functionality before you know it.$',
},
- }
+ }, {
+ 'url': 'https://app.egghead.io/playlists/professor-frisby-introduces-composable-functional-javascript',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
playlist_id = self._match_id(url)
@@ -65,7 +68,7 @@ class EggheadCourseIE(EggheadBaseIE):
class EggheadLessonIE(EggheadBaseIE):
IE_DESC = 'egghead.io lesson'
IE_NAME = 'egghead:lesson'
- _VALID_URL = r'https://egghead\.io/(?:api/v1/)?lessons/(?P<id>[^/?#&]+)'
+ _VALID_URL = r'https://(?:app\.)?egghead\.io/(?:api/v1/)?lessons/(?P<id>[^/?#&]+)'
_TESTS = [{
'url': 'https://egghead.io/lessons/javascript-linear-data-flow-with-container-style-types-box',
'info_dict': {
@@ -88,6 +91,9 @@ class EggheadLessonIE(EggheadBaseIE):
}, {
'url': 'https://egghead.io/api/v1/lessons/react-add-redux-to-a-react-application',
'only_matching': True,
+ }, {
+ 'url': 'https://app.egghead.io/lessons/javascript-linear-data-flow-with-container-style-types-box',
+ 'only_matching': True,
}]
def _real_extract(self, url):
@@ -107,8 +113,7 @@ class EggheadLessonIE(EggheadBaseIE):
ext = determine_ext(format_url)
if ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
- format_url, lesson_id, 'mp4', entry_protocol='m3u8',
- m3u8_id='hls', fatal=False))
+ format_url, lesson_id, 'mp4', m3u8_id='hls', fatal=False))
elif ext == 'mpd':
formats.extend(self._extract_mpd_formats(
format_url, lesson_id, mpd_id='dash', fatal=False))
diff --git a/hypervideo_dl/extractor/eighttracks.py b/hypervideo_dl/extractor/eighttracks.py
index 9b1e1ce..9a44f89 100644
--- a/hypervideo_dl/extractor/eighttracks.py
+++ b/hypervideo_dl/extractor/eighttracks.py
@@ -21,9 +21,9 @@ class EightTracksIE(InfoExtractor):
'url': 'http://8tracks.com/ytdl/youtube-dl-test-tracks-a',
'info_dict': {
'id': '1336550',
- 'display_id': 'hypervideo-test-tracks-a',
+ 'display_id': 'youtube-dl-test-tracks-a',
'description': "test chars: \"'/\\ä↭",
- 'title': "hypervideo test tracks \"'/\\ä↭<>",
+ 'title': "youtube-dl test tracks \"'/\\ä↭<>",
},
'playlist': [
{
@@ -31,7 +31,7 @@ class EightTracksIE(InfoExtractor):
'info_dict': {
'id': '11885610',
'ext': 'm4a',
- 'title': "youtue-dl project<>\"' - hypervideo test track 1 \"'/\\\u00e4\u21ad",
+ 'title': "youtue-dl project<>\"' - youtube-dl test track 1 \"'/\\\u00e4\u21ad",
'uploader_id': 'ytdl'
}
},
@@ -40,7 +40,7 @@ class EightTracksIE(InfoExtractor):
'info_dict': {
'id': '11885608',
'ext': 'm4a',
- 'title': "hypervideo project - hypervideo test track 2 \"'/\\\u00e4\u21ad",
+ 'title': "youtube-dl project - youtube-dl test track 2 \"'/\\\u00e4\u21ad",
'uploader_id': 'ytdl'
}
},
@@ -49,7 +49,7 @@ class EightTracksIE(InfoExtractor):
'info_dict': {
'id': '11885679',
'ext': 'm4a',
- 'title': "hypervideo project as well - hypervideo test track 3 \"'/\\\u00e4\u21ad",
+ 'title': "youtube-dl project as well - youtube-dl test track 3 \"'/\\\u00e4\u21ad",
'uploader_id': 'ytdl'
}
},
@@ -58,7 +58,7 @@ class EightTracksIE(InfoExtractor):
'info_dict': {
'id': '11885680',
'ext': 'm4a',
- 'title': "hypervideo project as well - hypervideo test track 4 \"'/\\\u00e4\u21ad",
+ 'title': "youtube-dl project as well - youtube-dl test track 4 \"'/\\\u00e4\u21ad",
'uploader_id': 'ytdl'
}
},
@@ -67,7 +67,7 @@ class EightTracksIE(InfoExtractor):
'info_dict': {
'id': '11885682',
'ext': 'm4a',
- 'title': "PH - hypervideo test track 5 \"'/\\\u00e4\u21ad",
+ 'title': "PH - youtube-dl test track 5 \"'/\\\u00e4\u21ad",
'uploader_id': 'ytdl'
}
},
@@ -76,7 +76,7 @@ class EightTracksIE(InfoExtractor):
'info_dict': {
'id': '11885683',
'ext': 'm4a',
- 'title': "PH - hypervideo test track 6 \"'/\\\u00e4\u21ad",
+ 'title': "PH - youtube-dl test track 6 \"'/\\\u00e4\u21ad",
'uploader_id': 'ytdl'
}
},
@@ -85,7 +85,7 @@ class EightTracksIE(InfoExtractor):
'info_dict': {
'id': '11885684',
'ext': 'm4a',
- 'title': "phihag - hypervideo test track 7 \"'/\\\u00e4\u21ad",
+ 'title': "phihag - youtube-dl test track 7 \"'/\\\u00e4\u21ad",
'uploader_id': 'ytdl'
}
},
@@ -94,7 +94,7 @@ class EightTracksIE(InfoExtractor):
'info_dict': {
'id': '11885685',
'ext': 'm4a',
- 'title': "phihag - hypervideo test track 8 \"'/\\\u00e4\u21ad",
+ 'title': "phihag - youtube-dl test track 8 \"'/\\\u00e4\u21ad",
'uploader_id': 'ytdl'
}
}
diff --git a/hypervideo_dl/extractor/einthusan.py b/hypervideo_dl/extractor/einthusan.py
index 4e0f8bc..7af279a 100644
--- a/hypervideo_dl/extractor/einthusan.py
+++ b/hypervideo_dl/extractor/einthusan.py
@@ -2,7 +2,6 @@
from __future__ import unicode_literals
import json
-import re
from .common import InfoExtractor
from ..compat import (
@@ -48,7 +47,7 @@ class EinthusanIE(InfoExtractor):
)).decode('utf-8'), video_id)
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
host = mobj.group('host')
video_id = mobj.group('id')
diff --git a/hypervideo_dl/extractor/elonet.py b/hypervideo_dl/extractor/elonet.py
new file mode 100644
index 0000000..eefba4e
--- /dev/null
+++ b/hypervideo_dl/extractor/elonet.py
@@ -0,0 +1,89 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ base_url,
+ ExtractorError,
+ try_get,
+)
+from ..compat import compat_str
+
+
+class ElonetIE(InfoExtractor):
+ _VALID_URL = r'https?://elonet\.finna\.fi/Record/kavi\.elonet_elokuva_(?P<id>[0-9]+)'
+ _TESTS = [{
+ # m3u8 with subtitles
+ 'url': 'https://elonet.finna.fi/Record/kavi.elonet_elokuva_107867',
+ 'md5': '8efc954b96c543711707f87de757caea',
+ 'info_dict': {
+ 'id': '107867',
+ 'ext': 'mp4',
+ 'title': 'Valkoinen peura',
+ 'description': 'Valkoinen peura (1952) on Erik Blombergin ohjaama ja yhdessä Mirjami Kuosmasen kanssa käsikirjoittama tarunomainen kertomus valkoisen peuran hahmossa lii...',
+ 'thumbnail': 'https://elonet.finna.fi/Cover/Show?id=kavi.elonet_elokuva_107867&index=0&size=large',
+ },
+ }, {
+ # DASH with subtitles
+ 'url': 'https://elonet.finna.fi/Record/kavi.elonet_elokuva_116539',
+ 'info_dict': {
+ 'id': '116539',
+ 'ext': 'mp4',
+ 'title': 'Minulla on tiikeri',
+ 'description': 'Pienellä pojalla, joka asuu kerrostalossa, on kotieläimenä tiikeri. Se on kuitenkin salaisuus. Kerrostalon räpätäti on Kotilaisen täti, joka on aina vali...',
+ 'thumbnail': 'https://elonet.finna.fi/Cover/Show?id=kavi.elonet_elokuva_116539&index=0&size=large&source=Solr',
+ }
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ title = self._html_search_regex(
+ r'<meta .*property="og&#x3A;title" .*content="(.+?)"', webpage, 'title')
+ description = self._html_search_regex(
+ r'<meta .*property="og&#x3A;description" .*content="(.+?)"', webpage, 'description')
+ thumbnail = self._html_search_regex(
+ r'<meta .*property="og&#x3A;image" .*content="(.+?)"', webpage, 'thumbnail')
+
+ json_s = self._html_search_regex(
+ r'data-video-sources="(.+?)"', webpage, 'json')
+ src = try_get(
+ self._parse_json(json_s, video_id),
+ lambda x: x[0]["src"], compat_str)
+ formats = []
+ subtitles = {}
+ if re.search(r'\.m3u8\??', src):
+ res = self._download_webpage_handle(
+ # elonet servers have certificate problems
+ src.replace('https:', 'http:'), video_id,
+ note='Downloading m3u8 information',
+ errnote='Failed to download m3u8 information')
+ if res:
+ doc, urlh = res
+ url = urlh.geturl()
+ formats, subtitles = self._parse_m3u8_formats_and_subtitles(doc, url)
+ for f in formats:
+ f['ext'] = 'mp4'
+ elif re.search(r'\.mpd\??', src):
+ res = self._download_xml_handle(
+ src, video_id,
+ note='Downloading MPD manifest',
+ errnote='Failed to download MPD manifest')
+ if res:
+ doc, urlh = res
+ url = base_url(urlh.geturl())
+ formats, subtitles = self._parse_mpd_formats_and_subtitles(doc, mpd_base_url=url)
+ else:
+ raise ExtractorError("Unknown streaming format")
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'formats': formats,
+ 'subtitles': subtitles,
+ }
diff --git a/hypervideo_dl/extractor/epicon.py b/hypervideo_dl/extractor/epicon.py
new file mode 100644
index 0000000..b4e544d
--- /dev/null
+++ b/hypervideo_dl/extractor/epicon.py
@@ -0,0 +1,119 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import ExtractorError
+
+
+class EpiconIE(InfoExtractor):
+ _VALID_URL = r'(?:https?://)(?:www\.)?epicon\.in/(?:documentaries|movies|tv-shows/[^/?#]+/[^/?#]+)/(?P<id>[^/?#]+)'
+ _TESTS = [{
+ 'url': 'https://www.epicon.in/documentaries/air-battle-of-srinagar',
+ 'info_dict': {
+ 'id': 'air-battle-of-srinagar',
+ 'ext': 'mp4',
+ 'title': 'Air Battle of Srinagar',
+ 'description': 'md5:c4de2013af9bc05ae4392e4115d518d7',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ }
+ }, {
+ 'url': 'https://www.epicon.in/movies/krit',
+ 'info_dict': {
+ 'id': 'krit',
+ 'ext': 'mp4',
+ 'title': 'Krit',
+ 'description': 'md5:c12b35dad915d48ccff7f013c79bab4a',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ }
+ }, {
+ 'url': 'https://www.epicon.in/tv-shows/paapnaashini-ganga/season-1/vardaan',
+ 'info_dict': {
+ 'id': 'vardaan',
+ 'ext': 'mp4',
+ 'title': 'Paapnaashini Ganga - Season 1 - Ep 1 - VARDAAN',
+ 'description': 'md5:f517058c3d0402398eefa6242f4dd6ae',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ }
+ }, {
+ 'url': 'https://www.epicon.in/movies/jayadev',
+ 'info_dict': {
+ 'id': 'jayadev',
+ 'ext': 'mp4',
+ 'title': 'Jayadev',
+ 'description': 'md5:09e349eecd8e585a3b6466904f19df6c',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ }
+ }]
+
+ def _real_extract(self, url):
+ id = self._match_id(url)
+ webpage = self._download_webpage(url, id)
+ cid = self._search_regex(r'class=\"mylist-icon\ iconclick\"\ id=\"(\d+)', webpage, 'cid')
+ headers = {'content-type': 'application/x-www-form-urlencoded; charset=UTF-8'}
+ data = f'cid={cid}&action=st&type=video'.encode()
+ data_json = self._parse_json(self._download_json('https://www.epicon.in/ajaxplayer/', id, headers=headers, data=data), id)
+
+ if not data_json['success']:
+ raise ExtractorError(data_json['message'], expected=True)
+
+ title = self._search_regex(r'setplaytitle=\"([^\"]+)', webpage, 'title')
+ description = self._og_search_description(webpage) or None
+ thumbnail = self._og_search_thumbnail(webpage) or None
+ formats = self._extract_m3u8_formats(data_json['url']['video_url'], id)
+ self._sort_formats(formats)
+
+ subtitles = {}
+ for subtitle in data_json.get('subtitles', []):
+ sub_url = subtitle.get('file')
+ if not sub_url:
+ continue
+ subtitles.setdefault(subtitle.get('lang', 'English'), []).append({
+ 'url': self._proto_relative_url(sub_url),
+ })
+
+ return {
+ 'id': id,
+ 'formats': formats,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'subtitles': subtitles,
+ }
+
+
+class EpiconSeriesIE(InfoExtractor):
+ _VALID_URL = r'(?!.*season)(?:https?://)(?:www\.)?epicon\.in/tv-shows/(?P<id>[^/?#]+)'
+ _TESTS = [{
+ 'url': 'https://www.epicon.in/tv-shows/1-of-something',
+ 'playlist_mincount': 5,
+ 'info_dict': {
+ 'id': '1-of-something',
+ },
+ }, {
+ 'url': 'https://www.epicon.in/tv-shows/eco-india-english',
+ 'playlist_mincount': 76,
+ 'info_dict': {
+ 'id': 'eco-india-english',
+ },
+ }, {
+ 'url': 'https://www.epicon.in/tv-shows/s/',
+ 'playlist_mincount': 25,
+ 'info_dict': {
+ 'id': 's',
+ },
+ }, {
+ 'url': 'https://www.epicon.in/tv-shows/ekaant',
+ 'playlist_mincount': 38,
+ 'info_dict': {
+ 'id': 'ekaant',
+ },
+ }]
+
+ def _real_extract(self, url):
+ id = self._match_id(url)
+ webpage = self._download_webpage(url, id)
+ episodes = re.findall(r'ct-tray-url=\"(tv-shows/%s/[^\"]+)' % id, webpage)
+ entries = [self.url_result('https://www.epicon.in/%s' % episode, ie=EpiconIE.ie_key()) for episode in episodes]
+ return self.playlist_result(entries, playlist_id=id)
diff --git a/hypervideo_dl/extractor/eporner.py b/hypervideo_dl/extractor/eporner.py
index bfecd3a..25a0d97 100644
--- a/hypervideo_dl/extractor/eporner.py
+++ b/hypervideo_dl/extractor/eporner.py
@@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
from ..utils import (
@@ -51,7 +50,7 @@ class EpornerIE(InfoExtractor):
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
video_id = mobj.group('id')
display_id = mobj.group('display_id') or video_id
diff --git a/hypervideo_dl/extractor/eroprofile.py b/hypervideo_dl/extractor/eroprofile.py
index c460dc7..a8396f1 100644
--- a/hypervideo_dl/extractor/eroprofile.py
+++ b/hypervideo_dl/extractor/eroprofile.py
@@ -90,3 +90,42 @@ class EroProfileIE(InfoExtractor):
'title': title,
'age_limit': 18,
})
+
+
+class EroProfileAlbumIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?eroprofile\.com/m/videos/album/(?P<id>[^/]+)'
+ IE_NAME = 'EroProfile:album'
+
+ _TESTS = [{
+ 'url': 'https://www.eroprofile.com/m/videos/album/BBW-2-893',
+ 'info_dict': {
+ 'id': 'BBW-2-893',
+ 'title': 'BBW 2'
+ },
+ 'playlist_mincount': 486,
+ },
+ ]
+
+ def _extract_from_page(self, page):
+ for url in re.findall(r'href=".*?(/m/videos/view/[^"]+)"', page):
+ yield self.url_result(f'https://www.eroprofile.com{url}', EroProfileIE.ie_key())
+
+ def _entries(self, playlist_id, first_page):
+ yield from self._extract_from_page(first_page)
+
+ page_urls = re.findall(rf'href=".*?(/m/videos/album/{playlist_id}\?pnum=(\d+))"', first_page)
+ max_page = max(int(n) for _, n in page_urls)
+
+ for n in range(2, max_page + 1):
+ url = f'https://www.eroprofile.com/m/videos/album/{playlist_id}?pnum={n}'
+ yield from self._extract_from_page(
+ self._download_webpage(url, playlist_id,
+ note=f'Downloading playlist page {int(n) - 1}'))
+
+ def _real_extract(self, url):
+ playlist_id = self._match_id(url)
+ first_page = self._download_webpage(url, playlist_id, note='Downloading playlist')
+ playlist_title = self._search_regex(
+ r'<title>Album: (.*) - EroProfile</title>', first_page, 'playlist_title')
+
+ return self.playlist_result(self._entries(playlist_id, first_page), playlist_id, playlist_title)
diff --git a/hypervideo_dl/extractor/espn.py b/hypervideo_dl/extractor/espn.py
index 6cf05e6..d4a66c2 100644
--- a/hypervideo_dl/extractor/espn.py
+++ b/hypervideo_dl/extractor/espn.py
@@ -154,7 +154,7 @@ class ESPNIE(OnceIE):
'tbr': int(mobj.group(3)),
})
if source_id == 'mezzanine':
- f['preference'] = 1
+ f['quality'] = 1
formats.append(f)
links = clip.get('links', {})
diff --git a/hypervideo_dl/extractor/europa.py b/hypervideo_dl/extractor/europa.py
index 2c1c747..60ab2ce 100644
--- a/hypervideo_dl/extractor/europa.py
+++ b/hypervideo_dl/extractor/europa.py
@@ -2,11 +2,11 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..compat import compat_urlparse
from ..utils import (
int_or_none,
orderedSet,
parse_duration,
+ parse_qs,
qualities,
unified_strdate,
xpath_text
@@ -53,7 +53,7 @@ class EuropaIE(InfoExtractor):
if items.get(p):
return items[p]
- query = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
+ query = parse_qs(url)
preferred_lang = query.get('sitelang', ('en', ))[0]
preferred_langs = orderedSet((preferred_lang, 'en', 'int'))
diff --git a/hypervideo_dl/extractor/euscreen.py b/hypervideo_dl/extractor/euscreen.py
new file mode 100644
index 0000000..3980c23
--- /dev/null
+++ b/hypervideo_dl/extractor/euscreen.py
@@ -0,0 +1,64 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+from ..utils import (
+ parse_duration,
+ js_to_json,
+)
+
+
+class EUScreenIE(InfoExtractor):
+ _VALID_URL = r'(?:https?://)(?:www\.)?euscreen\.eu/item.html\?id=(?P<id>[^&?$/]+)'
+
+ _TESTS = [{
+ 'url': 'https://euscreen.eu/item.html?id=EUS_0EBCBF356BFC4E12A014023BA41BD98C',
+ 'info_dict': {
+ 'id': 'EUS_0EBCBF356BFC4E12A014023BA41BD98C',
+ 'ext': 'mp4',
+ 'title': "L'effondrement du stade du Heysel",
+ 'alt_title': 'Collapse of the Heysel Stadium',
+ 'duration': 318.0,
+ 'description': 'md5:f0ffffdfce6821139357a1b8359d6152',
+ 'series': 'JA2 DERNIERE',
+ 'episode': '-',
+ 'uploader': 'INA / France',
+ 'thumbnail': 'http://images3.noterik.com/domain/euscreenxl/user/eu_ina/video/EUS_0EBCBF356BFC4E12A014023BA41BD98C/image.jpg'
+ },
+ 'params': {'skip_download': True}
+ }]
+
+ _payload = b'<fsxml><screen><properties><screenId>-1</screenId></properties><capabilities id="1"><properties><platform>Win32</platform><appcodename>Mozilla</appcodename><appname>Netscape</appname><appversion>5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36</appversion><useragent>Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36</useragent><cookiesenabled>true</cookiesenabled><screenwidth>784</screenwidth><screenheight>758</screenheight><orientation>undefined</orientation><smt_browserid>Sat, 07 Oct 2021 08:56:50 GMT</smt_browserid><smt_sessionid>1633769810758</smt_sessionid></properties></capabilities></screen></fsxml>'
+
+ def _real_extract(self, url):
+ id = self._match_id(url)
+ args_for_js_request = self._download_webpage(
+ 'https://euscreen.eu/lou/LouServlet/domain/euscreenxl/html5application/euscreenxlitem',
+ id, data=self._payload, query={'actionlist': 'itempage', 'id': id})
+ info_js = self._download_webpage(
+ 'https://euscreen.eu/lou/LouServlet/domain/euscreenxl/html5application/euscreenxlitem',
+ id, data=args_for_js_request.replace('screenid', 'screenId').encode())
+ video_json = self._parse_json(
+ self._search_regex(r'setVideo\(({.+})\)\(\$end\$\)put', info_js, 'Video JSON'),
+ id, transform_source=js_to_json)
+ meta_json = self._parse_json(
+ self._search_regex(r'setData\(({.+})\)\(\$end\$\)', info_js, 'Metadata JSON'),
+ id, transform_source=js_to_json)
+ formats = [{
+ 'url': source['src'],
+ } for source in video_json.get('sources', [])]
+ self._sort_formats(formats)
+
+ return {
+ 'id': id,
+ 'title': meta_json.get('originalTitle'),
+ 'alt_title': meta_json.get('title'),
+ 'duration': parse_duration(meta_json.get('duration')),
+ 'description': '%s\n%s' % (meta_json.get('summaryOriginal', ''), meta_json.get('summaryEnglish', '')),
+ 'series': meta_json.get('series') or meta_json.get('seriesEnglish'),
+ 'episode': meta_json.get('episodeNumber'),
+ 'uploader': meta_json.get('provider'),
+ 'thumbnail': meta_json.get('screenshot') or video_json.get('screenshot'),
+ 'formats': formats,
+ }
diff --git a/hypervideo_dl/extractor/everyonesmixtape.py b/hypervideo_dl/extractor/everyonesmixtape.py
new file mode 100644
index 0000000..80cb032
--- /dev/null
+++ b/hypervideo_dl/extractor/everyonesmixtape.py
@@ -0,0 +1,76 @@
+from __future__ import unicode_literals
+
+
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ sanitized_Request,
+)
+
+
+class EveryonesMixtapeIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?everyonesmixtape\.com/#/mix/(?P<id>[0-9a-zA-Z]+)(?:/(?P<songnr>[0-9]))?$'
+
+ _TESTS = [{
+ 'url': 'http://everyonesmixtape.com/#/mix/m7m0jJAbMQi/5',
+ 'info_dict': {
+ 'id': '5bfseWNmlds',
+ 'ext': 'mp4',
+ 'title': "Passion Pit - \"Sleepyhead\" (Official Music Video)",
+ 'uploader': 'FKR.TV',
+ 'uploader_id': 'frenchkissrecords',
+ 'description': "Music video for \"Sleepyhead\" from Passion Pit's debut EP Chunk Of Change.\nBuy on iTunes: https://itunes.apple.com/us/album/chunk-of-change-ep/id300087641\n\nDirected by The Wilderness.\n\nhttp://www.passionpitmusic.com\nhttp://www.frenchkissrecords.com",
+ 'upload_date': '20081015'
+ },
+ 'params': {
+ 'skip_download': True, # This is simply YouTube
+ }
+ }, {
+ 'url': 'http://everyonesmixtape.com/#/mix/m7m0jJAbMQi',
+ 'info_dict': {
+ 'id': 'm7m0jJAbMQi',
+ 'title': 'Driving',
+ },
+ 'playlist_count': 24
+ }]
+
+ def _real_extract(self, url):
+ mobj = self._match_valid_url(url)
+ playlist_id = mobj.group('id')
+
+ pllist_url = 'http://everyonesmixtape.com/mixtape.php?a=getMixes&u=-1&linked=%s&explore=' % playlist_id
+ pllist_req = sanitized_Request(pllist_url)
+ pllist_req.add_header('X-Requested-With', 'XMLHttpRequest')
+
+ playlist_list = self._download_json(
+ pllist_req, playlist_id, note='Downloading playlist metadata')
+ try:
+ playlist_no = next(playlist['id']
+ for playlist in playlist_list
+ if playlist['code'] == playlist_id)
+ except StopIteration:
+ raise ExtractorError('Playlist id not found')
+
+ pl_url = 'http://everyonesmixtape.com/mixtape.php?a=getMix&id=%s&userId=null&code=' % playlist_no
+ pl_req = sanitized_Request(pl_url)
+ pl_req.add_header('X-Requested-With', 'XMLHttpRequest')
+ playlist = self._download_json(
+ pl_req, playlist_id, note='Downloading playlist info')
+
+ entries = [{
+ '_type': 'url',
+ 'url': t['url'],
+ 'title': t['title'],
+ } for t in playlist['tracks']]
+
+ if mobj.group('songnr'):
+ songnr = int(mobj.group('songnr')) - 1
+ return entries[songnr]
+
+ playlist_title = playlist['mixData']['name']
+ return {
+ '_type': 'playlist',
+ 'id': playlist_id,
+ 'title': playlist_title,
+ 'entries': entries,
+ }
diff --git a/hypervideo_dl/extractor/extractors.py b/hypervideo_dl/extractor/extractors.py
index 402e542..f4f817f 100644
--- a/hypervideo_dl/extractor/extractors.py
+++ b/hypervideo_dl/extractor/extractors.py
@@ -41,7 +41,15 @@ from .airmozilla import AirMozillaIE
from .aljazeera import AlJazeeraIE
from .alphaporno import AlphaPornoIE
from .amara import AmaraIE
+from .alura import (
+ AluraIE,
+ AluraCourseIE
+)
from .amcnetworks import AMCNetworksIE
+from .animelab import (
+ AnimeLabIE,
+ AnimeLabShowsIE,
+)
from .americastestkitchen import (
AmericasTestKitchenIE,
AmericasTestKitchenSeasonIE,
@@ -59,7 +67,10 @@ from .appletrailers import (
AppleTrailersSectionIE,
)
from .applepodcasts import ApplePodcastsIE
-from .archiveorg import ArchiveOrgIE
+from .archiveorg import (
+ ArchiveOrgIE,
+ YoutubeWebArchiveIE,
+)
from .arcpublishing import ArcPublishingIE
from .arkena import ArkenaIE
from .ard import (
@@ -83,6 +94,12 @@ from .atvat import ATVAtIE
from .audimedia import AudiMediaIE
from .audioboom import AudioBoomIE
from .audiomack import AudiomackIE, AudiomackAlbumIE
+from .audius import (
+ AudiusIE,
+ AudiusTrackIE,
+ AudiusPlaylistIE,
+ AudiusProfileIE,
+)
from .awaan import (
AWAANIE,
AWAANVideoIE,
@@ -92,7 +109,13 @@ from .awaan import (
from .azmedien import AZMedienIE
from .baidu import BaiduVideoIE
from .bandaichannel import BandaiChannelIE
-from .bandcamp import BandcampIE, BandcampAlbumIE, BandcampWeeklyIE
+from .bandcamp import (
+ BandcampIE,
+ BandcampAlbumIE,
+ BandcampWeeklyIE,
+ BandcampMusicIE,
+)
+from .bannedvideo import BannedVideoIE
from .bbc import (
BBCCoUkIE,
BBCCoUkArticleIE,
@@ -117,17 +140,27 @@ from .bigflix import BigflixIE
from .bild import BildIE
from .bilibili import (
BiliBiliIE,
+ BiliBiliSearchIE,
+ BilibiliCategoryIE,
BiliBiliBangumiIE,
BilibiliAudioIE,
BilibiliAudioAlbumIE,
BiliBiliPlayerIE,
+ BilibiliChannelIE,
+ BiliIntlIE,
+ BiliIntlSeriesIE,
)
from .biobiochiletv import BioBioChileTVIE
from .bitchute import (
BitChuteIE,
BitChuteChannelIE,
)
+from .bitwave import (
+ BitwaveReplayIE,
+ BitwaveStreamIE,
+)
from .biqle import BIQLEIE
+from .blackboardcollaborate import BlackboardCollaborateIE
from .bleacherreport import (
BleacherReportIE,
BleacherReportCMSIE,
@@ -152,12 +185,12 @@ from .businessinsider import BusinessInsiderIE
from .buzzfeed import BuzzFeedIE
from .byutv import BYUtvIE
from .c56 import C56IE
+from .cam4 import CAM4IE
from .camdemy import (
CamdemyIE,
CamdemyFolderIE
)
from .cammodels import CamModelsIE
-from .camtube import CamTubeIE
from .camwithher import CamWithHerIE
from .canalplus import CanalplusIE
from .canalc2 import Canalc2IE
@@ -175,9 +208,9 @@ from .cartoonnetwork import CartoonNetworkIE
from .cbc import (
CBCIE,
CBCPlayerIE,
- CBCWatchVideoIE,
- CBCWatchIE,
- CBCOlympicsIE,
+ CBCGemIE,
+ CBCGemPlaylistIE,
+ CBCGemLiveIE,
)
from .cbs import CBSIE
from .cbslocal import (
@@ -206,10 +239,15 @@ from .ceskatelevize import (
CeskaTelevizeIE,
CeskaTelevizePoradyIE,
)
+from .cgtn import CGTNIE
from .channel9 import Channel9IE
from .charlierose import CharlieRoseIE
from .chaturbate import ChaturbateIE
from .chilloutzone import ChilloutzoneIE
+from .chingari import (
+ ChingariIE,
+ ChingariUserIE,
+)
from .chirbit import (
ChirbitIE,
ChirbitProfileIE,
@@ -220,6 +258,7 @@ from .ciscolive import (
CiscoLiveSessionIE,
CiscoLiveSearchIE,
)
+from .ciscowebex import CiscoWebexIE
from .cjsw import CJSWIE
from .cliphunter import CliphunterIE
from .clippit import ClippitIE
@@ -249,6 +288,7 @@ from .commonmistakes import CommonMistakesIE, UnicodeBOMIE
from .commonprotocols import (
MmsIE,
RtmpIE,
+ ViewSourceIE,
)
from .condenast import CondeNastIE
from .contv import CONtvIE
@@ -258,7 +298,9 @@ from .crackle import CrackleIE
from .crooksandliars import CrooksAndLiarsIE
from .crunchyroll import (
CrunchyrollIE,
- CrunchyrollShowPlaylistIE
+ CrunchyrollShowPlaylistIE,
+ CrunchyrollBetaIE,
+ CrunchyrollBetaShowIE,
)
from .cspan import CSpanIE
from .ctsnews import CtsNewsIE
@@ -276,6 +318,10 @@ from .dailymotion import (
DailymotionPlaylistIE,
DailymotionUserIE,
)
+from .damtomo import (
+ DamtomoRecordIE,
+ DamtomoVideoIE,
+)
from .daum import (
DaumIE,
DaumClipIE,
@@ -284,11 +330,18 @@ from .daum import (
)
from .dbtv import DBTVIE
from .dctp import DctpTvIE
-from .deezer import DeezerPlaylistIE
+from .deezer import (
+ DeezerPlaylistIE,
+ DeezerAlbumIE,
+)
from .democracynow import DemocracynowIE
from .dfb import DFBIE
from .dhm import DHMIE
from .digg import DiggIE
+from .discoveryplusindia import (
+ DiscoveryPlusIndiaIE,
+ DiscoveryPlusIndiaShowIE,
+)
from .dotsub import DotsubIE
from .douyutv import (
DouyuShowIE,
@@ -298,6 +351,9 @@ from .dplay import (
DPlayIE,
DiscoveryPlusIE,
HGTVDeIE,
+ ScienceChannelIE,
+ DIYNetworkIE,
+ AnimalPlanetIE
)
from .dreisat import DreiSatIE
from .drbonanza import DRBonanzaIE
@@ -308,6 +364,10 @@ from .drtv import (
)
from .dtube import DTubeIE
from .dvtv import DVTVIE
+from .duboku import (
+ DubokuIE,
+ DubokuPlaylistIE
+)
from .dumpert import DumpertIE
from .defense import DefenseGouvFrIE
from .discovery import DiscoveryIE
@@ -319,6 +379,7 @@ from .discoverynetworks import DiscoveryNetworksDeIE
from .discoveryvr import DiscoveryVRIE
from .disney import DisneyIE
from .dispeak import DigitallySpeakingIE
+from .doodstream import DoodStreamIE
from .dropbox import DropboxIE
from .dw import (
DWIE,
@@ -340,11 +401,19 @@ from .ellentube import (
EllenTubeVideoIE,
EllenTubePlaylistIE,
)
+from .elonet import ElonetIE
from .elpais import ElPaisIE
from .embedly import EmbedlyIE
from .engadget import EngadgetIE
+from .epicon import (
+ EpiconIE,
+ EpiconSeriesIE,
+)
from .eporner import EpornerIE
-from .eroprofile import EroProfileIE
+from .eroprofile import (
+ EroProfileIE,
+ EroProfileAlbumIE,
+)
from .escapist import EscapistIE
from .espn import (
ESPNIE,
@@ -353,6 +422,7 @@ from .espn import (
)
from .esri import EsriVideoIE
from .europa import EuropaIE
+from .euscreen import EUScreenIE
from .expotv import ExpoTVIE
from .expressen import ExpressenIE
from .extremetube import ExtremeTubeIE
@@ -361,12 +431,18 @@ from .facebook import (
FacebookIE,
FacebookPluginsVideoIE,
)
+from .fancode import (
+ FancodeVodIE,
+ FancodeLiveIE
+)
+
from .faz import FazIE
from .fc2 import (
FC2IE,
FC2EmbedIE,
)
from .fczenit import FczenitIE
+from .filmmodu import FilmmoduIE
from .filmon import (
FilmOnIE,
FilmOnChannelIE,
@@ -401,12 +477,7 @@ from .franceinter import FranceInterIE
from .francetv import (
FranceTVIE,
FranceTVSiteIE,
- FranceTVEmbedIE,
FranceTVInfoIE,
- FranceTVInfoSportIE,
- FranceTVJeunesseIE,
- GenerationWhatIE,
- CultureboxIE,
)
from .freesound import FreesoundIE
from .freespeech import FreespeechIE
@@ -417,9 +488,14 @@ from .frontendmasters import (
FrontendMastersCourseIE
)
from .fujitv import FujiTVFODPlus7IE
-from .funimation import FunimationIE
+from .funimation import (
+ FunimationIE,
+ FunimationPageIE,
+ FunimationShowIE,
+)
from .funk import FunkIE
from .fusion import FusionIE
+from .gab import GabTVIE
from .gaia import GaiaIE
from .gameinformer import GameInformerIE
from .gamespot import GameSpotIE
@@ -429,6 +505,7 @@ from .gazeta import GazetaIE
from .gdcvault import GDCVaultIE
from .gedidigital import GediDigitalIE
from .generic import GenericIE
+from .gettr import GettrIE
from .gfycat import GfycatIE
from .giantbomb import GiantBombIE
from .giga import GigaIE
@@ -446,8 +523,11 @@ from .googlepodcasts import (
GooglePodcastsFeedIE,
)
from .googlesearch import GoogleSearchIE
+from .gopro import GoProIE
from .goshgay import GoshgayIE
+from .gotostage import GoToStageIE
from .gputechconf import GPUTechConfIE
+from .gronkh import GronkhIE
from .groupon import GrouponIE
from .hbo import HBOIE
from .hearthisat import HearThisAtIE
@@ -466,9 +546,11 @@ from .hotnewhiphop import HotNewHipHopIE
from .hotstar import (
HotStarIE,
HotStarPlaylistIE,
+ HotStarSeriesIE,
)
from .howcast import HowcastIE
from .howstuffworks import HowStuffWorksIE
+from .hrfensehen import HRFernsehenIE
from .hrti import (
HRTiIE,
HRTiPlaylistIE,
@@ -478,8 +560,13 @@ from .huffpost import HuffPostIE
from .hungama import (
HungamaIE,
HungamaSongIE,
+ HungamaAlbumPlaylistIE,
)
from .hypem import HypemIE
+from .ichinanalive import (
+ IchinanaLiveIE,
+ IchinanaLiveClipIE,
+)
from .ign import (
IGNIE,
IGNVideoIE,
@@ -546,6 +633,7 @@ from .kickstarter import KickStarterIE
from .kinja import KinjaEmbedIE
from .kinopoisk import KinoPoiskIE
from .konserthusetplay import KonserthusetPlayIE
+from .koo import KooIE
from .krasview import KrasViewIE
from .ku6 import Ku6IE
from .kusi import KUSIIE
@@ -557,7 +645,11 @@ from .kuwo import (
KuwoCategoryIE,
KuwoMvIE,
)
-from .la7 import LA7IE
+from .la7 import (
+ LA7IE,
+ LA7PodcastEpisodeIE,
+ LA7PodcastIE,
+)
from .laola1tv import (
Laola1TvEmbedIE,
Laola1TvIE,
@@ -610,10 +702,6 @@ from .linkedin import (
from .linuxacademy import LinuxAcademyIE
from .litv import LiTVIE
from .livejournal import LiveJournalIE
-from .liveleak import (
- LiveLeakIE,
- LiveLeakEmbedIE,
-)
from .livestream import (
LivestreamIE,
LivestreamOriginalIE,
@@ -628,6 +716,7 @@ from .lynda import (
LyndaCourseIE
)
from .m6 import M6IE
+from .magentamusik360 import MagentaMusik360IE
from .mailru import (
MailRuIE,
MailRuMusicIE,
@@ -638,6 +727,11 @@ from .mangomolo import (
MangomoloVideoIE,
MangomoloLiveIE,
)
+from .manoto import (
+ ManotoTVIE,
+ ManotoTVShowIE,
+ ManotoTVLiveIE,
+)
from .manyvids import ManyVidsIE
from .maoritv import MaoriTVIE
from .markiza import (
@@ -648,6 +742,8 @@ from .massengeschmacktv import MassengeschmackTVIE
from .matchtv import MatchTVIE
from .mdr import MDRIE
from .medaltv import MedalTVIE
+from .mediaite import MediaiteIE
+from .mediaklikk import MediaKlikkIE
from .mediaset import MediasetIE
from .mediasite import (
MediasiteIE,
@@ -668,6 +764,11 @@ from .microsoftvirtualacademy import (
MicrosoftVirtualAcademyIE,
MicrosoftVirtualAcademyCourseIE,
)
+from .mildom import (
+ MildomIE,
+ MildomVodIE,
+ MildomUserVodIE,
+)
from .minds import (
MindsIE,
MindsChannelIE,
@@ -676,6 +777,10 @@ from .minds import (
from .ministrygrid import MinistryGridIE
from .minoto import MinotoIE
from .miomio import MioMioIE
+from .mirrativ import (
+ MirrativIE,
+ MirrativUserIE,
+)
from .mit import TechTVMITIE, OCWMITIE
from .mitele import MiTeleIE
from .mixcloud import (
@@ -710,9 +815,16 @@ from .mtv import (
MTVServicesEmbeddedIE,
MTVDEIE,
MTVJapanIE,
+ MTVItaliaIE,
+ MTVItaliaProgrammaIE,
)
from .muenchentv import MuenchenTVIE
+from .musescore import MuseScoreIE
from .mwave import MwaveIE, MwaveMeetGreetIE
+from .mxplayer import (
+ MxplayerIE,
+ MxplayerShowIE,
+)
from .mychannels import MyChannelsIE
from .myspace import MySpaceIE, MySpaceAlbumIE
from .myspass import MySpassIE
@@ -720,12 +832,17 @@ from .myvi import (
MyviIE,
MyviEmbedIE,
)
+from .myvideoge import MyVideoGeIE
from .myvidster import MyVidsterIE
+from .n1 import N1InfoIIE, N1InfoAssetIE
from .nationalgeographic import (
NationalGeographicVideoIE,
NationalGeographicTVIE,
)
-from .naver import NaverIE
+from .naver import (
+ NaverIE,
+ NaverLiveIE,
+)
from .nba import (
NBAWatchEmbedIE,
NBAWatchIE,
@@ -751,8 +868,9 @@ from .ndr import (
NJoyEmbedIE,
)
from .ndtv import NDTVIE
-from .netzkino import NetzkinoIE
+from .nebula import NebulaIE
from .nerdcubed import NerdCubedFeedIE
+from .netzkino import NetzkinoIE
from .neteasemusic import (
NetEaseMusicIE,
NetEaseMusicAlbumIE,
@@ -765,6 +883,7 @@ from .neteasemusic import (
from .newgrounds import (
NewgroundsIE,
NewgroundsPlaylistIE,
+ NewgroundsUserIE,
)
from .newstube import NewstubeIE
from .nextmedia import (
@@ -777,6 +896,7 @@ from .nexx import (
NexxIE,
NexxEmbedIE,
)
+from .nfhsnetwork import NFHSNetworkIE
from .nfl import (
NFLIE,
NFLArticleIE,
@@ -793,11 +913,20 @@ from .nick import (
NickNightIE,
NickRuIE,
)
-from .niconico import NiconicoIE, NiconicoPlaylistIE
+
+from .niconico import (
+ NiconicoIE,
+ NiconicoPlaylistIE,
+ NiconicoUserIE,
+ NicovideoSearchDateIE,
+ NicovideoSearchIE,
+ NicovideoSearchURLIE,
+)
from .ninecninemedia import NineCNineMediaIE
from .ninegag import NineGagIE
from .ninenow import NineNowIE
from .nintendo import NintendoIE
+from .nitter import NitterIE
from .njpwworld import NJPWWorldIE
from .nobelprize import NobelPrizeIE
from .nonktube import NonkTubeIE
@@ -808,6 +937,7 @@ from .nova import (
NovaEmbedIE,
NovaIE,
)
+from .novaplay import NovaPlayIE
from .nowness import (
NownessIE,
NownessPlaylistIE,
@@ -848,10 +978,13 @@ from .nytimes import (
NYTimesCookingIE,
)
from .nuvid import NuvidIE
+from .nzherald import NZHeraldIE
from .nzz import NZZIE
from .odatv import OdaTVIE
from .odnoklassniki import OdnoklassnikiIE
from .oktoberfesttv import OktoberfestTVIE
+from .olympics import OlympicsReplayIE
+from .on24 import On24IE
from .ondemandkorea import OnDemandKoreaIE
from .onet import (
OnetIE,
@@ -864,6 +997,10 @@ from .ooyala import (
OoyalaIE,
OoyalaExternalIE,
)
+from .openrec import (
+ OpenRecIE,
+ OpenRecCaptureIE,
+)
from .ora import OraTVIE
from .orf import (
ORFTVthekIE,
@@ -893,11 +1030,26 @@ from .palcomp3 import (
PalcoMP3VideoIE,
)
from .pandoratv import PandoraTVIE
+from .paramountplus import (
+ ParamountPlusIE,
+ ParamountPlusSeriesIE,
+)
from .parliamentliveuk import ParliamentLiveUKIE
-from .patreon import PatreonIE
+from .parlview import ParlviewIE
+from .patreon import (
+ PatreonIE,
+ PatreonUserIE
+)
from .pbs import PBSIE
from .pearvideo import PearVideoIE
-from .peertube import PeerTubeIE
+from .peertube import (
+ PeerTubeIE,
+ PeerTubePlaylistIE,
+)
+from .peloton import (
+ PelotonIE,
+ PelotonLiveIE
+)
from .people import PeopleIE
from .performgroup import PerformGroupIE
from .periscope import (
@@ -929,12 +1081,16 @@ from .playstuff import PlayStuffIE
from .playtvak import PlaytvakIE
from .playvid import PlayvidIE
from .playwire import PlaywireIE
+from .plutotv import PlutoTVIE
from .pluralsight import (
PluralsightIE,
PluralsightCourseIE,
)
from .podomatic import PodomaticIE
-from .pokemon import PokemonIE
+from .pokemon import (
+ PokemonIE,
+ PokemonWatchIE,
+)
from .polskieradio import (
PolskieRadioIE,
PolskieRadioCategoryIE,
@@ -943,10 +1099,12 @@ from .popcorntimes import PopcorntimesIE
from .popcorntv import PopcornTVIE
from .porn91 import Porn91IE
from .porncom import PornComIE
+from .pornflip import PornFlipIE
from .pornhd import PornHdIE
from .pornhub import (
PornHubIE,
PornHubUserIE,
+ PornHubPlaylistIE,
PornHubPagedVideoListIE,
PornHubUserVideosUploadIE,
)
@@ -958,6 +1116,7 @@ from .puhutv import (
PuhuTVSerieIE,
)
from .presstv import PressTVIE
+from .projectveritas import ProjectVeritasIE
from .prosiebensat1 import ProSiebenSat1IE
from .puls4 import Puls4IE
from .pyvideo import PyvideoIE
@@ -972,6 +1131,7 @@ from .r7 import (
R7IE,
R7ArticleIE,
)
+from .radiko import RadikoIE, RadikoRadioIE
from .radiocanada import (
RadioCanadaIE,
RadioCanadaAudioVideoIE,
@@ -980,6 +1140,11 @@ from .radiode import RadioDeIE
from .radiojavan import RadioJavanIE
from .radiobremen import RadioBremenIE
from .radiofrance import RadioFranceIE
+from .radlive import (
+ RadLiveIE,
+ RadLiveChannelIE,
+ RadLiveSeasonIE,
+)
from .rai import (
RaiPlayIE,
RaiPlayLiveIE,
@@ -991,6 +1156,16 @@ from .raywenderlich import (
RayWenderlichCourseIE,
)
from .rbmaradio import RBMARadioIE
+from .rcs import (
+ RCSIE,
+ RCSEmbedsIE,
+ RCSVariousIE,
+)
+from .rcti import (
+ RCTIPlusIE,
+ RCTIPlusSeriesIE,
+ RCTIPlusTVIE,
+)
from .rds import RDSIE
from .redbulltv import (
RedBullTVIE,
@@ -1033,7 +1208,10 @@ from .rtve import RTVEALaCartaIE, RTVELiveIE, RTVEInfantilIE, RTVELiveIE, RTVETe
from .rtvnh import RTVNHIE
from .rtvs import RTVSIE
from .ruhd import RUHDIE
-from .rumble import RumbleEmbedIE
+from .rumble import (
+ RumbleEmbedIE,
+ RumbleChannelIE,
+)
from .rutube import (
RutubeIE,
RutubeChannelIE,
@@ -1050,6 +1228,7 @@ from .safari import (
SafariApiIE,
SafariCourseIE,
)
+from .saitosan import SaitosanIE
from .samplefocus import SampleFocusIE
from .sapo import SapoIE
from .savefrom import SaveFromIE
@@ -1082,6 +1261,7 @@ from .shared import (
SharedIE,
VivoIE,
)
+from .shemaroome import ShemarooMeIE
from .showroomlive import ShowRoomLiveIE
from .simplecast import (
SimplecastIE,
@@ -1105,6 +1285,7 @@ from .skynewsarabia import (
SkyNewsArabiaIE,
SkyNewsArabiaArticleIE,
)
+from .skynewsau import SkyNewsAUIE
from .sky import (
SkyNewsIE,
SkySportsIE,
@@ -1115,7 +1296,10 @@ from .slideslive import SlidesLiveIE
from .slutload import SlutloadIE
from .snotr import SnotrIE
from .sohu import SohuIE
-from .sonyliv import SonyLIVIE
+from .sonyliv import (
+ SonyLIVIE,
+ SonyLIVSeriesIE,
+)
from .soundcloud import (
SoundcloudEmbedIE,
SoundcloudIE,
@@ -1136,6 +1320,10 @@ from .southpark import (
SouthParkEsIE,
SouthParkNlIE
)
+from .sovietscloset import (
+ SovietsClosetIE,
+ SovietsClosetPlaylistIE
+)
from .spankbang import (
SpankBangIE,
SpankBangPlaylistIE,
@@ -1171,6 +1359,7 @@ from .srgssr import (
)
from .srmediathek import SRMediathekIE
from .stanfordoc import StanfordOpenClassroomIE
+from .startv import StarTVIE
from .steam import SteamIE
from .storyfire import (
StoryFireIE,
@@ -1178,6 +1367,7 @@ from .storyfire import (
StoryFireSeriesIE,
)
from .streamable import StreamableIE
+from .streamanity import StreamanityIE
from .streamcloud import StreamcloudIE
from .streamcz import StreamCZIE
from .streetvoice import StreetVoiceIE
@@ -1223,6 +1413,7 @@ from .telebruxelles import TeleBruxellesIE
from .telecinco import TelecincoIE
from .telegraaf import TelegraafIE
from .telemb import TeleMBIE
+from .telemundo import TelemundoIE
from .telequebec import (
TeleQuebecIE,
TeleQuebecSquatIE,
@@ -1245,6 +1436,10 @@ from .theplatform import (
from .thescene import TheSceneIE
from .thestar import TheStarIE
from .thesun import TheSunIE
+from .theta import (
+ ThetaVideoIE,
+ ThetaStreamIE,
+)
from .theweatherchannel import TheWeatherChannelIE
from .thisamericanlife import ThisAmericanLifeIE
from .thisav import ThisAVIE
@@ -1253,12 +1448,10 @@ from .threeqsdn import ThreeQSDNIE
from .tiktok import (
TikTokIE,
TikTokUserIE,
+ DouyinIE,
)
from .tinypic import TinyPicIE
-from .tmz import (
- TMZIE,
- TMZArticleIE,
-)
+from .tmz import TMZIE
from .tnaflix import (
TNAFlixNetworkEmbedIE,
TNAFlixIE,
@@ -1269,6 +1462,10 @@ from .toggle import (
ToggleIE,
MeWatchIE,
)
+from .tokentube import (
+ TokentubeIE,
+ TokentubeChannelIE
+)
from .tonline import TOnlineIE
from .toongoggles import ToonGogglesIE
from .toutv import TouTvIE
@@ -1278,11 +1475,16 @@ from .trilulilu import TriluliluIE
from .trovo import (
TrovoIE,
TrovoVodIE,
+ TrovoChannelVodIE,
+ TrovoChannelClipIE,
)
from .trunews import TruNewsIE
from .trutv import TruTVIE
from .tube8 import Tube8IE
-from .tubitv import TubiTvIE
+from .tubitv import (
+ TubiTvIE,
+ TubiTvShowIE,
+)
from .tumblr import TumblrIE
from .tunein import (
TuneInClipIE,
@@ -1303,7 +1505,10 @@ from .tv2dk import (
TV2DKIE,
TV2DKBornholmPlayIE,
)
-from .tv2hu import TV2HuIE
+from .tv2hu import (
+ TV2HuIE,
+ TV2HuSeriesIE,
+)
from .tv4 import TV4IE
from .tv5mondeplus import TV5MondePlusIE
from .tv5unis import (
@@ -1330,6 +1535,7 @@ from .tvnet import TVNetIE
from .tvnoe import TVNoeIE
from .tvnow import (
TVNowIE,
+ TVNowFilmIE,
TVNowNewIE,
TVNowSeasonIE,
TVNowAnnualIE,
@@ -1350,7 +1556,11 @@ from .tweakers import TweakersIE
from .twentyfourvideo import TwentyFourVideoIE
from .twentymin import TwentyMinutenIE
from .twentythreevideo import TwentyThreeVideoIE
-from .twitcasting import TwitCastingIE
+from .twitcasting import (
+ TwitCastingIE,
+ TwitCastingLiveIE,
+ TwitCastingUserIE,
+)
from .twitch import (
TwitchVodIE,
TwitchCollectionIE,
@@ -1365,6 +1575,7 @@ from .twitter import (
TwitterIE,
TwitterAmplifyIE,
TwitterBroadcastIE,
+ TwitterShortenerIE,
)
from .udemy import (
UdemyIE,
@@ -1375,6 +1586,7 @@ from .ufctv import (
UFCTVIE,
UFCArabiaIE,
)
+from .ukcolumn import UkColumnIE
from .uktvplay import UKTVPlayIE
from .digiteka import DigitekaIE
from .dlive import (
@@ -1398,9 +1610,11 @@ from .ustudio import (
UstudioIE,
UstudioEmbedIE,
)
+from .utreon import UtreonIE
from .varzesh3 import Varzesh3IE
from .vbox7 import Vbox7IE
from .veehd import VeeHDIE
+from .veo import VeoIE
from .veoh import VeohIE
from .vesti import VestiIE
from .vevo import (
@@ -1429,13 +1643,12 @@ from .videomore import (
VideomoreSeasonIE,
)
from .videopress import VideoPressIE
-from .vidio import VidioIE
-from .vidlii import VidLiiIE
-from .vidme import (
- VidmeIE,
- VidmeUserIE,
- VidmeUserLikesIE,
+from .vidio import (
+ VidioIE,
+ VidioPremierIE,
+ VidioLiveIE
)
+from .vidlii import VidLiiIE
from .vier import VierIE, VierVideosIE
from .viewlift import (
ViewLiftIE,
@@ -1483,7 +1696,14 @@ from .vodlocker import VodlockerIE
from .vodpl import VODPlIE
from .vodplatform import VODPlatformIE
from .voicerepublic import VoiceRepublicIE
-from .voot import VootIE
+from .voicy import (
+ VoicyIE,
+ VoicyChannelIE,
+)
+from .voot import (
+ VootIE,
+ VootSeriesIE,
+)
from .voxmedia import (
VoxMediaVolumeIE,
VoxMediaIE,
@@ -1499,6 +1719,7 @@ from .vtm import VTMIE
from .medialaan import MedialaanIE
from .vube import VubeIE
from .vuclip import VuClipIE
+from .vupload import VuploadIE
from .vvvvid import (
VVVVIDIE,
VVVVIDShowIE,
@@ -1533,6 +1754,8 @@ from .weibo import (
WeiboMobileIE
)
from .weiqitv import WeiqiTVIE
+from .wimtv import WimTVIE
+from .whowatch import WhoWatchIE
from .wistia import (
WistiaIE,
WistiaPlaylistIE,
@@ -1583,7 +1806,11 @@ from .yandexmusic import (
YandexMusicArtistTracksIE,
YandexMusicArtistAlbumsIE,
)
-from .yandexvideo import YandexVideoIE
+from .yandexvideo import (
+ YandexVideoIE,
+ ZenYandexIE,
+ ZenYandexChannelIE,
+)
from .yapfiles import YapFilesIE
from .yesjapan import YesJapanIE
from .yinyuetai import YinYueTaiIE
@@ -1603,6 +1830,7 @@ from .yourporn import YourPornIE
from .yourupload import YourUploadIE
from .youtube import (
YoutubeIE,
+ YoutubeClipIE,
YoutubeFavouritesIE,
YoutubeHistoryIE,
YoutubeTabIE,
@@ -1610,7 +1838,7 @@ from .youtube import (
YoutubeRecommendedIE,
YoutubeSearchDateIE,
YoutubeSearchIE,
- #YoutubeSearchURLIE,
+ YoutubeSearchURLIE,
YoutubeSubscriptionsIE,
YoutubeTruncatedIDIE,
YoutubeTruncatedURLIE,
@@ -1639,6 +1867,10 @@ from .zattoo import (
ZattooLiveIE,
)
from .zdf import ZDFIE, ZDFChannelIE
+from .zee5 import (
+ Zee5IE,
+ Zee5SeriesIE,
+)
from .zhihu import ZhihuIE
from .zingmp3 import (
ZingMp3IE,
diff --git a/hypervideo_dl/extractor/facebook.py b/hypervideo_dl/extractor/facebook.py
index 04650af..f32700f 100644
--- a/hypervideo_dl/extractor/facebook.py
+++ b/hypervideo_dl/extractor/facebook.py
@@ -3,14 +3,11 @@ from __future__ import unicode_literals
import json
import re
-import socket
from .common import InfoExtractor
from ..compat import (
compat_etree_fromstring,
- compat_http_client,
compat_str,
- compat_urllib_error,
compat_urllib_parse_unquote,
compat_urllib_parse_unquote_plus,
)
@@ -23,6 +20,8 @@ from ..utils import (
int_or_none,
js_to_json,
limit_length,
+ merge_dicts,
+ network_exceptions,
parse_count,
qualities,
sanitized_Request,
@@ -36,7 +35,7 @@ class FacebookIE(InfoExtractor):
_VALID_URL = r'''(?x)
(?:
https?://
- (?:[\w-]+\.)?(?:facebook\.com|facebookcorewwwi\.onion)/
+ (?:[\w-]+\.)?(?:facebook\.com|facebookwkhpilnemxj7asaniu7vnjjbiltxjqhye3mhbshg7kx5tfyd\.onion)/
(?:[^#]*?\#!/)?
(?:
(?:
@@ -82,7 +81,8 @@ class FacebookIE(InfoExtractor):
'info_dict': {
'id': '274175099429670',
'ext': 'mp4',
- 'title': 're:^Asif Nawab Butt posted a video',
+ 'title': 'Asif Nawab Butt',
+ 'description': 'Asif Nawab Butt',
'uploader': 'Asif Nawab Butt',
'upload_date': '20140506',
'timestamp': 1399398998,
@@ -137,15 +137,17 @@ class FacebookIE(InfoExtractor):
'upload_date': '20160223',
'uploader': 'Barack Obama',
},
+ 'skip': 'Gif on giphy.com gone',
}, {
# have 1080P, but only up to 720p in swf params
# data.video.story.attachments[].media
'url': 'https://www.facebook.com/cnn/videos/10155529876156509/',
- 'md5': '9571fae53d4165bbbadb17a94651dcdc',
+ 'md5': '3f3798adb2b73423263e59376f1f5eb7',
'info_dict': {
'id': '10155529876156509',
'ext': 'mp4',
- 'title': 'She survived the holocaust — and years later, she’s getting her citizenship s...',
+ 'title': 'Holocaust survivor becomes US citizen',
+ 'description': 'She survived the holocaust — and years later, she’s getting her citizenship so she can vote for Hillary Clinton http://cnn.it/2eERh5f',
'timestamp': 1477818095,
'upload_date': '20161030',
'uploader': 'CNN',
@@ -159,15 +161,18 @@ class FacebookIE(InfoExtractor):
'info_dict': {
'id': '1417995061575415',
'ext': 'mp4',
- 'title': 'md5:1db063d6a8c13faa8da727817339c857',
- 'timestamp': 1486648217,
+ 'title': 'Yaroslav Korpan - Довгоочікуване відео',
+ 'description': 'Довгоочікуване відео',
+ 'timestamp': 1486648771,
'upload_date': '20170209',
'uploader': 'Yaroslav Korpan',
+ 'uploader_id': '100000948048708',
},
'params': {
'skip_download': True,
},
}, {
+ # FIXME
'url': 'https://www.facebook.com/LaGuiaDelVaron/posts/1072691702860471',
'info_dict': {
'id': '1072691702860471',
@@ -185,12 +190,14 @@ class FacebookIE(InfoExtractor):
# data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.media
'url': 'https://www.facebook.com/groups/1024490957622648/permalink/1396382447100162/',
'info_dict': {
- 'id': '1396382447100162',
+ 'id': '202882990186699',
'ext': 'mp4',
- 'title': 'md5:19a428bbde91364e3de815383b54a235',
- 'timestamp': 1486035494,
+ 'title': 'Elisabeth Ahtn - Hello? Yes your uber ride is here\n* Jukin...',
+ 'description': 'Hello? Yes your uber ride is here\n* Jukin Media Verified *\nFind this video and others like it by visiting...',
+ 'timestamp': 1486035513,
'upload_date': '20170202',
'uploader': 'Elisabeth Ahtn',
+ 'uploader_id': '100013949973717',
},
'params': {
'skip_download': True,
@@ -219,7 +226,7 @@ class FacebookIE(InfoExtractor):
'only_matching': True,
}, {
# data.video
- 'url': 'https://www.facebookcorewwwi.onion/video.php?v=274175099429670',
+ 'url': 'https://www.facebookwkhpilnemxj7asaniu7vnjjbiltxjqhye3mhbshg7kx5tfyd.onion/video.php?v=274175099429670',
'only_matching': True,
}, {
# no title
@@ -231,8 +238,12 @@ class FacebookIE(InfoExtractor):
'info_dict': {
'id': '359649331226507',
'ext': 'mp4',
- 'title': '#ESLOne VoD - Birmingham Finals Day#1 Fnatic vs. @Evil Geniuses',
+ 'title': 'Fnatic vs. EG - Group A - Opening Match - ESL One Birmingham Day 1',
+ 'description': '#ESLOne VoD - Birmingham Finals Day#1 Fnatic vs. @Evil Geniuses',
+ 'timestamp': 1527084179,
+ 'upload_date': '20180523',
'uploader': 'ESL One Dota 2',
+ 'uploader_id': '234218833769558',
},
'params': {
'skip_download': True,
@@ -249,6 +260,7 @@ class FacebookIE(InfoExtractor):
'url': 'https://www.facebook.com/watch/?v=647537299265662',
'only_matching': True,
}, {
+ # FIXME: https://github.com/hypervideo/hypervideo/issues/542
# data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.all_subattachments.nodes[].media
'url': 'https://www.facebook.com/PankajShahLondon/posts/10157667649866271',
'info_dict': {
@@ -279,6 +291,7 @@ class FacebookIE(InfoExtractor):
'upload_date': '20161122',
'timestamp': 1479793574,
},
+ 'skip': 'No video',
}, {
# data.video.creation_story.attachments[].media
'url': 'https://www.facebook.com/watch/live/?v=1823658634322275',
@@ -348,7 +361,7 @@ class FacebookIE(InfoExtractor):
login_results, 'login error', default=None, group='error')
if error:
raise ExtractorError('Unable to login: %s' % error, expected=True)
- self._downloader.report_warning('unable to log in: bad username/password, or exceeded login rate limit (~3/min). Check credentials or wait.')
+ self.report_warning('unable to log in: bad username/password, or exceeded login rate limit (~3/min). Check credentials or wait.')
return
fb_dtsg = self._search_regex(
@@ -369,9 +382,9 @@ class FacebookIE(InfoExtractor):
check_response = self._download_webpage(check_req, None,
note='Confirming login')
if re.search(r'id="checkpointSubmitButton"', check_response) is not None:
- self._downloader.report_warning('Unable to confirm login, you have to login in your browser and authorize the login.')
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- self._downloader.report_warning('unable to log in: %s' % error_to_compat_str(err))
+ self.report_warning('Unable to confirm login, you have to login in your browser and authorize the login.')
+ except network_exceptions as err:
+ self.report_warning('unable to log in: %s' % error_to_compat_str(err))
return
def _real_initialize(self):
@@ -381,6 +394,56 @@ class FacebookIE(InfoExtractor):
webpage = self._download_webpage(
url.replace('://m.facebook.com/', '://www.facebook.com/'), video_id)
+ def extract_metadata(webpage):
+ video_title = self._html_search_regex(
+ r'<h2\s+[^>]*class="uiHeaderTitle"[^>]*>([^<]*)</h2>', webpage,
+ 'title', default=None)
+ if not video_title:
+ video_title = self._html_search_regex(
+ r'(?s)<span class="fbPhotosPhotoCaption".*?id="fbPhotoPageCaption"><span class="hasCaption">(.*?)</span>',
+ webpage, 'alternative title', default=None)
+ if not video_title:
+ video_title = self._html_search_meta(
+ ['og:title', 'twitter:title', 'description'],
+ webpage, 'title', default=None)
+ if video_title:
+ video_title = limit_length(video_title, 80)
+ else:
+ video_title = 'Facebook video #%s' % video_id
+ description = self._html_search_meta(
+ ['description', 'og:description', 'twitter:description'],
+ webpage, 'description', default=None)
+ uploader = clean_html(get_element_by_id(
+ 'fbPhotoPageAuthorName', webpage)) or self._search_regex(
+ r'ownerName\s*:\s*"([^"]+)"', webpage, 'uploader',
+ default=None) or self._og_search_title(webpage, fatal=False)
+ timestamp = int_or_none(self._search_regex(
+ r'<abbr[^>]+data-utime=["\'](\d+)', webpage,
+ 'timestamp', default=None))
+ thumbnail = self._html_search_meta(
+ ['og:image', 'twitter:image'], webpage, 'thumbnail', default=None)
+ # some webpages contain unretrievable thumbnail urls
+ # like https://lookaside.fbsbx.com/lookaside/crawler/media/?media_id=10155168902769113&get_thumbnail=1
+ # in https://www.facebook.com/yaroslav.korpan/videos/1417995061575415/
+ if thumbnail and not re.search(r'\.(?:jpg|png)', thumbnail):
+ thumbnail = None
+ view_count = parse_count(self._search_regex(
+ r'\bviewCount\s*:\s*["\']([\d,.]+)', webpage, 'view count',
+ default=None))
+ info_dict = {
+ 'title': video_title,
+ 'description': description,
+ 'uploader': uploader,
+ 'timestamp': timestamp,
+ 'thumbnail': thumbnail,
+ 'view_count': view_count,
+ }
+ info_json_ld = self._search_json_ld(webpage, video_id, default={})
+ if info_json_ld.get('title'):
+ info_json_ld['title'] = limit_length(
+ re.sub(r'\s*\|\s*Facebook$', '', info_json_ld['title']), 80)
+ return merge_dicts(info_json_ld, info_dict)
+
video_data = None
def extract_video_data(instances):
@@ -416,7 +479,7 @@ class FacebookIE(InfoExtractor):
for f in formats:
f.setdefault('http_headers', {})['User-Agent'] = 'facebookexternalhit/1.1'
- self._sort_formats(formats)
+ self._sort_formats(formats, ('res', 'quality'))
def extract_relay_data(_filter):
return self._parse_json(self._search_regex(
@@ -513,7 +576,15 @@ class FacebookIE(InfoExtractor):
if not entries:
parse_graphql_video(video)
- return self.playlist_result(entries, video_id)
+ if len(entries) > 1:
+ return self.playlist_result(entries, video_id)
+
+ video_info = entries[0]
+ webpage_info = extract_metadata(webpage)
+ # honor precise duration in video info
+ if video_info.get('duration'):
+ webpage_info['duration'] = video_info['duration']
+ return merge_dicts(webpage_info, video_info)
if not video_data:
m_msg = re.search(r'class="[^"]*uiInterstitialContent[^"]*"><div>(.*?)</div>', webpage)
@@ -616,60 +687,28 @@ class FacebookIE(InfoExtractor):
for src_type in ('src', 'src_no_ratelimit'):
src = f[0].get('%s_%s' % (quality, src_type))
if src:
- preference = -10 if format_id == 'progressive' else 0
+ preference = -10 if format_id == 'progressive' else -1
if quality == 'hd':
preference += 5
formats.append({
'format_id': '%s_%s_%s' % (format_id, quality, src_type),
'url': src,
- 'preference': preference,
+ 'quality': preference,
+ 'height': 720 if quality == 'hd' else None
})
extract_dash_manifest(f[0], formats)
subtitles_src = f[0].get('subtitles_src')
if subtitles_src:
subtitles.setdefault('en', []).append({'url': subtitles_src})
- if not formats:
- raise ExtractorError('Cannot find video formats')
process_formats(formats)
- video_title = self._html_search_regex(
- r'<h2\s+[^>]*class="uiHeaderTitle"[^>]*>([^<]*)</h2>', webpage,
- 'title', default=None)
- if not video_title:
- video_title = self._html_search_regex(
- r'(?s)<span class="fbPhotosPhotoCaption".*?id="fbPhotoPageCaption"><span class="hasCaption">(.*?)</span>',
- webpage, 'alternative title', default=None)
- if not video_title:
- video_title = self._html_search_meta(
- 'description', webpage, 'title', default=None)
- if video_title:
- video_title = limit_length(video_title, 80)
- else:
- video_title = 'Facebook video #%s' % video_id
- uploader = clean_html(get_element_by_id(
- 'fbPhotoPageAuthorName', webpage)) or self._search_regex(
- r'ownerName\s*:\s*"([^"]+)"', webpage, 'uploader',
- default=None) or self._og_search_title(webpage, fatal=False)
- timestamp = int_or_none(self._search_regex(
- r'<abbr[^>]+data-utime=["\'](\d+)', webpage,
- 'timestamp', default=None))
- thumbnail = self._html_search_meta(['og:image', 'twitter:image'], webpage)
-
- view_count = parse_count(self._search_regex(
- r'\bviewCount\s*:\s*["\']([\d,.]+)', webpage, 'view count',
- default=None))
-
info_dict = {
'id': video_id,
- 'title': video_title,
'formats': formats,
- 'uploader': uploader,
- 'timestamp': timestamp,
- 'thumbnail': thumbnail,
- 'view_count': view_count,
'subtitles': subtitles,
}
+ info_dict.update(extract_metadata(webpage))
return info_dict
diff --git a/hypervideo_dl/extractor/fancode.py b/hypervideo_dl/extractor/fancode.py
new file mode 100644
index 0000000..912feb7
--- /dev/null
+++ b/hypervideo_dl/extractor/fancode.py
@@ -0,0 +1,187 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+from ..compat import compat_str
+from ..utils import (
+ parse_iso8601,
+ ExtractorError,
+ try_get,
+ mimetype2ext
+)
+
+
+class FancodeVodIE(InfoExtractor):
+ IE_NAME = 'fancode:vod'
+
+ _VALID_URL = r'https?://(?:www\.)?fancode\.com/video/(?P<id>[0-9]+)\b'
+
+ _TESTS = [{
+ 'url': 'https://fancode.com/video/15043/match-preview-pbks-vs-mi',
+ 'params': {
+ 'skip_download': True,
+ 'format': 'bestvideo'
+ },
+ 'info_dict': {
+ 'id': '6249806281001',
+ 'ext': 'mp4',
+ 'title': 'Match Preview: PBKS vs MI',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ "timestamp": 1619081590,
+ 'view_count': int,
+ 'like_count': int,
+ 'upload_date': '20210422',
+ 'uploader_id': '6008340455001'
+ }
+ }, {
+ 'url': 'https://fancode.com/video/15043',
+ 'only_matching': True,
+ }]
+
+ _ACCESS_TOKEN = None
+ _NETRC_MACHINE = 'fancode'
+
+ _LOGIN_HINT = 'Use "--user refresh --password <refresh_token>" to login using a refresh token'
+
+ headers = {
+ 'content-type': 'application/json',
+ 'origin': 'https://fancode.com',
+ 'referer': 'https://fancode.com',
+ }
+
+ def _login(self):
+ # Access tokens are shortlived, so get them using the refresh token.
+ username, password = self._get_login_info()
+ if username == 'refresh' and password is not None:
+ self.report_login()
+ data = '''{
+ "query":"mutation RefreshToken($refreshToken: String\\u0021) { refreshToken(refreshToken: $refreshToken) { accessToken }}",
+ "variables":{
+ "refreshToken":"%s"
+ },
+ "operationName":"RefreshToken"
+ }''' % password
+
+ token_json = self.download_gql('refresh token', data, "Getting the Access token")
+ self._ACCESS_TOKEN = try_get(token_json, lambda x: x['data']['refreshToken']['accessToken'])
+ if self._ACCESS_TOKEN is None:
+ self.report_warning('Failed to get Access token')
+ else:
+ self.headers.update({'Authorization': 'Bearer %s' % self._ACCESS_TOKEN})
+ elif username is not None:
+ self.report_warning(f'Login using username and password is not currently supported. {self._LOGIN_HINT}')
+
+ def _real_initialize(self):
+ self._login()
+
+ def _check_login_required(self, is_available, is_premium):
+ msg = None
+ if is_premium and self._ACCESS_TOKEN is None:
+ msg = f'This video is only available for registered users. {self._LOGIN_HINT}'
+ elif not is_available and self._ACCESS_TOKEN is not None:
+ msg = 'This video isn\'t available to the current logged in account'
+ if msg:
+ self.raise_login_required(msg, metadata_available=True, method=None)
+
+ def download_gql(self, variable, data, note, fatal=False, headers=headers):
+ return self._download_json(
+ 'https://www.fancode.com/graphql', variable,
+ data=data.encode(), note=note,
+ headers=headers, fatal=fatal)
+
+ def _real_extract(self, url):
+
+ BRIGHTCOVE_URL_TEMPLATE = 'https://players.brightcove.net/%s/default_default/index.html?videoId=%s'
+ video_id = self._match_id(url)
+
+ brightcove_user_id = '6008340455001'
+ data = '''{
+ "query":"query Video($id: Int\\u0021, $filter: SegmentFilter) { media(id: $id, filter: $filter) { id contentId title contentId publishedTime totalViews totalUpvotes provider thumbnail { src } mediaSource {brightcove } duration isPremium isUserEntitled tags duration }}",
+ "variables":{
+ "id":%s,
+ "filter":{
+ "contentDataType":"DEFAULT"
+ }
+ },
+ "operationName":"Video"
+ }''' % video_id
+
+ metadata_json = self.download_gql(video_id, data, note='Downloading metadata')
+
+ media = try_get(metadata_json, lambda x: x['data']['media'], dict) or {}
+ brightcove_video_id = try_get(media, lambda x: x['mediaSource']['brightcove'], compat_str)
+
+ if brightcove_video_id is None:
+ raise ExtractorError('Unable to extract brightcove Video ID')
+
+ is_premium = media.get('isPremium')
+
+ self._check_login_required(media.get('isUserEntitled'), is_premium)
+
+ return {
+ '_type': 'url_transparent',
+ 'url': BRIGHTCOVE_URL_TEMPLATE % (brightcove_user_id, brightcove_video_id),
+ 'ie_key': 'BrightcoveNew',
+ 'id': video_id,
+ 'title': media['title'],
+ 'like_count': media.get('totalUpvotes'),
+ 'view_count': media.get('totalViews'),
+ 'tags': media.get('tags'),
+ 'release_timestamp': parse_iso8601(media.get('publishedTime')),
+ 'availability': self._availability(needs_premium=is_premium),
+ }
+
+
+class FancodeLiveIE(FancodeVodIE):
+ IE_NAME = 'fancode:live'
+
+ _VALID_URL = r'https?://(www\.)?fancode\.com/match/(?P<id>[0-9]+).+'
+
+ _TESTS = [{
+ 'url': 'https://fancode.com/match/35328/cricket-fancode-ecs-hungary-2021-bub-vs-blb?slug=commentary',
+ 'info_dict': {
+ 'id': '35328',
+ 'ext': 'mp4',
+ 'title': 'BUB vs BLB',
+ "timestamp": 1624863600,
+ 'is_live': True,
+ 'upload_date': '20210628',
+ },
+ 'skip': 'Ended'
+ }, {
+ 'url': 'https://fancode.com/match/35328/',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://fancode.com/match/35567?slug=scorecard',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+
+ id = self._match_id(url)
+ data = '''{
+ "query":"query MatchResponse($id: Int\\u0021, $isLoggedIn: Boolean\\u0021) { match: matchWithScores(id: $id) { id matchDesc mediaId videoStreamId videoStreamUrl { ...VideoSource } liveStreams { videoStreamId videoStreamUrl { ...VideoSource } contentId } name startTime streamingStatus isPremium isUserEntitled @include(if: $isLoggedIn) status metaTags bgImage { src } sport { name slug } tour { id name } squads { name shortName } liveStreams { contentId } mediaId }}fragment VideoSource on VideoSource { title description posterUrl url deliveryType playerType}",
+ "variables":{
+ "id":%s,
+ "isLoggedIn":true
+ },
+ "operationName":"MatchResponse"
+ }''' % id
+
+ info_json = self.download_gql(id, data, "Info json")
+
+ match_info = try_get(info_json, lambda x: x['data']['match'])
+
+ if match_info.get('streamingStatus') != "STARTED":
+ raise ExtractorError('The stream can\'t be accessed', expected=True)
+ self._check_login_required(match_info.get('isUserEntitled'), True) # all live streams are premium only
+
+ return {
+ 'id': id,
+ 'title': match_info.get('name'),
+ 'formats': self._extract_akamai_formats(try_get(match_info, lambda x: x['videoStreamUrl']['url']), id),
+ 'ext': mimetype2ext(try_get(match_info, lambda x: x['videoStreamUrl']['deliveryType'])),
+ 'is_live': True,
+ 'release_timestamp': parse_iso8601(match_info.get('startTime'))
+ }
diff --git a/hypervideo_dl/extractor/fc2.py b/hypervideo_dl/extractor/fc2.py
index 4355611..4d85e62 100644
--- a/hypervideo_dl/extractor/fc2.py
+++ b/hypervideo_dl/extractor/fc2.py
@@ -2,7 +2,6 @@
from __future__ import unicode_literals
import hashlib
-import re
from .common import InfoExtractor
from ..compat import (
@@ -138,7 +137,7 @@ class FC2EmbedIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
query = compat_parse_qs(mobj.group('query'))
video_id = query['i'][-1]
diff --git a/hypervideo_dl/extractor/filmmodu.py b/hypervideo_dl/extractor/filmmodu.py
new file mode 100644
index 0000000..2746876
--- /dev/null
+++ b/hypervideo_dl/extractor/filmmodu.py
@@ -0,0 +1,74 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import int_or_none
+
+
+class FilmmoduIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www.)?filmmodu.org/(?P<id>[^/]+-(?:turkce-dublaj-izle|altyazili-izle))'
+ _TESTS = [{
+ 'url': 'https://www.filmmodu.org/f9-altyazili-izle',
+ 'md5': 'aeefd955c2a508a5bdaa3bcec8eeb0d4',
+ 'info_dict': {
+ 'id': '10804',
+ 'ext': 'mp4',
+ 'title': 'F9',
+ 'description': 'md5:2713f584a4d65afa2611e2948d0b953c',
+ 'subtitles': {
+ 'tr': [{
+ 'ext': 'vtt',
+ }],
+ },
+ 'thumbnail': r're:https://s[0-9]+.filmmodu.org/uploads/movie/cover/10804/xXHZeb1yhJvnSHPzZDqee0zfMb6.jpg',
+ },
+ }, {
+ 'url': 'https://www.filmmodu.org/the-godfather-turkce-dublaj-izle',
+ 'md5': '109f2fcb9c941330eed133971c035c00',
+ 'info_dict': {
+ 'id': '3646',
+ 'ext': 'mp4',
+ 'title': 'Baba',
+ 'description': 'md5:d43fd651937cd75cc650883ebd8d8461',
+ 'thumbnail': r're:https://s[0-9]+.filmmodu.org/uploads/movie/cover/3646/6xKCYgH16UuwEGAyroLU6p8HLIn.jpg',
+ },
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+ title = self._og_search_title(webpage, fatal=True)
+ description = self._og_search_description(webpage)
+ thumbnail = self._og_search_thumbnail(webpage)
+ real_video_id = self._search_regex(r'var\s*videoId\s*=\s*\'([0-9]+)\'', webpage, 'video_id')
+ video_type = self._search_regex(r'var\s*videoType\s*=\s*\'([a-z]+)\'', webpage, 'video_type')
+ data = self._download_json('https://www.filmmodu.org/get-source', real_video_id, query={
+ 'movie_id': real_video_id,
+ 'type': video_type,
+ })
+ formats = [{
+ 'url': source['src'],
+ 'ext': 'mp4',
+ 'format_id': source['label'],
+ 'height': int_or_none(source.get('res')),
+ 'protocol': 'm3u8_native',
+ } for source in data['sources']]
+
+ self._sort_formats(formats)
+
+ subtitles = {}
+
+ if data.get('subtitle'):
+ subtitles['tr'] = [{
+ 'url': data['subtitle'],
+ }]
+
+ return {
+ 'id': real_video_id,
+ 'display_id': video_id,
+ 'title': title,
+ 'description': description,
+ 'formats': formats,
+ 'subtitles': subtitles,
+ 'thumbnail': thumbnail,
+ }
diff --git a/hypervideo_dl/extractor/filmweb.py b/hypervideo_dl/extractor/filmweb.py
index 56000bc..5e323b4 100644
--- a/hypervideo_dl/extractor/filmweb.py
+++ b/hypervideo_dl/extractor/filmweb.py
@@ -1,6 +1,5 @@
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
@@ -22,7 +21,7 @@ class FilmwebIE(InfoExtractor):
}
def _real_extract(self, url):
- article_type, article_id = re.match(self._VALID_URL, url).groups()
+ article_type, article_id = self._match_valid_url(url).groups()
if article_type == 'filmnytt':
webpage = self._download_webpage(url, article_id)
article_id = self._search_regex(r'data-videoid="(\d+)"', webpage, 'article id')
diff --git a/hypervideo_dl/extractor/firsttv.py b/hypervideo_dl/extractor/firsttv.py
index 28617d8..ccad173 100644
--- a/hypervideo_dl/extractor/firsttv.py
+++ b/hypervideo_dl/extractor/firsttv.py
@@ -104,7 +104,7 @@ class FirstTVIE(InfoExtractor):
'tbr': tbr,
'source_preference': quality(f.get('name')),
# quality metadata of http formats may be incorrect
- 'preference': -1,
+ 'preference': -10,
})
# m3u8 URL format is reverse engineered from [1] (search for
# master.m3u8). dashEdges (that is currently balancer-vod.1tv.ru)
diff --git a/hypervideo_dl/extractor/fivetv.py b/hypervideo_dl/extractor/fivetv.py
index c4c0f1b..be81fcc 100644
--- a/hypervideo_dl/extractor/fivetv.py
+++ b/hypervideo_dl/extractor/fivetv.py
@@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
from ..utils import int_or_none
@@ -66,7 +65,7 @@ class FiveTVIE(InfoExtractor):
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
video_id = mobj.group('id') or mobj.group('path')
webpage = self._download_webpage(url, video_id)
diff --git a/hypervideo_dl/extractor/flickr.py b/hypervideo_dl/extractor/flickr.py
index 9f166ef..6c82fae 100644
--- a/hypervideo_dl/extractor/flickr.py
+++ b/hypervideo_dl/extractor/flickr.py
@@ -88,7 +88,7 @@ class FlickrIE(InfoExtractor):
formats.append({
'format_id': stream_type,
'url': stream['_content'],
- 'preference': preference(stream_type),
+ 'quality': preference(stream_type),
})
self._sort_formats(formats)
diff --git a/hypervideo_dl/extractor/fourtube.py b/hypervideo_dl/extractor/fourtube.py
index be4e813..d4d955b 100644
--- a/hypervideo_dl/extractor/fourtube.py
+++ b/hypervideo_dl/extractor/fourtube.py
@@ -41,7 +41,7 @@ class FourTubeBaseIE(InfoExtractor):
return formats
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
kind, video_id, display_id = mobj.group('kind', 'id', 'display_id')
if kind == 'm' or not display_id:
@@ -228,7 +228,7 @@ class PornTubeIE(FourTubeBaseIE):
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
video_id, display_id = mobj.group('id', 'display_id')
webpage = self._download_webpage(url, display_id)
diff --git a/hypervideo_dl/extractor/foxnews.py b/hypervideo_dl/extractor/foxnews.py
index 63613cb..18fa0a5 100644
--- a/hypervideo_dl/extractor/foxnews.py
+++ b/hypervideo_dl/extractor/foxnews.py
@@ -67,7 +67,7 @@ class FoxNewsIE(AMPIE):
webpage)]
def _real_extract(self, url):
- host, video_id = re.match(self._VALID_URL, url).groups()
+ host, video_id = self._match_valid_url(url).groups()
info = self._extract_feed_info(
'http://%s/v/feed/video/%s.js?template=fox' % (host, video_id))
diff --git a/hypervideo_dl/extractor/francetv.py b/hypervideo_dl/extractor/francetv.py
index e4ec2e2..3bbab69 100644
--- a/hypervideo_dl/extractor/francetv.py
+++ b/hypervideo_dl/extractor/francetv.py
@@ -2,22 +2,14 @@
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
-from ..compat import (
- compat_str,
- compat_urlparse,
-)
from ..utils import (
- clean_html,
determine_ext,
ExtractorError,
- int_or_none,
- parse_duration,
- try_get,
- url_or_none,
- urljoin,
+ format_field,
+ parse_iso8601,
+ parse_qs,
)
from .dailymotion import DailymotionIE
@@ -90,94 +82,81 @@ class FranceTVIE(InfoExtractor):
# Videos are identified by idDiffusion so catalogue part is optional.
# However when provided, some extra formats may be returned so we pass
# it if available.
- info = self._download_json(
- 'https://sivideo.webservices.francetelevisions.fr/tools/getInfosOeuvre/v2/',
- video_id, 'Downloading video JSON', query={
- 'idDiffusion': video_id,
- 'catalogue': catalogue or '',
- })
-
- if info.get('status') == 'NOK':
- raise ExtractorError(
- '%s returned error: %s' % (self.IE_NAME, info['message']),
- expected=True)
- allowed_countries = info['videos'][0].get('geoblocage')
- if allowed_countries:
- georestricted = True
- geo_info = self._download_json(
- 'http://geo.francetv.fr/ws/edgescape.json', video_id,
- 'Downloading geo restriction info')
- country = geo_info['reponse']['geo_info']['country_code']
- if country not in allowed_countries:
- raise ExtractorError(
- 'The video is not available from your location',
- expected=True)
- else:
- georestricted = False
-
- def sign(manifest_url, manifest_id):
- for host in ('hdfauthftv-a.akamaihd.net', 'hdfauth.francetv.fr'):
- signed_url = url_or_none(self._download_webpage(
- 'https://%s/esi/TA' % host, video_id,
- 'Downloading signed %s manifest URL' % manifest_id,
- fatal=False, query={
- 'url': manifest_url,
- }))
- if signed_url:
- return signed_url
- return manifest_url
-
is_live = None
-
videos = []
-
- for video in (info.get('videos') or []):
- if video.get('statut') != 'ONLINE':
+ title = None
+ subtitle = None
+ image = None
+ duration = None
+ timestamp = None
+ spritesheets = None
+
+ for device_type in ('desktop', 'mobile'):
+ dinfo = self._download_json(
+ 'https://player.webservices.francetelevisions.fr/v1/videos/%s' % video_id,
+ video_id, 'Downloading %s video JSON' % device_type, query={
+ 'device_type': device_type,
+ 'browser': 'chrome',
+ }, fatal=False)
+
+ if not dinfo:
continue
- if not video.get('url'):
- continue
- videos.append(video)
-
- if not videos:
- for device_type in ['desktop', 'mobile']:
- fallback_info = self._download_json(
- 'https://player.webservices.francetelevisions.fr/v1/videos/%s' % video_id,
- video_id, 'Downloading fallback %s video JSON' % device_type, query={
- 'device_type': device_type,
- 'browser': 'chrome',
- }, fatal=False)
- if fallback_info and fallback_info.get('video'):
- videos.append(fallback_info['video'])
+ video = dinfo.get('video')
+ if video:
+ videos.append(video)
+ if duration is None:
+ duration = video.get('duration')
+ if is_live is None:
+ is_live = video.get('is_live')
+ if spritesheets is None:
+ spritesheets = video.get('spritesheets')
+
+ meta = dinfo.get('meta')
+ if meta:
+ if title is None:
+ title = meta.get('title')
+ # XXX: what is meta['pre_title']?
+ if subtitle is None:
+ subtitle = meta.get('additional_title')
+ if image is None:
+ image = meta.get('image_url')
+ if timestamp is None:
+ timestamp = parse_iso8601(meta.get('broadcasted_at'))
formats = []
+ subtitles = {}
for video in videos:
- video_url = video.get('url')
- if not video_url:
- continue
- if is_live is None:
- is_live = (try_get(
- video, lambda x: x['plages_ouverture'][0]['direct'], bool) is True
- or video.get('is_live') is True
- or '/live.francetv.fr/' in video_url)
format_id = video.get('format')
+
+ video_url = None
+ if video.get('workflow') == 'token-akamai':
+ token_url = video.get('token')
+ if token_url:
+ token_json = self._download_json(
+ token_url, video_id,
+ 'Downloading signed %s manifest URL' % format_id)
+ if token_json:
+ video_url = token_json.get('url')
+ if not video_url:
+ video_url = video.get('url')
+
ext = determine_ext(video_url)
if ext == 'f4m':
- if georestricted:
- # See https://github.com/ytdl-org/youtube-dl/issues/3963
- # m3u8 urls work fine
- continue
formats.extend(self._extract_f4m_formats(
- sign(video_url, format_id) + '&hdcore=3.7.0&plugin=aasp-3.7.0.39.44',
- video_id, f4m_id=format_id, fatal=False))
+ video_url, video_id, f4m_id=format_id, fatal=False))
elif ext == 'm3u8':
- formats.extend(self._extract_m3u8_formats(
- sign(video_url, format_id), video_id, 'mp4',
+ fmts, subs = self._extract_m3u8_formats_and_subtitles(
+ video_url, video_id, 'mp4',
entry_protocol='m3u8_native', m3u8_id=format_id,
- fatal=False))
+ fatal=False)
+ formats.extend(fmts)
+ self._merge_subtitles(subs, target=subtitles)
elif ext == 'mpd':
- formats.extend(self._extract_mpd_formats(
- sign(video_url, format_id), video_id, mpd_id=format_id, fatal=False))
+ fmts, subs = self._extract_mpd_formats_and_subtitles(
+ video_url, video_id, mpd_id=format_id, fatal=False)
+ formats.extend(fmts)
+ self._merge_subtitles(subs, target=subtitles)
elif video_url.startswith('rtmp'):
formats.append({
'url': video_url,
@@ -191,41 +170,55 @@ class FranceTVIE(InfoExtractor):
'format_id': format_id,
})
+ # XXX: what is video['captions']?
+
+ for f in formats:
+ if f.get('acodec') != 'none' and f.get('language') in ('qtz', 'qad'):
+ f['language_preference'] = -10
+ f['format_note'] = 'audio description%s' % format_field(f, 'format_note', ', %s')
+
+ if spritesheets:
+ formats.append({
+ 'format_id': 'spritesheets',
+ 'format_note': 'storyboard',
+ 'acodec': 'none',
+ 'vcodec': 'none',
+ 'ext': 'mhtml',
+ 'protocol': 'mhtml',
+ 'url': 'about:dummy',
+ 'fragments': [{
+ 'path': sheet,
+ # XXX: not entirely accurate; each spritesheet seems to be
+ # a 10×10 grid of thumbnails corresponding to approximately
+ # 2 seconds of the video; the last spritesheet may be shorter
+ 'duration': 200,
+ } for sheet in spritesheets]
+ })
+
self._sort_formats(formats)
- title = info['titre']
- subtitle = info.get('sous_titre')
if subtitle:
title += ' - %s' % subtitle
title = title.strip()
- subtitles = {}
- subtitles_list = [{
- 'url': subformat['url'],
- 'ext': subformat.get('format'),
- } for subformat in info.get('subtitles', []) if subformat.get('url')]
- if subtitles_list:
- subtitles['fr'] = subtitles_list
-
return {
'id': video_id,
'title': self._live_title(title) if is_live else title,
- 'description': clean_html(info.get('synopsis')),
- 'thumbnail': urljoin('https://sivideo.webservices.francetelevisions.fr', info.get('image')),
- 'duration': int_or_none(info.get('real_duration')) or parse_duration(info.get('duree')),
- 'timestamp': int_or_none(try_get(info, lambda x: x['diffusion']['timestamp'])),
+ 'thumbnail': image,
+ 'duration': duration,
+ 'timestamp': timestamp,
'is_live': is_live,
'formats': formats,
'subtitles': subtitles,
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
video_id = mobj.group('id')
catalog = mobj.group('catalog')
if not video_id:
- qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
+ qs = parse_qs(url)
video_id = qs.get('idDiffusion', [None])[0]
catalog = qs.get('catalogue', [None])[0]
if not video_id:
@@ -307,47 +300,19 @@ class FranceTVSiteIE(FranceTVBaseInfoExtractor):
return self._make_url_result(video_id, catalogue)
-class FranceTVEmbedIE(FranceTVBaseInfoExtractor):
- _VALID_URL = r'https?://embed\.francetv\.fr/*\?.*?\bue=(?P<id>[^&]+)'
-
- _TESTS = [{
- 'url': 'http://embed.francetv.fr/?ue=7fd581a2ccf59d2fc5719c5c13cf6961',
- 'info_dict': {
- 'id': 'NI_983319',
- 'ext': 'mp4',
- 'title': 'Le Pen Reims',
- 'upload_date': '20170505',
- 'timestamp': 1493981780,
- 'duration': 16,
- },
- 'params': {
- 'skip_download': True,
- },
- 'add_ie': [FranceTVIE.ie_key()],
- }]
-
- def _real_extract(self, url):
- video_id = self._match_id(url)
-
- video = self._download_json(
- 'http://api-embed.webservices.francetelevisions.fr/key/%s' % video_id,
- video_id)
-
- return self._make_url_result(video['video_id'], video.get('catalog'))
-
-
class FranceTVInfoIE(FranceTVBaseInfoExtractor):
IE_NAME = 'francetvinfo.fr'
_VALID_URL = r'https?://(?:www|mobile|france3-regions)\.francetvinfo\.fr/(?:[^/]+/)*(?P<id>[^/?#&.]+)'
_TESTS = [{
- 'url': 'http://www.francetvinfo.fr/replay-jt/france-3/soir-3/jt-grand-soir-3-lundi-26-aout-2013_393427.html',
+ 'url': 'https://www.francetvinfo.fr/replay-jt/france-3/soir-3/jt-grand-soir-3-jeudi-22-aout-2019_3561461.html',
'info_dict': {
- 'id': '84981923',
+ 'id': 'd12458ee-5062-48fe-bfdd-a30d6a01b793',
'ext': 'mp4',
'title': 'Soir 3',
- 'upload_date': '20130826',
- 'timestamp': 1377548400,
+ 'upload_date': '20190822',
+ 'timestamp': 1566510900,
+ 'description': 'md5:72d167097237701d6e8452ff03b83c00',
'subtitles': {
'fr': 'mincount:2',
},
@@ -357,6 +322,22 @@ class FranceTVInfoIE(FranceTVBaseInfoExtractor):
},
'add_ie': [FranceTVIE.ie_key()],
}, {
+ 'note': 'Only an image exists in initial webpage instead of the video',
+ 'url': 'https://www.francetvinfo.fr/sante/maladie/coronavirus/covid-19-en-inde-une-situation-catastrophique-a-new-dehli_4381095.html',
+ 'info_dict': {
+ 'id': '7d204c9e-a2d3-11eb-9e4c-000d3a23d482',
+ 'ext': 'mp4',
+ 'title': 'Covid-19 : une situation catastrophique à New Dehli',
+ 'thumbnail': str,
+ 'duration': 76,
+ 'timestamp': 1619028518,
+ 'upload_date': '20210421',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ 'add_ie': [FranceTVIE.ie_key()],
+ }, {
'url': 'http://www.francetvinfo.fr/elections/europeennes/direct-europeennes-regardez-le-debat-entre-les-candidats-a-la-presidence-de-la-commission_600639.html',
'only_matching': True,
}, {
@@ -408,139 +389,3 @@ class FranceTVInfoIE(FranceTVBaseInfoExtractor):
webpage, 'video id')
return self._make_url_result(video_id)
-
-
-class FranceTVInfoSportIE(FranceTVBaseInfoExtractor):
- IE_NAME = 'sport.francetvinfo.fr'
- _VALID_URL = r'https?://sport\.francetvinfo\.fr/(?:[^/]+/)*(?P<id>[^/?#&]+)'
- _TESTS = [{
- 'url': 'https://sport.francetvinfo.fr/les-jeux-olympiques/retour-sur-les-meilleurs-moments-de-pyeongchang-2018',
- 'info_dict': {
- 'id': '6e49080e-3f45-11e8-b459-000d3a2439ea',
- 'ext': 'mp4',
- 'title': 'Retour sur les meilleurs moments de Pyeongchang 2018',
- 'timestamp': 1523639962,
- 'upload_date': '20180413',
- },
- 'params': {
- 'skip_download': True,
- },
- 'add_ie': [FranceTVIE.ie_key()],
- }]
-
- def _real_extract(self, url):
- display_id = self._match_id(url)
- webpage = self._download_webpage(url, display_id)
- video_id = self._search_regex(r'data-video="([^"]+)"', webpage, 'video_id')
- return self._make_url_result(video_id, 'Sport-web')
-
-
-class GenerationWhatIE(InfoExtractor):
- IE_NAME = 'france2.fr:generation-what'
- _VALID_URL = r'https?://generation-what\.francetv\.fr/[^/]+/video/(?P<id>[^/?#&]+)'
-
- _TESTS = [{
- 'url': 'http://generation-what.francetv.fr/portrait/video/present-arms',
- 'info_dict': {
- 'id': 'wtvKYUG45iw',
- 'ext': 'mp4',
- 'title': 'Generation What - Garde à vous - FRA',
- 'uploader': 'Generation What',
- 'uploader_id': 'UCHH9p1eetWCgt4kXBYCb3_w',
- 'upload_date': '20160411',
- },
- 'params': {
- 'skip_download': True,
- },
- 'add_ie': ['Youtube'],
- }, {
- 'url': 'http://generation-what.francetv.fr/europe/video/present-arms',
- 'only_matching': True,
- }]
-
- def _real_extract(self, url):
- display_id = self._match_id(url)
-
- webpage = self._download_webpage(url, display_id)
-
- youtube_id = self._search_regex(
- r"window\.videoURL\s*=\s*'([0-9A-Za-z_-]{11})';",
- webpage, 'youtube id')
-
- return self.url_result(youtube_id, ie='Youtube', video_id=youtube_id)
-
-
-class CultureboxIE(FranceTVBaseInfoExtractor):
- _VALID_URL = r'https?://(?:m\.)?culturebox\.francetvinfo\.fr/(?:[^/]+/)*(?P<id>[^/?#&]+)'
-
- _TESTS = [{
- 'url': 'https://culturebox.francetvinfo.fr/opera-classique/musique-classique/c-est-baroque/concerts/cantates-bwv-4-106-et-131-de-bach-par-raphael-pichon-57-268689',
- 'info_dict': {
- 'id': 'EV_134885',
- 'ext': 'mp4',
- 'title': 'Cantates BWV 4, 106 et 131 de Bach par Raphaël Pichon 5/7',
- 'description': 'md5:19c44af004b88219f4daa50fa9a351d4',
- 'upload_date': '20180206',
- 'timestamp': 1517945220,
- 'duration': 5981,
- },
- 'params': {
- 'skip_download': True,
- },
- 'add_ie': [FranceTVIE.ie_key()],
- }]
-
- def _real_extract(self, url):
- display_id = self._match_id(url)
-
- webpage = self._download_webpage(url, display_id)
-
- if ">Ce live n'est plus disponible en replay<" in webpage:
- raise ExtractorError(
- 'Video %s is not available' % display_id, expected=True)
-
- video_id, catalogue = self._search_regex(
- r'["\'>]https?://videos\.francetv\.fr/video/([^@]+@.+?)["\'<]',
- webpage, 'video id').split('@')
-
- return self._make_url_result(video_id, catalogue)
-
-
-class FranceTVJeunesseIE(FranceTVBaseInfoExtractor):
- _VALID_URL = r'(?P<url>https?://(?:www\.)?(?:zouzous|ludo)\.fr/heros/(?P<id>[^/?#&]+))'
-
- _TESTS = [{
- 'url': 'https://www.zouzous.fr/heros/simon',
- 'info_dict': {
- 'id': 'simon',
- },
- 'playlist_count': 9,
- }, {
- 'url': 'https://www.ludo.fr/heros/ninjago',
- 'info_dict': {
- 'id': 'ninjago',
- },
- 'playlist_count': 10,
- }, {
- 'url': 'https://www.zouzous.fr/heros/simon?abc',
- 'only_matching': True,
- }]
-
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- playlist_id = mobj.group('id')
-
- playlist = self._download_json(
- '%s/%s' % (mobj.group('url'), 'playlist'), playlist_id)
-
- if not playlist.get('count'):
- raise ExtractorError(
- '%s is not available' % playlist_id, expected=True)
-
- entries = []
- for item in playlist['items']:
- identity = item.get('identity')
- if identity and isinstance(identity, compat_str):
- entries.append(self._make_url_result(identity))
-
- return self.playlist_result(entries, playlist_id)
diff --git a/hypervideo_dl/extractor/frontendmasters.py b/hypervideo_dl/extractor/frontendmasters.py
index f1db33f..40b8cb0 100644
--- a/hypervideo_dl/extractor/frontendmasters.py
+++ b/hypervideo_dl/extractor/frontendmasters.py
@@ -207,7 +207,7 @@ class FrontendMastersLessonIE(FrontendMastersPageBaseIE):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
course_name, lesson_name = mobj.group('course_name', 'lesson_name')
course = self._download_course(course_name, url)
diff --git a/hypervideo_dl/extractor/funimation.py b/hypervideo_dl/extractor/funimation.py
index d8f1e16..382cbe1 100644
--- a/hypervideo_dl/extractor/funimation.py
+++ b/hypervideo_dl/extractor/funimation.py
@@ -2,6 +2,7 @@
from __future__ import unicode_literals
import random
+import re
import string
from .common import InfoExtractor
@@ -10,52 +11,29 @@ from ..utils import (
determine_ext,
int_or_none,
js_to_json,
+ orderedSet,
+ qualities,
+ str_or_none,
+ traverse_obj,
+ try_get,
+ urlencode_postdata,
ExtractorError,
- urlencode_postdata
)
-class FunimationIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?funimation(?:\.com|now\.uk)/(?:[^/]+/)?shows/[^/]+/(?P<id>[^/?#&]+)'
-
+class FunimationBaseIE(InfoExtractor):
_NETRC_MACHINE = 'funimation'
+ _REGION = None
_TOKEN = None
- _TESTS = [{
- 'url': 'https://www.funimation.com/shows/hacksign/role-play/',
- 'info_dict': {
- 'id': '91144',
- 'display_id': 'role-play',
- 'ext': 'mp4',
- 'title': '.hack//SIGN - Role Play',
- 'description': 'md5:b602bdc15eef4c9bbb201bb6e6a4a2dd',
- 'thumbnail': r're:https?://.*\.jpg',
- },
- 'params': {
- # m3u8 download
- 'skip_download': True,
- },
- }, {
- 'url': 'https://www.funimation.com/shows/attack-on-titan-junior-high/broadcast-dub-preview/',
- 'info_dict': {
- 'id': '210051',
- 'display_id': 'broadcast-dub-preview',
- 'ext': 'mp4',
- 'title': 'Attack on Titan: Junior High - Broadcast Dub Preview',
- 'thumbnail': r're:https?://.*\.(?:jpg|png)',
- },
- 'params': {
- # m3u8 download
- 'skip_download': True,
- },
- }, {
- 'url': 'https://www.funimationnow.uk/shows/puzzle-dragons-x/drop-impact/simulcast/',
- 'only_matching': True,
- }, {
- # with lang code
- 'url': 'https://www.funimation.com/en/shows/hacksign/role-play/',
- 'only_matching': True,
- }]
+ def _get_region(self):
+ region_cookie = self._get_cookies('https://www.funimation.com').get('region')
+ region = region_cookie.value if region_cookie else self.get_param('geo_bypass_country')
+ return region or traverse_obj(
+ self._download_json(
+ 'https://geo-service.prd.funimationsvc.com/geo/v1/region/check', None, fatal=False,
+ note='Checking geo-location', errnote='Unable to fetch geo-location information'),
+ 'region') or 'US'
def _login(self):
username, password = self._get_login_info()
@@ -68,91 +46,307 @@ class FunimationIE(InfoExtractor):
'username': username,
'password': password,
}))
- self._TOKEN = data['token']
+ return data['token']
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
error = self._parse_json(e.cause.read().decode(), None)['error']
raise ExtractorError(error, expected=True)
raise
+
+class FunimationPageIE(FunimationBaseIE):
+ IE_NAME = 'funimation:page'
+ _VALID_URL = r'https?://(?:www\.)?funimation(?:\.com|now\.uk)/(?:(?P<lang>[^/]+)/)?(?:shows|v)/(?P<show>[^/]+)/(?P<episode>[^/?#&]+)'
+
+ _TESTS = [{
+ 'url': 'https://www.funimation.com/shows/attack-on-titan-junior-high/broadcast-dub-preview/',
+ 'info_dict': {
+ 'id': '210050',
+ 'ext': 'mp4',
+ 'title': 'Broadcast Dub Preview',
+ # Other metadata is tested in FunimationIE
+ },
+ 'params': {
+ 'skip_download': 'm3u8',
+ },
+ 'add_ie': ['Funimation'],
+ }, {
+ # Not available in US
+ 'url': 'https://www.funimation.com/shows/hacksign/role-play/',
+ 'only_matching': True,
+ }, {
+ # with lang code
+ 'url': 'https://www.funimation.com/en/shows/hacksign/role-play/',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.funimationnow.uk/shows/puzzle-dragons-x/drop-impact/simulcast/',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.funimation.com/v/a-certain-scientific-railgun/super-powered-level-5',
+ 'only_matching': True,
+ }]
+
def _real_initialize(self):
- self._login()
+ if not self._REGION:
+ FunimationBaseIE._REGION = self._get_region()
+ if not self._TOKEN:
+ FunimationBaseIE._TOKEN = self._login()
def _real_extract(self, url):
- display_id = self._match_id(url)
- webpage = self._download_webpage(url, display_id)
-
- def _search_kane(name):
- return self._search_regex(
- r"KANE_customdimensions\.%s\s*=\s*'([^']+)';" % name,
- webpage, name, default=None)
-
- title_data = self._parse_json(self._search_regex(
- r'TITLE_DATA\s*=\s*({[^}]+})',
- webpage, 'title data', default=''),
- display_id, js_to_json, fatal=False) or {}
-
- video_id = title_data.get('id') or self._search_regex([
- r"KANE_customdimensions.videoID\s*=\s*'(\d+)';",
- r'<iframe[^>]+src="/player/(\d+)',
- ], webpage, 'video_id', default=None)
- if not video_id:
- player_url = self._html_search_meta([
- 'al:web:url',
- 'og:video:url',
- 'og:video:secure_url',
- ], webpage, fatal=True)
- video_id = self._search_regex(r'/player/(\d+)', player_url, 'video id')
-
- title = episode = title_data.get('title') or _search_kane('videoTitle') or self._og_search_title(webpage)
- series = _search_kane('showName')
- if series:
- title = '%s - %s' % (series, title)
- description = self._html_search_meta(['description', 'og:description'], webpage, fatal=True)
+ locale, show, episode = self._match_valid_url(url).group('lang', 'show', 'episode')
+
+ video_id = traverse_obj(self._download_json(
+ f'https://title-api.prd.funimationsvc.com/v1/shows/{show}/episodes/{episode}',
+ f'{show}_{episode}', query={
+ 'deviceType': 'web',
+ 'region': self._REGION,
+ 'locale': locale or 'en'
+ }), ('videoList', ..., 'id'), get_all=False)
+
+ return self.url_result(f'https://www.funimation.com/player/{video_id}', FunimationIE.ie_key(), video_id)
+
+
+class FunimationIE(FunimationBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?funimation\.com/player/(?P<id>\d+)'
+
+ _TESTS = [{
+ 'url': 'https://www.funimation.com/player/210051',
+ 'info_dict': {
+ 'id': '210050',
+ 'display_id': 'broadcast-dub-preview',
+ 'ext': 'mp4',
+ 'title': 'Broadcast Dub Preview',
+ 'thumbnail': r're:https?://.*\.(?:jpg|png)',
+ 'episode': 'Broadcast Dub Preview',
+ 'episode_id': '210050',
+ 'season': 'Extras',
+ 'season_id': '166038',
+ 'season_number': 99,
+ 'series': 'Attack on Titan: Junior High',
+ 'description': '',
+ 'duration': 155,
+ },
+ 'params': {
+ 'skip_download': 'm3u8',
+ },
+ }, {
+ 'note': 'player_id should be extracted with the relevent compat-opt',
+ 'url': 'https://www.funimation.com/player/210051',
+ 'info_dict': {
+ 'id': '210051',
+ 'display_id': 'broadcast-dub-preview',
+ 'ext': 'mp4',
+ 'title': 'Broadcast Dub Preview',
+ 'thumbnail': r're:https?://.*\.(?:jpg|png)',
+ 'episode': 'Broadcast Dub Preview',
+ 'episode_id': '210050',
+ 'season': 'Extras',
+ 'season_id': '166038',
+ 'season_number': 99,
+ 'series': 'Attack on Titan: Junior High',
+ 'description': '',
+ 'duration': 155,
+ },
+ 'params': {
+ 'skip_download': 'm3u8',
+ 'compat_opts': ['seperate-video-versions'],
+ },
+ }]
+
+ def _real_initialize(self):
+ if not self._TOKEN:
+ FunimationBaseIE._TOKEN = self._login()
+
+ @staticmethod
+ def _get_experiences(episode):
+ for lang, lang_data in episode.get('languages', {}).items():
+ for video_data in lang_data.values():
+ for version, f in video_data.items():
+ yield lang, version.title(), f
+
+ def _get_episode(self, webpage, experience_id=None, episode_id=None, fatal=True):
+ ''' Extract the episode, season and show objects given either episode/experience id '''
+ show = self._parse_json(
+ self._search_regex(
+ r'show\s*=\s*({.+?})\s*;', webpage, 'show data', fatal=fatal),
+ experience_id, transform_source=js_to_json, fatal=fatal) or []
+ for season in show.get('seasons', []):
+ for episode in season.get('episodes', []):
+ if episode_id is not None:
+ if str(episode.get('episodePk')) == episode_id:
+ return episode, season, show
+ continue
+ for _, _, f in self._get_experiences(episode):
+ if f.get('experienceId') == experience_id:
+ return episode, season, show
+ if fatal:
+ raise ExtractorError('Unable to find episode information')
+ else:
+ self.report_warning('Unable to find episode information')
+ return {}, {}, {}
+
+ def _real_extract(self, url):
+ initial_experience_id = self._match_id(url)
+ webpage = self._download_webpage(
+ url, initial_experience_id, note=f'Downloading player webpage for {initial_experience_id}')
+ episode, season, show = self._get_episode(webpage, experience_id=int(initial_experience_id))
+ episode_id = str(episode['episodePk'])
+ display_id = episode.get('slug') or episode_id
+
+ formats, subtitles, thumbnails, duration = [], {}, [], 0
+ requested_languages, requested_versions = self._configuration_arg('language'), self._configuration_arg('version')
+ language_preference = qualities((requested_languages or [''])[::-1])
+ source_preference = qualities((requested_versions or ['uncut', 'simulcast'])[::-1])
+ only_initial_experience = 'seperate-video-versions' in self.get_param('compat_opts', [])
+
+ for lang, version, fmt in self._get_experiences(episode):
+ experience_id = str(fmt['experienceId'])
+ if (only_initial_experience and experience_id != initial_experience_id
+ or requested_languages and lang.lower() not in requested_languages
+ or requested_versions and version.lower() not in requested_versions):
+ continue
+ thumbnails.append({'url': fmt.get('poster')})
+ duration = max(duration, fmt.get('duration', 0))
+ format_name = '%s %s (%s)' % (version, lang, experience_id)
+ self.extract_subtitles(
+ subtitles, experience_id, display_id=display_id, format_name=format_name,
+ episode=episode if experience_id == initial_experience_id else episode_id)
- try:
headers = {}
if self._TOKEN:
headers['Authorization'] = 'Token %s' % self._TOKEN
- sources = self._download_json(
- 'https://www.funimation.com/api/showexperience/%s/' % video_id,
- video_id, headers=headers, query={
+ page = self._download_json(
+ 'https://www.funimation.com/api/showexperience/%s/' % experience_id,
+ display_id, headers=headers, expected_status=403, query={
'pinst_id': ''.join([random.choice(string.digits + string.ascii_letters) for _ in range(8)]),
- })['items']
- except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
- error = self._parse_json(e.cause.read(), video_id)['errors'][0]
- raise ExtractorError('%s said: %s' % (
- self.IE_NAME, error.get('detail') or error.get('title')), expected=True)
- raise
+ }, note=f'Downloading {format_name} JSON')
+ sources = page.get('items') or []
+ if not sources:
+ error = try_get(page, lambda x: x['errors'][0], dict)
+ if error:
+ self.report_warning('%s said: Error %s - %s' % (
+ self.IE_NAME, error.get('code'), error.get('detail') or error.get('title')))
+ else:
+ self.report_warning('No sources found for format')
- formats = []
- for source in sources:
- source_url = source.get('src')
- if not source_url:
- continue
- source_type = source.get('videoType') or determine_ext(source_url)
- if source_type == 'm3u8':
- formats.extend(self._extract_m3u8_formats(
- source_url, video_id, 'mp4',
- m3u8_id='hls', fatal=False))
- else:
- formats.append({
- 'format_id': source_type,
- 'url': source_url,
- })
- self._sort_formats(formats)
+ current_formats = []
+ for source in sources:
+ source_url = source.get('src')
+ source_type = source.get('videoType') or determine_ext(source_url)
+ if source_type == 'm3u8':
+ current_formats.extend(self._extract_m3u8_formats(
+ source_url, display_id, 'mp4', m3u8_id='%s-%s' % (experience_id, 'hls'), fatal=False,
+ note=f'Downloading {format_name} m3u8 information'))
+ else:
+ current_formats.append({
+ 'format_id': '%s-%s' % (experience_id, source_type),
+ 'url': source_url,
+ })
+ for f in current_formats:
+ # TODO: Convert language to code
+ f.update({
+ 'language': lang,
+ 'format_note': version,
+ 'source_preference': source_preference(version.lower()),
+ 'language_preference': language_preference(lang.lower()),
+ })
+ formats.extend(current_formats)
+ self._remove_duplicate_formats(formats)
+ self._sort_formats(formats, ('lang', 'source'))
return {
- 'id': video_id,
+ 'id': initial_experience_id if only_initial_experience else episode_id,
'display_id': display_id,
- 'title': title,
- 'description': description,
- 'thumbnail': self._og_search_thumbnail(webpage),
- 'series': series,
- 'season_number': int_or_none(title_data.get('seasonNum') or _search_kane('season')),
- 'episode_number': int_or_none(title_data.get('episodeNum')),
- 'episode': episode,
- 'season_id': title_data.get('seriesId'),
+ 'duration': duration,
+ 'title': episode['episodeTitle'],
+ 'description': episode.get('episodeSummary'),
+ 'episode': episode.get('episodeTitle'),
+ 'episode_number': int_or_none(episode.get('episodeId')),
+ 'episode_id': episode_id,
+ 'season': season.get('seasonTitle'),
+ 'season_number': int_or_none(season.get('seasonId')),
+ 'season_id': str_or_none(season.get('seasonPk')),
+ 'series': show.get('showTitle'),
'formats': formats,
+ 'thumbnails': thumbnails,
+ 'subtitles': subtitles,
+ }
+
+ def _get_subtitles(self, subtitles, experience_id, episode, display_id, format_name):
+ if isinstance(episode, str):
+ webpage = self._download_webpage(
+ f'https://www.funimation.com/player/{experience_id}', display_id,
+ fatal=False, note=f'Downloading player webpage for {format_name}')
+ episode, _, _ = self._get_episode(webpage, episode_id=episode, fatal=False)
+
+ for _, version, f in self._get_experiences(episode):
+ for source in f.get('sources'):
+ for text_track in source.get('textTracks'):
+ if not text_track.get('src'):
+ continue
+ sub_type = text_track.get('type').upper()
+ sub_type = sub_type if sub_type != 'FULL' else None
+ current_sub = {
+ 'url': text_track['src'],
+ 'name': ' '.join(filter(None, (version, text_track.get('label'), sub_type)))
+ }
+ lang = '_'.join(filter(None, (
+ text_track.get('language', 'und'), version if version != 'Simulcast' else None, sub_type)))
+ if current_sub not in subtitles.get(lang, []):
+ subtitles.setdefault(lang, []).append(current_sub)
+ return subtitles
+
+
+class FunimationShowIE(FunimationBaseIE):
+ IE_NAME = 'funimation:show'
+ _VALID_URL = r'(?P<url>https?://(?:www\.)?funimation(?:\.com|now\.uk)/(?P<locale>[^/]+)?/?shows/(?P<id>[^/?#&]+))/?(?:[?#]|$)'
+
+ _TESTS = [{
+ 'url': 'https://www.funimation.com/en/shows/sk8-the-infinity',
+ 'info_dict': {
+ 'id': 1315000,
+ 'title': 'SK8 the Infinity'
+ },
+ 'playlist_count': 13,
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ # without lang code
+ 'url': 'https://www.funimation.com/shows/ouran-high-school-host-club/',
+ 'info_dict': {
+ 'id': 39643,
+ 'title': 'Ouran High School Host Club'
+ },
+ 'playlist_count': 26,
+ 'params': {
+ 'skip_download': True,
+ },
+ }]
+
+ def _real_initialize(self):
+ if not self._REGION:
+ FunimationBaseIE._REGION = self._get_region()
+
+ def _real_extract(self, url):
+ base_url, locale, display_id = self._match_valid_url(url).groups()
+
+ show_info = self._download_json(
+ 'https://title-api.prd.funimationsvc.com/v2/shows/%s?region=%s&deviceType=web&locale=%s'
+ % (display_id, self._REGION, locale or 'en'), display_id)
+ items_info = self._download_json(
+ 'https://prod-api-funimationnow.dadcdigital.com/api/funimation/episodes/?limit=99999&title_id=%s'
+ % show_info.get('id'), display_id)
+
+ vod_items = traverse_obj(items_info, ('items', ..., re.compile('(?i)mostRecent[AS]vod').match, 'item'))
+
+ return {
+ '_type': 'playlist',
+ 'id': show_info['id'],
+ 'title': show_info['name'],
+ 'entries': orderedSet(
+ self.url_result(
+ '%s/%s' % (base_url, vod_item.get('episodeSlug')), FunimationPageIE.ie_key(),
+ vod_item.get('episodeId'), vod_item.get('episodeName'))
+ for vod_item in sorted(vod_items, key=lambda x: x.get('episodeOrder', -1))),
}
diff --git a/hypervideo_dl/extractor/funk.py b/hypervideo_dl/extractor/funk.py
index 81d1949..e5e3260 100644
--- a/hypervideo_dl/extractor/funk.py
+++ b/hypervideo_dl/extractor/funk.py
@@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
from .nexx import NexxIE
@@ -31,7 +30,7 @@ class FunkIE(InfoExtractor):
}]
def _real_extract(self, url):
- display_id, nexx_id = re.match(self._VALID_URL, url).groups()
+ display_id, nexx_id = self._match_valid_url(url).groups()
video = self._download_json(
'https://www.funk.net/api/v4.0/videos/' + nexx_id, nexx_id)
return {
diff --git a/hypervideo_dl/extractor/fxnetworks.py b/hypervideo_dl/extractor/fxnetworks.py
new file mode 100644
index 0000000..00e6742
--- /dev/null
+++ b/hypervideo_dl/extractor/fxnetworks.py
@@ -0,0 +1,77 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .adobepass import AdobePassIE
+from ..utils import (
+ extract_attributes,
+ int_or_none,
+ parse_age_limit,
+ smuggle_url,
+ update_url_query,
+)
+
+
+class FXNetworksIE(AdobePassIE):
+ _VALID_URL = r'https?://(?:www\.)?(?:fxnetworks|simpsonsworld)\.com/video/(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'http://www.fxnetworks.com/video/1032565827847',
+ 'md5': '8d99b97b4aa7a202f55b6ed47ea7e703',
+ 'info_dict': {
+ 'id': 'dRzwHC_MMqIv',
+ 'ext': 'mp4',
+ 'title': 'First Look: Better Things - Season 2',
+ 'description': 'Because real life is like a fart. Watch this FIRST LOOK to see what inspired the new season of Better Things.',
+ 'age_limit': 14,
+ 'uploader': 'NEWA-FNG-FX',
+ 'upload_date': '20170825',
+ 'timestamp': 1503686274,
+ 'episode_number': 0,
+ 'season_number': 2,
+ 'series': 'Better Things',
+ },
+ 'add_ie': ['ThePlatform'],
+ }, {
+ 'url': 'http://www.simpsonsworld.com/video/716094019682',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+ if 'The content you are trying to access is not available in your region.' in webpage:
+ self.raise_geo_restricted()
+ video_data = extract_attributes(self._search_regex(
+ r'(<a.+?rel="https?://link\.theplatform\.com/s/.+?</a>)', webpage, 'video data'))
+ player_type = self._search_regex(r'playerType\s*=\s*[\'"]([^\'"]+)', webpage, 'player type', default=None)
+ release_url = video_data['rel']
+ title = video_data['data-title']
+ rating = video_data.get('data-rating')
+ query = {
+ 'mbr': 'true',
+ }
+ if player_type == 'movies':
+ query.update({
+ 'manifest': 'm3u',
+ })
+ else:
+ query.update({
+ 'switch': 'http',
+ })
+ if video_data.get('data-req-auth') == '1':
+ resource = self._get_mvpd_resource(
+ video_data['data-channel'], title,
+ video_data.get('data-guid'), rating)
+ query['auth'] = self._extract_mvpd_auth(url, video_id, 'fx', resource)
+
+ return {
+ '_type': 'url_transparent',
+ 'id': video_id,
+ 'title': title,
+ 'url': smuggle_url(update_url_query(release_url, query), {'force_smil_url': True}),
+ 'series': video_data.get('data-show-title'),
+ 'episode_number': int_or_none(video_data.get('data-episode')),
+ 'season_number': int_or_none(video_data.get('data-season')),
+ 'thumbnail': video_data.get('data-large-thumb'),
+ 'age_limit': parse_age_limit(rating),
+ 'ie_key': 'ThePlatform',
+ }
diff --git a/hypervideo_dl/extractor/gab.py b/hypervideo_dl/extractor/gab.py
new file mode 100644
index 0000000..25b5cb0
--- /dev/null
+++ b/hypervideo_dl/extractor/gab.py
@@ -0,0 +1,64 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ clean_html,
+ str_to_int,
+)
+
+
+class GabTVIE(InfoExtractor):
+ _VALID_URL = r'(?:https?://)tv.gab.com/channel/[^/]+/view/(?P<id>[a-z0-9-]+)'
+ _TESTS = [{
+ 'url': 'https://tv.gab.com/channel/wurzelroot/view/why-was-america-in-afghanistan-61217eacea5665de450d0488',
+ 'info_dict': {
+ 'id': '61217eacea5665de450d0488',
+ 'ext': 'mp4',
+ 'title': 'WHY WAS AMERICA IN AFGHANISTAN - AMERICA FIRST AGAINST AMERICAN OLIGARCHY',
+ 'description': None,
+ 'uploader': 'Wurzelroot',
+ 'uploader_id': '608fb0a85738fd1974984f7d',
+ 'thumbnail': 'https://tv.gab.com/image/61217eacea5665de450d0488',
+ }
+ }]
+
+ def _real_extract(self, url):
+ id = self._match_id(url).split('-')[-1]
+ webpage = self._download_webpage(url, id)
+ channel_id = self._search_regex(r'data-channel-id=\"(?P<channel_id>[^\"]+)', webpage, 'channel_id')
+ channel_name = self._search_regex(r'data-channel-name=\"(?P<channel_id>[^\"]+)', webpage, 'channel_name')
+ title = self._search_regex(r'data-episode-title=\"(?P<channel_id>[^\"]+)', webpage, 'title')
+ view_key = self._search_regex(r'data-view-key=\"(?P<channel_id>[^\"]+)', webpage, 'view_key')
+ description = clean_html(self._html_search_regex(self._meta_regex('description'), webpage, 'description', group='content')) or None
+ available_resolutions = re.findall(r'<a\ data-episode-id=\"%s\"\ data-resolution=\"(?P<resolution>[^\"]+)' % id, webpage)
+
+ formats = []
+ for resolution in available_resolutions:
+ frmt = {
+ 'url': f'https://tv.gab.com/media/{id}?viewKey={view_key}&r={resolution}',
+ 'format_id': resolution,
+ 'vcodec': 'h264',
+ 'acodec': 'aac',
+ 'ext': 'mp4'
+ }
+ if 'audio-' in resolution:
+ frmt['abr'] = str_to_int(resolution.replace('audio-', ''))
+ frmt['height'] = 144
+ frmt['quality'] = -10
+ else:
+ frmt['height'] = str_to_int(resolution.replace('p', ''))
+ formats.append(frmt)
+ self._sort_formats(formats)
+
+ return {
+ 'id': id,
+ 'title': title,
+ 'formats': formats,
+ 'description': description,
+ 'uploader': channel_name,
+ 'uploader_id': channel_id,
+ 'thumbnail': f'https://tv.gab.com/image/{id}',
+ }
diff --git a/hypervideo_dl/extractor/gaia.py b/hypervideo_dl/extractor/gaia.py
index e952775..7821fb7 100644
--- a/hypervideo_dl/extractor/gaia.py
+++ b/hypervideo_dl/extractor/gaia.py
@@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
from ..compat import (
@@ -76,7 +75,7 @@ class GaiaIE(InfoExtractor):
self._jwt = auth.get('jwt')
def _real_extract(self, url):
- display_id, vtype = re.search(self._VALID_URL, url).groups()
+ display_id, vtype = self._match_valid_url(url).groups()
node_id = self._download_json(
'https://brooklyn.gaia.com/pathinfo', display_id, query={
'path': 'video/' + display_id,
diff --git a/hypervideo_dl/extractor/gamestar.py b/hypervideo_dl/extractor/gamestar.py
index f00dab2..e882fa6 100644
--- a/hypervideo_dl/extractor/gamestar.py
+++ b/hypervideo_dl/extractor/gamestar.py
@@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
from ..utils import (
@@ -34,7 +33,7 @@ class GameStarIE(InfoExtractor):
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
site = mobj.group('site')
video_id = mobj.group('id')
diff --git a/hypervideo_dl/extractor/gaskrank.py b/hypervideo_dl/extractor/gaskrank.py
index 1726a67..03acd2a 100644
--- a/hypervideo_dl/extractor/gaskrank.py
+++ b/hypervideo_dl/extractor/gaskrank.py
@@ -51,7 +51,7 @@ class GaskrankIE(InfoExtractor):
webpage, default=None) or self._html_search_meta(
'title', webpage, fatal=True)
- categories = [re.match(self._VALID_URL, url).group('categories')]
+ categories = [self._match_valid_url(url).group('categories')]
mobj = re.search(
r'Video von:\s*(?P<uploader_id>[^|]*?)\s*\|\s*vom:\s*(?P<upload_date>[0-9][0-9]\.[0-9][0-9]\.[0-9][0-9][0-9][0-9])',
diff --git a/hypervideo_dl/extractor/gazeta.py b/hypervideo_dl/extractor/gazeta.py
index 57c67a4..3671870 100644
--- a/hypervideo_dl/extractor/gazeta.py
+++ b/hypervideo_dl/extractor/gazeta.py
@@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
@@ -34,7 +33,7 @@ class GazetaIE(InfoExtractor):
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
display_id = mobj.group('id')
embed_url = '%s?p=embed' % mobj.group('url')
diff --git a/hypervideo_dl/extractor/gdcvault.py b/hypervideo_dl/extractor/gdcvault.py
index acc6478..c3ad6b4 100644
--- a/hypervideo_dl/extractor/gdcvault.py
+++ b/hypervideo_dl/extractor/gdcvault.py
@@ -149,7 +149,7 @@ class GDCVaultIE(InfoExtractor):
return start_page
def _real_extract(self, url):
- video_id, name = re.match(self._VALID_URL, url).groups()
+ video_id, name = self._match_valid_url(url).groups()
display_id = name or video_id
webpage_url = 'http://www.gdcvault.com/play/' + video_id
diff --git a/hypervideo_dl/extractor/gedidigital.py b/hypervideo_dl/extractor/gedidigital.py
index 6c4153b..ec386c2 100644
--- a/hypervideo_dl/extractor/gedidigital.py
+++ b/hypervideo_dl/extractor/gedidigital.py
@@ -5,18 +5,22 @@ import re
from .common import InfoExtractor
from ..utils import (
+ base_url,
determine_ext,
int_or_none,
+ url_basename,
+ urljoin,
)
class GediDigitalIE(InfoExtractor):
- _VALID_URL = r'''(?x)https?://video\.
+ _VALID_URL = r'''(?x)(?P<url>(?:https?:)//video\.
(?:
(?:
(?:espresso\.)?repubblica
|lastampa
|ilsecoloxix
+ |huffingtonpost
)|
(?:
iltirreno
@@ -32,12 +36,12 @@ class GediDigitalIE(InfoExtractor):
|corrierealpi
|lasentinella
)\.gelocal
- )\.it(?:/[^/]+){2,3}?/(?P<id>\d+)(?:[/?&#]|$)'''
+ )\.it(?:/[^/]+){2,4}/(?P<id>\d+))(?:$|[?&].*)'''
_TESTS = [{
'url': 'https://video.lastampa.it/politica/il-paradosso-delle-regionali-la-lega-vince-ma-sembra-aver-perso/121559/121683',
'md5': '84658d7fb9e55a6e57ecc77b73137494',
'info_dict': {
- 'id': '121559',
+ 'id': '121683',
'ext': 'mp4',
'title': 'Il paradosso delle Regionali: ecco perché la Lega vince ma sembra aver perso',
'description': 'md5:de7f4d6eaaaf36c153b599b10f8ce7ca',
@@ -45,6 +49,9 @@ class GediDigitalIE(InfoExtractor):
'duration': 125,
},
}, {
+ 'url': 'https://video.huffingtonpost.it/embed/politica/cotticelli-non-so-cosa-mi-sia-successo-sto-cercando-di-capire-se-ho-avuto-un-malore/29312/29276?responsive=true&el=video971040871621586700',
+ 'only_matching': True,
+ }, {
'url': 'https://video.espresso.repubblica.it/embed/tutti-i-video/01-ted-villa/14772/14870&width=640&height=360',
'only_matching': True,
}, {
@@ -94,9 +101,49 @@ class GediDigitalIE(InfoExtractor):
'only_matching': True,
}]
+ @staticmethod
+ def _sanitize_urls(urls):
+ # add protocol if missing
+ for i, e in enumerate(urls):
+ if e.startswith('//'):
+ urls[i] = 'https:%s' % e
+ # clean iframes urls
+ for i, e in enumerate(urls):
+ urls[i] = urljoin(base_url(e), url_basename(e))
+ return urls
+
+ @staticmethod
+ def _extract_urls(webpage):
+ entries = [
+ mobj.group('eurl')
+ for mobj in re.finditer(r'''(?x)
+ (?:
+ data-frame-src=|
+ <iframe[^\n]+src=
+ )
+ (["'])(?P<eurl>%s)\1''' % GediDigitalIE._VALID_URL, webpage)]
+ return GediDigitalIE._sanitize_urls(entries)
+
+ @staticmethod
+ def _extract_url(webpage):
+ urls = GediDigitalIE._extract_urls(webpage)
+ return urls[0] if urls else None
+
+ @staticmethod
+ def _clean_formats(formats):
+ format_urls = set()
+ clean_formats = []
+ for f in formats:
+ if f['url'] not in format_urls:
+ if f.get('audio_ext') != 'none' and not f.get('acodec'):
+ continue
+ format_urls.add(f['url'])
+ clean_formats.append(f)
+ formats[:] = clean_formats
+
def _real_extract(self, url):
video_id = self._match_id(url)
-
+ url = self._match_valid_url(url).group('url')
webpage = self._download_webpage(url, video_id)
title = self._html_search_meta(
['twitter:title', 'og:title'], webpage, fatal=True)
@@ -129,6 +176,7 @@ class GediDigitalIE(InfoExtractor):
f.update({
'abr': abr,
'tbr': abr,
+ 'acodec': ext,
'vcodec': 'none'
})
else:
@@ -148,6 +196,7 @@ class GediDigitalIE(InfoExtractor):
elif n == 'videoDuration':
duration = int_or_none(v)
+ self._clean_formats(formats)
self._sort_formats(formats)
return {
diff --git a/hypervideo_dl/extractor/generic.py b/hypervideo_dl/extractor/generic.py
index 7da038a..8387646 100644
--- a/hypervideo_dl/extractor/generic.py
+++ b/hypervideo_dl/extractor/generic.py
@@ -84,7 +84,6 @@ from .jwplatform import JWPlatformIE
from .digiteka import DigitekaIE
from .arkena import ArkenaIE
from .instagram import InstagramIE
-from .liveleak import LiveLeakIE
from .threeqsdn import ThreeQSDNIE
from .theplatform import ThePlatformIE
from .kaltura import KalturaIE
@@ -128,9 +127,14 @@ from .zype import ZypeIE
from .odnoklassniki import OdnoklassnikiIE
from .vk import VKIE
from .kinja import KinjaEmbedIE
+from .gedidigital import GediDigitalIE
+from .rcs import RCSEmbedsIE
+from .bitchute import BitChuteIE
+from .rumble import RumbleEmbedIE
from .arcpublishing import ArcPublishingIE
from .medialaan import MedialaanIE
from .simplecast import SimplecastIE
+from .wimtv import WimTVIE
class GenericIE(InfoExtractor):
@@ -216,12 +220,10 @@ class GenericIE(InfoExtractor):
'playlist': [{
'info_dict': {
'ext': 'mov',
- 'id': 'pdv_maddow_netcast_mov-12-04-2020-224335',
- 'title': 're:MSNBC Rachel Maddow',
+ 'id': 'pdv_maddow_netcast_mov-12-03-2020-223726',
+ 'title': 'MSNBC Rachel Maddow (video) - 12-03-2020-223726',
'description': 're:.*her unique approach to storytelling.*',
- 'timestamp': int,
- 'upload_date': compat_str,
- 'duration': float,
+ 'upload_date': '20201204',
},
}],
},
@@ -1213,14 +1215,13 @@ class GenericIE(InfoExtractor):
},
{
# JWPlatform iframe
- 'url': 'https://www.mediaite.com/tv/dem-senator-claims-gary-cohn-faked-a-bad-connection-during-trump-call-to-get-him-off-the-phone/',
- 'md5': 'ca00a040364b5b439230e7ebfd02c4e9',
+ 'url': 'https://www.covermagazine.co.uk/feature/2465255/business-protection-involved',
'info_dict': {
- 'id': 'O0c5JcKT',
+ 'id': 'AG26UQXM',
'ext': 'mp4',
- 'upload_date': '20171122',
- 'timestamp': 1511366290,
- 'title': 'Dem Senator Claims Gary Cohn Faked a Bad Connection During Trump Call to Get Him Off the Phone',
+ 'upload_date': '20160719',
+ 'timestamp': 468923808,
+ 'title': '2016_05_18 Cover L&G Business Protection V1 FINAL.mp4',
},
'add_ie': [JWPlatformIE.ie_key()],
},
@@ -1629,31 +1630,6 @@ class GenericIE(InfoExtractor):
'upload_date': '20160409',
},
},
- # LiveLeak embed
- {
- 'url': 'http://www.wykop.pl/link/3088787/',
- 'md5': '7619da8c820e835bef21a1efa2a0fc71',
- 'info_dict': {
- 'id': '874_1459135191',
- 'ext': 'mp4',
- 'title': 'Man shows poor quality of new apartment building',
- 'description': 'The wall is like a sand pile.',
- 'uploader': 'Lake8737',
- },
- 'add_ie': [LiveLeakIE.ie_key()],
- },
- # Another LiveLeak embed pattern (#13336)
- {
- 'url': 'https://milo.yiannopoulos.net/2017/06/concealed-carry-robbery/',
- 'info_dict': {
- 'id': '2eb_1496309988',
- 'ext': 'mp4',
- 'title': 'Thief robs place where everyone was armed',
- 'description': 'md5:694d73ee79e535953cf2488562288eee',
- 'uploader': 'brazilwtf',
- },
- 'add_ie': [LiveLeakIE.ie_key()],
- },
# Duplicated embedded video URLs
{
'url': 'http://www.hudl.com/athlete/2538180/highlights/149298443',
@@ -2253,6 +2229,95 @@ class GenericIE(InfoExtractor):
# Sibnet embed (https://help.sibnet.ru/?sibnet_video_embed)
'url': 'https://phpbb3.x-tk.ru/bbcode-video-sibnet-t24.html',
'only_matching': True,
+ }, {
+ # WimTv embed player
+ 'url': 'http://www.msmotor.tv/wearefmi-pt-2-2021/',
+ 'info_dict': {
+ 'id': 'wearefmi-pt-2-2021',
+ 'title': '#WEAREFMI – PT.2 – 2021 – MsMotorTV',
+ },
+ 'playlist_count': 1,
+ }, {
+ # KVS Player
+ 'url': 'https://www.kvs-demo.com/videos/105/kelis-4th-of-july/',
+ 'info_dict': {
+ 'id': '105',
+ 'display_id': 'kelis-4th-of-july',
+ 'ext': 'mp4',
+ 'title': 'Kelis - 4th Of July',
+ 'thumbnail': 'https://kvs-demo.com/contents/videos_screenshots/0/105/preview.jpg',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ # KVS Player
+ 'url': 'https://www.kvs-demo.com/embed/105/',
+ 'info_dict': {
+ 'id': '105',
+ 'display_id': 'kelis-4th-of-july',
+ 'ext': 'mp4',
+ 'title': 'Kelis - 4th Of July / Embed Player',
+ 'thumbnail': 'https://kvs-demo.com/contents/videos_screenshots/0/105/preview.jpg',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ # KVS Player
+ 'url': 'https://thisvid.com/videos/french-boy-pantsed/',
+ 'md5': '3397979512c682f6b85b3b04989df224',
+ 'info_dict': {
+ 'id': '2400174',
+ 'display_id': 'french-boy-pantsed',
+ 'ext': 'mp4',
+ 'title': 'French Boy Pantsed - ThisVid.com',
+ 'thumbnail': 'https://media.thisvid.com/contents/videos_screenshots/2400000/2400174/preview.mp4.jpg',
+ }
+ }, {
+ # KVS Player
+ 'url': 'https://thisvid.com/embed/2400174/',
+ 'md5': '3397979512c682f6b85b3b04989df224',
+ 'info_dict': {
+ 'id': '2400174',
+ 'display_id': 'french-boy-pantsed',
+ 'ext': 'mp4',
+ 'title': 'French Boy Pantsed - ThisVid.com',
+ 'thumbnail': 'https://media.thisvid.com/contents/videos_screenshots/2400000/2400174/preview.mp4.jpg',
+ }
+ }, {
+ # KVS Player
+ 'url': 'https://youix.com/video/leningrad-zoj/',
+ 'md5': '94f96ba95706dc3880812b27b7d8a2b8',
+ 'info_dict': {
+ 'id': '18485',
+ 'display_id': 'leningrad-zoj',
+ 'ext': 'mp4',
+ 'title': 'Клип: Ленинград - ЗОЖ скачать, смотреть онлайн | Youix.com',
+ 'thumbnail': 'https://youix.com/contents/videos_screenshots/18000/18485/preview_480x320_youix_com.mp4.jpg',
+ }
+ }, {
+ # KVS Player
+ 'url': 'https://youix.com/embed/18485',
+ 'md5': '94f96ba95706dc3880812b27b7d8a2b8',
+ 'info_dict': {
+ 'id': '18485',
+ 'display_id': 'leningrad-zoj',
+ 'ext': 'mp4',
+ 'title': 'Ленинград - ЗОЖ',
+ 'thumbnail': 'https://youix.com/contents/videos_screenshots/18000/18485/preview_480x320_youix_com.mp4.jpg',
+ }
+ }, {
+ # KVS Player
+ 'url': 'https://bogmedia.org/videos/21217/40-nochey-40-nights-2016/',
+ 'md5': '94166bdb26b4cb1fb9214319a629fc51',
+ 'info_dict': {
+ 'id': '21217',
+ 'display_id': '40-nochey-40-nights-2016',
+ 'ext': 'mp4',
+ 'title': '40 ночей (2016) - BogMedia.org',
+ 'thumbnail': 'https://bogmedia.org/contents/videos_screenshots/21000/21217/preview_480p.mp4.jpg',
+ }
},
]
@@ -2358,19 +2423,57 @@ class GenericIE(InfoExtractor):
'title': title,
}
+ def _kvs_getrealurl(self, video_url, license_code):
+ if not video_url.startswith('function/0/'):
+ return video_url # not obfuscated
+
+ url_path, _, url_query = video_url.partition('?')
+ urlparts = url_path.split('/')[2:]
+ license = self._kvs_getlicensetoken(license_code)
+ newmagic = urlparts[5][:32]
+
+ for o in range(len(newmagic) - 1, -1, -1):
+ new = ''
+ l = (o + sum([int(n) for n in license[o:]])) % 32
+
+ for i in range(0, len(newmagic)):
+ if i == o:
+ new += newmagic[l]
+ elif i == l:
+ new += newmagic[o]
+ else:
+ new += newmagic[i]
+ newmagic = new
+
+ urlparts[5] = newmagic + urlparts[5][32:]
+ return '/'.join(urlparts) + '?' + url_query
+
+ def _kvs_getlicensetoken(self, license):
+ modlicense = license.replace('$', '').replace('0', '1')
+ center = int(len(modlicense) / 2)
+ fronthalf = int(modlicense[:center + 1])
+ backhalf = int(modlicense[center:])
+
+ modlicense = str(4 * abs(fronthalf - backhalf))
+ retval = ''
+ for o in range(0, center + 1):
+ for i in range(1, 5):
+ retval += str((int(license[o + i]) + int(modlicense[o])) % 10)
+ return retval
+
def _real_extract(self, url):
if url.startswith('//'):
return self.url_result(self.http_scheme() + url)
parsed_url = compat_urlparse.urlparse(url)
if not parsed_url.scheme:
- default_search = self._downloader.params.get('default_search')
+ default_search = self.get_param('default_search')
if default_search is None:
default_search = 'fixup_error'
if default_search in ('auto', 'auto_warning', 'fixup_error'):
if re.match(r'^[^\s/]+\.[^\s/]+/', url):
- self._downloader.report_warning('The url doesn\'t specify the protocol, trying with http')
+ self.report_warning('The url doesn\'t specify the protocol, trying with http')
return self.url_result('http://' + url)
elif default_search != 'fixup_error':
if default_search == 'auto_warning':
@@ -2379,7 +2482,7 @@ class GenericIE(InfoExtractor):
'Invalid URL: %r . Call hypervideo like this: hypervideo -v "https://www.youtube.com/watch?v=BaW_jenozKc" ' % url,
expected=True)
else:
- self._downloader.report_warning(
+ self.report_warning(
'Falling back to youtube search for %s . Set --default-search "auto" to suppress this warning.' % url)
return self.url_result('ytsearch:' + url)
@@ -2438,8 +2541,9 @@ class GenericIE(InfoExtractor):
m = re.match(r'^(?P<type>audio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P<format_id>[^;\s]+)', content_type)
if m:
format_id = compat_str(m.group('format_id'))
+ subtitles = {}
if format_id.endswith('mpegurl'):
- formats = self._extract_m3u8_formats(url, video_id, 'mp4')
+ formats, subtitles = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4')
elif format_id == 'f4m':
formats = self._extract_f4m_formats(url, video_id)
else:
@@ -2451,11 +2555,12 @@ class GenericIE(InfoExtractor):
info_dict['direct'] = True
self._sort_formats(formats)
info_dict['formats'] = formats
+ info_dict['subtitles'] = subtitles
return info_dict
- if not self._downloader.params.get('test', False) and not is_intentional:
- force = self._downloader.params.get('force_generic_extractor', False)
- self._downloader.report_warning(
+ if not self.get_param('test', False) and not is_intentional:
+ force = self.get_param('force_generic_extractor', False)
+ self.report_warning(
'%s on generic information extractor.' % ('Forcing' if force else 'Falling back'))
if not full_response:
@@ -2475,14 +2580,14 @@ class GenericIE(InfoExtractor):
# Is it an M3U playlist?
if first_bytes.startswith(b'#EXTM3U'):
- info_dict['formats'] = self._extract_m3u8_formats(url, video_id, 'mp4')
+ info_dict['formats'], info_dict['subtitles'] = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4')
self._sort_formats(info_dict['formats'])
return info_dict
# Maybe it's a direct link to a video?
# Be careful not to download the whole thing!
if not is_html(first_bytes):
- self._downloader.report_warning(
+ self.report_warning(
'URL could be a direct video link, returning it as such.')
info_dict.update({
'direct': True,
@@ -2500,11 +2605,14 @@ class GenericIE(InfoExtractor):
# Is it an RSS feed, a SMIL file, an XSPF playlist or a MPD manifest?
try:
- doc = compat_etree_fromstring(webpage.encode('utf-8'))
+ try:
+ doc = compat_etree_fromstring(webpage)
+ except compat_xml_parse_error:
+ doc = compat_etree_fromstring(webpage.encode('utf-8'))
if doc.tag == 'rss':
return self._extract_rss(url, video_id, doc)
elif doc.tag == 'SmoothStreamingMedia':
- info_dict['formats'] = self._parse_ism_formats(doc, url)
+ info_dict['formats'], info_dict['subtitles'] = self._parse_ism_formats_and_subtitles(doc, url)
self._sort_formats(info_dict['formats'])
return info_dict
elif re.match(r'^(?:{[^}]+})?smil$', doc.tag):
@@ -2518,7 +2626,7 @@ class GenericIE(InfoExtractor):
xspf_base_url=full_response.geturl()),
video_id)
elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag):
- info_dict['formats'] = self._parse_mpd_formats(
+ info_dict['formats'], info_dict['subtitles'] = self._parse_mpd_formats_and_subtitles(
doc,
mpd_base_url=full_response.geturl().rpartition('/')[0],
mpd_url=url)
@@ -2647,11 +2755,14 @@ class GenericIE(InfoExtractor):
if vhx_url:
return self.url_result(vhx_url, VHXEmbedIE.ie_key())
- vid_me_embed_url = self._search_regex(
- r'src=[\'"](https?://vid\.me/[^\'"]+)[\'"]',
- webpage, 'vid.me embed', default=None)
- if vid_me_embed_url is not None:
- return self.url_result(vid_me_embed_url, 'Vidme')
+ # Invidious Instances
+ # https://github.com/hypervideo/hypervideo/issues/195
+ # https://github.com/iv-org/invidious/pull/1730
+ youtube_url = self._search_regex(
+ r'<link rel="alternate" href="(https://www\.youtube\.com/watch\?v=[0-9A-Za-z_-]{11})"',
+ webpage, 'youtube link', default=None)
+ if youtube_url:
+ return self.url_result(youtube_url, YoutubeIE.ie_key())
# Look for YouTube embeds
youtube_urls = YoutubeIE._extract_urls(webpage)
@@ -3179,11 +3290,6 @@ class GenericIE(InfoExtractor):
return self.url_result(
self._proto_relative_url(instagram_embed_url), InstagramIE.ie_key())
- # Look for LiveLeak embeds
- liveleak_urls = LiveLeakIE._extract_urls(webpage)
- if liveleak_urls:
- return self.playlist_from_matches(liveleak_urls, video_id, video_title)
-
# Look for 3Q SDN embeds
threeqsdn_url = ThreeQSDNIE._extract_url(webpage)
if threeqsdn_url:
@@ -3348,6 +3454,34 @@ class GenericIE(InfoExtractor):
return self.playlist_from_matches(
zype_urls, video_id, video_title, ie=ZypeIE.ie_key())
+ gedi_urls = GediDigitalIE._extract_urls(webpage)
+ if gedi_urls:
+ return self.playlist_from_matches(
+ gedi_urls, video_id, video_title, ie=GediDigitalIE.ie_key())
+
+ # Look for RCS media group embeds
+ rcs_urls = RCSEmbedsIE._extract_urls(webpage)
+ if rcs_urls:
+ return self.playlist_from_matches(
+ rcs_urls, video_id, video_title, ie=RCSEmbedsIE.ie_key())
+
+ wimtv_urls = WimTVIE._extract_urls(webpage)
+ if wimtv_urls:
+ return self.playlist_from_matches(
+ wimtv_urls, video_id, video_title, ie=WimTVIE.ie_key())
+
+ bitchute_urls = BitChuteIE._extract_urls(webpage)
+ if bitchute_urls:
+ return self.playlist_from_matches(
+ bitchute_urls, video_id, video_title, ie=BitChuteIE.ie_key())
+
+ rumble_urls = RumbleEmbedIE._extract_urls(webpage)
+ if len(rumble_urls) == 1:
+ return self.url_result(rumble_urls[0], RumbleEmbedIE.ie_key())
+ if rumble_urls:
+ return self.playlist_from_matches(
+ rumble_urls, video_id, video_title, ie=RumbleEmbedIE.ie_key())
+
# Look for HTML5 media
entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls')
if entries:
@@ -3388,6 +3522,7 @@ class GenericIE(InfoExtractor):
if not isinstance(sources, list):
sources = [sources]
formats = []
+ subtitles = {}
for source in sources:
src = source.get('src')
if not src or not isinstance(src, compat_str):
@@ -3400,12 +3535,16 @@ class GenericIE(InfoExtractor):
if src_type == 'video/youtube':
return self.url_result(src, YoutubeIE.ie_key())
if src_type == 'application/dash+xml' or ext == 'mpd':
- formats.extend(self._extract_mpd_formats(
- src, video_id, mpd_id='dash', fatal=False))
+ fmts, subs = self._extract_mpd_formats_and_subtitles(
+ src, video_id, mpd_id='dash', fatal=False)
+ formats.extend(fmts)
+ self._merge_subtitles(subs, target=subtitles)
elif src_type == 'application/x-mpegurl' or ext == 'm3u8':
- formats.extend(self._extract_m3u8_formats(
+ fmts, subs = self._extract_m3u8_formats_and_subtitles(
src, video_id, 'mp4', entry_protocol='m3u8_native',
- m3u8_id='hls', fatal=False))
+ m3u8_id='hls', fatal=False)
+ formats.extend(fmts)
+ self._merge_subtitles(subs, target=subtitles)
else:
formats.append({
'url': src,
@@ -3415,9 +3554,10 @@ class GenericIE(InfoExtractor):
'Referer': full_response.geturl(),
},
})
- if formats:
+ if formats or subtitles:
self._sort_formats(formats)
info_dict['formats'] = formats
+ info_dict['subtitles'] = subtitles
return info_dict
# Looking for http://schema.org/VideoObject
@@ -3451,6 +3591,52 @@ class GenericIE(InfoExtractor):
.*?
['"]?file['"]?\s*:\s*["\'](.*?)["\']''', webpage))
if not found:
+ # Look for generic KVS player
+ found = re.search(r'<script [^>]*?src="https://.+?/kt_player\.js\?v=(?P<ver>(?P<maj_ver>\d+)(\.\d+)+)".*?>', webpage)
+ if found:
+ if found.group('maj_ver') not in ['4', '5']:
+ self.report_warning('Untested major version (%s) in player engine--Download may fail.' % found.group('ver'))
+ flashvars = re.search(r'(?ms)<script.*?>.*?var\s+flashvars\s*=\s*(\{.*?\});.*?</script>', webpage)
+ flashvars = self._parse_json(flashvars.group(1), video_id, transform_source=js_to_json)
+
+ # extract the part after the last / as the display_id from the
+ # canonical URL.
+ display_id = self._search_regex(
+ r'(?:<link href="https?://[^"]+/(.+?)/?" rel="canonical"\s*/?>'
+ r'|<link rel="canonical" href="https?://[^"]+/(.+?)/?"\s*/?>)',
+ webpage, 'display_id', fatal=False
+ )
+ title = self._html_search_regex(r'<(?:h1|title)>(?:Video: )?(.+?)</(?:h1|title)>', webpage, 'title')
+
+ thumbnail = flashvars['preview_url']
+ if thumbnail.startswith('//'):
+ protocol, _, _ = url.partition('/')
+ thumbnail = protocol + thumbnail
+
+ formats = []
+ for key in ('video_url', 'video_alt_url', 'video_alt_url2'):
+ if key in flashvars and '/get_file/' in flashvars[key]:
+ next_format = {
+ 'url': self._kvs_getrealurl(flashvars[key], flashvars['license_code']),
+ 'format_id': flashvars.get(key + '_text', key),
+ 'ext': 'mp4',
+ }
+ height = re.search(r'%s_(\d+)p\.mp4(?:/[?].*)?$' % flashvars['video_id'], flashvars[key])
+ if height:
+ next_format['height'] = int(height.group(1))
+ else:
+ next_format['quality'] = 1
+ formats.append(next_format)
+ self._sort_formats(formats)
+
+ return {
+ 'id': flashvars['video_id'],
+ 'display_id': display_id,
+ 'title': title,
+ 'thumbnail': thumbnail,
+ 'formats': formats,
+ }
+ if not found:
# Broaden the search a little bit
found = filter_video(re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage))
if not found:
@@ -3552,13 +3738,13 @@ class GenericIE(InfoExtractor):
ext = determine_ext(video_url)
if ext == 'smil':
- entry_info_dict['formats'] = self._extract_smil_formats(video_url, video_id)
+ entry_info_dict = {**self._extract_smil_info(video_url, video_id), **entry_info_dict}
elif ext == 'xspf':
return self.playlist_result(self._extract_xspf_playlist(video_url, video_id), video_id)
elif ext == 'm3u8':
- entry_info_dict['formats'] = self._extract_m3u8_formats(video_url, video_id, ext='mp4')
+ entry_info_dict['formats'], entry_info_dict['subtitles'] = self._extract_m3u8_formats_and_subtitles(video_url, video_id, ext='mp4')
elif ext == 'mpd':
- entry_info_dict['formats'] = self._extract_mpd_formats(video_url, video_id)
+ entry_info_dict['formats'], entry_info_dict['subtitles'] = self._extract_mpd_formats_and_subtitles(video_url, video_id)
elif ext == 'f4m':
entry_info_dict['formats'] = self._extract_f4m_formats(video_url, video_id)
elif re.search(r'(?i)\.(?:ism|smil)/manifest', video_url) and video_url != url:
diff --git a/hypervideo_dl/extractor/gettr.py b/hypervideo_dl/extractor/gettr.py
new file mode 100644
index 0000000..aa50b2f
--- /dev/null
+++ b/hypervideo_dl/extractor/gettr.py
@@ -0,0 +1,110 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ dict_get,
+ float_or_none,
+ int_or_none,
+ remove_end,
+ str_or_none,
+ try_get,
+ url_or_none,
+ urljoin,
+)
+
+
+class GettrIE(InfoExtractor):
+ _VALID_URL = r'https?://(www\.)?gettr\.com/post/(?P<id>[a-z0-9]+)'
+ _MEDIA_BASE_URL = 'https://media.gettr.com/'
+
+ _TESTS = [{
+ 'url': 'https://www.gettr.com/post/pcf6uv838f',
+ 'info_dict': {
+ 'id': 'pcf6uv838f',
+ 'title': 'md5:9086a646bbd06c41c4fe8e52b3c93454',
+ 'description': 'md5:be0577f1e4caadc06de4a002da2bf287',
+ 'ext': 'mp4',
+ 'uploader': 'EpochTV',
+ 'uploader_id': 'epochtv',
+ 'thumbnail': r're:^https?://.+/out\.jpg',
+ 'timestamp': 1632782451058,
+ 'duration': 58.5585,
+ }
+ }, {
+ 'url': 'https://gettr.com/post/p4iahp',
+ 'info_dict': {
+ 'id': 'p4iahp',
+ 'title': 'md5:b03c07883db6fbc1aab88877a6c3b149',
+ 'description': 'md5:741b7419d991c403196ed2ea7749a39d',
+ 'ext': 'mp4',
+ 'uploader': 'Neues Forum Freiheit',
+ 'uploader_id': 'nf_freiheit',
+ 'thumbnail': r're:^https?://.+/out\.jpg',
+ 'timestamp': 1626594455017,
+ 'duration': 23,
+ }
+ }]
+
+ def _real_extract(self, url):
+ post_id = self._match_id(url)
+ webpage = self._download_webpage(url, post_id)
+
+ api_data = self._download_json(
+ 'https://api.gettr.com/u/post/%s?incl="poststats|userinfo"' % post_id, post_id)
+
+ post_data = try_get(api_data, lambda x: x['result']['data'])
+ user_data = try_get(api_data, lambda x: x['result']['aux']['uinf'][post_data['uid']]) or {}
+
+ if post_data.get('nfound'):
+ raise ExtractorError(post_data.get('txt'), expected=True)
+
+ title = description = str_or_none(
+ post_data.get('txt') or self._og_search_description(webpage))
+
+ uploader = str_or_none(
+ user_data.get('nickname')
+ or remove_end(self._og_search_title(webpage), ' on GETTR'))
+ if uploader:
+ title = '%s - %s' % (uploader, title)
+
+ if not dict_get(post_data, ['vid', 'ovid']):
+ raise ExtractorError('There\'s no video in this post.')
+
+ vid = post_data.get('vid')
+ ovid = post_data.get('ovid')
+
+ formats = self._extract_m3u8_formats(
+ urljoin(self._MEDIA_BASE_URL, vid), post_id, 'mp4',
+ entry_protocol='m3u8_native', m3u8_id='hls') if vid else []
+
+ if ovid:
+ formats.append({
+ 'url': urljoin(self._MEDIA_BASE_URL, ovid),
+ 'format_id': 'ovid',
+ 'ext': 'mp4',
+ 'width': int_or_none(post_data.get('vid_wid')),
+ 'height': int_or_none(post_data.get('vid_hgt')),
+ 'source_preference': 1,
+ 'quality': 1,
+ })
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': post_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': url_or_none(
+ urljoin(self._MEDIA_BASE_URL, post_data.get('main'))
+ or self._og_search_thumbnail(webpage)),
+ 'timestamp': int_or_none(post_data.get('cdate')),
+ 'uploader_id': str_or_none(
+ dict_get(user_data, ['_id', 'username'])
+ or post_data.get('uid')),
+ 'uploader': uploader,
+ 'formats': formats,
+ 'duration': float_or_none(post_data.get('vid_dur')),
+ 'tags': post_data.get('htgs'),
+ }
diff --git a/hypervideo_dl/extractor/giantbomb.py b/hypervideo_dl/extractor/giantbomb.py
index c647795..1920923 100644
--- a/hypervideo_dl/extractor/giantbomb.py
+++ b/hypervideo_dl/extractor/giantbomb.py
@@ -1,6 +1,5 @@
from __future__ import unicode_literals
-import re
import json
from .common import InfoExtractor
@@ -32,7 +31,7 @@ class GiantBombIE(InfoExtractor):
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
video_id = mobj.group('id')
display_id = mobj.group('display_id')
diff --git a/hypervideo_dl/extractor/globo.py b/hypervideo_dl/extractor/globo.py
index 60d842d..a3f0241 100644
--- a/hypervideo_dl/extractor/globo.py
+++ b/hypervideo_dl/extractor/globo.py
@@ -9,15 +9,14 @@ import re
from .common import InfoExtractor
from ..compat import (
- compat_HTTPError,
compat_str,
)
from ..utils import (
ExtractorError,
float_or_none,
- int_or_none,
orderedSet,
str_or_none,
+ try_get,
)
@@ -26,18 +25,19 @@ class GloboIE(InfoExtractor):
_NETRC_MACHINE = 'globo'
_TESTS = [{
'url': 'http://g1.globo.com/carros/autoesporte/videos/t/exclusivos-do-g1/v/mercedes-benz-gla-passa-por-teste-de-colisao-na-europa/3607726/',
- 'md5': 'b3ccc801f75cd04a914d51dadb83a78d',
'info_dict': {
'id': '3607726',
'ext': 'mp4',
'title': 'Mercedes-Benz GLA passa por teste de colisão na Europa',
'duration': 103.204,
- 'uploader': 'Globo.com',
- 'uploader_id': '265',
+ 'uploader': 'G1',
+ 'uploader_id': '2015',
+ },
+ 'params': {
+ 'skip_download': True,
},
}, {
'url': 'http://globoplay.globo.com/v/4581987/',
- 'md5': 'f36a1ecd6a50da1577eee6dd17f67eff',
'info_dict': {
'id': '4581987',
'ext': 'mp4',
@@ -46,6 +46,9 @@ class GloboIE(InfoExtractor):
'uploader': 'Rede Globo',
'uploader_id': '196',
},
+ 'params': {
+ 'skip_download': True,
+ },
}, {
'url': 'http://canalbrasil.globo.com/programas/sangue-latino/videos/3928201.html',
'only_matching': True,
@@ -66,109 +69,79 @@ class GloboIE(InfoExtractor):
'only_matching': True,
}]
- def _real_initialize(self):
- email, password = self._get_login_info()
- if email is None:
- return
-
- try:
- glb_id = (self._download_json(
- 'https://login.globo.com/api/authentication', None, data=json.dumps({
- 'payload': {
- 'email': email,
- 'password': password,
- 'serviceId': 4654,
- },
- }).encode(), headers={
- 'Content-Type': 'application/json; charset=utf-8',
- }) or {}).get('glbId')
- if glb_id:
- self._set_cookie('.globo.com', 'GLBID', glb_id)
- except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
- resp = self._parse_json(e.cause.read(), None)
- raise ExtractorError(resp.get('userMessage') or resp['id'], expected=True)
- raise
-
def _real_extract(self, url):
video_id = self._match_id(url)
video = self._download_json(
'http://api.globovideos.com/videos/%s/playlist' % video_id,
video_id)['videos'][0]
- if video.get('encrypted') is True:
- raise ExtractorError('This video is DRM protected.', expected=True)
+ if not self.get_param('allow_unplayable_formats') and video.get('encrypted') is True:
+ self.report_drm(video_id)
title = video['title']
formats = []
+ security = self._download_json(
+ 'https://playback.video.globo.com/v1/video-session', video_id, 'Downloading security hash for %s' % video_id,
+ headers={'content-type': 'application/json'}, data=json.dumps({
+ "player_type": "desktop",
+ "video_id": video_id,
+ "quality": "max",
+ "content_protection": "widevine",
+ "vsid": "581b986b-4c40-71f0-5a58-803e579d5fa2",
+ "tz": "-3.0:00"
+ }).encode())
+
+ security_hash = security['source']['token']
+ if not security_hash:
+ message = security.get('message')
+ if message:
+ raise ExtractorError(
+ '%s returned error: %s' % (self.IE_NAME, message), expected=True)
+
+ hash_code = security_hash[:2]
+ padding = '%010d' % random.randint(1, 10000000000)
+ if hash_code in ('04', '14'):
+ received_time = security_hash[3:13]
+ received_md5 = security_hash[24:]
+ hash_prefix = security_hash[:23]
+ elif hash_code in ('02', '12', '03', '13'):
+ received_time = security_hash[2:12]
+ received_md5 = security_hash[22:]
+ padding += '1'
+ hash_prefix = '05' + security_hash[:22]
+
+ padded_sign_time = compat_str(int(received_time) + 86400) + padding
+ md5_data = (received_md5 + padded_sign_time + '0xAC10FD').encode()
+ signed_md5 = base64.urlsafe_b64encode(hashlib.md5(md5_data).digest()).decode().strip('=')
+ signed_hash = hash_prefix + padded_sign_time + signed_md5
+ source = security['source']['url_parts']
+ resource_url = source['scheme'] + '://' + source['domain'] + source['path']
+ signed_url = '%s?h=%s&k=html5&a=%s' % (resource_url, signed_hash, 'F' if video.get('subscriber_only') else 'A')
+
+ formats.extend(self._extract_m3u8_formats(
+ signed_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False))
+ self._sort_formats(formats)
+
subtitles = {}
for resource in video['resources']:
- resource_id = resource.get('_id')
- resource_url = resource.get('url')
- resource_type = resource.get('type')
- if not resource_url or (resource_type == 'media' and not resource_id) or resource_type not in ('subtitle', 'media'):
- continue
-
- if resource_type == 'subtitle':
+ if resource.get('type') == 'subtitle':
subtitles.setdefault(resource.get('language') or 'por', []).append({
- 'url': resource_url,
+ 'url': resource.get('url'),
})
- continue
-
- security = self._download_json(
- 'http://security.video.globo.com/videos/%s/hash' % video_id,
- video_id, 'Downloading security hash for %s' % resource_id, query={
- 'player': 'desktop',
- 'version': '5.19.1',
- 'resource_id': resource_id,
+ subs = try_get(security, lambda x: x['source']['subtitles'], expected_type=dict) or {}
+ for sub_lang, sub_url in subs.items():
+ if sub_url:
+ subtitles.setdefault(sub_lang or 'por', []).append({
+ 'url': sub_url,
})
-
- security_hash = security.get('hash')
- if not security_hash:
- message = security.get('message')
- if message:
- raise ExtractorError(
- '%s returned error: %s' % (self.IE_NAME, message), expected=True)
- continue
-
- hash_code = security_hash[:2]
- padding = '%010d' % random.randint(1, 10000000000)
- if hash_code in ('04', '14'):
- received_time = security_hash[3:13]
- received_md5 = security_hash[24:]
- hash_prefix = security_hash[:23]
- elif hash_code in ('02', '12', '03', '13'):
- received_time = security_hash[2:12]
- received_md5 = security_hash[22:]
- padding += '1'
- hash_prefix = '05' + security_hash[:22]
-
- padded_sign_time = compat_str(int(received_time) + 86400) + padding
- md5_data = (received_md5 + padded_sign_time + '0xAC10FD').encode()
- signed_md5 = base64.urlsafe_b64encode(hashlib.md5(md5_data).digest()).decode().strip('=')
- signed_hash = hash_prefix + padded_sign_time + signed_md5
- signed_url = '%s?h=%s&k=html5&a=%s&u=%s' % (resource_url, signed_hash, 'F' if video.get('subscriber_only') else 'A', security.get('user') or '')
-
- if resource_id.endswith('m3u8') or resource_url.endswith('.m3u8'):
- formats.extend(self._extract_m3u8_formats(
- signed_url, resource_id, 'mp4', entry_protocol='m3u8_native',
- m3u8_id='hls', fatal=False))
- elif resource_id.endswith('mpd') or resource_url.endswith('.mpd'):
- formats.extend(self._extract_mpd_formats(
- signed_url, resource_id, mpd_id='dash', fatal=False))
- elif resource_id.endswith('manifest') or resource_url.endswith('/manifest'):
- formats.extend(self._extract_ism_formats(
- signed_url, resource_id, ism_id='mss', fatal=False))
- else:
- formats.append({
- 'url': signed_url,
- 'format_id': 'http-%s' % resource_id,
- 'height': int_or_none(resource.get('height')),
+ subs = try_get(security, lambda x: x['source']['subtitles_webvtt'], expected_type=dict) or {}
+ for sub_lang, sub_url in subs.items():
+ if sub_url:
+ subtitles.setdefault(sub_lang or 'por', []).append({
+ 'url': sub_url,
})
- self._sort_formats(formats)
-
duration = float_or_none(video.get('duration'), 1000)
uploader = video.get('channel')
uploader_id = str_or_none(video.get('channel_id'))
diff --git a/hypervideo_dl/extractor/go.py b/hypervideo_dl/extractor/go.py
index 878ba14..2ccc6df 100644
--- a/hypervideo_dl/extractor/go.py
+++ b/hypervideo_dl/extractor/go.py
@@ -9,6 +9,8 @@ from ..utils import (
int_or_none,
determine_ext,
parse_age_limit,
+ remove_start,
+ remove_end,
try_get,
urlencode_postdata,
ExtractorError,
@@ -48,15 +50,15 @@ class GoIE(AdobePassIE):
}
_VALID_URL = r'''(?x)
https?://
- (?:
- (?:(?P<sub_domain>%s)\.)?go|
- (?P<sub_domain_2>abc|freeform|disneynow|fxnow\.fxnetworks)
+ (?P<sub_domain>
+ (?:%s\.)?go|fxnow\.fxnetworks|
+ (?:www\.)?(?:abc|freeform|disneynow)
)\.com/
(?:
(?:[^/]+/)*(?P<id>[Vv][Dd][Kk][Aa]\w+)|
(?:[^/]+/)*(?P<display_id>[^/?\#]+)
)
- ''' % '|'.join(list(_SITE_INFO.keys()))
+ ''' % r'\.|'.join(list(_SITE_INFO.keys()))
_TESTS = [{
'url': 'http://abc.go.com/shows/designated-survivor/video/most-recent/VDKA3807643',
'info_dict': {
@@ -147,6 +149,9 @@ class GoIE(AdobePassIE):
}, {
'url': 'https://disneynow.com/shows/minnies-bow-toons/video/happy-campers/vdka4872013',
'only_matching': True,
+ }, {
+ 'url': 'https://www.freeform.com/shows/cruel-summer/episode-guide/season-01/01-happy-birthday-jeanette-turner',
+ 'only_matching': True,
}]
def _extract_videos(self, brand, video_id='-1', show_id='-1'):
@@ -156,8 +161,8 @@ class GoIE(AdobePassIE):
display_id)['video']
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- sub_domain = mobj.group('sub_domain') or mobj.group('sub_domain_2')
+ mobj = self._match_valid_url(url)
+ sub_domain = remove_start(remove_end(mobj.group('sub_domain') or '', '.go'), 'www.')
video_id, display_id = mobj.group('id', 'display_id')
site_info = self._SITE_INFO.get(sub_domain, {})
brand = site_info.get('brand')
@@ -262,7 +267,7 @@ class GoIE(AdobePassIE):
if re.search(r'(?:/mp4/source/|_source\.mp4)', asset_url):
f.update({
'format_id': ('%s-' % format_id if format_id else '') + 'SOURCE',
- 'preference': 1,
+ 'quality': 1,
})
else:
mobj = re.search(r'/(\d+)x(\d+)/', asset_url)
diff --git a/hypervideo_dl/extractor/godtube.py b/hypervideo_dl/extractor/godtube.py
index 92efd16..96e68b4 100644
--- a/hypervideo_dl/extractor/godtube.py
+++ b/hypervideo_dl/extractor/godtube.py
@@ -1,6 +1,5 @@
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
from ..utils import (
@@ -29,7 +28,7 @@ class GodTubeIE(InfoExtractor):
]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
video_id = mobj.group('id')
config = self._download_xml(
diff --git a/hypervideo_dl/extractor/googledrive.py b/hypervideo_dl/extractor/googledrive.py
index 3f2de00..7b5bf28 100644
--- a/hypervideo_dl/extractor/googledrive.py
+++ b/hypervideo_dl/extractor/googledrive.py
@@ -253,7 +253,7 @@ class GoogleDriveIE(InfoExtractor):
or 'unable to extract confirmation code')
if not formats and reason:
- raise ExtractorError(reason, expected=True)
+ self.raise_no_formats(reason, expected=True)
self._sort_formats(formats)
@@ -266,6 +266,8 @@ class GoogleDriveIE(InfoExtractor):
subtitles_id = ttsurl.encode('utf-8').decode(
'unicode_escape').split('=')[-1]
+ self._downloader.cookiejar.clear(domain='.google.com', path='/', name='NID')
+
return {
'id': video_id,
'title': title,
diff --git a/hypervideo_dl/extractor/googlepodcasts.py b/hypervideo_dl/extractor/googlepodcasts.py
index 31ad799..25631e2 100644
--- a/hypervideo_dl/extractor/googlepodcasts.py
+++ b/hypervideo_dl/extractor/googlepodcasts.py
@@ -2,7 +2,6 @@
from __future__ import unicode_literals
import json
-import re
from .common import InfoExtractor
from ..utils import (
@@ -56,7 +55,7 @@ class GooglePodcastsIE(GooglePodcastsBaseIE):
}
def _real_extract(self, url):
- b64_feed_url, b64_guid = re.match(self._VALID_URL, url).groups()
+ b64_feed_url, b64_guid = self._match_valid_url(url).groups()
episode = self._batch_execute(
'oNjqVe', b64_guid, [b64_feed_url, b64_guid])[1]
return self._extract_episode(episode)
diff --git a/hypervideo_dl/extractor/googlesearch.py b/hypervideo_dl/extractor/googlesearch.py
index 5279fa8..f605c0c 100644
--- a/hypervideo_dl/extractor/googlesearch.py
+++ b/hypervideo_dl/extractor/googlesearch.py
@@ -11,6 +11,7 @@ class GoogleSearchIE(SearchInfoExtractor):
_MAX_RESULTS = 1000
IE_NAME = 'video.google:search'
_SEARCH_KEY = 'gvsearch'
+ _WORKING = False
_TEST = {
'url': 'gvsearch15:python language',
'info_dict': {
@@ -20,16 +21,7 @@ class GoogleSearchIE(SearchInfoExtractor):
'playlist_count': 15,
}
- def _get_n_results(self, query, n):
- """Get a specified number of results for a query"""
-
- entries = []
- res = {
- '_type': 'playlist',
- 'id': query,
- 'title': query,
- }
-
+ def _search_results(self, query):
for pagenum in itertools.count():
webpage = self._download_webpage(
'http://www.google.com/search',
@@ -44,16 +36,8 @@ class GoogleSearchIE(SearchInfoExtractor):
for hit_idx, mobj in enumerate(re.finditer(
r'<h3 class="r"><a href="([^"]+)"', webpage)):
+ if re.search(f'id="vidthumb{hit_idx + 1}"', webpage):
+ yield self.url_result(mobj.group(1))
- # Skip playlists
- if not re.search(r'id="vidthumb%d"' % (hit_idx + 1), webpage):
- continue
-
- entries.append({
- '_type': 'url',
- 'url': mobj.group(1)
- })
-
- if (len(entries) >= n) or not re.search(r'id="pnnext"', webpage):
- res['entries'] = entries[:n]
- return res
+ if not re.search(r'id="pnnext"', webpage):
+ return
diff --git a/hypervideo_dl/extractor/gopro.py b/hypervideo_dl/extractor/gopro.py
new file mode 100644
index 0000000..10cc1ae
--- /dev/null
+++ b/hypervideo_dl/extractor/gopro.py
@@ -0,0 +1,110 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ int_or_none,
+ remove_end,
+ str_or_none,
+ try_get,
+ unified_timestamp,
+ url_or_none,
+)
+
+
+class GoProIE(InfoExtractor):
+ _VALID_URL = r'https?://(www\.)?gopro\.com/v/(?P<id>[A-Za-z0-9]+)'
+
+ _TESTS = [{
+ 'url': 'https://gopro.com/v/ZNVvED8QDzR5V',
+ 'info_dict': {
+ 'id': 'ZNVvED8QDzR5V',
+ 'title': 'My GoPro Adventure - 9/19/21',
+ 'thumbnail': r're:https?://.+',
+ 'ext': 'mp4',
+ 'timestamp': 1632072947,
+ 'upload_date': '20210919',
+ 'uploader_id': 'fireydive30018',
+ 'duration': 396062,
+ }
+ }, {
+ 'url': 'https://gopro.com/v/KRm6Vgp2peg4e',
+ 'info_dict': {
+ 'id': 'KRm6Vgp2peg4e',
+ 'title': 'じゃがいも カリカリ オーブン焼き',
+ 'thumbnail': r're:https?://.+',
+ 'ext': 'mp4',
+ 'timestamp': 1607231125,
+ 'upload_date': '20201206',
+ 'uploader_id': 'dc9bcb8b-47d2-47c6-afbc-4c48f9a3769e',
+ 'duration': 45187,
+ 'track': 'The Sky Machine',
+ }
+ }, {
+ 'url': 'https://gopro.com/v/kVrK9wlJvBMwn',
+ 'info_dict': {
+ 'id': 'kVrK9wlJvBMwn',
+ 'title': 'DARKNESS',
+ 'thumbnail': r're:https?://.+',
+ 'ext': 'mp4',
+ 'timestamp': 1594183735,
+ 'upload_date': '20200708',
+ 'uploader_id': '闇夜乃皇帝',
+ 'duration': 313075,
+ 'track': 'Battery (Live)',
+ 'artist': 'Metallica',
+ }
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ metadata = self._parse_json(
+ self._html_search_regex(r'window\.__reflectData\s*=\s*([^;]+)', webpage, 'metadata'), video_id)
+
+ video_info = metadata['collectionMedia'][0]
+ media_data = self._download_json(
+ 'https://api.gopro.com/media/%s/download' % video_info['id'], video_id)
+
+ formats = []
+ for fmt in try_get(media_data, lambda x: x['_embedded']['variations']) or []:
+ format_url = url_or_none(fmt.get('url'))
+ if not format_url:
+ continue
+ formats.append({
+ 'url': format_url,
+ 'format_id': str_or_none(fmt.get('quality')),
+ 'format_note': str_or_none(fmt.get('label')),
+ 'ext': str_or_none(fmt.get('type')),
+ 'width': int_or_none(fmt.get('width')),
+ 'height': int_or_none(fmt.get('height')),
+ })
+
+ self._sort_formats(formats)
+
+ title = str_or_none(
+ try_get(metadata, lambda x: x['collection']['title'])
+ or self._html_search_meta(['og:title', 'twitter:title'], webpage)
+ or remove_end(self._html_search_regex(
+ r'<title[^>]*>([^<]+)</title>', webpage, 'title', fatal=False), ' | GoPro'))
+ if title:
+ title = title.replace('\n', ' ')
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'formats': formats,
+ 'thumbnail': url_or_none(
+ self._html_search_meta(['og:image', 'twitter:image'], webpage)),
+ 'timestamp': unified_timestamp(
+ try_get(metadata, lambda x: x['collection']['created_at'])),
+ 'uploader_id': str_or_none(
+ try_get(metadata, lambda x: x['account']['nickname'])),
+ 'duration': int_or_none(
+ video_info.get('source_duration')),
+ 'artist': str_or_none(
+ video_info.get('music_track_artist')),
+ 'track': str_or_none(
+ video_info.get('music_track_name')),
+ }
diff --git a/hypervideo_dl/extractor/gotostage.py b/hypervideo_dl/extractor/gotostage.py
new file mode 100644
index 0000000..6aa9610
--- /dev/null
+++ b/hypervideo_dl/extractor/gotostage.py
@@ -0,0 +1,73 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+ try_get,
+ url_or_none
+)
+
+import json
+
+
+class GoToStageIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?gotostage\.com/channel/[a-z0-9]+/recording/(?P<id>[a-z0-9]+)/watch'
+ _TESTS = [{
+ 'url': 'https://www.gotostage.com/channel/8901680603948959494/recording/60bb55548d434f21b9ce4f0e225c4895/watch',
+ 'md5': 'ca72ce990cdcd7a2bd152f7217e319a2',
+ 'info_dict': {
+ 'id': '60bb55548d434f21b9ce4f0e225c4895',
+ 'ext': 'mp4',
+ 'title': 'What is GoToStage?',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'duration': 93.924711
+ }
+ }, {
+ 'url': 'https://www.gotostage.com/channel/bacc3d3535b34bafacc3f4ef8d4df78a/recording/831e74cd3e0042be96defba627b6f676/watch?source=HOMEPAGE',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ metadata = self._download_json(
+ 'https://api.gotostage.com/contents?ids=%s' % video_id,
+ video_id,
+ note='Downloading video metadata',
+ errnote='Unable to download video metadata')[0]
+
+ registration_data = {
+ 'product': metadata['product'],
+ 'resourceType': metadata['contentType'],
+ 'productReferenceKey': metadata['productRefKey'],
+ 'firstName': 'foo',
+ 'lastName': 'bar',
+ 'email': 'foobar@example.com'
+ }
+
+ registration_response = self._download_json(
+ 'https://api-registrations.logmeininc.com/registrations',
+ video_id,
+ data=json.dumps(registration_data).encode(),
+ expected_status=409,
+ headers={'Content-Type': 'application/json'},
+ note='Register user',
+ errnote='Unable to register user')
+
+ content_response = self._download_json(
+ 'https://api.gotostage.com/contents/%s/asset' % video_id,
+ video_id,
+ headers={'x-registrantkey': registration_response['registrationKey']},
+ note='Get download url',
+ errnote='Unable to get download url')
+
+ return {
+ 'id': video_id,
+ 'title': try_get(metadata, lambda x: x['title'], compat_str),
+ 'url': try_get(content_response, lambda x: x['cdnLocation'], compat_str),
+ 'ext': 'mp4',
+ 'thumbnail': url_or_none(try_get(metadata, lambda x: x['thumbnail']['location'])),
+ 'duration': try_get(metadata, lambda x: x['duration'], float),
+ 'categories': [try_get(metadata, lambda x: x['category'], compat_str)],
+ 'is_live': False
+ }
diff --git a/hypervideo_dl/extractor/gronkh.py b/hypervideo_dl/extractor/gronkh.py
new file mode 100644
index 0000000..a7792a5
--- /dev/null
+++ b/hypervideo_dl/extractor/gronkh.py
@@ -0,0 +1,43 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import unified_strdate
+
+
+class GronkhIE(InfoExtractor):
+ _VALID_URL = r'(?:https?://)(?:www\.)?gronkh\.tv/stream/(?P<id>\d+)'
+
+ _TESTS = [{
+ 'url': 'https://gronkh.tv/stream/536',
+ 'info_dict': {
+ 'id': '536',
+ 'ext': 'mp4',
+ 'title': 'GTV0536, 2021-10-01 - MARTHA IS DEAD #FREiAB1830 !FF7 !horde !archiv',
+ 'view_count': 19491,
+ 'thumbnail': 'https://01.cdn.vod.farm/preview/6436746cce14e25f751260a692872b9b.jpg',
+ 'upload_date': '20211001'
+ },
+ 'params': {'skip_download': True}
+ }]
+
+ def _real_extract(self, url):
+ id = self._match_id(url)
+ data_json = self._download_json(f'https://api.gronkh.tv/v1/video/info?episode={id}', id)
+ m3u8_url = self._download_json(f'https://api.gronkh.tv/v1/video/playlist?episode={id}', id)['playlist_url']
+ formats, subtitles = self._extract_m3u8_formats_and_subtitles(m3u8_url, id)
+ if data_json.get('vtt_url'):
+ subtitles.setdefault('en', []).append({
+ 'url': data_json['vtt_url'],
+ 'ext': 'vtt',
+ })
+ self._sort_formats(formats)
+ return {
+ 'id': id,
+ 'title': data_json.get('title'),
+ 'view_count': data_json.get('views'),
+ 'thumbnail': data_json.get('preview_url'),
+ 'upload_date': unified_strdate(data_json.get('created_at')),
+ 'formats': formats,
+ 'subtitles': subtitles,
+ }
diff --git a/hypervideo_dl/extractor/hearthisat.py b/hypervideo_dl/extractor/hearthisat.py
index 18c2520..a3d6a05 100644
--- a/hypervideo_dl/extractor/hearthisat.py
+++ b/hypervideo_dl/extractor/hearthisat.py
@@ -1,17 +1,12 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
-from ..compat import compat_urlparse
from ..utils import (
- HEADRequest,
+ determine_ext,
KNOWN_EXTENSIONS,
- sanitized_Request,
str_to_int,
- urlencode_postdata,
- urlhandle_detect_ext,
)
@@ -27,13 +22,11 @@ class HearThisAtIE(InfoExtractor):
'title': 'Moofi - Dr. Kreep',
'thumbnail': r're:^https?://.*\.jpg$',
'timestamp': 1421564134,
- 'description': 'Listen to Dr. Kreep by Moofi on hearthis.at - Modular, Eurorack, Mutable Intruments Braids, Valhalla-DSP',
+ 'description': 'md5:1adb0667b01499f9d27e97ddfd53852a',
'upload_date': '20150118',
- 'comment_count': int,
'view_count': int,
- 'like_count': int,
'duration': 71,
- 'categories': ['Experimental'],
+ 'genre': 'Experimental',
}
}, {
# 'download' link redirects to the original webpage
@@ -43,79 +36,54 @@ class HearThisAtIE(InfoExtractor):
'id': '811296',
'ext': 'mp3',
'title': 'TwitchSF - DJ Jim Hopkins - Totally Bitchin\' 80\'s Dance Mix!',
- 'description': 'Listen to DJ Jim Hopkins - Totally Bitchin\' 80\'s Dance Mix! by TwitchSF on hearthis.at - Dance',
+ 'description': 'md5:ef26815ca8f483272a87b137ff175be2',
'upload_date': '20160328',
'timestamp': 1459186146,
'thumbnail': r're:^https?://.*\.jpg$',
- 'comment_count': int,
'view_count': int,
- 'like_count': int,
'duration': 4360,
- 'categories': ['Dance'],
+ 'genre': 'Dance',
},
}]
def _real_extract(self, url):
- m = re.match(self._VALID_URL, url)
+ m = self._match_valid_url(url)
display_id = '{artist:s} - {title:s}'.format(**m.groupdict())
-
- webpage = self._download_webpage(url, display_id)
- track_id = self._search_regex(
- r'intTrackId\s*=\s*(\d+)', webpage, 'track ID')
-
- payload = urlencode_postdata({'tracks[]': track_id})
- req = sanitized_Request(self._PLAYLIST_URL, payload)
- req.add_header('Content-type', 'application/x-www-form-urlencoded')
-
- track = self._download_json(req, track_id, 'Downloading playlist')[0]
- title = '{artist:s} - {title:s}'.format(**track)
-
- categories = None
- if track.get('category'):
- categories = [track['category']]
-
- description = self._og_search_description(webpage)
- thumbnail = self._og_search_thumbnail(webpage)
-
- meta_span = r'<span[^>]+class="%s".*?</i>([^<]+)</span>'
- view_count = str_to_int(self._search_regex(
- meta_span % 'plays_count', webpage, 'view count', fatal=False))
- like_count = str_to_int(self._search_regex(
- meta_span % 'likes_count', webpage, 'like count', fatal=False))
- comment_count = str_to_int(self._search_regex(
- meta_span % 'comment_count', webpage, 'comment count', fatal=False))
- duration = str_to_int(self._search_regex(
- r'data-length="(\d+)', webpage, 'duration', fatal=False))
- timestamp = str_to_int(self._search_regex(
- r'<span[^>]+class="calctime"[^>]+data-time="(\d+)', webpage, 'timestamp', fatal=False))
+ api_url = url.replace('www.', '').replace('hearthis.at', 'api-v2.hearthis.at')
+ data_json = self._download_json(api_url, display_id)
+ track_id = data_json.get('id')
+ artist_json = data_json.get('user')
+ title = '{} - {}'.format(artist_json.get('username'), data_json.get('title'))
+ genre = data_json.get('genre')
+ description = data_json.get('description')
+ thumbnail = data_json.get('artwork_url') or data_json.get('thumb')
+ view_count = str_to_int(data_json.get('playback_count'))
+ duration = str_to_int(data_json.get('duration'))
+ timestamp = data_json.get('release_timestamp')
formats = []
- mp3_url = self._search_regex(
- r'(?s)<a class="player-link"\s+(?:[a-zA-Z0-9_:-]+="[^"]+"\s+)*?data-mp3="([^"]+)"',
- webpage, 'mp3 URL', fatal=False)
+ mp3_url = data_json.get('stream_url')
+
if mp3_url:
formats.append({
'format_id': 'mp3',
'vcodec': 'none',
'acodec': 'mp3',
'url': mp3_url,
+ 'ext': 'mp3',
})
- download_path = self._search_regex(
- r'<a class="[^"]*download_fct[^"]*"\s+href="([^"]+)"',
- webpage, 'download URL', default=None)
- if download_path:
- download_url = compat_urlparse.urljoin(url, download_path)
- ext_req = HEADRequest(download_url)
- ext_handle = self._request_webpage(
- ext_req, display_id, note='Determining extension')
- ext = urlhandle_detect_ext(ext_handle)
+
+ if data_json.get('download_url'):
+ download_url = data_json['download_url']
+ ext = determine_ext(data_json['download_filename'])
if ext in KNOWN_EXTENSIONS:
formats.append({
- 'format_id': 'download',
+ 'format_id': ext,
'vcodec': 'none',
'ext': ext,
'url': download_url,
- 'preference': 2, # Usually better quality
+ 'acodec': ext,
+ 'quality': 2, # Usually better quality
})
self._sort_formats(formats)
@@ -129,7 +97,5 @@ class HearThisAtIE(InfoExtractor):
'duration': duration,
'timestamp': timestamp,
'view_count': view_count,
- 'comment_count': comment_count,
- 'like_count': like_count,
- 'categories': categories,
+ 'genre': genre,
}
diff --git a/hypervideo_dl/extractor/hidive.py b/hypervideo_dl/extractor/hidive.py
index f26f802..15bd444 100644
--- a/hypervideo_dl/extractor/hidive.py
+++ b/hypervideo_dl/extractor/hidive.py
@@ -1,20 +1,18 @@
# coding: utf-8
-from __future__ import unicode_literals
-
import re
from .common import InfoExtractor
-from ..compat import compat_str
from ..utils import (
ExtractorError,
int_or_none,
+ try_get,
url_or_none,
urlencode_postdata,
)
class HiDiveIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?hidive\.com/stream/(?P<title>[^/]+)/(?P<key>[^/?#&]+)'
+ _VALID_URL = r'https?://(?:www\.)?hidive\.com/stream/(?P<id>(?P<title>[^/]+)/(?P<key>[^/?#&]+))'
# Using X-Forwarded-For results in 403 HTTP error for HLS fragments,
# so disabling geo bypass completely
_GEO_BYPASS = False
@@ -54,65 +52,71 @@ class HiDiveIE(InfoExtractor):
self._download_webpage(
self._LOGIN_URL, None, 'Logging in', data=urlencode_postdata(data))
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- title, key = mobj.group('title', 'key')
- video_id = '%s/%s' % (title, key)
-
- settings = self._download_json(
+ def _call_api(self, video_id, title, key, data={}, **kwargs):
+ data = {
+ **data,
+ 'Title': title,
+ 'Key': key,
+ 'PlayerId': 'f4f895ce1ca713ba263b91caeb1daa2d08904783',
+ }
+ return self._download_json(
'https://www.hidive.com/play/settings', video_id,
- data=urlencode_postdata({
- 'Title': title,
- 'Key': key,
- 'PlayerId': 'f4f895ce1ca713ba263b91caeb1daa2d08904783',
- }))
+ data=urlencode_postdata(data), **kwargs) or {}
+
+ def _extract_subtitles_from_rendition(self, rendition, subtitles, parsed_urls):
+ for cc_file in rendition.get('ccFiles', []):
+ cc_url = url_or_none(try_get(cc_file, lambda x: x[2]))
+ # name is used since we cant distinguish subs with same language code
+ cc_lang = try_get(cc_file, (lambda x: x[1].replace(' ', '-').lower(), lambda x: x[0]), str)
+ if cc_url not in parsed_urls and cc_lang:
+ parsed_urls.add(cc_url)
+ subtitles.setdefault(cc_lang, []).append({'url': cc_url})
+
+ def _get_subtitles(self, url, video_id, title, key, parsed_urls):
+ webpage = self._download_webpage(url, video_id, fatal=False) or ''
+ subtitles = {}
+ for caption in set(re.findall(r'data-captions=\"([^\"]+)\"', webpage)):
+ renditions = self._call_api(
+ video_id, title, key, {'Captions': caption}, fatal=False,
+ note=f'Downloading {caption} subtitle information').get('renditions') or {}
+ for rendition_id, rendition in renditions.items():
+ self._extract_subtitles_from_rendition(rendition, subtitles, parsed_urls)
+ return subtitles
+
+ def _real_extract(self, url):
+ video_id, title, key = self._match_valid_url(url).group('id', 'title', 'key')
+ settings = self._call_api(video_id, title, key)
restriction = settings.get('restrictionReason')
if restriction == 'RegionRestricted':
self.raise_geo_restricted()
-
if restriction and restriction != 'None':
raise ExtractorError(
'%s said: %s' % (self.IE_NAME, restriction), expected=True)
- formats = []
- subtitles = {}
+ formats, parsed_urls = [], {None}
for rendition_id, rendition in settings['renditions'].items():
- bitrates = rendition.get('bitrates')
- if not isinstance(bitrates, dict):
- continue
- m3u8_url = url_or_none(bitrates.get('hls'))
- if not m3u8_url:
- continue
- formats.extend(self._extract_m3u8_formats(
- m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native',
- m3u8_id='%s-hls' % rendition_id, fatal=False))
- cc_files = rendition.get('ccFiles')
- if not isinstance(cc_files, list):
- continue
- for cc_file in cc_files:
- if not isinstance(cc_file, list) or len(cc_file) < 3:
- continue
- cc_lang = cc_file[0]
- cc_url = url_or_none(cc_file[2])
- if not isinstance(cc_lang, compat_str) or not cc_url:
- continue
- subtitles.setdefault(cc_lang, []).append({
- 'url': cc_url,
- })
+ audio, version, extra = rendition_id.split('_')
+ m3u8_url = url_or_none(try_get(rendition, lambda x: x['bitrates']['hls']))
+ if m3u8_url not in parsed_urls:
+ parsed_urls.add(m3u8_url)
+ frmt = self._extract_m3u8_formats(
+ m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id=rendition_id, fatal=False)
+ for f in frmt:
+ f['language'] = audio
+ f['format_note'] = f'{version}, {extra}'
+ formats.extend(frmt)
self._sort_formats(formats)
- season_number = int_or_none(self._search_regex(
- r's(\d+)', key, 'season number', default=None))
- episode_number = int_or_none(self._search_regex(
- r'e(\d+)', key, 'episode number', default=None))
-
return {
'id': video_id,
'title': video_id,
- 'subtitles': subtitles,
+ 'subtitles': self.extract_subtitles(url, video_id, title, key, parsed_urls),
'formats': formats,
'series': title,
- 'season_number': season_number,
- 'episode_number': episode_number,
+ 'season_number': int_or_none(
+ self._search_regex(r's(\d+)', key, 'season number', default=None)),
+ 'episode_number': int_or_none(
+ self._search_regex(r'e(\d+)', key, 'episode number', default=None)),
+ 'http_headers': {'Referer': url}
}
diff --git a/hypervideo_dl/extractor/hotstar.py b/hypervideo_dl/extractor/hotstar.py
index 1620822..74e2728 100644
--- a/hypervideo_dl/extractor/hotstar.py
+++ b/hypervideo_dl/extractor/hotstar.py
@@ -3,15 +3,15 @@ from __future__ import unicode_literals
import hashlib
import hmac
-import json
import re
import time
import uuid
+import json
from .common import InfoExtractor
from ..compat import (
compat_HTTPError,
- compat_str,
+ compat_str
)
from ..utils import (
determine_ext,
@@ -26,52 +26,79 @@ from ..utils import (
class HotStarBaseIE(InfoExtractor):
_AKAMAI_ENCRYPTION_KEY = b'\x05\xfc\x1a\x01\xca\xc9\x4b\xc4\x12\xfc\x53\x12\x07\x75\xf9\xee'
- def _call_api_impl(self, path, video_id, headers, query, data=None):
- st = int(time.time())
+ def _call_api_impl(self, path, video_id, query, st=None, cookies=None):
+ st = int_or_none(st) or int(time.time())
exp = st + 6000
auth = 'st=%d~exp=%d~acl=/*' % (st, exp)
auth += '~hmac=' + hmac.new(self._AKAMAI_ENCRYPTION_KEY, auth.encode(), hashlib.sha256).hexdigest()
- h = {'hotstarauth': auth}
- h.update(headers)
- return self._download_json(
- 'https://api.hotstar.com/' + path,
- video_id, headers=h, query=query, data=data)
+
+ if cookies and cookies.get('userUP'):
+ token = cookies.get('userUP').value
+ else:
+ token = self._download_json(
+ 'https://api.hotstar.com/um/v3/users',
+ video_id, note='Downloading token',
+ data=json.dumps({"device_ids": [{"id": compat_str(uuid.uuid4()), "type": "device_id"}]}).encode('utf-8'),
+ headers={
+ 'hotstarauth': auth,
+ 'x-hs-platform': 'PCTV', # or 'web'
+ 'Content-Type': 'application/json',
+ })['user_identity']
+
+ response = self._download_json(
+ 'https://api.hotstar.com/' + path, video_id, headers={
+ 'hotstarauth': auth,
+ 'x-hs-appversion': '6.72.2',
+ 'x-hs-platform': 'web',
+ 'x-hs-usertoken': token,
+ }, query=query)
+
+ if response['message'] != "Playback URL's fetched successfully":
+ raise ExtractorError(
+ response['message'], expected=True)
+ return response['data']
def _call_api(self, path, video_id, query_name='contentId'):
- response = self._call_api_impl(path, video_id, {
- 'x-country-code': 'IN',
- 'x-platform-code': 'JIO',
- }, {
+ return self._download_json('https://api.hotstar.com/' + path, video_id=video_id, query={
query_name: video_id,
'tas': 10000,
+ }, headers={
+ 'x-country-code': 'IN',
+ 'x-platform-code': 'PCTV',
})
- if response['statusCode'] != 'OK':
- raise ExtractorError(
- response['body']['message'], expected=True)
- return response['body']['results']
-
- def _call_api_v2(self, path, video_id, headers, query=None, data=None):
- h = {'X-Request-Id': compat_str(uuid.uuid4())}
- h.update(headers)
- try:
- return self._call_api_impl(
- path, video_id, h, query, data)
- except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError):
- if e.cause.code == 402:
- self.raise_login_required()
- message = self._parse_json(e.cause.read().decode(), video_id)['message']
- if message in ('Content not available in region', 'Country is not supported'):
- raise self.raise_geo_restricted(message)
- raise ExtractorError(message)
- raise e
+
+ def _call_api_v2(self, path, video_id, st=None, cookies=None):
+ return self._call_api_impl(
+ '%s/content/%s' % (path, video_id), video_id, st=st, cookies=cookies, query={
+ 'desired-config': 'audio_channel:stereo|container:fmp4|dynamic_range:hdr|encryption:plain|ladder:tv|package:dash|resolution:fhd|subs-tag:HotstarVIP|video_codec:h265',
+ 'device-id': cookies.get('device_id').value if cookies.get('device_id') else compat_str(uuid.uuid4()),
+ 'os-name': 'Windows',
+ 'os-version': '10',
+ })
class HotStarIE(HotStarBaseIE):
IE_NAME = 'hotstar'
- _VALID_URL = r'https?://(?:www\.)?hotstar\.com/(?:.+[/-])?(?P<id>\d{10})'
+ _VALID_URL = r'''(?x)
+ (?:
+ hotstar\:|
+ https?://(?:www\.)?hotstar\.com(?:/in)?/(?!in/)
+ )
+ (?:
+ (?P<type>movies|sports|episode|(?P<tv>tv))
+ (?:
+ \:|
+ /[^/?#]+/
+ (?(tv)
+ (?:[^/?#]+/){2}|
+ (?:[^/?#]+/)*
+ )
+ )|
+ [^/?#]+/
+ )?
+ (?P<id>\d{10})
+ '''
_TESTS = [{
- # contentData
'url': 'https://www.hotstar.com/can-you-not-spread-rumours/1000076273',
'info_dict': {
'id': '1000076273',
@@ -82,147 +109,161 @@ class HotStarIE(HotStarBaseIE):
'upload_date': '20151111',
'duration': 381,
},
- 'params': {
- # m3u8 download
- 'skip_download': True,
- }
}, {
- # contentDetail
+ 'url': 'hotstar:1000076273',
+ 'only_matching': True,
+ }, {
'url': 'https://www.hotstar.com/movies/radha-gopalam/1000057157',
+ 'info_dict': {
+ 'id': '1000057157',
+ 'ext': 'mp4',
+ 'title': 'Radha Gopalam',
+ 'description': 'md5:be3bc342cc120bbc95b3b0960e2b0d22',
+ 'timestamp': 1140805800,
+ 'upload_date': '20060224',
+ 'duration': 9182,
+ },
+ }, {
+ 'url': 'hotstar:movies:1000057157',
'only_matching': True,
}, {
- 'url': 'http://www.hotstar.com/sports/cricket/rajitha-sizzles-on-debut-with-329/2001477583',
+ 'url': 'https://www.hotstar.com/in/sports/cricket/follow-the-blues-2021/recap-eng-fight-back-on-day-2/1260066104',
'only_matching': True,
}, {
- 'url': 'http://www.hotstar.com/1000000515',
+ 'url': 'https://www.hotstar.com/in/sports/football/most-costly-pl-transfers-ft-grealish/1260065956',
'only_matching': True,
}, {
- # only available via api v2
- 'url': 'https://www.hotstar.com/tv/ek-bhram-sarvagun-sampanna/s-2116/janhvi-targets-suman/1000234847',
+ # contentData
+ 'url': 'hotstar:sports:1260065956',
'only_matching': True,
}, {
- 'url': 'https://www.hotstar.com/in/tv/start-music/1260005217/cooks-vs-comalis/1100039717',
+ # contentData
+ 'url': 'hotstar:sports:1260066104',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.hotstar.com/tv/ek-bhram-sarvagun-sampanna/s-2116/janhvi-targets-suman/1000234847',
+ 'info_dict': {
+ 'id': '1000234847',
+ 'ext': 'mp4',
+ 'title': 'Janhvi Targets Suman',
+ 'description': 'md5:78a85509348910bd1ca31be898c5796b',
+ 'timestamp': 1556670600,
+ 'upload_date': '20190501',
+ 'duration': 1219,
+ 'channel': 'StarPlus',
+ 'channel_id': 3,
+ 'series': 'Ek Bhram - Sarvagun Sampanna',
+ 'season': 'Chapter 1',
+ 'season_number': 1,
+ 'season_id': 6771,
+ 'episode': 'Janhvi Targets Suman',
+ 'episode_number': 8,
+ },
+ }, {
+ 'url': 'hotstar:episode:1000234847',
'only_matching': True,
}]
_GEO_BYPASS = False
- _DEVICE_ID = None
- _USER_TOKEN = None
+ _TYPE = {
+ 'movies': 'movie',
+ 'sports': 'match',
+ 'episode': 'episode',
+ 'tv': 'episode',
+ None: 'content',
+ }
def _real_extract(self, url):
- video_id = self._match_id(url)
-
- webpage = self._download_webpage(url, video_id)
- app_state = self._parse_json(self._search_regex(
- r'<script>window\.APP_STATE\s*=\s*({.+?})</script>',
- webpage, 'app state'), video_id)
- video_data = {}
- getters = list(
- lambda x, k=k: x['initialState']['content%s' % k]['content']
- for k in ('Data', 'Detail')
- )
- for v in app_state.values():
- content = try_get(v, getters, dict)
- if content and content.get('contentId') == video_id:
- video_data = content
- break
-
+ mobj = self._match_valid_url(url)
+ video_id = mobj.group('id')
+ video_type = mobj.group('type')
+ cookies = self._get_cookies(url)
+ video_type = self._TYPE.get(video_type, video_type)
+ video_data = self._call_api(f'o/v1/{video_type}/detail', video_id)['body']['results']['item']
title = video_data['title']
- if video_data.get('drmProtected'):
- raise ExtractorError('This video is DRM protected.', expected=True)
+ if not self.get_param('allow_unplayable_formats') and video_data.get('drmProtected'):
+ self.report_drm(video_id)
- headers = {'Referer': url}
+ headers = {'Referer': 'https://www.hotstar.com/in'}
formats = []
+ subs = {}
geo_restricted = False
-
- if not self._USER_TOKEN:
- self._DEVICE_ID = compat_str(uuid.uuid4())
- self._USER_TOKEN = self._call_api_v2('um/v3/users', video_id, {
- 'X-HS-Platform': 'PCTV',
- 'Content-Type': 'application/json',
- }, data=json.dumps({
- 'device_ids': [{
- 'id': self._DEVICE_ID,
- 'type': 'device_id',
- }],
- }).encode())['user_identity']
-
- playback_sets = self._call_api_v2(
- 'play/v2/playback/content/' + video_id, video_id, {
- 'X-HS-Platform': 'web',
- 'X-HS-AppVersion': '6.99.1',
- 'X-HS-UserToken': self._USER_TOKEN,
- }, query={
- 'device-id': self._DEVICE_ID,
- 'desired-config': 'encryption:plain',
- 'os-name': 'Windows',
- 'os-version': '10',
- })['data']['playBackSets']
+ _, urlh = self._download_webpage_handle('https://www.hotstar.com/in', video_id)
+ # Required to fix https://github.com/hypervideo/hypervideo/issues/396
+ st = urlh.headers.get('x-origin-date')
+ # change to v2 in the future
+ playback_sets = self._call_api_v2('play/v1/playback', video_id, st=st, cookies=cookies)['playBackSets']
for playback_set in playback_sets:
if not isinstance(playback_set, dict):
continue
+ dr = re.search(r'dynamic_range:(?P<dr>[a-z]+)', playback_set.get('tagsCombination')).group('dr')
format_url = url_or_none(playback_set.get('playbackUrl'))
if not format_url:
continue
format_url = re.sub(
r'(?<=//staragvod)(\d)', r'web\1', format_url)
tags = str_or_none(playback_set.get('tagsCombination')) or ''
- if tags and 'encryption:plain' not in tags:
- continue
ext = determine_ext(format_url)
+ current_formats, current_subs = [], {}
try:
if 'package:hls' in tags or ext == 'm3u8':
- formats.extend(self._extract_m3u8_formats(
+ current_formats, current_subs = self._extract_m3u8_formats_and_subtitles(
format_url, video_id, 'mp4',
entry_protocol='m3u8_native',
- m3u8_id='hls', headers=headers))
+ m3u8_id=f'{dr}-hls', headers=headers)
elif 'package:dash' in tags or ext == 'mpd':
- formats.extend(self._extract_mpd_formats(
- format_url, video_id, mpd_id='dash', headers=headers))
+ current_formats, current_subs = self._extract_mpd_formats_and_subtitles(
+ format_url, video_id, mpd_id=f'{dr}-dash', headers=headers)
elif ext == 'f4m':
# produce broken files
pass
else:
- formats.append({
+ current_formats = [{
'url': format_url,
'width': int_or_none(playback_set.get('width')),
'height': int_or_none(playback_set.get('height')),
- })
+ }]
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
geo_restricted = True
continue
+ if tags and 'encryption:plain' not in tags:
+ for f in current_formats:
+ f['has_drm'] = True
+ formats.extend(current_formats)
+ subs = self._merge_subtitles(subs, current_subs)
if not formats and geo_restricted:
- self.raise_geo_restricted(countries=['IN'])
+ self.raise_geo_restricted(countries=['IN'], metadata_available=True)
self._sort_formats(formats)
for f in formats:
f.setdefault('http_headers', {}).update(headers)
- image = try_get(video_data, lambda x: x['image']['h'], compat_str)
-
return {
'id': video_id,
'title': title,
- 'thumbnail': 'https://img1.hotstarext.com/image/upload/' + image if image else None,
'description': video_data.get('description'),
'duration': int_or_none(video_data.get('duration')),
'timestamp': int_or_none(video_data.get('broadcastDate') or video_data.get('startDate')),
'formats': formats,
+ 'subtitles': subs,
'channel': video_data.get('channelName'),
- 'channel_id': str_or_none(video_data.get('channelId')),
+ 'channel_id': video_data.get('channelId'),
'series': video_data.get('showName'),
'season': video_data.get('seasonName'),
'season_number': int_or_none(video_data.get('seasonNo')),
- 'season_id': str_or_none(video_data.get('seasonId')),
+ 'season_id': video_data.get('seasonId'),
'episode': title,
'episode_number': int_or_none(video_data.get('episodeNo')),
+ 'http_headers': {
+ 'Referer': 'https://www.hotstar.com/in',
+ }
}
class HotStarPlaylistIE(HotStarBaseIE):
IE_NAME = 'hotstar:playlist'
- _VALID_URL = r'https?://(?:www\.)?hotstar\.com/(?:[a-z]{2}/)?tv/[^/]+/s-\w+/list/[^/]+/t-(?P<id>\w+)'
+ _VALID_URL = r'https?://(?:www\.)?hotstar\.com/tv/[^/]+/s-\w+/list/[^/]+/t-(?P<id>\w+)'
_TESTS = [{
'url': 'https://www.hotstar.com/tv/savdhaan-india/s-26/list/popular-clips/t-3_2_26',
'info_dict': {
@@ -232,16 +273,12 @@ class HotStarPlaylistIE(HotStarBaseIE):
}, {
'url': 'https://www.hotstar.com/tv/savdhaan-india/s-26/list/extras/t-2480',
'only_matching': True,
- }, {
- 'url': 'https://www.hotstar.com/us/tv/masterchef-india/s-830/list/episodes/t-1_2_830',
- 'only_matching': True,
}]
def _real_extract(self, url):
playlist_id = self._match_id(url)
- collection = self._call_api('o/v1/tray/find', playlist_id, 'uqId')
-
+ collection = self._call_api('o/v1/tray/find', playlist_id, 'uqId')['body']['results']
entries = [
self.url_result(
'https://www.hotstar.com/%s' % video['contentId'],
@@ -250,3 +287,47 @@ class HotStarPlaylistIE(HotStarBaseIE):
if video.get('contentId')]
return self.playlist_result(entries, playlist_id)
+
+
+class HotStarSeriesIE(HotStarBaseIE):
+ IE_NAME = 'hotstar:series'
+ _VALID_URL = r'(?P<url>(?:https?://)(?:www\.)?hotstar\.com(?:/in)?/tv/[^/]+/(?P<id>\d+))'
+ _TESTS = [{
+ 'url': 'https://www.hotstar.com/in/tv/radhakrishn/1260000646',
+ 'info_dict': {
+ 'id': '1260000646',
+ },
+ 'playlist_mincount': 690,
+ }, {
+ 'url': 'https://www.hotstar.com/tv/dancee-/1260050431',
+ 'info_dict': {
+ 'id': '1260050431',
+ },
+ 'playlist_mincount': 43,
+ }, {
+ 'url': 'https://www.hotstar.com/in/tv/mahabharat/435/',
+ 'info_dict': {
+ 'id': '435',
+ },
+ 'playlist_mincount': 269,
+ }]
+
+ def _real_extract(self, url):
+ url, series_id = self._match_valid_url(url).groups()
+ headers = {
+ 'x-country-code': 'IN',
+ 'x-platform-code': 'PCTV',
+ }
+ detail_json = self._download_json('https://api.hotstar.com/o/v1/show/detail?contentId=' + series_id,
+ video_id=series_id, headers=headers)
+ id = compat_str(try_get(detail_json, lambda x: x['body']['results']['item']['id'], int))
+ item_json = self._download_json('https://api.hotstar.com/o/v1/tray/g/1/items?etid=0&tao=0&tas=10000&eid=' + id,
+ video_id=series_id, headers=headers)
+ entries = [
+ self.url_result(
+ '%s/ignoreme/%d' % (url, video['contentId']),
+ ie=HotStarIE.ie_key(), video_id=video['contentId'])
+ for video in item_json['body']['results']['items']
+ if video.get('contentId')]
+
+ return self.playlist_result(entries, series_id)
diff --git a/hypervideo_dl/extractor/hrfensehen.py b/hypervideo_dl/extractor/hrfensehen.py
new file mode 100644
index 0000000..2a994d4
--- /dev/null
+++ b/hypervideo_dl/extractor/hrfensehen.py
@@ -0,0 +1,102 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import json
+import re
+
+from ..utils import int_or_none, unified_timestamp, unescapeHTML
+from .common import InfoExtractor
+
+
+class HRFernsehenIE(InfoExtractor):
+ IE_NAME = 'hrfernsehen'
+ _VALID_URL = r'^https?://www\.(?:hr-fernsehen|hessenschau)\.de/.*,video-(?P<id>[0-9]{6})\.html'
+
+ _TESTS = [{
+ 'url': 'https://www.hessenschau.de/tv-sendung/hessenschau-vom-26082020,video-130546.html',
+ 'md5': '5c4e0ba94677c516a2f65a84110fc536',
+ 'info_dict': {
+ 'id': '130546',
+ 'ext': 'mp4',
+ 'description': 'Sturmtief Kirsten fegt über Hessen / Die Corona-Pandemie – eine Chronologie / '
+ 'Sterbehilfe: Die Lage in Hessen / Miss Hessen leitet zwei eigene Unternehmen / '
+ 'Pop-Up Museum zeigt Schwarze Unterhaltung und Black Music',
+ 'subtitles': {'de': [{
+ 'url': 'https://hr-a.akamaihd.net/video/as/hessenschau/2020_08/hrLogo_200826200407_L385592_512x288-25p-500kbit.vtt'
+ }]},
+ 'timestamp': 1598470200,
+ 'upload_date': '20200826',
+ 'thumbnails': [{
+ 'url': 'https://www.hessenschau.de/tv-sendung/hs_ganz-1554~_t-1598465545029_v-16to9.jpg',
+ 'id': '0'
+ }, {
+ 'url': 'https://www.hessenschau.de/tv-sendung/hs_ganz-1554~_t-1598465545029_v-16to9__medium.jpg',
+ 'id': '1'
+ }],
+ 'title': 'hessenschau vom 26.08.2020'
+ }
+ }, {
+ 'url': 'https://www.hr-fernsehen.de/sendungen-a-z/mex/sendungen/fair-und-gut---was-hinter-aldis-eigenem-guetesiegel-steckt,video-130544.html',
+ 'only_matching': True
+ }]
+
+ _GEO_COUNTRIES = ['DE']
+
+ def extract_airdate(self, loader_data):
+ airdate_str = loader_data.get('mediaMetadata', {}).get('agf', {}).get('airdate')
+
+ if airdate_str is None:
+ return None
+
+ return unified_timestamp(airdate_str)
+
+ def extract_formats(self, loader_data):
+ stream_formats = []
+ for stream_obj in loader_data["videoResolutionLevels"]:
+ stream_format = {
+ 'format_id': str(stream_obj['verticalResolution']) + "p",
+ 'height': stream_obj['verticalResolution'],
+ 'url': stream_obj['url'],
+ }
+
+ quality_information = re.search(r'([0-9]{3,4})x([0-9]{3,4})-([0-9]{2})p-([0-9]{3,4})kbit',
+ stream_obj['url'])
+ if quality_information:
+ stream_format['width'] = int_or_none(quality_information.group(1))
+ stream_format['height'] = int_or_none(quality_information.group(2))
+ stream_format['fps'] = int_or_none(quality_information.group(3))
+ stream_format['tbr'] = int_or_none(quality_information.group(4))
+
+ stream_formats.append(stream_format)
+
+ self._sort_formats(stream_formats)
+ return stream_formats
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ title = self._html_search_meta(
+ ['og:title', 'twitter:title', 'name'], webpage)
+ description = self._html_search_meta(
+ ['description'], webpage)
+
+ loader_str = unescapeHTML(self._search_regex(r"data-hr-mediaplayer-loader='([^']*)'", webpage, "ardloader"))
+ loader_data = json.loads(loader_str)
+
+ info = {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'formats': self.extract_formats(loader_data),
+ 'timestamp': self.extract_airdate(loader_data)
+ }
+
+ if "subtitle" in loader_data:
+ info["subtitles"] = {"de": [{"url": loader_data["subtitle"]}]}
+
+ thumbnails = list(set([t for t in loader_data.get("previewImageUrl", {}).values()]))
+ if len(thumbnails) > 0:
+ info["thumbnails"] = [{"url": t} for t in thumbnails]
+
+ return info
diff --git a/hypervideo_dl/extractor/hrti.py b/hypervideo_dl/extractor/hrti.py
index 23f7b1f..dc5b967 100644
--- a/hypervideo_dl/extractor/hrti.py
+++ b/hypervideo_dl/extractor/hrti.py
@@ -2,7 +2,6 @@
from __future__ import unicode_literals
import json
-import re
from .common import InfoExtractor
from ..compat import compat_HTTPError
@@ -135,7 +134,7 @@ class HRTiIE(HRTiBaseIE):
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
video_id = mobj.group('short_id') or mobj.group('id')
display_id = mobj.group('display_id') or video_id
@@ -191,7 +190,7 @@ class HRTiPlaylistIE(HRTiBaseIE):
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
category_id = mobj.group('id')
display_id = mobj.group('display_id') or category_id
diff --git a/hypervideo_dl/extractor/hungama.py b/hypervideo_dl/extractor/hungama.py
index 3fdaac5..821b16e 100644
--- a/hypervideo_dl/extractor/hungama.py
+++ b/hypervideo_dl/extractor/hungama.py
@@ -1,9 +1,12 @@
# coding: utf-8
from __future__ import unicode_literals
+import re
+
from .common import InfoExtractor
from ..utils import (
int_or_none,
+ try_get,
urlencode_postdata,
)
@@ -71,14 +74,14 @@ class HungamaSongIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?hungama\.com/song/[^/]+/(?P<id>\d+)'
_TEST = {
'url': 'https://www.hungama.com/song/kitni-haseen-zindagi/2931166/',
- 'md5': 'a845a6d1ebd08d80c1035126d49bd6a0',
+ 'md5': 'd4a6a05a394ad0453a9bea3ca00e6024',
'info_dict': {
'id': '2931166',
- 'ext': 'mp4',
+ 'ext': 'mp3',
'title': 'Lucky Ali - Kitni Haseen Zindagi',
'track': 'Kitni Haseen Zindagi',
'artist': 'Lucky Ali',
- 'album': 'Aks',
+ 'album': None,
'release_year': 2000,
}
}
@@ -89,18 +92,20 @@ class HungamaSongIE(InfoExtractor):
data = self._download_json(
'https://www.hungama.com/audio-player-data/track/%s' % audio_id,
audio_id, query={'_country': 'IN'})[0]
-
track = data['song_name']
artist = data.get('singer_name')
-
- m3u8_url = self._download_json(
- data.get('file') or data['preview_link'],
- audio_id)['response']['media_url']
-
- formats = self._extract_m3u8_formats(
- m3u8_url, audio_id, ext='mp4', entry_protocol='m3u8_native',
- m3u8_id='hls')
- self._sort_formats(formats)
+ formats = []
+ media_json = self._download_json(data.get('file') or data['preview_link'], audio_id)
+ media_url = try_get(media_json, lambda x: x['response']['media_url'], str)
+ media_type = try_get(media_json, lambda x: x['response']['type'], str)
+
+ if media_url:
+ formats.append({
+ 'url': media_url,
+ 'ext': media_type,
+ 'vcodec': 'none',
+ 'acodec': media_type,
+ })
title = '%s - %s' % (artist, track) if artist else track
thumbnail = data.get('img_src') or data.get('album_image')
@@ -111,7 +116,32 @@ class HungamaSongIE(InfoExtractor):
'thumbnail': thumbnail,
'track': track,
'artist': artist,
- 'album': data.get('album_name'),
+ 'album': data.get('album_name') or None,
'release_year': int_or_none(data.get('date')),
'formats': formats,
}
+
+
+class HungamaAlbumPlaylistIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?hungama\.com/(?:playlists|album)/[^/]+/(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'https://www.hungama.com/album/bhuj-the-pride-of-india/69481490/',
+ 'playlist_mincount': 7,
+ 'info_dict': {
+ 'id': '69481490',
+ },
+ }, {
+ 'url': 'https://www.hungama.com/playlists/hindi-jan-to-june-2021/123063/',
+ 'playlist_mincount': 50,
+ 'info_dict': {
+ 'id': '123063',
+ },
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+ ptrn = r'<meta[^>]+?property=[\"\']?music:song:url[\"\']?[^>]+?content=[\"\']?([^\"\']+)'
+ items = re.findall(ptrn, webpage)
+ entries = [self.url_result(item, ie=HungamaSongIE.ie_key()) for item in items]
+ return self.playlist_result(entries, video_id)
diff --git a/hypervideo_dl/extractor/ichinanalive.py b/hypervideo_dl/extractor/ichinanalive.py
new file mode 100644
index 0000000..cb39f82
--- /dev/null
+++ b/hypervideo_dl/extractor/ichinanalive.py
@@ -0,0 +1,167 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import ExtractorError, str_or_none, traverse_obj, unified_strdate
+from ..compat import compat_str
+
+
+class IchinanaLiveIE(InfoExtractor):
+ IE_NAME = '17live'
+ _VALID_URL = r'https?://(?:www\.)?17\.live/(?:[^/]+/)*(?:live|profile/r)/(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'https://17.live/live/3773096',
+ 'info_dict': {
+ 'id': '3773096',
+ 'title': '萠珈☕🤡🍫moka',
+ 'is_live': True,
+ 'uploader': '萠珈☕🤡🍫moka',
+ 'uploader_id': '3773096',
+ 'like_count': 366,
+ 'view_count': 18121,
+ 'timestamp': 1630569012,
+ },
+ 'skip': 'running as of writing, but may be ended as of testing',
+ }, {
+ 'note': 'nothing except language differs',
+ 'url': 'https://17.live/ja/live/3773096',
+ 'only_matching': True,
+ }]
+
+ @classmethod
+ def suitable(cls, url):
+ return not IchinanaLiveClipIE.suitable(url) and super(IchinanaLiveIE, cls).suitable(url)
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ url = 'https://17.live/live/%s' % video_id
+
+ enter = self._download_json(
+ 'https://api-dsa.17app.co/api/v1/lives/%s/enter' % video_id, video_id,
+ headers={'Referer': url}, fatal=False, expected_status=420,
+ data=b'\0')
+ if enter and enter.get('message') == 'ended':
+ raise ExtractorError('This live has ended.', expected=True)
+
+ view_data = self._download_json(
+ 'https://api-dsa.17app.co/api/v1/lives/%s' % video_id, video_id,
+ headers={'Referer': url})
+
+ uploader = traverse_obj(
+ view_data, ('userInfo', 'displayName'), ('userInfo', 'openID'))
+
+ video_urls = view_data.get('rtmpUrls')
+ if not video_urls:
+ raise ExtractorError('unable to extract live URL information')
+ formats = []
+ for (name, value) in video_urls[0].items():
+ if not isinstance(value, compat_str):
+ continue
+ if not value.startswith('http'):
+ continue
+ quality = -1
+ if 'web' in name:
+ quality -= 1
+ if 'High' in name:
+ quality += 4
+ if 'Low' in name:
+ quality -= 2
+ formats.append({
+ 'format_id': name,
+ 'url': value,
+ 'quality': quality,
+ 'http_headers': {'Referer': url},
+ 'ext': 'flv',
+ 'vcodec': 'h264',
+ 'acodec': 'aac',
+ })
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': uploader or video_id,
+ 'formats': formats,
+ 'is_live': True,
+ 'uploader': uploader,
+ 'uploader_id': video_id,
+ 'like_count': view_data.get('receivedLikeCount'),
+ 'view_count': view_data.get('viewerCount'),
+ 'thumbnail': view_data.get('coverPhoto'),
+ 'description': view_data.get('caption'),
+ 'timestamp': view_data.get('beginTime'),
+ }
+
+
+class IchinanaLiveClipIE(InfoExtractor):
+ IE_NAME = '17live:clip'
+ _VALID_URL = r'https?://(?:www\.)?17\.live/(?:[^/]+/)*profile/r/(?P<uploader_id>\d+)/clip/(?P<id>[^/]+)'
+ _TESTS = [{
+ 'url': 'https://17.live/profile/r/1789280/clip/1bHQSK8KUieruFXaCH4A4upCzlN',
+ 'info_dict': {
+ 'id': '1bHQSK8KUieruFXaCH4A4upCzlN',
+ 'title': 'マチコ先生🦋Class💋',
+ 'description': 'マチ戦隊 第一次 バスターコール\n総額200万coin!\n動画制作@うぉーかー🌱Walker🎫',
+ 'uploader_id': '1789280',
+ },
+ }, {
+ 'url': 'https://17.live/ja/profile/r/1789280/clip/1bHQSK8KUieruFXaCH4A4upCzlN',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ uploader_id, video_id = self._match_valid_url(url).groups()
+ url = 'https://17.live/profile/r/%s/clip/%s' % (uploader_id, video_id)
+
+ view_data = self._download_json(
+ 'https://api-dsa.17app.co/api/v1/clips/%s' % video_id, video_id,
+ headers={'Referer': url})
+
+ uploader = traverse_obj(
+ view_data, ('userInfo', 'displayName'), ('userInfo', 'name'))
+
+ formats = []
+ if view_data.get('videoURL'):
+ formats.append({
+ 'id': 'video',
+ 'url': view_data['videoURL'],
+ 'quality': -1,
+ })
+ if view_data.get('transcodeURL'):
+ formats.append({
+ 'id': 'transcode',
+ 'url': view_data['transcodeURL'],
+ 'quality': -1,
+ })
+ if view_data.get('srcVideoURL'):
+ # highest quality
+ formats.append({
+ 'id': 'srcVideo',
+ 'url': view_data['srcVideoURL'],
+ 'quality': 1,
+ })
+
+ for fmt in formats:
+ fmt.update({
+ 'ext': 'mp4',
+ 'protocol': 'https',
+ 'vcodec': 'h264',
+ 'acodec': 'aac',
+ 'http_headers': {'Referer': url},
+ })
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': uploader or video_id,
+ 'formats': formats,
+ 'uploader': uploader,
+ 'uploader_id': uploader_id,
+ 'like_count': view_data.get('likeCount'),
+ 'view_count': view_data.get('viewCount'),
+ 'thumbnail': view_data.get('imageURL'),
+ 'duration': view_data.get('duration'),
+ 'description': view_data.get('caption'),
+ 'upload_date': unified_strdate(str_or_none(view_data.get('createdAt'))),
+ }
diff --git a/hypervideo_dl/extractor/ign.py b/hypervideo_dl/extractor/ign.py
index 0d9f50e..c826eb3 100644
--- a/hypervideo_dl/extractor/ign.py
+++ b/hypervideo_dl/extractor/ign.py
@@ -100,7 +100,7 @@ class IGNIE(IGNBaseIE):
formats.append({
'ext': determine_ext(mezzanine_url, 'mp4'),
'format_id': 'mezzanine',
- 'preference': 1,
+ 'quality': 1,
'url': mezzanine_url,
})
diff --git a/hypervideo_dl/extractor/imggaming.py b/hypervideo_dl/extractor/imggaming.py
index e11f920..ef20a4b 100644
--- a/hypervideo_dl/extractor/imggaming.py
+++ b/hypervideo_dl/extractor/imggaming.py
@@ -2,7 +2,6 @@
from __future__ import unicode_literals
import json
-import re
from .common import InfoExtractor
from ..compat import compat_HTTPError
@@ -62,10 +61,10 @@ class ImgGamingBaseIE(InfoExtractor):
raise
def _real_extract(self, url):
- domain, media_type, media_id, playlist_id = re.match(self._VALID_URL, url).groups()
+ domain, media_type, media_id, playlist_id = self._match_valid_url(url).groups()
if playlist_id:
- if self._downloader.params.get('noplaylist'):
+ if self.get_param('noplaylist'):
self.to_screen('Downloading just video %s because of --no-playlist' % media_id)
else:
self.to_screen('Downloading playlist %s - add --no-playlist to just download video' % playlist_id)
diff --git a/hypervideo_dl/extractor/imgur.py b/hypervideo_dl/extractor/imgur.py
index 511fa5f..c917cf1 100644
--- a/hypervideo_dl/extractor/imgur.py
+++ b/hypervideo_dl/extractor/imgur.py
@@ -72,7 +72,7 @@ class ImgurIE(InfoExtractor):
gif_json, video_id, transform_source=js_to_json)
formats.append({
'format_id': 'gif',
- 'preference': -10,
+ 'preference': -10, # gifs are worse than videos
'width': width,
'height': height,
'ext': 'gif',
diff --git a/hypervideo_dl/extractor/instagram.py b/hypervideo_dl/extractor/instagram.py
index 12e1014..3801c7a 100644
--- a/hypervideo_dl/extractor/instagram.py
+++ b/hypervideo_dl/extractor/instagram.py
@@ -19,6 +19,7 @@ from ..utils import (
std_headers,
try_get,
url_or_none,
+ variadic,
)
@@ -140,11 +141,13 @@ class InstagramIE(InfoExtractor):
return mobj.group('link')
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
video_id = mobj.group('id')
url = mobj.group('url')
- webpage = self._download_webpage(url, video_id)
+ webpage, urlh = self._download_webpage_handle(url, video_id)
+ if 'www.instagram.com/accounts/login' in urlh.geturl().rstrip('/'):
+ self.raise_login_required('You need to log in to access this content', method='cookies')
(media, video_url, description, thumbnail, timestamp, uploader,
uploader_id, like_count, comment_count, comments, height,
@@ -188,26 +191,29 @@ class InstagramIE(InfoExtractor):
uploader_id = media.get('owner', {}).get('username')
def get_count(keys, kind):
- if not isinstance(keys, (list, tuple)):
- keys = [keys]
- for key in keys:
+ for key in variadic(keys):
count = int_or_none(try_get(
media, (lambda x: x['edge_media_%s' % key]['count'],
lambda x: x['%ss' % kind]['count'])))
if count is not None:
return count
+
like_count = get_count('preview_like', 'like')
comment_count = get_count(
('preview_comment', 'to_comment', 'to_parent_comment'), 'comment')
- comments = [{
- 'author': comment.get('user', {}).get('username'),
- 'author_id': comment.get('user', {}).get('id'),
- 'id': comment.get('id'),
- 'text': comment.get('text'),
- 'timestamp': int_or_none(comment.get('created_at')),
- } for comment in media.get(
- 'comments', {}).get('nodes', []) if comment.get('text')]
+ comments = []
+ for comment in try_get(media, lambda x: x['edge_media_to_parent_comment']['edges']):
+ comment_dict = comment.get('node', {})
+ comment_text = comment_dict.get('text')
+ if comment_text:
+ comments.append({
+ 'author': try_get(comment_dict, lambda x: x['owner']['username']),
+ 'author_id': try_get(comment_dict, lambda x: x['owner']['id']),
+ 'id': comment_dict.get('id'),
+ 'text': comment_text,
+ 'timestamp': int_or_none(comment_dict.get('created_at')),
+ })
if not video_url:
edges = try_get(
media, lambda x: x['edge_sidecar_to_children']['edges'],
@@ -273,6 +279,9 @@ class InstagramIE(InfoExtractor):
'like_count': like_count,
'comment_count': comment_count,
'comments': comments,
+ 'http_headers': {
+ 'Referer': 'https://www.instagram.com/',
+ }
}
diff --git a/hypervideo_dl/extractor/internetvideoarchive.py b/hypervideo_dl/extractor/internetvideoarchive.py
index 59b0a90..880918c 100644
--- a/hypervideo_dl/extractor/internetvideoarchive.py
+++ b/hypervideo_dl/extractor/internetvideoarchive.py
@@ -4,10 +4,7 @@ import json
import re
from .common import InfoExtractor
-from ..compat import (
- compat_parse_qs,
- compat_urlparse,
-)
+from ..utils import parse_qs
class InternetVideoArchiveIE(InfoExtractor):
@@ -32,7 +29,7 @@ class InternetVideoArchiveIE(InfoExtractor):
return 'http://video.internetvideoarchive.net/player/6/configuration.ashx?' + query
def _real_extract(self, url):
- query = compat_parse_qs(compat_urlparse.urlparse(url).query)
+ query = parse_qs(url)
video_id = query['publishedid'][0]
data = self._download_json(
'https://video.internetvideoarchive.net/videojs7/videojs7.ivasettings.ashx',
diff --git a/hypervideo_dl/extractor/iprima.py b/hypervideo_dl/extractor/iprima.py
index 648ae67..28e6609 100644
--- a/hypervideo_dl/extractor/iprima.py
+++ b/hypervideo_dl/extractor/iprima.py
@@ -136,7 +136,7 @@ class IPrimaIE(InfoExtractor):
extract_formats(src)
if not formats and '>GEO_IP_NOT_ALLOWED<' in playerpage:
- self.raise_geo_restricted(countries=['CZ'])
+ self.raise_geo_restricted(countries=['CZ'], metadata_available=True)
self._sort_formats(formats)
diff --git a/hypervideo_dl/extractor/iqiyi.py b/hypervideo_dl/extractor/iqiyi.py
index 6df5214..b13b9f4 100644
--- a/hypervideo_dl/extractor/iqiyi.py
+++ b/hypervideo_dl/extractor/iqiyi.py
@@ -198,7 +198,7 @@ class IqiyiIE(InfoExtractor):
'url': stream['m3utx'],
'format_id': vd,
'ext': 'mp4',
- 'preference': self._FORMATS_MAP.get(vd, -1),
+ 'quality': self._FORMATS_MAP.get(vd, -1),
'protocol': 'm3u8_native',
})
diff --git a/hypervideo_dl/extractor/itv.py b/hypervideo_dl/extractor/itv.py
index e86c40b..4cd34a2 100644
--- a/hypervideo_dl/extractor/itv.py
+++ b/hypervideo_dl/extractor/itv.py
@@ -2,20 +2,26 @@
from __future__ import unicode_literals
import json
-import re
from .common import InfoExtractor
from .brightcove import BrightcoveNewIE
+
+from ..compat import compat_str
from ..utils import (
+ base_url,
clean_html,
determine_ext,
extract_attributes,
+ ExtractorError,
get_element_by_class,
JSON_LD_RE,
merge_dicts,
parse_duration,
smuggle_url,
+ try_get,
url_or_none,
+ url_basename,
+ urljoin,
)
@@ -23,15 +29,32 @@ class ITVIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?itv\.com/hub/[^/]+/(?P<id>[0-9a-zA-Z]+)'
_GEO_COUNTRIES = ['GB']
_TESTS = [{
- 'url': 'https://www.itv.com/hub/liar/2a4547a0012',
+ 'url': 'https://www.itv.com/hub/plebs/2a1873a0002',
'info_dict': {
- 'id': '2a4547a0012',
+ 'id': '2a1873a0002',
'ext': 'mp4',
- 'title': 'Liar - Series 2 - Episode 6',
- 'description': 'md5:d0f91536569dec79ea184f0a44cca089',
- 'series': 'Liar',
- 'season_number': 2,
- 'episode_number': 6,
+ 'title': 'Plebs - The Orgy',
+ 'description': 'md5:4d7159af53ebd5b36e8b3ec82a41fdb4',
+ 'series': 'Plebs',
+ 'season_number': 1,
+ 'episode_number': 1,
+ 'thumbnail': r're:https?://hubimages\.itv\.com/episode/2_1873_0002'
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'https://www.itv.com/hub/the-jonathan-ross-show/2a1166a0209',
+ 'info_dict': {
+ 'id': '2a1166a0209',
+ 'ext': 'mp4',
+ 'title': 'The Jonathan Ross Show - Series 17 - Episode 8',
+ 'description': 'md5:3023dcdd375db1bc9967186cdb3f1399',
+ 'series': 'The Jonathan Ross Show',
+ 'episode_number': 8,
+ 'season_number': 17,
+ 'thumbnail': r're:https?://hubimages\.itv\.com/episode/2_1873_0002'
},
'params': {
# m3u8 download
@@ -51,22 +74,16 @@ class ITVIE(InfoExtractor):
'only_matching': True,
}]
- def _real_extract(self, url):
- video_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
- params = extract_attributes(self._search_regex(
- r'(?s)(<[^>]+id="video"[^>]*>)', webpage, 'params'))
-
- ios_playlist_url = params.get('data-video-playlist') or params['data-video-id']
- hmac = params['data-video-hmac']
- headers = self.geo_verification_headers()
- headers.update({
+ def _generate_api_headers(self, hmac):
+ return merge_dicts({
'Accept': 'application/vnd.itv.vod.playlist.v2+json',
'Content-Type': 'application/json',
'hmac': hmac.upper(),
- })
- ios_playlist = self._download_json(
- ios_playlist_url, video_id, data=json.dumps({
+ }, self.geo_verification_headers())
+
+ def _call_api(self, video_id, playlist_url, headers, platform_tag, featureset, fatal=True):
+ return self._download_json(
+ playlist_url, video_id, data=json.dumps({
'user': {
'itvUserId': '',
'entitlements': [],
@@ -87,15 +104,61 @@ class ITVIE(InfoExtractor):
},
'variantAvailability': {
'featureset': {
- 'min': ['hls', 'aes', 'outband-webvtt'],
- 'max': ['hls', 'aes', 'outband-webvtt']
+ 'min': featureset,
+ 'max': featureset
},
- 'platformTag': 'dotcom'
+ 'platformTag': platform_tag
}
- }).encode(), headers=headers)
- video_data = ios_playlist['Playlist']['Video']
- ios_base_url = video_data.get('Base')
+ }).encode(), headers=headers, fatal=fatal)
+
+ def _get_subtitles(self, video_id, variants, ios_playlist_url, headers, *args, **kwargs):
+ subtitles = {}
+ # Prefer last matching featureset
+ # See: https://github.com/hypervideo/hypervideo/issues/986
+ platform_tag_subs, featureset_subs = next(
+ ((platform_tag, featureset)
+ for platform_tag, featuresets in reversed(variants.items()) for featureset in featuresets
+ if try_get(featureset, lambda x: x[2]) == 'outband-webvtt'),
+ (None, None))
+
+ if platform_tag_subs and featureset_subs:
+ subs_playlist = self._call_api(
+ video_id, ios_playlist_url, headers, platform_tag_subs, featureset_subs, fatal=False)
+ subs = try_get(subs_playlist, lambda x: x['Playlist']['Video']['Subtitles'], list) or []
+ for sub in subs:
+ if not isinstance(sub, dict):
+ continue
+ href = url_or_none(sub.get('Href'))
+ if not href:
+ continue
+ subtitles.setdefault('en', []).append({'url': href})
+ return subtitles
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+ params = extract_attributes(self._search_regex(
+ r'(?s)(<[^>]+id="video"[^>]*>)', webpage, 'params'))
+ variants = self._parse_json(
+ try_get(params, lambda x: x['data-video-variants'], compat_str) or '{}',
+ video_id, fatal=False)
+ # Prefer last matching featureset
+ # See: https://github.com/hypervideo/hypervideo/issues/986
+ platform_tag_video, featureset_video = next(
+ ((platform_tag, featureset)
+ for platform_tag, featuresets in reversed(variants.items()) for featureset in featuresets
+ if try_get(featureset, lambda x: x[:2]) == ['hls', 'aes']),
+ (None, None))
+ if not platform_tag_video or not featureset_video:
+ raise ExtractorError('No downloads available', expected=True, video_id=video_id)
+
+ ios_playlist_url = params.get('data-video-playlist') or params['data-video-id']
+ headers = self._generate_api_headers(params['data-video-hmac'])
+ ios_playlist = self._call_api(
+ video_id, ios_playlist_url, headers, platform_tag_video, featureset_video)
+
+ video_data = try_get(ios_playlist, lambda x: x['Playlist']['Video'], dict) or {}
+ ios_base_url = video_data.get('Base')
formats = []
for media_file in (video_data.get('MediaFiles') or []):
href = media_file.get('Href')
@@ -113,20 +176,6 @@ class ITVIE(InfoExtractor):
'url': href,
})
self._sort_formats(formats)
-
- subtitles = {}
- subs = video_data.get('Subtitles') or []
- for sub in subs:
- if not isinstance(sub, dict):
- continue
- href = url_or_none(sub.get('Href'))
- if not href:
- continue
- subtitles.setdefault('en', []).append({
- 'url': href,
- 'ext': determine_ext(href, 'vtt'),
- })
-
info = self._search_json_ld(webpage, video_id, default={})
if not info:
json_ld = self._parse_json(self._search_regex(
@@ -140,25 +189,45 @@ class ITVIE(InfoExtractor):
info = self._json_ld(item, video_id, fatal=False) or {}
break
+ thumbnails = []
+ thumbnail_url = try_get(params, lambda x: x['data-video-posterframe'], compat_str)
+ if thumbnail_url:
+ thumbnails.extend([{
+ 'url': thumbnail_url.format(width=1920, height=1080, quality=100, blur=0, bg='false'),
+ 'width': 1920,
+ 'height': 1080,
+ }, {
+ 'url': urljoin(base_url(thumbnail_url), url_basename(thumbnail_url)),
+ 'preference': -2
+ }])
+
+ thumbnail_url = self._html_search_meta(['og:image', 'twitter:image'], webpage, default=None)
+ if thumbnail_url:
+ thumbnails.append({
+ 'url': thumbnail_url,
+ })
+ self._remove_duplicate_formats(thumbnails)
+
return merge_dicts({
'id': video_id,
'title': self._html_search_meta(['og:title', 'twitter:title'], webpage),
'formats': formats,
- 'subtitles': subtitles,
+ 'subtitles': self.extract_subtitles(video_id, variants, ios_playlist_url, headers),
'duration': parse_duration(video_data.get('Duration')),
'description': clean_html(get_element_by_class('episode-info__synopsis', webpage)),
+ 'thumbnails': thumbnails
}, info)
class ITVBTCCIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?itv\.com/btcc/(?:[^/]+/)*(?P<id>[^/?#&]+)'
_TEST = {
- 'url': 'http://www.itv.com/btcc/races/btcc-2018-all-the-action-from-brands-hatch',
+ 'url': 'https://www.itv.com/btcc/articles/btcc-2019-brands-hatch-gp-race-action',
'info_dict': {
- 'id': 'btcc-2018-all-the-action-from-brands-hatch',
- 'title': 'BTCC 2018: All the action from Brands Hatch',
+ 'id': 'btcc-2019-brands-hatch-gp-race-action',
+ 'title': 'BTCC 2019: Brands Hatch GP race action',
},
- 'playlist_mincount': 9,
+ 'playlist_count': 12,
}
BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1582188683001/HkiHLnNRx_default/index.html?videoId=%s'
@@ -167,6 +236,16 @@ class ITVBTCCIE(InfoExtractor):
webpage = self._download_webpage(url, playlist_id)
+ json_map = try_get(self._parse_json(self._html_search_regex(
+ '(?s)<script[^>]+id=[\'"]__NEXT_DATA__[^>]*>([^<]+)</script>', webpage, 'json_map'), playlist_id),
+ lambda x: x['props']['pageProps']['article']['body']['content']) or []
+
+ # Discard empty objects
+ video_ids = []
+ for video in json_map:
+ if video['data'].get('id'):
+ video_ids.append(video['data']['id'])
+
entries = [
self.url_result(
smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % video_id, {
@@ -178,7 +257,7 @@ class ITVBTCCIE(InfoExtractor):
'referrer': url,
}),
ie=BrightcoveNewIE.ie_key(), video_id=video_id)
- for video_id in re.findall(r'data-video-id=["\'](\d+)', webpage)]
+ for video_id in video_ids]
title = self._og_search_title(webpage, fatal=False)
diff --git a/hypervideo_dl/extractor/ivi.py b/hypervideo_dl/extractor/ivi.py
index 04c54e8..5f8a046 100644
--- a/hypervideo_dl/extractor/ivi.py
+++ b/hypervideo_dl/extractor/ivi.py
@@ -3,7 +3,6 @@ from __future__ import unicode_literals
import json
import re
-import sys
from .common import InfoExtractor
from ..utils import (
@@ -94,20 +93,21 @@ class IviIE(InfoExtractor):
]
})
- bundled = hasattr(sys, 'frozen')
-
for site in (353, 183):
content_data = (data % site).encode()
if site == 353:
- if bundled:
- continue
try:
from Cryptodome.Cipher import Blowfish
from Cryptodome.Hash import CMAC
- pycryptodomex_found = True
+ pycryptodome_found = True
except ImportError:
- pycryptodomex_found = False
- continue
+ try:
+ from Crypto.Cipher import Blowfish
+ from Crypto.Hash import CMAC
+ pycryptodome_found = True
+ except ImportError:
+ pycryptodome_found = False
+ continue
timestamp = (self._download_json(
self._LIGHT_URL, video_id,
@@ -140,14 +140,8 @@ class IviIE(InfoExtractor):
extractor_msg = 'Video %s does not exist'
elif site == 353:
continue
- elif bundled:
- raise ExtractorError(
- 'This feature does not work from bundled exe. Run hypervideo from sources.',
- expected=True)
- elif not pycryptodomex_found:
- raise ExtractorError(
- 'pycryptodomex not found. Please install it.',
- expected=True)
+ elif not pycryptodome_found:
+ raise ExtractorError('pycryptodomex not found. Please install', expected=True)
elif message:
extractor_msg += ': ' + message
raise ExtractorError(extractor_msg % video_id, expected=True)
@@ -163,7 +157,10 @@ class IviIE(InfoExtractor):
for f in result.get('files', []):
f_url = f.get('url')
content_format = f.get('content_format')
- if not f_url or '-MDRM-' in content_format or '-FPS-' in content_format:
+ if not f_url:
+ continue
+ if (not self.get_param('allow_unplayable_formats')
+ and ('-MDRM-' in content_format or '-FPS-' in content_format)):
continue
formats.append({
'url': f_url,
@@ -242,7 +239,7 @@ class IviCompilationIE(InfoExtractor):
r'<a\b[^>]+\bhref=["\']/watch/%s/(\d+)["\']' % compilation_id, html)]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
compilation_id = mobj.group('compilationid')
season_id = mobj.group('seasonid')
diff --git a/hypervideo_dl/extractor/ivideon.py b/hypervideo_dl/extractor/ivideon.py
index 3ca824f..01e7b22 100644
--- a/hypervideo_dl/extractor/ivideon.py
+++ b/hypervideo_dl/extractor/ivideon.py
@@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
from ..compat import (
@@ -38,7 +37,7 @@ class IvideonIE(InfoExtractor):
_QUALITIES = ('low', 'mid', 'hi')
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
server_id, camera_id = mobj.group('id'), mobj.group('camera_id')
camera_name, description = None, None
camera_url = compat_urlparse.urljoin(
diff --git a/hypervideo_dl/extractor/iwara.py b/hypervideo_dl/extractor/iwara.py
index 907d5fc..254d986 100644
--- a/hypervideo_dl/extractor/iwara.py
+++ b/hypervideo_dl/extractor/iwara.py
@@ -1,5 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
+import re
from .common import InfoExtractor
from ..compat import compat_urllib_parse_urlparse
@@ -8,6 +9,8 @@ from ..utils import (
mimetype2ext,
remove_end,
url_or_none,
+ unified_strdate,
+ strip_or_none,
)
@@ -21,6 +24,10 @@ class IwaraIE(InfoExtractor):
'ext': 'mp4',
'title': '【MMD R-18】ガールフレンド carry_me_off',
'age_limit': 18,
+ 'thumbnail': 'https://i.iwara.tv/sites/default/files/videos/thumbnails/7951/thumbnail-7951_0001.png',
+ 'uploader': 'Reimu丨Action',
+ 'upload_date': '20150828',
+ 'description': 'md5:1d4905ce48c66c9299c617f08e106e0f',
},
}, {
'url': 'http://ecchi.iwara.tv/videos/Vb4yf2yZspkzkBO',
@@ -72,6 +79,19 @@ class IwaraIE(InfoExtractor):
title = remove_end(self._html_search_regex(
r'<title>([^<]+)</title>', webpage, 'title'), ' | Iwara')
+ thumbnail = self._html_search_regex(
+ r'poster=[\'"]([^\'"]+)', webpage, 'thumbnail', default=None)
+
+ uploader = self._html_search_regex(
+ r'class="username">([^<]+)', webpage, 'uploader', fatal=False)
+
+ upload_date = unified_strdate(self._html_search_regex(
+ r'作成日:([^\s]+)', webpage, 'upload_date', fatal=False))
+
+ description = strip_or_none(self._search_regex(
+ r'<p>(.+?(?=</div))', webpage, 'description', fatal=False,
+ flags=re.DOTALL))
+
formats = []
for a_format in video_data:
format_uri = url_or_none(a_format.get('uri'))
@@ -96,4 +116,8 @@ class IwaraIE(InfoExtractor):
'title': title,
'age_limit': age_limit,
'formats': formats,
+ 'thumbnail': self._proto_relative_url(thumbnail, 'https:'),
+ 'uploader': uploader,
+ 'upload_date': upload_date,
+ 'description': description,
}
diff --git a/hypervideo_dl/extractor/jeuxvideo.py b/hypervideo_dl/extractor/jeuxvideo.py
index e9f4ed7..77c0f52 100644
--- a/hypervideo_dl/extractor/jeuxvideo.py
+++ b/hypervideo_dl/extractor/jeuxvideo.py
@@ -2,7 +2,6 @@
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
@@ -25,7 +24,7 @@ class JeuxVideoIE(InfoExtractor):
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
title = mobj.group(1)
webpage = self._download_webpage(url, title)
title = self._html_search_meta('name', webpage) or self._og_search_title(webpage)
diff --git a/hypervideo_dl/extractor/joj.py b/hypervideo_dl/extractor/joj.py
index 62b28e9..6376181 100644
--- a/hypervideo_dl/extractor/joj.py
+++ b/hypervideo_dl/extractor/joj.py
@@ -1,108 +1,108 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-import re
-
-from .common import InfoExtractor
-from ..compat import compat_str
-from ..utils import (
- int_or_none,
- js_to_json,
- try_get,
-)
-
-
-class JojIE(InfoExtractor):
- _VALID_URL = r'''(?x)
- (?:
- joj:|
- https?://media\.joj\.sk/embed/
- )
- (?P<id>[^/?#^]+)
- '''
- _TESTS = [{
- 'url': 'https://media.joj.sk/embed/a388ec4c-6019-4a4a-9312-b1bee194e932',
- 'info_dict': {
- 'id': 'a388ec4c-6019-4a4a-9312-b1bee194e932',
- 'ext': 'mp4',
- 'title': 'NOVÉ BÝVANIE',
- 'thumbnail': r're:^https?://.*\.jpg$',
- 'duration': 3118,
- }
- }, {
- 'url': 'https://media.joj.sk/embed/9i1cxv',
- 'only_matching': True,
- }, {
- 'url': 'joj:a388ec4c-6019-4a4a-9312-b1bee194e932',
- 'only_matching': True,
- }, {
- 'url': 'joj:9i1cxv',
- 'only_matching': True,
- }]
-
- @staticmethod
- def _extract_urls(webpage):
- return [
- mobj.group('url')
- for mobj in re.finditer(
- r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//media\.joj\.sk/embed/(?:(?!\1).)+)\1',
- webpage)]
-
- def _real_extract(self, url):
- video_id = self._match_id(url)
-
- webpage = self._download_webpage(
- 'https://media.joj.sk/embed/%s' % video_id, video_id)
-
- title = self._search_regex(
- (r'videoTitle\s*:\s*(["\'])(?P<title>(?:(?!\1).)+)\1',
- r'<title>(?P<title>[^<]+)'), webpage, 'title',
- default=None, group='title') or self._og_search_title(webpage)
-
- bitrates = self._parse_json(
- self._search_regex(
- r'(?s)(?:src|bitrates)\s*=\s*({.+?});', webpage, 'bitrates',
- default='{}'),
- video_id, transform_source=js_to_json, fatal=False)
-
- formats = []
- for format_url in try_get(bitrates, lambda x: x['mp4'], list) or []:
- if isinstance(format_url, compat_str):
- height = self._search_regex(
- r'(\d+)[pP]\.', format_url, 'height', default=None)
- formats.append({
- 'url': format_url,
- 'format_id': '%sp' % height if height else None,
- 'height': int(height),
- })
- if not formats:
- playlist = self._download_xml(
- 'https://media.joj.sk/services/Video.php?clip=%s' % video_id,
- video_id)
- for file_el in playlist.findall('./files/file'):
- path = file_el.get('path')
- if not path:
- continue
- format_id = file_el.get('id') or file_el.get('label')
- formats.append({
- 'url': 'http://n16.joj.sk/storage/%s' % path.replace(
- 'dat/', '', 1),
- 'format_id': format_id,
- 'height': int_or_none(self._search_regex(
- r'(\d+)[pP]', format_id or path, 'height',
- default=None)),
- })
- self._sort_formats(formats)
-
- thumbnail = self._og_search_thumbnail(webpage)
-
- duration = int_or_none(self._search_regex(
- r'videoDuration\s*:\s*(\d+)', webpage, 'duration', fatal=False))
-
- return {
- 'id': video_id,
- 'title': title,
- 'thumbnail': thumbnail,
- 'duration': duration,
- 'formats': formats,
- }
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+ int_or_none,
+ js_to_json,
+ try_get,
+)
+
+
+class JojIE(InfoExtractor):
+ _VALID_URL = r'''(?x)
+ (?:
+ joj:|
+ https?://media\.joj\.sk/embed/
+ )
+ (?P<id>[^/?#^]+)
+ '''
+ _TESTS = [{
+ 'url': 'https://media.joj.sk/embed/a388ec4c-6019-4a4a-9312-b1bee194e932',
+ 'info_dict': {
+ 'id': 'a388ec4c-6019-4a4a-9312-b1bee194e932',
+ 'ext': 'mp4',
+ 'title': 'NOVÉ BÝVANIE',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'duration': 3118,
+ }
+ }, {
+ 'url': 'https://media.joj.sk/embed/9i1cxv',
+ 'only_matching': True,
+ }, {
+ 'url': 'joj:a388ec4c-6019-4a4a-9312-b1bee194e932',
+ 'only_matching': True,
+ }, {
+ 'url': 'joj:9i1cxv',
+ 'only_matching': True,
+ }]
+
+ @staticmethod
+ def _extract_urls(webpage):
+ return [
+ mobj.group('url')
+ for mobj in re.finditer(
+ r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//media\.joj\.sk/embed/(?:(?!\1).)+)\1',
+ webpage)]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(
+ 'https://media.joj.sk/embed/%s' % video_id, video_id)
+
+ title = self._search_regex(
+ (r'videoTitle\s*:\s*(["\'])(?P<title>(?:(?!\1).)+)\1',
+ r'<title>(?P<title>[^<]+)'), webpage, 'title',
+ default=None, group='title') or self._og_search_title(webpage)
+
+ bitrates = self._parse_json(
+ self._search_regex(
+ r'(?s)(?:src|bitrates)\s*=\s*({.+?});', webpage, 'bitrates',
+ default='{}'),
+ video_id, transform_source=js_to_json, fatal=False)
+
+ formats = []
+ for format_url in try_get(bitrates, lambda x: x['mp4'], list) or []:
+ if isinstance(format_url, compat_str):
+ height = self._search_regex(
+ r'(\d+)[pP]\.', format_url, 'height', default=None)
+ formats.append({
+ 'url': format_url,
+ 'format_id': '%sp' % height if height else None,
+ 'height': int(height),
+ })
+ if not formats:
+ playlist = self._download_xml(
+ 'https://media.joj.sk/services/Video.php?clip=%s' % video_id,
+ video_id)
+ for file_el in playlist.findall('./files/file'):
+ path = file_el.get('path')
+ if not path:
+ continue
+ format_id = file_el.get('id') or file_el.get('label')
+ formats.append({
+ 'url': 'http://n16.joj.sk/storage/%s' % path.replace(
+ 'dat/', '', 1),
+ 'format_id': format_id,
+ 'height': int_or_none(self._search_regex(
+ r'(\d+)[pP]', format_id or path, 'height',
+ default=None)),
+ })
+ self._sort_formats(formats)
+
+ thumbnail = self._og_search_thumbnail(webpage)
+
+ duration = int_or_none(self._search_regex(
+ r'videoDuration\s*:\s*(\d+)', webpage, 'duration', fatal=False))
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ 'formats': formats,
+ }
diff --git a/hypervideo_dl/extractor/jove.py b/hypervideo_dl/extractor/jove.py
index 27e0e37..4b7dfc5 100644
--- a/hypervideo_dl/extractor/jove.py
+++ b/hypervideo_dl/extractor/jove.py
@@ -1,6 +1,5 @@
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
from ..utils import (
@@ -41,7 +40,7 @@ class JoveIE(InfoExtractor):
]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
diff --git a/hypervideo_dl/extractor/jwplatform.py b/hypervideo_dl/extractor/jwplatform.py
index c34b5f5..5aa508b 100644
--- a/hypervideo_dl/extractor/jwplatform.py
+++ b/hypervideo_dl/extractor/jwplatform.py
@@ -32,9 +32,14 @@ class JWPlatformIE(InfoExtractor):
@staticmethod
def _extract_urls(webpage):
- return re.findall(
- r'<(?:script|iframe)[^>]+?src=["\']((?:https?:)?//(?:content\.jwplatform|cdn\.jwplayer)\.com/players/[a-zA-Z0-9]{8})',
- webpage)
+ for tag, key in ((r'(?:script|iframe)', 'src'), ('input', 'value')):
+ # <input value=URL> is used by hyland.com
+ # if we find <iframe>, dont look for <input>
+ ret = re.findall(
+ r'<%s[^>]+?%s=["\']((?:https?:)?//(?:content\.jwplatform|cdn\.jwplayer)\.com/players/[a-zA-Z0-9]{8})' % (tag, key),
+ webpage)
+ if ret:
+ return ret
def _real_extract(self, url):
url, smuggled_data = unsmuggle_url(url, {})
diff --git a/hypervideo_dl/extractor/kakao.py b/hypervideo_dl/extractor/kakao.py
index 31ce7a8..97c986d 100644
--- a/hypervideo_dl/extractor/kakao.py
+++ b/hypervideo_dl/extractor/kakao.py
@@ -3,21 +3,19 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..compat import compat_HTTPError
+from ..compat import compat_str
from ..utils import (
- ExtractorError,
int_or_none,
- str_or_none,
strip_or_none,
- try_get,
+ traverse_obj,
unified_timestamp,
- update_url_query,
)
class KakaoIE(InfoExtractor):
_VALID_URL = r'https?://(?:play-)?tv\.kakao\.com/(?:channel/\d+|embed/player)/cliplink/(?P<id>\d+|[^?#&]+@my)'
- _API_BASE_TMPL = 'http://tv.kakao.com/api/v1/ft/cliplinks/%s/'
+ _API_BASE_TMPL = 'http://tv.kakao.com/api/v1/ft/playmeta/cliplink/%s/'
+ _CDN_API = 'https://tv.kakao.com/katz/v1/ft/cliplink/%s/readyNplay?'
_TESTS = [{
'url': 'http://tv.kakao.com/channel/2671005/cliplink/301965083',
@@ -26,7 +24,7 @@ class KakaoIE(InfoExtractor):
'id': '301965083',
'ext': 'mp4',
'title': '乃木坂46 バナナマン 「3期生紹介コーナーが始動!顔高低差GPも!」 『乃木坂工事中』',
- 'uploader_id': '2671005',
+ 'uploader_id': 2671005,
'uploader': '그랑그랑이',
'timestamp': 1488160199,
'upload_date': '20170227',
@@ -39,31 +37,17 @@ class KakaoIE(InfoExtractor):
'ext': 'mp4',
'description': '러블리즈 - Destiny (나의 지구) (Lovelyz - Destiny)\r\n\r\n[쇼! 음악중심] 20160611, 507회',
'title': '러블리즈 - Destiny (나의 지구) (Lovelyz - Destiny)',
- 'uploader_id': '2653210',
+ 'uploader_id': 2653210,
'uploader': '쇼! 음악중심',
'timestamp': 1485684628,
'upload_date': '20170129',
}
- }, {
- # geo restricted
- 'url': 'https://tv.kakao.com/channel/3643855/cliplink/412069491',
- 'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
- display_id = video_id.rstrip('@my')
api_base = self._API_BASE_TMPL % video_id
-
- player_header = {
- 'Referer': update_url_query(
- 'http://tv.kakao.com/embed/player/cliplink/%s' % video_id, {
- 'service': 'kakao_tv',
- 'autoplay': '1',
- 'profile': 'HIGH',
- 'wmode': 'transparent',
- })
- }
+ cdn_api_base = self._CDN_API % video_id
query = {
'player': 'monet_html5',
@@ -75,64 +59,69 @@ class KakaoIE(InfoExtractor):
'fields': ','.join([
'-*', 'tid', 'clipLink', 'displayTitle', 'clip', 'title',
'description', 'channelId', 'createTime', 'duration', 'playCount',
- 'likeCount', 'commentCount', 'tagList', 'channel', 'name', 'thumbnailUrl',
+ 'likeCount', 'commentCount', 'tagList', 'channel', 'name',
+ 'clipChapterThumbnailList', 'thumbnailUrl', 'timeInSec', 'isDefault',
'videoOutputList', 'width', 'height', 'kbps', 'profile', 'label'])
}
- impress = self._download_json(
- api_base + 'impress', display_id, 'Downloading video info',
- query=query, headers=player_header)
+ api_json = self._download_json(
+ api_base, video_id, 'Downloading video info')
- clip_link = impress['clipLink']
+ clip_link = api_json['clipLink']
clip = clip_link['clip']
title = clip.get('title') or clip_link.get('displayTitle')
- query.update({
- 'fields': '-*,code,message,url',
- 'tid': impress.get('tid') or '',
- })
-
formats = []
- for fmt in (clip.get('videoOutputList') or []):
- try:
- profile_name = fmt['profile']
- if profile_name == 'AUDIO':
- continue
- query['profile'] = profile_name
- try:
- fmt_url_json = self._download_json(
- api_base + 'raw/videolocation', display_id,
- 'Downloading video URL for profile %s' % profile_name,
- query=query, headers=player_header)
- except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
- resp = self._parse_json(e.cause.read().decode(), video_id)
- if resp.get('code') == 'GeoBlocked':
- self.raise_geo_restricted()
- continue
+ for fmt in clip.get('videoOutputList', []):
+ profile_name = fmt.get('profile')
+ if not profile_name or profile_name == 'AUDIO':
+ continue
+ query.update({
+ 'profile': profile_name,
+ 'fields': '-*,url',
+ })
+
+ fmt_url_json = self._download_json(
+ cdn_api_base, video_id,
+ 'Downloading video URL for profile %s' % profile_name,
+ query=query, fatal=False)
+ fmt_url = traverse_obj(fmt_url_json, ('videoLocation', 'url'))
+ if not fmt_url:
+ continue
- fmt_url = fmt_url_json['url']
- formats.append({
- 'url': fmt_url,
- 'format_id': profile_name,
- 'width': int_or_none(fmt.get('width')),
- 'height': int_or_none(fmt.get('height')),
- 'format_note': fmt.get('label'),
- 'filesize': int_or_none(fmt.get('filesize')),
- 'tbr': int_or_none(fmt.get('kbps')),
- })
- except KeyError:
- pass
+ formats.append({
+ 'url': fmt_url,
+ 'format_id': profile_name,
+ 'width': int_or_none(fmt.get('width')),
+ 'height': int_or_none(fmt.get('height')),
+ 'format_note': fmt.get('label'),
+ 'filesize': int_or_none(fmt.get('filesize')),
+ 'tbr': int_or_none(fmt.get('kbps')),
+ })
self._sort_formats(formats)
+ thumbs = []
+ for thumb in clip.get('clipChapterThumbnailList') or []:
+ thumbs.append({
+ 'url': thumb.get('thumbnailUrl'),
+ 'id': compat_str(thumb.get('timeInSec')),
+ 'preference': -1 if thumb.get('isDefault') else 0
+ })
+ top_thumbnail = clip.get('thumbnailUrl')
+ if top_thumbnail:
+ thumbs.append({
+ 'url': top_thumbnail,
+ 'preference': 10,
+ })
+
return {
- 'id': display_id,
+ 'id': video_id,
'title': title,
'description': strip_or_none(clip.get('description')),
- 'uploader': try_get(clip_link, lambda x: x['channel']['name']),
- 'uploader_id': str_or_none(clip_link.get('channelId')),
- 'thumbnail': clip.get('thumbnailUrl'),
+ 'uploader': traverse_obj(clip_link, ('channel', 'name')),
+ 'uploader_id': clip_link.get('channelId'),
+ 'thumbnails': thumbs,
'timestamp': unified_timestamp(clip_link.get('createTime')),
'duration': int_or_none(clip.get('duration')),
'view_count': int_or_none(clip.get('playCount')),
diff --git a/hypervideo_dl/extractor/kaltura.py b/hypervideo_dl/extractor/kaltura.py
index c731612..c8f60ef 100644
--- a/hypervideo_dl/extractor/kaltura.py
+++ b/hypervideo_dl/extractor/kaltura.py
@@ -229,7 +229,7 @@ class KalturaIE(InfoExtractor):
def _real_extract(self, url):
url, smuggled_data = unsmuggle_url(url, {})
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
partner_id, entry_id = mobj.group('partner_id', 'id')
ks = None
captions = None
@@ -309,7 +309,7 @@ class KalturaIE(InfoExtractor):
if f.get('fileExt') == 'chun':
continue
# DRM-protected video, cannot be decrypted
- if f.get('fileExt') == 'wvm':
+ if not self.get_param('allow_unplayable_formats') and f.get('fileExt') == 'wvm':
continue
if not f.get('fileExt'):
# QT indicates QuickTime; some videos have broken fileExt
diff --git a/hypervideo_dl/extractor/kanalplay.py b/hypervideo_dl/extractor/kanalplay.py
new file mode 100644
index 0000000..5e24f7e
--- /dev/null
+++ b/hypervideo_dl/extractor/kanalplay.py
@@ -0,0 +1,96 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ float_or_none,
+ srt_subtitles_timecode,
+)
+
+
+class KanalPlayIE(InfoExtractor):
+ IE_DESC = 'Kanal 5/9/11 Play'
+ _VALID_URL = r'https?://(?:www\.)?kanal(?P<channel_id>5|9|11)play\.se/(?:#!/)?(?:play/)?program/\d+/video/(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'http://www.kanal5play.se/#!/play/program/3060212363/video/3270012277',
+ 'info_dict': {
+ 'id': '3270012277',
+ 'ext': 'flv',
+ 'title': 'Saknar både dusch och avlopp',
+ 'description': 'md5:6023a95832a06059832ae93bc3c7efb7',
+ 'duration': 2636.36,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ }
+ }, {
+ 'url': 'http://www.kanal9play.se/#!/play/program/335032/video/246042',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.kanal11play.se/#!/play/program/232835958/video/367135199',
+ 'only_matching': True,
+ }]
+
+ def _fix_subtitles(self, subs):
+ return '\r\n\r\n'.join(
+ '%s\r\n%s --> %s\r\n%s'
+ % (
+ num,
+ srt_subtitles_timecode(item['startMillis'] / 1000.0),
+ srt_subtitles_timecode(item['endMillis'] / 1000.0),
+ item['text'],
+ ) for num, item in enumerate(subs, 1))
+
+ def _get_subtitles(self, channel_id, video_id):
+ subs = self._download_json(
+ 'http://www.kanal%splay.se/api/subtitles/%s' % (channel_id, video_id),
+ video_id, 'Downloading subtitles JSON', fatal=False)
+ return {'sv': [{'ext': 'srt', 'data': self._fix_subtitles(subs)}]} if subs else {}
+
+ def _real_extract(self, url):
+ mobj = self._match_valid_url(url)
+ video_id = mobj.group('id')
+ channel_id = mobj.group('channel_id')
+
+ video = self._download_json(
+ 'http://www.kanal%splay.se/api/getVideo?format=FLASH&videoId=%s' % (channel_id, video_id),
+ video_id)
+
+ reasons_for_no_streams = video.get('reasonsForNoStreams')
+ if reasons_for_no_streams:
+ raise ExtractorError(
+ '%s returned error: %s' % (self.IE_NAME, '\n'.join(reasons_for_no_streams)),
+ expected=True)
+
+ title = video['title']
+ description = video.get('description')
+ duration = float_or_none(video.get('length'), 1000)
+ thumbnail = video.get('posterUrl')
+
+ stream_base_url = video['streamBaseUrl']
+
+ formats = [{
+ 'url': stream_base_url,
+ 'play_path': stream['source'],
+ 'ext': 'flv',
+ 'tbr': float_or_none(stream.get('bitrate'), 1000),
+ 'rtmp_real_time': True,
+ } for stream in video['streams']]
+ self._sort_formats(formats)
+
+ subtitles = {}
+ if video.get('hasSubtitle'):
+ subtitles = self.extract_subtitles(channel_id, video_id)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ 'formats': formats,
+ 'subtitles': subtitles,
+ }
diff --git a/hypervideo_dl/extractor/keezmovies.py b/hypervideo_dl/extractor/keezmovies.py
index c3eb74c..027f43c 100644
--- a/hypervideo_dl/extractor/keezmovies.py
+++ b/hypervideo_dl/extractor/keezmovies.py
@@ -35,7 +35,7 @@ class KeezMoviesIE(InfoExtractor):
}]
def _extract_info(self, url, fatal=True):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
video_id = mobj.group('id')
display_id = (mobj.group('display_id')
if 'display_id' in mobj.groupdict()
@@ -101,7 +101,7 @@ class KeezMoviesIE(InfoExtractor):
if not formats:
if 'title="This video is no longer available"' in webpage:
- raise ExtractorError(
+ self.raise_no_formats(
'Video %s is no longer available' % video_id, expected=True)
try:
diff --git a/hypervideo_dl/extractor/kinja.py b/hypervideo_dl/extractor/kinja.py
index 79e3026..1be8b48 100644
--- a/hypervideo_dl/extractor/kinja.py
+++ b/hypervideo_dl/extractor/kinja.py
@@ -129,7 +129,7 @@ class KinjaEmbedIE(InfoExtractor):
webpage)]
def _real_extract(self, url):
- video_type, video_id = re.match(self._VALID_URL, url).groups()
+ video_type, video_id = self._match_valid_url(url).groups()
provider = self._PROVIDER_MAP.get(video_type)
if provider:
diff --git a/hypervideo_dl/extractor/koo.py b/hypervideo_dl/extractor/koo.py
new file mode 100644
index 0000000..8154ba7
--- /dev/null
+++ b/hypervideo_dl/extractor/koo.py
@@ -0,0 +1,116 @@
+# coding: utf-8
+from __future__ import unicode_literals
+from .common import InfoExtractor
+from ..utils import (
+ clean_html,
+ try_get,
+)
+
+
+class KooIE(InfoExtractor):
+ _VALID_URL = r'(?:https?://)(?:www\.)?kooapp\.com/koo/[^/]+/(?P<id>[^/&#$?]+)'
+ _TESTS = [{ # Test for video in the comments
+ 'url': 'https://www.kooapp.com/koo/ytdlpTestAccount/946c4189-bc2d-4524-b95b-43f641e2adde',
+ 'info_dict': {
+ 'id': '946c4189-bc2d-4524-b95b-43f641e2adde',
+ 'ext': 'mp4',
+ 'title': 'test for video in comment',
+ 'description': 'md5:daa77dc214add4da8b6ea7d2226776e7',
+ 'timestamp': 1632215195,
+ 'uploader_id': 'ytdlpTestAccount',
+ 'uploader': 'hypervideoTestAccount',
+ 'duration': 7000,
+ 'upload_date': '20210921'
+ },
+ 'params': {'skip_download': True}
+ }, { # Test for koo with long title
+ 'url': 'https://www.kooapp.com/koo/laxman_kumarDBFEC/33decbf7-5e1e-4bb8-bfd7-04744a064361',
+ 'info_dict': {
+ 'id': '33decbf7-5e1e-4bb8-bfd7-04744a064361',
+ 'ext': 'mp4',
+ 'title': 'md5:47a71c2337295330c5a19a8af1bbf450',
+ 'description': 'md5:06a6a84e9321499486dab541693d8425',
+ 'timestamp': 1632106884,
+ 'uploader_id': 'laxman_kumarDBFEC',
+ 'uploader': 'Laxman Kumar 🇮🇳',
+ 'duration': 46000,
+ 'upload_date': '20210920'
+ },
+ 'params': {'skip_download': True}
+ }, { # Test for audio
+ 'url': 'https://www.kooapp.com/koo/ytdlpTestAccount/a2a9c88e-ce4b-4d2d-952f-d06361c5b602',
+ 'info_dict': {
+ 'id': 'a2a9c88e-ce4b-4d2d-952f-d06361c5b602',
+ 'ext': 'mp4',
+ 'title': 'Test for audio',
+ 'description': 'md5:ecb9a2b6a5d34b736cecb53788cb11e8',
+ 'timestamp': 1632211634,
+ 'uploader_id': 'ytdlpTestAccount',
+ 'uploader': 'hypervideoTestAccount',
+ 'duration': 214000,
+ 'upload_date': '20210921'
+ },
+ 'params': {'skip_download': True}
+ }, { # Test for video
+ 'url': 'https://www.kooapp.com/koo/ytdlpTestAccount/a3e56c53-c1ed-4ac9-ac02-ed1630e6b1d1',
+ 'info_dict': {
+ 'id': 'a3e56c53-c1ed-4ac9-ac02-ed1630e6b1d1',
+ 'ext': 'mp4',
+ 'title': 'Test for video',
+ 'description': 'md5:7afc4eb839074ddeb2beea5dd6fe9500',
+ 'timestamp': 1632211468,
+ 'uploader_id': 'ytdlpTestAccount',
+ 'uploader': 'hypervideoTestAccount',
+ 'duration': 14000,
+ 'upload_date': '20210921'
+ },
+ 'params': {'skip_download': True}
+ }, { # Test for link
+ 'url': 'https://www.kooapp.com/koo/ytdlpTestAccount/01bf5b94-81a5-4d8e-a387-5f732022e15a',
+ 'skip': 'No video/audio found at the provided url.',
+ 'info_dict': {
+ 'id': '01bf5b94-81a5-4d8e-a387-5f732022e15a',
+ 'title': 'Test for link',
+ 'ext': 'none',
+ },
+ }, { # Test for images
+ 'url': 'https://www.kooapp.com/koo/ytdlpTestAccount/dc05d9cd-a61d-45fd-bb07-e8019d8ca8cb',
+ 'skip': 'No video/audio found at the provided url.',
+ 'info_dict': {
+ 'id': 'dc05d9cd-a61d-45fd-bb07-e8019d8ca8cb',
+ 'title': 'Test for images',
+ 'ext': 'none',
+ },
+ }]
+
+ def _real_extract(self, url):
+ id = self._match_id(url)
+ data_json = self._download_json(f'https://www.kooapp.com/apiV1/ku/{id}?limit=20&offset=0&showSimilarKoos=true', id)['parentContent']
+ item_json = next(content['items'][0] for content in data_json
+ if try_get(content, lambda x: x['items'][0]['id']) == id)
+ media_json = item_json['mediaMap']
+ formats = []
+
+ mp4_url = media_json.get('videoMp4')
+ video_m3u8_url = media_json.get('videoHls')
+ if mp4_url:
+ formats.append({
+ 'url': mp4_url,
+ 'ext': 'mp4',
+ })
+ if video_m3u8_url:
+ formats.extend(self._extract_m3u8_formats(video_m3u8_url, id, fatal=False, ext='mp4'))
+ if not formats:
+ self.raise_no_formats('No video/audio found at the provided url.', expected=True)
+
+ self._sort_formats(formats)
+ return {
+ 'id': id,
+ 'title': clean_html(item_json.get('title')),
+ 'description': f'{clean_html(item_json.get("title"))}\n\n{clean_html(item_json.get("enTransliteration"))}',
+ 'timestamp': item_json.get('createdAt'),
+ 'uploader_id': item_json.get('handle'),
+ 'uploader': item_json.get('name'),
+ 'duration': media_json.get('duration'),
+ 'formats': formats,
+ }
diff --git a/hypervideo_dl/extractor/kusi.py b/hypervideo_dl/extractor/kusi.py
index 9833d35..707fe18 100644
--- a/hypervideo_dl/extractor/kusi.py
+++ b/hypervideo_dl/extractor/kusi.py
@@ -2,7 +2,6 @@
from __future__ import unicode_literals
import random
-import re
from .common import InfoExtractor
from ..compat import compat_urllib_parse_unquote_plus
@@ -35,7 +34,7 @@ class KUSIIE(InfoExtractor):
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
clip_id = mobj.group('clipId')
video_id = clip_id or mobj.group('path')
diff --git a/hypervideo_dl/extractor/kuwo.py b/hypervideo_dl/extractor/kuwo.py
index cc5b2a1..460a425 100644
--- a/hypervideo_dl/extractor/kuwo.py
+++ b/hypervideo_dl/extractor/kuwo.py
@@ -49,7 +49,7 @@ class KuwoBaseIE(InfoExtractor):
'url': song_url,
'format_id': file_format['format'],
'format': file_format['format'],
- 'preference': file_format['preference'],
+ 'quality': file_format['preference'],
'abr': file_format.get('abr'),
})
diff --git a/hypervideo_dl/extractor/la7.py b/hypervideo_dl/extractor/la7.py
index c3b4ffa..363fbd6 100644
--- a/hypervideo_dl/extractor/la7.py
+++ b/hypervideo_dl/extractor/la7.py
@@ -1,10 +1,15 @@
# coding: utf-8
from __future__ import unicode_literals
+import re
+
from .common import InfoExtractor
from ..utils import (
- js_to_json,
+ determine_ext,
+ float_or_none,
+ parse_duration,
smuggle_url,
+ unified_strdate,
)
@@ -23,22 +28,13 @@ class LA7IE(InfoExtractor):
'id': '0_42j6wd36',
'ext': 'mp4',
'title': 'Inc.Cool8',
- 'description': 'Benvenuti nell\'incredibile mondo della INC. COOL. 8. dove “INC.” sta per “Incorporated” “COOL” sta per “fashion” ed Eight sta per il gesto atletico',
+ 'description': 'Benvenuti nell\'incredibile mondo della INC. COOL. 8. dove “INC.” sta per “Incorporated” “COOL” sta per “fashion” ed Eight sta per il gesto atletico',
'thumbnail': 're:^https?://.*',
'uploader_id': 'kdla7pillole@iltrovatore.it',
'timestamp': 1443814869,
'upload_date': '20151002',
},
}, {
- # 'src' is a dictionary
- 'url': 'http://tg.la7.it/repliche-tgla7?id=189080',
- 'md5': '6b0d8888d286e39870208dfeceaf456b',
- 'info_dict': {
- 'id': '189080',
- 'ext': 'mp4',
- 'title': 'TG LA7',
- },
- }, {
'url': 'http://www.la7.it/omnibus/rivedila7/omnibus-news-02-07-2016-189077',
'only_matching': True,
}]
@@ -46,22 +42,162 @@ class LA7IE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
+ if not url.startswith('http'):
+ url = '%s//%s' % (self.http_scheme(), url)
+
webpage = self._download_webpage(url, video_id)
- player_data = self._parse_json(
- self._search_regex(
- [r'(?s)videoParams\s*=\s*({.+?});', r'videoLa7\(({[^;]+})\);'],
- webpage, 'player data'),
- video_id, transform_source=js_to_json)
+ player_data = self._search_regex(
+ [r'(?s)videoParams\s*=\s*({.+?});', r'videoLa7\(({[^;]+})\);'],
+ webpage, 'player data')
+ vid = self._search_regex(r'vid\s*:\s*"(.+?)",', player_data, 'vid')
return {
'_type': 'url_transparent',
- 'url': smuggle_url('kaltura:103:%s' % player_data['vid'], {
+ 'url': smuggle_url('kaltura:103:%s' % vid, {
'service_url': 'http://nkdam.iltrovatore.it',
}),
'id': video_id,
- 'title': player_data['title'],
+ 'title': self._og_search_title(webpage, default=None),
'description': self._og_search_description(webpage, default=None),
- 'thumbnail': player_data.get('poster'),
+ 'thumbnail': self._og_search_thumbnail(webpage, default=None),
'ie_key': 'Kaltura',
}
+
+
+class LA7PodcastEpisodeIE(InfoExtractor):
+ IE_NAME = 'la7.it:pod:episode'
+ _VALID_URL = r'''(?x)(https?://)?
+ (?:www\.)?la7\.it/[^/]+/podcast/([^/]+-)?(?P<id>\d+)'''
+
+ _TESTS = [{
+ 'url': 'https://www.la7.it/voicetown/podcast/la-carezza-delle-memoria-di-carlo-verdone-23-03-2021-371497',
+ 'md5': '7737d4d79b3c1a34b3de3e16297119ed',
+ 'info_dict': {
+ 'id': '371497',
+ 'ext': 'mp3',
+ 'title': '"La carezza delle memoria" di Carlo Verdone',
+ 'description': 'md5:5abf07c3c551a687db80af3f9ceb7d52',
+ 'thumbnail': 'https://www.la7.it/sites/default/files/podcast/371497.jpg',
+ 'upload_date': '20210323',
+ },
+ }, {
+ # embed url
+ 'url': 'https://www.la7.it/embed/podcast/371497',
+ 'only_matching': True,
+ }, {
+ # date already in the title
+ 'url': 'https://www.la7.it/propagandalive/podcast/lintervista-di-diego-bianchi-ad-annalisa-cuzzocrea-puntata-del-1932021-20-03-2021-371130',
+ 'only_matching': True,
+ }, {
+ # title same as show_title
+ 'url': 'https://www.la7.it/otto-e-mezzo/podcast/otto-e-mezzo-26-03-2021-372340',
+ 'only_matching': True,
+ }]
+
+ def _extract_info(self, webpage, video_id=None, ppn=None):
+ if not video_id:
+ video_id = self._search_regex(
+ r'data-nid=([\'"])(?P<vid>\d+)\1',
+ webpage, 'video_id', group='vid')
+
+ media_url = self._search_regex(
+ (r'src:\s*([\'"])(?P<url>.+?mp3.+?)\1',
+ r'data-podcast=([\'"])(?P<url>.+?mp3.+?)\1'),
+ webpage, 'media_url', group='url')
+ ext = determine_ext(media_url)
+ formats = [{
+ 'url': media_url,
+ 'format_id': ext,
+ 'ext': ext,
+ }]
+ self._sort_formats(formats)
+
+ title = self._html_search_regex(
+ (r'<div class="title">(?P<title>.+?)</',
+ r'<title>(?P<title>[^<]+)</title>',
+ r'title:\s*([\'"])(?P<title>.+?)\1'),
+ webpage, 'title', group='title')
+
+ description = (
+ self._html_search_regex(
+ (r'<div class="description">(.+?)</div>',
+ r'<div class="description-mobile">(.+?)</div>',
+ r'<div class="box-txt">([^<]+?)</div>',
+ r'<div class="field-content"><p>(.+?)</p></div>'),
+ webpage, 'description', default=None)
+ or self._html_search_meta('description', webpage))
+
+ thumb = self._html_search_regex(
+ (r'<div class="podcast-image"><img src="(.+?)"></div>',
+ r'<div class="container-embed"[^<]+url\((.+?)\);">',
+ r'<div class="field-content"><img src="(.+?)"'),
+ webpage, 'thumbnail', fatal=False, default=None)
+
+ duration = parse_duration(self._html_search_regex(
+ r'<span class="(?:durata|duration)">([\d:]+)</span>',
+ webpage, 'duration', fatal=False, default=None))
+
+ date = self._html_search_regex(
+ r'class="data">\s*(?:<span>)?([\d\.]+)\s*</',
+ webpage, 'date', default=None)
+
+ date_alt = self._search_regex(
+ r'(\d+[\./]\d+[\./]\d+)', title, 'date_alt', default=None)
+ ppn = ppn or self._search_regex(
+ r'ppN:\s*([\'"])(?P<ppn>.+?)\1',
+ webpage, 'ppn', group='ppn', default=None)
+ # if the date is not in the title
+ # and title is the same as the show_title
+ # add the date to the title
+ if date and not date_alt and ppn and ppn.lower() == title.lower():
+ title += ' del %s' % date
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'duration': float_or_none(duration),
+ 'formats': formats,
+ 'thumbnail': thumb,
+ 'upload_date': unified_strdate(date),
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ return self._extract_info(webpage, video_id)
+
+
+class LA7PodcastIE(LA7PodcastEpisodeIE):
+ IE_NAME = 'la7.it:podcast'
+ _VALID_URL = r'(https?://)?(www\.)?la7\.it/(?P<id>[^/]+)/podcast/?(?:$|[#?])'
+
+ _TESTS = [{
+ 'url': 'https://www.la7.it/propagandalive/podcast',
+ 'info_dict': {
+ 'id': 'propagandalive',
+ 'title': "Propaganda Live",
+ },
+ 'playlist_count': 10,
+ }]
+
+ def _real_extract(self, url):
+ playlist_id = self._match_id(url)
+ webpage = self._download_webpage(url, playlist_id)
+
+ title = (
+ self._html_search_regex(
+ r'<h1.*?>(.+?)</h1>', webpage, 'title', fatal=False, default=None)
+ or self._og_search_title(webpage))
+ ppn = self._search_regex(
+ r'window\.ppN\s*=\s*([\'"])(?P<ppn>.+?)\1',
+ webpage, 'ppn', group='ppn', default=None)
+
+ entries = []
+ for episode in re.finditer(
+ r'<div class="container-podcast-property">([\s\S]+?)(?:</div>\s*){3}',
+ webpage):
+ entries.append(self._extract_info(episode.group(1), ppn=ppn))
+
+ return self.playlist_result(entries, playlist_id, title)
diff --git a/hypervideo_dl/extractor/lbry.py b/hypervideo_dl/extractor/lbry.py
index cfd6b83..0f87bf1 100644
--- a/hypervideo_dl/extractor/lbry.py
+++ b/hypervideo_dl/extractor/lbry.py
@@ -6,16 +6,15 @@ import json
from .common import InfoExtractor
from ..compat import (
- compat_parse_qs,
compat_str,
compat_urllib_parse_unquote,
- compat_urllib_parse_urlparse,
)
from ..utils import (
determine_ext,
ExtractorError,
int_or_none,
mimetype2ext,
+ parse_qs,
OnDemandPagedList,
try_get,
urljoin,
@@ -23,27 +22,34 @@ from ..utils import (
class LBRYBaseIE(InfoExtractor):
- _BASE_URL_REGEX = r'https?://(?:www\.)?(?:lbry\.tv|odysee\.com)/'
+ _BASE_URL_REGEX = r'(?:https?://(?:www\.)?(?:lbry\.tv|odysee\.com)/|lbry://)'
_CLAIM_ID_REGEX = r'[0-9a-f]{1,40}'
- _OPT_CLAIM_ID = '[^:/?#&]+(?::%s)?' % _CLAIM_ID_REGEX
+ _OPT_CLAIM_ID = '[^:/?#&]+(?:[:#]%s)?' % _CLAIM_ID_REGEX
_SUPPORTED_STREAM_TYPES = ['video', 'audio']
def _call_api_proxy(self, method, display_id, params, resource):
- return self._download_json(
+ response = self._download_json(
'https://api.lbry.tv/api/v1/proxy',
display_id, 'Downloading %s JSON metadata' % resource,
headers={'Content-Type': 'application/json-rpc'},
data=json.dumps({
'method': method,
'params': params,
- }).encode())['result']
+ }).encode())
+ err = response.get('error')
+ if err:
+ raise ExtractorError(
+ f'{self.IE_NAME} said: {err.get("code")} - {err.get("message")}', expected=True)
+ return response['result']
def _resolve_url(self, url, display_id, resource):
return self._call_api_proxy(
'resolve', display_id, {'urls': url}, resource)[url]
def _permanent_url(self, url, claim_name, claim_id):
- return urljoin(url, '/%s:%s' % (claim_name, claim_id))
+ return urljoin(
+ url.replace('lbry://', 'https://lbry.tv/'),
+ '/%s:%s' % (claim_name, claim_id))
def _parse_stream(self, stream, url):
stream_value = stream.get('value') or {}
@@ -164,6 +170,9 @@ class LBRYIE(LBRYBaseIE):
}, {
'url': 'https://lbry.tv/@lacajadepandora:a/TRUMP-EST%C3%81-BIEN-PUESTO-con-Pilar-Baselga,-Carlos-Senra,-Luis-Palacios-(720p_30fps_H264-192kbit_AAC):1',
'only_matching': True,
+ }, {
+ 'url': 'lbry://@lbry#3f/odysee#7',
+ 'only_matching': True,
}]
def _real_extract(self, url):
@@ -201,7 +210,7 @@ class LBRYIE(LBRYBaseIE):
class LBRYChannelIE(LBRYBaseIE):
IE_NAME = 'lbry:channel'
- _VALID_URL = LBRYBaseIE._BASE_URL_REGEX + r'(?P<id>@%s)/?(?:[?#&]|$)' % LBRYBaseIE._OPT_CLAIM_ID
+ _VALID_URL = LBRYBaseIE._BASE_URL_REGEX + r'(?P<id>@%s)/?(?:[?&]|$)' % LBRYBaseIE._OPT_CLAIM_ID
_TESTS = [{
'url': 'https://lbry.tv/@LBRYFoundation:0',
'info_dict': {
@@ -213,6 +222,9 @@ class LBRYChannelIE(LBRYBaseIE):
}, {
'url': 'https://lbry.tv/@LBRYFoundation',
'only_matching': True,
+ }, {
+ 'url': 'lbry://@lbry#3f',
+ 'only_matching': True,
}]
_PAGE_SIZE = 50
@@ -248,7 +260,7 @@ class LBRYChannelIE(LBRYBaseIE):
result = self._resolve_url(
'lbry://' + display_id, display_id, 'channel')
claim_id = result['claim_id']
- qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
+ qs = parse_qs(url)
content = qs.get('content', [None])[0]
params = {
'fee_amount': qs.get('fee_amount', ['>=0'])[0],
diff --git a/hypervideo_dl/extractor/lecturio.py b/hypervideo_dl/extractor/lecturio.py
index 1b2dcef..9d22287 100644
--- a/hypervideo_dl/extractor/lecturio.py
+++ b/hypervideo_dl/extractor/lecturio.py
@@ -103,7 +103,7 @@ class LecturioIE(LecturioBaseIE):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
nt = mobj.group('nt') or mobj.group('nt_de')
lecture_id = mobj.group('id')
display_id = nt or lecture_id
@@ -196,7 +196,7 @@ class LecturioCourseIE(LecturioBaseIE):
}]
def _real_extract(self, url):
- nt, course_id = re.match(self._VALID_URL, url).groups()
+ nt, course_id = self._match_valid_url(url).groups()
display_id = nt or course_id
api_path = 'courses/' + course_id if course_id else 'course/content/' + nt + '.json'
course = self._download_json(
diff --git a/hypervideo_dl/extractor/leeco.py b/hypervideo_dl/extractor/leeco.py
index 7dc0ad7..d5e1142 100644
--- a/hypervideo_dl/extractor/leeco.py
+++ b/hypervideo_dl/extractor/leeco.py
@@ -185,7 +185,7 @@ class LeIE(InfoExtractor):
f['height'] = int_or_none(format_id[:-1])
formats.append(f)
- self._sort_formats(formats, ('height', 'quality', 'format_id'))
+ self._sort_formats(formats, ('res', 'quality'))
publish_time = parse_iso8601(self._html_search_regex(
r'发布时间&nbsp;([^<>]+) ', page, 'publish time', default=None),
diff --git a/hypervideo_dl/extractor/lego.py b/hypervideo_dl/extractor/lego.py
index 1e3c19d..b9d8b16 100644
--- a/hypervideo_dl/extractor/lego.py
+++ b/hypervideo_dl/extractor/lego.py
@@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
import uuid
from .common import InfoExtractor
@@ -64,7 +63,7 @@ class LEGOIE(InfoExtractor):
}
def _real_extract(self, url):
- locale, video_id = re.match(self._VALID_URL, url).groups()
+ locale, video_id = self._match_valid_url(url).groups()
countries = [locale.split('-')[1].upper()]
self._initialize_geo_bypass({
'countries': countries,
diff --git a/hypervideo_dl/extractor/libsyn.py b/hypervideo_dl/extractor/libsyn.py
index 2cf4442..d1fcda4 100644
--- a/hypervideo_dl/extractor/libsyn.py
+++ b/hypervideo_dl/extractor/libsyn.py
@@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
from ..utils import (
@@ -42,7 +41,7 @@ class LibsynIE(InfoExtractor):
}]
def _real_extract(self, url):
- url, video_id = re.match(self._VALID_URL, url).groups()
+ url, video_id = self._match_valid_url(url).groups()
webpage = self._download_webpage(url, video_id)
data = self._parse_json(self._search_regex(
diff --git a/hypervideo_dl/extractor/lifenews.py b/hypervideo_dl/extractor/lifenews.py
index 42e263b..49a0a59 100644
--- a/hypervideo_dl/extractor/lifenews.py
+++ b/hypervideo_dl/extractor/lifenews.py
@@ -201,7 +201,7 @@ class LifeEmbedIE(InfoExtractor):
formats.append({
'url': original_url,
'format_id': determine_ext(original_url, None),
- 'preference': 1,
+ 'quality': 1,
})
playlist = self._parse_json(
diff --git a/hypervideo_dl/extractor/limelight.py b/hypervideo_dl/extractor/limelight.py
index 39f74d2..369141d 100644
--- a/hypervideo_dl/extractor/limelight.py
+++ b/hypervideo_dl/extractor/limelight.py
@@ -96,7 +96,9 @@ class LimelightBaseIE(InfoExtractor):
urls = []
for stream in pc_item.get('streams', []):
stream_url = stream.get('url')
- if not stream_url or stream.get('drmProtected') or stream_url in urls:
+ if not stream_url or stream_url in urls:
+ continue
+ if not self.get_param('allow_unplayable_formats') and stream.get('drmProtected'):
continue
urls.append(stream_url)
ext = determine_ext(stream_url)
@@ -158,7 +160,10 @@ class LimelightBaseIE(InfoExtractor):
for mobile_url in mobile_item.get('mobileUrls', []):
media_url = mobile_url.get('mobileUrl')
format_id = mobile_url.get('targetMediaPlatform')
- if not media_url or format_id in ('Widevine', 'SmoothStreaming') or media_url in urls:
+ if not media_url or media_url in urls:
+ continue
+ if (format_id in ('Widevine', 'SmoothStreaming')
+ and not self.get_param('allow_unplayable_formats', False)):
continue
urls.append(media_url)
ext = determine_ext(media_url)
@@ -173,7 +178,7 @@ class LimelightBaseIE(InfoExtractor):
formats.append({
'url': media_url,
'format_id': format_id,
- 'preference': -1,
+ 'quality': -10,
'ext': ext,
})
diff --git a/hypervideo_dl/extractor/line.py b/hypervideo_dl/extractor/line.py
index 2526daa..d4bcae6 100644
--- a/hypervideo_dl/extractor/line.py
+++ b/hypervideo_dl/extractor/line.py
@@ -1,12 +1,10 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
- ExtractorError,
int_or_none,
js_to_json,
str_or_none,
@@ -32,7 +30,7 @@ class LineTVIE(InfoExtractor):
}]
def _real_extract(self, url):
- series_id, segment = re.match(self._VALID_URL, url).groups()
+ series_id, segment = self._match_valid_url(url).groups()
video_id = '%s_%s' % (series_id, segment)
webpage = self._download_webpage(url, video_id)
@@ -77,7 +75,7 @@ class LineTVIE(InfoExtractor):
self._sort_formats(formats)
- if not formats[0].get('width'):
+ if formats and not formats[0].get('width'):
formats[0]['vcodec'] = 'none'
title = self._og_search_title(webpage)
@@ -155,7 +153,7 @@ class LineLiveIE(LineLiveBaseIE):
}]
def _real_extract(self, url):
- channel_id, broadcast_id = re.match(self._VALID_URL, url).groups()
+ channel_id, broadcast_id = self._match_valid_url(url).groups()
broadcast = self._download_json(
self._API_BASE_URL + '%s/broadcast/%s' % (channel_id, broadcast_id),
broadcast_id)
@@ -183,7 +181,7 @@ class LineLiveIE(LineLiveBaseIE):
if not formats:
archive_status = item.get('archiveStatus')
if archive_status != 'ARCHIVED':
- raise ExtractorError('this video has been ' + archive_status.lower(), expected=True)
+ self.raise_no_formats('this video has been ' + archive_status.lower(), expected=True)
self._sort_formats(formats)
info['formats'] = formats
return info
diff --git a/hypervideo_dl/extractor/linkedin.py b/hypervideo_dl/extractor/linkedin.py
index 26fc703..3ce906e 100644
--- a/hypervideo_dl/extractor/linkedin.py
+++ b/hypervideo_dl/extractor/linkedin.py
@@ -1,6 +1,7 @@
# coding: utf-8
from __future__ import unicode_literals
+from itertools import zip_longest
import re
from .common import InfoExtractor
@@ -8,6 +9,8 @@ from ..utils import (
ExtractorError,
float_or_none,
int_or_none,
+ srt_subtitles_timecode,
+ try_get,
urlencode_postdata,
urljoin,
)
@@ -86,8 +89,18 @@ class LinkedInLearningIE(LinkedInLearningBaseIE):
},
}
+ def json2srt(self, transcript_lines, duration=None):
+ srt_data = ''
+ for line, (line_dict, next_dict) in enumerate(zip_longest(transcript_lines, transcript_lines[1:])):
+ start_time, caption = line_dict['transcriptStartAt'] / 1000, line_dict['caption']
+ end_time = next_dict['transcriptStartAt'] / 1000 if next_dict else duration or start_time + 1
+ srt_data += '%d\n%s --> %s\n%s\n\n' % (line + 1, srt_subtitles_timecode(start_time),
+ srt_subtitles_timecode(end_time),
+ caption)
+ return srt_data
+
def _real_extract(self, url):
- course_slug, video_slug = re.match(self._VALID_URL, url).groups()
+ course_slug, video_slug = self._match_valid_url(url).groups()
video_data = None
formats = []
@@ -101,6 +114,7 @@ class LinkedInLearningIE(LinkedInLearningBaseIE):
formats.append({
'format_id': 'progressive-%dp' % height,
'url': progressive_url,
+ 'ext': 'mp4',
'height': height,
'width': width,
'source_preference': 1,
@@ -124,7 +138,18 @@ class LinkedInLearningIE(LinkedInLearningBaseIE):
streaming_url, video_slug, 'mp4',
'm3u8_native', m3u8_id='hls', fatal=False))
- self._sort_formats(formats, ('width', 'height', 'source_preference', 'tbr', 'abr'))
+ # It seems like this would be correctly handled by default
+ # However, unless someone can confirm this, the old
+ # behaviour is being kept as-is
+ self._sort_formats(formats, ('res', 'source_preference'))
+ subtitles = {}
+ duration = int_or_none(video_data.get('durationInSeconds'))
+ transcript_lines = try_get(video_data, lambda x: x['transcript']['lines'], expected_type=list)
+ if transcript_lines:
+ subtitles['en'] = [{
+ 'ext': 'srt',
+ 'data': self.json2srt(transcript_lines, duration)
+ }]
return {
'id': self._get_video_id(video_data, course_slug, video_slug),
@@ -132,7 +157,8 @@ class LinkedInLearningIE(LinkedInLearningBaseIE):
'formats': formats,
'thumbnail': video_data.get('defaultThumbnail'),
'timestamp': float_or_none(video_data.get('publishedOn'), 1000),
- 'duration': int_or_none(video_data.get('durationInSeconds')),
+ 'duration': duration,
+ 'subtitles': subtitles,
}
diff --git a/hypervideo_dl/extractor/linuxacademy.py b/hypervideo_dl/extractor/linuxacademy.py
index 7ec4a65..2053970 100644
--- a/hypervideo_dl/extractor/linuxacademy.py
+++ b/hypervideo_dl/extractor/linuxacademy.py
@@ -2,7 +2,6 @@ from __future__ import unicode_literals
import json
import random
-import re
from .common import InfoExtractor
from ..compat import (
@@ -38,8 +37,8 @@ class LinuxAcademyIE(InfoExtractor):
'ext': 'mp4',
'title': 'What Is Data Science',
'description': 'md5:c574a3c20607144fb36cb65bdde76c99',
- 'timestamp': 1607387907,
- 'upload_date': '20201208',
+ 'timestamp': int, # The timestamp and upload date changes
+ 'upload_date': r're:\d+',
'duration': 304,
},
'params': {
@@ -59,6 +58,16 @@ class LinuxAcademyIE(InfoExtractor):
},
'playlist_count': 41,
'skip': 'Requires Linux Academy account credentials',
+ }, {
+ 'url': 'https://linuxacademy.com/cp/modules/view/id/39',
+ 'info_dict': {
+ 'id': '39',
+ 'title': 'Red Hat Certified Systems Administrator - RHCSA (EX200) Exam Prep (legacy)',
+ 'description': 'md5:0f1d3369e90c3fb14a79813b863c902f',
+ 'duration': 89280,
+ },
+ 'playlist_count': 73,
+ 'skip': 'Requires Linux Academy account credentials',
}]
_AUTHORIZE_URL = 'https://login.linuxacademy.com/authorize'
@@ -102,7 +111,7 @@ class LinuxAcademyIE(InfoExtractor):
'client_id': self._CLIENT_ID,
'redirect_uri': self._ORIGIN_URL,
'tenant': 'lacausers',
- 'connection': 'Username-Password-Authentication',
+ 'connection': 'Username-Password-ACG-Proxy',
'username': username,
'password': password,
'sso': 'true',
@@ -152,7 +161,7 @@ class LinuxAcademyIE(InfoExtractor):
% access_token, None, 'Downloading token validation page')
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
chapter_id, lecture_id, course_id = mobj.group('chapter_id', 'lesson_id', 'course_id')
item_id = course_id if course_id else '%s-%s' % (chapter_id, lecture_id)
@@ -162,7 +171,7 @@ class LinuxAcademyIE(InfoExtractor):
if course_id:
module = self._parse_json(
self._search_regex(
- r'window\.module\s*=\s*({.+?})\s*;', webpage, 'module'),
+ r'window\.module\s*=\s*({(?:(?!};)[^"]|"([^"]|\\")*")+})\s*;', webpage, 'module'),
item_id)
entries = []
chapter_number = None
diff --git a/hypervideo_dl/extractor/litv.py b/hypervideo_dl/extractor/litv.py
index 337b1b1..18d237e 100644
--- a/hypervideo_dl/extractor/litv.py
+++ b/hypervideo_dl/extractor/litv.py
@@ -71,7 +71,7 @@ class LiTVIE(InfoExtractor):
video_id = self._match_id(url)
- noplaylist = self._downloader.params.get('noplaylist')
+ noplaylist = self.get_param('noplaylist')
noplaylist_prompt = True
if 'force_noplaylist' in data:
noplaylist = data['force_noplaylist']
diff --git a/hypervideo_dl/extractor/livestream.py b/hypervideo_dl/extractor/livestream.py
index e55b1a2..f591289 100644
--- a/hypervideo_dl/extractor/livestream.py
+++ b/hypervideo_dl/extractor/livestream.py
@@ -84,7 +84,7 @@ class LivestreamIE(InfoExtractor):
'format_id': 'smil_%d' % tbr,
'ext': 'flv',
'tbr': tbr,
- 'preference': -1000,
+ 'preference': -1000, # Strictly inferior than all other formats?
})
return formats
@@ -212,7 +212,7 @@ class LivestreamIE(InfoExtractor):
return self.playlist_result(entries, event_id, event_data['full_name'])
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
video_id = mobj.group('id')
event = mobj.group('event_id') or mobj.group('event_name')
account = mobj.group('account_id') or mobj.group('account_name')
@@ -319,7 +319,7 @@ class LivestreamOriginalIE(InfoExtractor):
return self.playlist_result(entries, folder_id)
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
user = mobj.group('user')
url_type = mobj.group('type')
content_id = mobj.group('id')
@@ -359,7 +359,7 @@ class LivestreamShortenerIE(InfoExtractor):
_VALID_URL = r'https?://livestre\.am/(?P<id>.+)'
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
id = mobj.group('id')
webpage = self._download_webpage(url, id)
diff --git a/hypervideo_dl/extractor/lnkgo.py b/hypervideo_dl/extractor/lnkgo.py
index 3e71852..1467596 100644
--- a/hypervideo_dl/extractor/lnkgo.py
+++ b/hypervideo_dl/extractor/lnkgo.py
@@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
from ..utils import (
@@ -58,7 +57,7 @@ class LnkGoIE(InfoExtractor):
_M3U8_TEMPL = 'https://vod.lnk.lt/lnk_vod/lnk/lnk/%s:%s/playlist.m3u8%s'
def _real_extract(self, url):
- display_id, video_id = re.match(self._VALID_URL, url).groups()
+ display_id, video_id = self._match_valid_url(url).groups()
video_info = self._download_json(
'https://lnk.lt/api/main/video-page/%s/%s/false' % (display_id, video_id or '0'),
diff --git a/hypervideo_dl/extractor/localnews8.py b/hypervideo_dl/extractor/localnews8.py
index aad3961..c3e9d10 100644
--- a/hypervideo_dl/extractor/localnews8.py
+++ b/hypervideo_dl/extractor/localnews8.py
@@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
@@ -25,7 +24,7 @@ class LocalNews8IE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
video_id = mobj.group('id')
display_id = mobj.group('display_id')
diff --git a/hypervideo_dl/extractor/lovehomeporn.py b/hypervideo_dl/extractor/lovehomeporn.py
index 8f65a3c..ca4b5f3 100644
--- a/hypervideo_dl/extractor/lovehomeporn.py
+++ b/hypervideo_dl/extractor/lovehomeporn.py
@@ -1,6 +1,5 @@
from __future__ import unicode_literals
-import re
from .nuevo import NuevoBaseIE
@@ -23,7 +22,7 @@ class LoveHomePornIE(NuevoBaseIE):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
video_id = mobj.group('id')
display_id = mobj.group('display_id')
diff --git a/hypervideo_dl/extractor/lrt.py b/hypervideo_dl/extractor/lrt.py
index 89d5498..4024aef 100644
--- a/hypervideo_dl/extractor/lrt.py
+++ b/hypervideo_dl/extractor/lrt.py
@@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
from ..utils import (
@@ -47,7 +46,7 @@ class LRTIE(InfoExtractor):
webpage, var_name.replace('_', ' '), default, group=2)
def _real_extract(self, url):
- path, video_id = re.match(self._VALID_URL, url).groups()
+ path, video_id = self._match_valid_url(url).groups()
webpage = self._download_webpage(url, video_id)
media_url = self._extract_js_var(webpage, 'main_url', path)
diff --git a/hypervideo_dl/extractor/lynda.py b/hypervideo_dl/extractor/lynda.py
index b3d8653..58cf172 100644
--- a/hypervideo_dl/extractor/lynda.py
+++ b/hypervideo_dl/extractor/lynda.py
@@ -128,7 +128,7 @@ class LyndaIE(LyndaBaseIE):
'Video %s is only available for members' % video_id)
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
video_id = mobj.group('id')
course_id = mobj.group('course_id')
@@ -281,7 +281,7 @@ class LyndaCourseIE(LyndaBaseIE):
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
course_path = mobj.group('coursepath')
course_id = mobj.group('courseid')
@@ -331,7 +331,7 @@ class LyndaCourseIE(LyndaBaseIE):
})
if unaccessible_videos > 0:
- self._downloader.report_warning(
+ self.report_warning(
'%s videos are only available for members (or paid members) and will not be downloaded. '
% unaccessible_videos + self._ACCOUNT_CREDENTIALS_HINT)
diff --git a/hypervideo_dl/extractor/magentamusik360.py b/hypervideo_dl/extractor/magentamusik360.py
new file mode 100644
index 0000000..5c27490
--- /dev/null
+++ b/hypervideo_dl/extractor/magentamusik360.py
@@ -0,0 +1,61 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class MagentaMusik360IE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?magenta-musik-360\.de/([a-z0-9-]+-(?P<id>[0-9]+)|festivals/.+)'
+ _TESTS = [{
+ 'url': 'https://www.magenta-musik-360.de/within-temptation-wacken-2019-1-9208205928595185932',
+ 'md5': '65b6f060b40d90276ec6fb9b992c1216',
+ 'info_dict': {
+ 'id': '9208205928595185932',
+ 'ext': 'm3u8',
+ 'title': 'WITHIN TEMPTATION',
+ 'description': 'Robert Westerholt und Sharon Janny den Adel gründeten die Symphonic Metal-Band. Privat sind die Niederländer ein Paar und haben zwei Kinder. Die Single Ice Queen brachte ihnen Platin und Gold und verhalf 2002 zum internationalen Durchbruch. Charakteristisch für die Band war Anfangs der hohe Gesang von Frontfrau Sharon. Stilistisch fing die Band im Gothic Metal an. Mit neuem Sound, schnellen Gitarrenriffs und Gitarrensoli, avancierte Within Temptation zur erfolgreichen Rockband. Auch dieses Jahr wird die Band ihre Fangemeinde wieder mitreißen.',
+ }
+ }, {
+ 'url': 'https://www.magenta-musik-360.de/festivals/wacken-world-wide-2020-body-count-feat-ice-t',
+ 'md5': '81010d27d7cab3f7da0b0f681b983b7e',
+ 'info_dict': {
+ 'id': '9208205928595231363',
+ 'ext': 'm3u8',
+ 'title': 'Body Count feat. Ice-T',
+ 'description': 'Body Count feat. Ice-T konnten bereits im vergangenen Jahr auf dem „Holy Ground“ in Wacken überzeugen. 2020 gehen die Crossover-Metaller aus einem Club in Los Angeles auf Sendung und bringen mit ihrer Mischung aus Metal und Hip-Hop Abwechslung und ordentlich Alarm zum WWW. Bereits seit 1990 stehen die beiden Gründer Ice-T (Gesang) und Ernie C (Gitarre) auf der Bühne. Sieben Studioalben hat die Gruppe bis jetzt veröffentlicht, darunter das Debüt „Body Count“ (1992) mit dem kontroversen Track „Cop Killer“.',
+ }
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ # _match_id casts to string, but since "None" is not a valid video_id for magenta
+ # there is no risk for confusion
+ if video_id == "None":
+ webpage = self._download_webpage(url, video_id)
+ video_id = self._html_search_regex(r'data-asset-id="([^"]+)"', webpage, 'video_id')
+ json = self._download_json("https://wcps.t-online.de/cvss/magentamusic/vodplayer/v3/player/58935/%s/Main%%20Movie" % video_id, video_id)
+ xml_url = json['content']['feature']['representations'][0]['contentPackages'][0]['media']['href']
+ metadata = json['content']['feature'].get('metadata')
+ title = None
+ description = None
+ duration = None
+ thumbnails = []
+ if metadata:
+ title = metadata.get('title')
+ description = metadata.get('fullDescription')
+ duration = metadata.get('runtimeInSeconds')
+ for img_key in ('teaserImageWide', 'smallCoverImage'):
+ if img_key in metadata:
+ thumbnails.append({'url': metadata[img_key].get('href')})
+
+ xml = self._download_xml(xml_url, video_id)
+ final_url = xml[0][0][0].attrib['src']
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'url': final_url,
+ 'duration': duration,
+ 'thumbnails': thumbnails
+ }
diff --git a/hypervideo_dl/extractor/mailru.py b/hypervideo_dl/extractor/mailru.py
index 65cc474..5d9f80b 100644
--- a/hypervideo_dl/extractor/mailru.py
+++ b/hypervideo_dl/extractor/mailru.py
@@ -12,6 +12,7 @@ from ..utils import (
parse_duration,
remove_end,
try_get,
+ urljoin,
)
@@ -20,10 +21,10 @@ class MailRuIE(InfoExtractor):
IE_DESC = 'Видео@Mail.Ru'
_VALID_URL = r'''(?x)
https?://
- (?:(?:www|m)\.)?my\.mail\.ru/+
+ (?:(?:www|m|videoapi)\.)?my\.mail\.ru/+
(?:
video/.*\#video=/?(?P<idv1>(?:[^/]+/){3}\d+)|
- (?:(?P<idv2prefix>(?:[^/]+/+){2})video/(?P<idv2suffix>[^/]+/\d+))\.html|
+ (?:videos/embed/)?(?:(?P<idv2prefix>(?:[^/]+/+){2})(?:video/(?:embed/)?)?(?P<idv2suffix>[^/]+/\d+))(?:\.html)?|
(?:video/embed|\+/video/meta)/(?P<metaid>\d+)
)
'''
@@ -93,11 +94,19 @@ class MailRuIE(InfoExtractor):
{
'url': 'https://my.mail.ru//list//sinyutin10/video/_myvideo/4.html',
'only_matching': True,
+ },
+ {
+ 'url': 'https://my.mail.ru/mail/cloud-strife/video/embed/Games/2009',
+ 'only_matching': True,
+ },
+ {
+ 'url': 'https://videoapi.my.mail.ru/videos/embed/mail/cloud-strife/Games/2009.html',
+ 'only_matching': True,
}
]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
meta_id = mobj.group('metaid')
video_id = None
@@ -108,15 +117,21 @@ class MailRuIE(InfoExtractor):
if not video_id:
video_id = mobj.group('idv2prefix') + mobj.group('idv2suffix')
webpage = self._download_webpage(url, video_id)
- page_config = self._parse_json(self._search_regex(
+ page_config = self._parse_json(self._search_regex([
r'(?s)<script[^>]+class="sp-video__page-config"[^>]*>(.+?)</script>',
+ r'(?s)"video":\s*({.+?}),'],
webpage, 'page config', default='{}'), video_id, fatal=False)
if page_config:
- meta_url = page_config.get('metaUrl') or page_config.get('video', {}).get('metaUrl')
+ meta_url = page_config.get('metaUrl') or page_config.get('video', {}).get('metaUrl') or page_config.get('metadataUrl')
else:
meta_url = None
video_data = None
+
+ # fix meta_url if missing the host address
+ if re.match(r'^\/\+\/', meta_url):
+ meta_url = urljoin('https://my.mail.ru', meta_url)
+
if meta_url:
video_data = self._download_json(
meta_url, video_id or meta_id, 'Downloading video meta JSON',
diff --git a/hypervideo_dl/extractor/manoto.py b/hypervideo_dl/extractor/manoto.py
new file mode 100644
index 0000000..d12aa5f
--- /dev/null
+++ b/hypervideo_dl/extractor/manoto.py
@@ -0,0 +1,138 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ clean_html,
+ int_or_none,
+ traverse_obj
+)
+
+
+_API_URL = 'https://dak1vd5vmi7x6.cloudfront.net/api/v1/publicrole/{}/{}?id={}'
+
+
+class ManotoTVIE(InfoExtractor):
+ IE_DESC = 'Manoto TV (Episode)'
+ _VALID_URL = r'https?://(?:www\.)?manototv\.com/episode/(?P<id>[0-9]+)'
+ _TESTS = [{
+ 'url': 'https://www.manototv.com/episode/8475',
+ 'info_dict': {
+ 'id': '8475',
+ 'series': 'خانه های رویایی با برادران اسکات',
+ 'season_number': 7,
+ 'episode_number': 25,
+ 'episode_id': 'My Dream Home S7: Carol & John',
+ 'duration': 3600,
+ 'categories': ['سرگرمی'],
+ 'title': 'کارول و جان',
+ 'description': 'md5:d0fff1f8ba5c6775d312a00165d1a97e',
+ 'thumbnail': r're:^https?://.*\.(jpeg|png|jpg)$',
+ 'ext': 'mp4'
+ },
+ 'params': {
+ 'skip_download': 'm3u8',
+ }
+ }, {
+ 'url': 'https://www.manototv.com/episode/12576',
+ 'info_dict': {
+ 'id': '12576',
+ 'series': 'فیلم های ایرانی',
+ 'episode_id': 'Seh Mah Taatili',
+ 'duration': 5400,
+ 'view_count': int,
+ 'categories': ['سرگرمی'],
+ 'title': 'سه ماه تعطیلی',
+ 'description': 'سه ماه تعطیلی فیلمی به کارگردانی و نویسندگی شاپور قریب ساختهٔ سال ۱۳۵۶ است.',
+ 'thumbnail': r're:^https?://.*\.(jpeg|png|jpg)$',
+ 'ext': 'mp4'
+ },
+ 'params': {
+ 'skip_download': 'm3u8',
+ }
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ episode_json = self._download_json(_API_URL.format('showmodule', 'episodedetails', video_id), video_id)
+ details = episode_json.get('details', {})
+ formats = self._extract_m3u8_formats(details.get('videoM3u8Url'), video_id, 'mp4')
+ self._sort_formats(formats)
+ return {
+ 'id': video_id,
+ 'series': details.get('showTitle'),
+ 'season_number': int_or_none(details.get('analyticsSeasonNumber')),
+ 'episode_number': int_or_none(details.get('episodeNumber')),
+ 'episode_id': details.get('analyticsEpisodeTitle'),
+ 'duration': int_or_none(details.get('durationInMinutes'), invscale=60),
+ 'view_count': details.get('viewCount'),
+ 'categories': [details.get('videoCategory')],
+ 'title': details.get('episodeTitle'),
+ 'description': clean_html(details.get('episodeDescription')),
+ 'thumbnail': details.get('episodelandscapeImgIxUrl'),
+ 'formats': formats,
+ }
+
+
+class ManotoTVShowIE(InfoExtractor):
+ IE_DESC = 'Manoto TV (Show)'
+ _VALID_URL = r'https?://(?:www\.)?manototv\.com/show/(?P<id>[0-9]+)'
+ _TESTS = [{
+ 'url': 'https://www.manototv.com/show/2526',
+ 'playlist_mincount': 68,
+ 'info_dict': {
+ 'id': '2526',
+ 'title': 'فیلم های ایرانی',
+ 'description': 'مجموعه ای از فیلم های سینمای کلاسیک ایران',
+ },
+ }]
+
+ def _real_extract(self, url):
+ show_id = self._match_id(url)
+ show_json = self._download_json(_API_URL.format('showmodule', 'details', show_id), show_id)
+ show_details = show_json.get('details', {})
+ title = show_details.get('showTitle')
+ description = show_details.get('showSynopsis')
+
+ series_json = self._download_json(_API_URL.format('showmodule', 'serieslist', show_id), show_id)
+ playlist_id = str(traverse_obj(series_json, ('details', 'list', 0, 'id')))
+
+ playlist_json = self._download_json(_API_URL.format('showmodule', 'episodelist', playlist_id), playlist_id)
+ playlist = traverse_obj(playlist_json, ('details', 'list')) or []
+
+ entries = [
+ self.url_result(
+ 'https://www.manototv.com/episode/%s' % item['slideID'], ie=ManotoTVIE.ie_key(), video_id=item['slideID'])
+ for item in playlist]
+ return self.playlist_result(entries, show_id, title, description)
+
+
+class ManotoTVLiveIE(InfoExtractor):
+ IE_DESC = 'Manoto TV (Live)'
+ _VALID_URL = r'https?://(?:www\.)?manototv\.com/live/'
+ _TEST = {
+ 'url': 'https://www.manototv.com/live/',
+ 'info_dict': {
+ 'id': 'live',
+ 'title': 'Manoto TV Live',
+ 'ext': 'mp4',
+ 'is_live': True,
+ },
+ 'params': {
+ 'skip_download': 'm3u8',
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = 'live'
+ json = self._download_json(_API_URL.format('livemodule', 'details', ''), video_id)
+ details = json.get('details', {})
+ video_url = details.get('liveUrl')
+ formats = self._extract_m3u8_formats(video_url, video_id, 'mp4', live=True)
+ self._sort_formats(formats)
+ return {
+ 'id': video_id,
+ 'title': 'Manoto TV Live',
+ 'is_live': True,
+ 'formats': formats,
+ }
diff --git a/hypervideo_dl/extractor/massengeschmacktv.py b/hypervideo_dl/extractor/massengeschmacktv.py
index cfcc6b2..b381d31 100644
--- a/hypervideo_dl/extractor/massengeschmacktv.py
+++ b/hypervideo_dl/extractor/massengeschmacktv.py
@@ -67,7 +67,7 @@ class MassengeschmackTVIE(InfoExtractor):
'vcodec': 'none' if format_id.startswith('Audio') else None,
})
- self._sort_formats(formats, ('width', 'height', 'filesize', 'tbr'))
+ self._sort_formats(formats)
return {
'id': episode,
diff --git a/hypervideo_dl/extractor/mdr.py b/hypervideo_dl/extractor/mdr.py
index dc6aa98..0bdd626 100644
--- a/hypervideo_dl/extractor/mdr.py
+++ b/hypervideo_dl/extractor/mdr.py
@@ -137,11 +137,11 @@ class MDRIE(InfoExtractor):
if ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
video_url, video_id, 'mp4', entry_protocol='m3u8_native',
- preference=0, m3u8_id='HLS', fatal=False))
+ quality=1, m3u8_id='HLS', fatal=False))
elif ext == 'f4m':
formats.extend(self._extract_f4m_formats(
video_url + '?hdcore=3.7.0&plugin=aasp-3.7.0.39.44', video_id,
- preference=0, f4m_id='HDS', fatal=False))
+ quality=1, f4m_id='HDS', fatal=False))
else:
media_type = xpath_text(asset, './mediaType', 'media type', default='MP4')
vbr = int_or_none(xpath_text(asset, './bitrateVideo', 'vbr'), 1000)
diff --git a/hypervideo_dl/extractor/medaltv.py b/hypervideo_dl/extractor/medaltv.py
index 67bb4de..2ece5aa 100644
--- a/hypervideo_dl/extractor/medaltv.py
+++ b/hypervideo_dl/extractor/medaltv.py
@@ -103,11 +103,11 @@ class MedalTVIE(InfoExtractor):
error = clip.get('error')
if not formats and error:
if error == 404:
- raise ExtractorError(
+ self.raise_no_formats(
'That clip does not exist.',
expected=True, video_id=video_id)
else:
- raise ExtractorError(
+ self.raise_no_formats(
'An unknown error occurred ({0}).'.format(error),
video_id=video_id)
diff --git a/hypervideo_dl/extractor/mediaite.py b/hypervideo_dl/extractor/mediaite.py
new file mode 100644
index 0000000..b670f0d
--- /dev/null
+++ b/hypervideo_dl/extractor/mediaite.py
@@ -0,0 +1,93 @@
+from __future__ import unicode_literals
+
+
+from .common import InfoExtractor
+
+
+class MediaiteIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?mediaite.com(?!/category)(?:/[\w-]+){2}'
+ _TESTS = [{
+ 'url': 'https://www.mediaite.com/sports/bill-burr-roasts-nfl-for-promoting-black-lives-matter-while-scheduling-more-games-after-all-the-sht-they-know-about-cte/',
+ 'info_dict': {
+ 'id': 'vPHKITzy',
+ 'ext': 'm4a',
+ 'title': 'Bill Burr On NFL And Black Lives Matter',
+ 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
+ 'thumbnail': 'https://cdn.jwplayer.com/v2/media/vPHKITzy/poster.jpg?width=720',
+ 'duration': 55,
+ 'timestamp': 1631630185,
+ 'upload_date': '20210914',
+ },
+ 'params': {'skip_download': True}
+ }, {
+ 'url': 'https://www.mediaite.com/tv/joe-scarborough-goes-off-on-tax-breaks-for-super-wealthy-largest-income-redistribution-scam-in-american-history/',
+ 'info_dict': {
+ 'id': 'eeFcK4Xm',
+ 'ext': 'mp4',
+ 'title': 'Morning Joe-6_16_52 am - 6_21_10 am-2021-09-14.mp4',
+ 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
+ 'thumbnail': 'https://cdn.jwplayer.com/v2/media/eeFcK4Xm/poster.jpg?width=720',
+ 'duration': 258,
+ 'timestamp': 1631618057,
+ 'upload_date': '20210914',
+ },
+ 'params': {'skip_download': True}
+ }, {
+ 'url': 'https://www.mediaite.com/politics/watch-rudy-giuliani-impersonates-queen-elizabeth-calls-mark-milley-an-asshle-in-bizarre-9-11-speech/',
+ 'info_dict': {
+ 'id': 'EiyiXKcr',
+ 'ext': 'mp4',
+ 'title': 'Giuliani 1',
+ 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
+ 'thumbnail': 'https://cdn.jwplayer.com/v2/media/EiyiXKcr/poster.jpg?width=720',
+ 'duration': 39,
+ 'timestamp': 1631536476,
+ 'upload_date': '20210913',
+ },
+ 'params': {'skip_download': True}
+ }, {
+ 'url': 'https://www.mediaite.com/podcasts/clarissa-ward-says-she-decided-to-become-a-journalist-on-9-11/',
+ 'info_dict': {
+ 'id': 'TxavoRTx',
+ 'ext': 'mp4',
+ 'title': 'clarissa-ward-3.mp4',
+ 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
+ 'thumbnail': 'https://cdn.jwplayer.com/v2/media/TxavoRTx/poster.jpg?width=720',
+ 'duration': 83,
+ 'timestamp': 1631311188,
+ 'upload_date': '20210910',
+ },
+ 'params': {'skip_download': True}
+ }, {
+ 'url': 'https://www.mediaite.com/opinion/mainstream-media-ignores-rose-mcgowans-bombshell-allegation-that-newsoms-wife-tried-to-silence-her-on-weinstein/',
+ 'info_dict': {
+ 'id': 'sEIWvKR7',
+ 'ext': 'mp4',
+ 'title': 'KTTV_09-13-2021_05.34.21',
+ 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
+ 'thumbnail': 'https://cdn.jwplayer.com/v2/media/sEIWvKR7/poster.jpg?width=720',
+ 'duration': 52,
+ 'timestamp': 1631553328,
+ 'upload_date': '20210913',
+ },
+ 'params': {'skip_download': True}
+ }, {
+ 'url': 'https://www.mediaite.com/news/watch-cnbcs-jim-cramer-says-nobody-wants-to-die-getting-infected-by-unvaccinated-coworker-even-for-22-an-hour/',
+ 'info_dict': {
+ 'id': 'nwpt1elX',
+ 'ext': 'mp4',
+ 'title': "CNBC's Jim Cramer Says Nobody Wants to Die Getting Infected by Unvaccinated Coworker 'Even for $22 an Hour'.mp4",
+ 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
+ 'thumbnail': 'https://cdn.jwplayer.com/v2/media/nwpt1elX/poster.jpg?width=720',
+ 'duration': 60,
+ 'timestamp': 1633014214,
+ 'upload_date': '20210930',
+ },
+ 'params': {'skip_download': True}
+ }]
+
+ def _real_extract(self, url):
+ webpage = self._download_webpage(url, None)
+ id = self._search_regex(r'data-video-id\s?=\s?\"([^\"]+)\"', webpage, 'id')
+ data_json = self._download_json(f'https://cdn.jwplayer.com/v2/media/{id}', id)
+ return self._parse_jwplayer_data(data_json)
diff --git a/hypervideo_dl/extractor/mediaklikk.py b/hypervideo_dl/extractor/mediaklikk.py
new file mode 100644
index 0000000..b9b6d73
--- /dev/null
+++ b/hypervideo_dl/extractor/mediaklikk.py
@@ -0,0 +1,104 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from ..utils import (
+ unified_strdate
+)
+from .common import InfoExtractor
+from ..compat import (
+ compat_urllib_parse_unquote,
+ compat_str
+)
+
+
+class MediaKlikkIE(InfoExtractor):
+ _VALID_URL = r'''(?x)^https?:\/\/(?:www\.)?
+ (?:mediaklikk|m4sport|hirado|petofilive)\.hu\/.*?videok?\/
+ (?:(?P<year>[0-9]{4})/(?P<month>[0-9]{1,2})/(?P<day>[0-9]{1,2})/)?
+ (?P<id>[^/#?_]+)'''
+
+ _TESTS = [{
+ # mediaklikk. date in html.
+ 'url': 'https://mediaklikk.hu/video/hazajaro-delnyugat-bacska-a-duna-menten-palankatol-doroszloig/',
+ 'info_dict': {
+ 'id': '4754129',
+ 'title': 'Hazajáró, DÉLNYUGAT-BÁCSKA – A Duna mentén Palánkától Doroszlóig',
+ 'ext': 'mp4',
+ 'upload_date': '20210901',
+ 'thumbnail': 'http://mediaklikk.hu/wp-content/uploads/sites/4/2014/02/hazajarouj_JO.jpg'
+ }
+ }, {
+ # m4sport
+ 'url': 'https://m4sport.hu/video/2021/08/30/gyemant-liga-parizs/',
+ 'info_dict': {
+ 'id': '4754999',
+ 'title': 'Gyémánt Liga, Párizs',
+ 'ext': 'mp4',
+ 'upload_date': '20210830',
+ 'thumbnail': 'http://m4sport.hu/wp-content/uploads/sites/4/2021/08/vlcsnap-2021-08-30-18h21m20s10-1024x576.jpg'
+ }
+ }, {
+ # m4sport with *video/ url and no date
+ 'url': 'https://m4sport.hu/bl-video/real-madrid-chelsea-1-1/',
+ 'info_dict': {
+ 'id': '4492099',
+ 'title': 'Real Madrid - Chelsea 1-1',
+ 'ext': 'mp4',
+ 'thumbnail': 'http://m4sport.hu/wp-content/uploads/sites/4/2021/04/Sequence-01.Still001-1024x576.png'
+ }
+ }, {
+ # hirado
+ 'url': 'https://hirado.hu/videok/felteteleket-szabott-a-fovaros/',
+ 'info_dict': {
+ 'id': '4760120',
+ 'title': 'Feltételeket szabott a főváros',
+ 'ext': 'mp4',
+ 'thumbnail': 'http://hirado.hu/wp-content/uploads/sites/4/2021/09/vlcsnap-2021-09-01-20h20m37s165.jpg'
+ }
+ }, {
+ # petofilive
+ 'url': 'https://petofilive.hu/video/2021/06/07/tha-shudras-az-akusztikban/',
+ 'info_dict': {
+ 'id': '4571948',
+ 'title': 'Tha Shudras az Akusztikban',
+ 'ext': 'mp4',
+ 'upload_date': '20210607',
+ 'thumbnail': 'http://petofilive.hu/wp-content/uploads/sites/4/2021/06/vlcsnap-2021-06-07-22h14m23s915-1024x576.jpg'
+ }
+ }]
+
+ def _real_extract(self, url):
+ mobj = self._match_valid_url(url)
+ display_id = mobj.group('id')
+ webpage = self._download_webpage(url, display_id)
+
+ player_data_str = self._html_search_regex(
+ r'mtva_player_manager\.player\(document.getElementById\(.*\),\s?(\{.*\}).*\);', webpage, 'player data')
+ player_data = self._parse_json(player_data_str, display_id, compat_urllib_parse_unquote)
+ video_id = compat_str(player_data['contentId'])
+ title = player_data.get('title') or self._og_search_title(webpage, fatal=False) or \
+ self._html_search_regex(r'<h\d+\b[^>]+\bclass="article_title">([^<]+)<', webpage, 'title')
+
+ upload_date = unified_strdate(
+ '%s-%s-%s' % (mobj.group('year'), mobj.group('month'), mobj.group('day')))
+ if not upload_date:
+ upload_date = unified_strdate(self._html_search_regex(
+ r'<p+\b[^>]+\bclass="article_date">([^<]+)<', webpage, 'upload date', default=None))
+
+ player_data['video'] = player_data.pop('token')
+ player_page = self._download_webpage('https://player.mediaklikk.hu/playernew/player.php', video_id, query=player_data)
+ playlist_url = self._proto_relative_url(compat_urllib_parse_unquote(
+ self._html_search_regex(r'\"file\":\s*\"(\\?/\\?/.*playlist\.m3u8)\"', player_page, 'playlist_url')).replace('\\/', '/'))
+
+ formats = self._extract_wowza_formats(
+ playlist_url, video_id, skip_protocols=['f4m', 'smil', 'dash'])
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'display_id': display_id,
+ 'formats': formats,
+ 'upload_date': upload_date,
+ 'thumbnail': player_data.get('bgImage') or self._og_search_thumbnail(webpage)
+ }
diff --git a/hypervideo_dl/extractor/mediaset.py b/hypervideo_dl/extractor/mediaset.py
index 2c16fc9..26e7abc 100644
--- a/hypervideo_dl/extractor/mediaset.py
+++ b/hypervideo_dl/extractor/mediaset.py
@@ -4,13 +4,10 @@ from __future__ import unicode_literals
import re
from .theplatform import ThePlatformBaseIE
-from ..compat import (
- compat_parse_qs,
- compat_urllib_parse_urlparse,
-)
from ..utils import (
ExtractorError,
int_or_none,
+ parse_qs,
update_url_query,
)
@@ -30,38 +27,70 @@ class MediasetIE(ThePlatformBaseIE):
'''
_TESTS = [{
# full episode
- 'url': 'https://www.mediasetplay.mediaset.it/video/hellogoodbye/quarta-puntata_FAFU000000661824',
- 'md5': '9b75534d42c44ecef7bf1ffeacb7f85d',
+ 'url': 'https://www.mediasetplay.mediaset.it/video/mrwronglezionidamore/episodio-1_F310575103000102',
+ 'md5': 'a7e75c6384871f322adb781d3bd72c26',
'info_dict': {
- 'id': 'FAFU000000661824',
+ 'id': 'F310575103000102',
'ext': 'mp4',
- 'title': 'Quarta puntata',
+ 'title': 'Episodio 1',
'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
'thumbnail': r're:^https?://.*\.jpg$',
- 'duration': 1414.26,
- 'upload_date': '20161107',
- 'series': 'Hello Goodbye',
- 'timestamp': 1478532900,
- 'uploader': 'Rete 4',
- 'uploader_id': 'R4',
+ 'duration': 2682.0,
+ 'upload_date': '20210530',
+ 'series': 'Mr Wrong - Lezioni d\'amore',
+ 'timestamp': 1622413946,
+ 'uploader': 'Canale 5',
+ 'uploader_id': 'C5',
},
}, {
'url': 'https://www.mediasetplay.mediaset.it/video/matrix/puntata-del-25-maggio_F309013801000501',
- 'md5': '288532f0ad18307705b01e581304cd7b',
+ 'md5': '1276f966ac423d16ba255ce867de073e',
'info_dict': {
'id': 'F309013801000501',
'ext': 'mp4',
'title': 'Puntata del 25 maggio',
'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
'thumbnail': r're:^https?://.*\.jpg$',
- 'duration': 6565.007,
- 'upload_date': '20180526',
+ 'duration': 6565.008,
+ 'upload_date': '20200903',
'series': 'Matrix',
- 'timestamp': 1527326245,
+ 'timestamp': 1599172492,
'uploader': 'Canale 5',
'uploader_id': 'C5',
},
}, {
+ 'url': 'https://www.mediasetplay.mediaset.it/video/cameracafe5/episodio-69-pezzo-di-luna_F303843101017801',
+ 'md5': 'd1650ac9ff944f185556126a736df148',
+ 'info_dict': {
+ 'id': 'F303843101017801',
+ 'ext': 'mp4',
+ 'title': 'Episodio 69 - Pezzo di luna',
+ 'description': '',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'duration': 263.008,
+ 'upload_date': '20200902',
+ 'series': 'Camera Café 5',
+ 'timestamp': 1599064700,
+ 'uploader': 'Italia 1',
+ 'uploader_id': 'I1',
+ },
+ }, {
+ 'url': 'https://www.mediasetplay.mediaset.it/video/cameracafe5/episodio-51-tu-chi-sei_F303843107000601',
+ 'md5': '567e9ad375b7a27a0e370650f572a1e3',
+ 'info_dict': {
+ 'id': 'F303843107000601',
+ 'ext': 'mp4',
+ 'title': 'Episodio 51 - Tu chi sei?',
+ 'description': '',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'duration': 367.021,
+ 'upload_date': '20200902',
+ 'series': 'Camera Café 5',
+ 'timestamp': 1599069817,
+ 'uploader': 'Italia 1',
+ 'uploader_id': 'I1',
+ },
+ }, {
# clip
'url': 'https://www.mediasetplay.mediaset.it/video/gogglebox/un-grande-classico-della-commedia-sexy_FAFU000000661680',
'only_matching': True,
@@ -96,7 +125,7 @@ class MediasetIE(ThePlatformBaseIE):
@staticmethod
def _extract_urls(ie, webpage):
def _qs(url):
- return compat_parse_qs(compat_urllib_parse_urlparse(url).query)
+ return parse_qs(url)
def _program_guid(qs):
return qs.get('programGuid', [None])[0]
@@ -135,36 +164,38 @@ class MediasetIE(ThePlatformBaseIE):
formats = []
subtitles = {}
first_e = None
- for asset_type in ('SD', 'HD'):
- # TODO: fixup ISM+none manifest URLs
- for f in ('MPEG4', 'MPEG-DASH+none', 'M3U+none'):
- try:
- tp_formats, tp_subtitles = self._extract_theplatform_smil(
- update_url_query('http://link.theplatform.%s/s/%s' % (self._TP_TLD, tp_path), {
- 'mbr': 'true',
- 'formats': f,
- 'assetTypes': asset_type,
- }), guid, 'Downloading %s %s SMIL data' % (f.split('+')[0], asset_type))
- except ExtractorError as e:
- if not first_e:
- first_e = e
- break
- for tp_f in tp_formats:
- tp_f['quality'] = 1 if asset_type == 'HD' else 0
- formats.extend(tp_formats)
- subtitles = self._merge_subtitles(subtitles, tp_subtitles)
+ asset_type = 'geoNo:HD,browser,geoIT|geoNo:HD,geoIT|geoNo:SD,browser,geoIT|geoNo:SD,geoIT|geoNo|HD|SD'
+ # TODO: fixup ISM+none manifest URLs
+ for f in ('MPEG4', 'MPEG-DASH+none', 'M3U+none'):
+ try:
+ tp_formats, tp_subtitles = self._extract_theplatform_smil(
+ update_url_query('http://link.theplatform.%s/s/%s' % (self._TP_TLD, tp_path), {
+ 'mbr': 'true',
+ 'formats': f,
+ 'assetTypes': asset_type,
+ }), guid, 'Downloading %s SMIL data' % (f.split('+')[0]))
+ except ExtractorError as e:
+ if not first_e:
+ first_e = e
+ break
+ formats.extend(tp_formats)
+ subtitles = self._merge_subtitles(subtitles, tp_subtitles)
if first_e and not formats:
raise first_e
self._sort_formats(formats)
- fields = []
- for templ, repls in (('tvSeason%sNumber', ('', 'Episode')), ('mediasetprogram$%s', ('brandTitle', 'numberOfViews', 'publishInfo'))):
- fields.extend(templ % repl for repl in repls)
feed_data = self._download_json(
- 'https://feed.entertainment.tv.theplatform.eu/f/PR1GhC/mediaset-prod-all-programs/guid/-/' + guid,
- guid, fatal=False, query={'fields': ','.join(fields)})
+ 'https://feed.entertainment.tv.theplatform.eu/f/PR1GhC/mediaset-prod-all-programs-v2/guid/-/' + guid,
+ guid, fatal=False)
if feed_data:
publish_info = feed_data.get('mediasetprogram$publishInfo') or {}
+ thumbnails = feed_data.get('thumbnails') or {}
+ thumbnail = None
+ for key, value in thumbnails.items():
+ if key.startswith('image_keyframe_poster-'):
+ thumbnail = value.get('url')
+ break
+
info.update({
'episode_number': int_or_none(feed_data.get('tvSeasonEpisodeNumber')),
'season_number': int_or_none(feed_data.get('tvSeasonNumber')),
@@ -172,6 +203,7 @@ class MediasetIE(ThePlatformBaseIE):
'uploader': publish_info.get('description'),
'uploader_id': publish_info.get('channel'),
'view_count': int_or_none(feed_data.get('mediasetprogram$numberOfViews')),
+ 'thumbnail': thumbnail,
})
info.update({
diff --git a/hypervideo_dl/extractor/mediasite.py b/hypervideo_dl/extractor/mediasite.py
index d6eb157..ace86c2 100644
--- a/hypervideo_dl/extractor/mediasite.py
+++ b/hypervideo_dl/extractor/mediasite.py
@@ -26,7 +26,7 @@ _ID_RE = r'(?:[0-9a-f]{32,34}|[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0
class MediasiteIE(InfoExtractor):
- _VALID_URL = r'(?xi)https?://[^/]+/Mediasite/(?:Play|Showcase/(?:default|livebroadcast)/Presentation)/(?P<id>%s)(?P<query>\?[^#]+|)' % _ID_RE
+ _VALID_URL = r'(?xi)https?://[^/]+/Mediasite/(?:Play|Showcase/[^/#?]+/Presentation)/(?P<id>%s)(?P<query>\?[^#]+|)' % _ID_RE
_TESTS = [
{
'url': 'https://hitsmediaweb.h-its.org/mediasite/Play/2db6c271681e4f199af3c60d1f82869b1d',
@@ -122,9 +122,55 @@ class MediasiteIE(InfoExtractor):
r'(?xi)<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:(?:https?:)?//[^/]+)?/Mediasite/Play/%s(?:\?.*?)?)\1' % _ID_RE,
webpage)]
+ def __extract_slides(self, *, stream_id, snum, Stream, duration, images):
+ slide_base_url = Stream['SlideBaseUrl']
+
+ fname_template = Stream['SlideImageFileNameTemplate']
+ if fname_template != 'slide_{0:D4}.jpg':
+ self.report_warning('Unusual slide file name template; report a bug if slide downloading fails')
+ fname_template = re.sub(r'\{0:D([0-9]+)\}', r'{0:0\1}', fname_template)
+
+ fragments = []
+ for i, slide in enumerate(Stream['Slides']):
+ if i == 0:
+ if slide['Time'] > 0:
+ default_slide = images.get('DefaultSlide')
+ if default_slide is None:
+ default_slide = images.get('DefaultStreamImage')
+ if default_slide is not None:
+ default_slide = default_slide['ImageFilename']
+ if default_slide is not None:
+ fragments.append({
+ 'path': default_slide,
+ 'duration': slide['Time'] / 1000,
+ })
+
+ next_time = try_get(None, [
+ lambda _: Stream['Slides'][i + 1]['Time'],
+ lambda _: duration,
+ lambda _: slide['Time'],
+ ], expected_type=(int, float))
+
+ fragments.append({
+ 'path': fname_template.format(slide.get('Number', i + 1)),
+ 'duration': (next_time - slide['Time']) / 1000
+ })
+
+ return {
+ 'format_id': '%s-%u.slides' % (stream_id, snum),
+ 'ext': 'mhtml',
+ 'url': slide_base_url,
+ 'protocol': 'mhtml',
+ 'acodec': 'none',
+ 'vcodec': 'none',
+ 'format_note': 'Slides',
+ 'fragments': fragments,
+ 'fragment_base_url': slide_base_url,
+ }
+
def _real_extract(self, url):
url, data = unsmuggle_url(url, {})
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
resource_id = mobj.group('id')
query = mobj.group('query')
@@ -198,15 +244,20 @@ class MediasiteIE(InfoExtractor):
'ext': mimetype2ext(VideoUrl.get('MimeType')),
})
- # TODO: if Stream['HasSlideContent']:
- # synthesise an MJPEG video stream '%s-%u.slides' % (stream_type, snum)
- # from Stream['Slides']
- # this will require writing a custom downloader...
+ if Stream.get('HasSlideContent', False):
+ images = player_options['PlayerLayoutOptions']['Images']
+ stream_formats.append(self.__extract_slides(
+ stream_id=stream_id,
+ snum=snum,
+ Stream=Stream,
+ duration=presentation.get('Duration'),
+ images=images,
+ ))
# disprefer 'secondary' streams
if stream_type != 0:
for fmt in stream_formats:
- fmt['preference'] = -1
+ fmt['quality'] = -10
thumbnail_url = Stream.get('ThumbnailUrl')
if thumbnail_url:
@@ -276,7 +327,7 @@ class MediasiteCatalogIE(InfoExtractor):
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
mediasite_url = mobj.group('url')
catalog_id = mobj.group('catalog_id')
current_folder_id = mobj.group('current_folder_id') or catalog_id
@@ -352,7 +403,7 @@ class MediasiteNamedCatalogIE(InfoExtractor):
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
mediasite_url = mobj.group('url')
catalog_name = mobj.group('catalog_name')
diff --git a/hypervideo_dl/extractor/metacafe.py b/hypervideo_dl/extractor/metacafe.py
index 9e92416..7b2d4a0 100644
--- a/hypervideo_dl/extractor/metacafe.py
+++ b/hypervideo_dl/extractor/metacafe.py
@@ -19,7 +19,7 @@ from ..utils import (
class MetacafeIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?metacafe\.com/watch/(?P<video_id>[^/]+)/(?P<display_id>[^/?#]+)'
+ _VALID_URL = r'https?://(?:www\.)?metacafe\.com/watch/(?P<id>[^/]+)/(?P<display_id>[^/?#]+)'
_DISCLAIMER = 'http://www.metacafe.com/family_filter/'
_FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
IE_NAME = 'metacafe'
@@ -130,7 +130,7 @@ class MetacafeIE(InfoExtractor):
def _real_extract(self, url):
# Extract id and simplified title from URL
- video_id, display_id = re.match(self._VALID_URL, url).groups()
+ video_id, display_id = self._match_valid_url(url).groups()
# the video may come from an external site
m_external = re.match(r'^(\w{2})-(.*)$', video_id)
diff --git a/hypervideo_dl/extractor/metacritic.py b/hypervideo_dl/extractor/metacritic.py
index 7d468d7..1424288 100644
--- a/hypervideo_dl/extractor/metacritic.py
+++ b/hypervideo_dl/extractor/metacritic.py
@@ -33,7 +33,7 @@ class MetacriticIE(InfoExtractor):
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
# The xml is not well formatted, there are raw '&'
diff --git a/hypervideo_dl/extractor/mgoon.py b/hypervideo_dl/extractor/mgoon.py
index 7bb4739..184c311 100644
--- a/hypervideo_dl/extractor/mgoon.py
+++ b/hypervideo_dl/extractor/mgoon.py
@@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
from ..utils import (
@@ -41,7 +40,7 @@ class MgoonIE(InfoExtractor):
]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
video_id = mobj.group('id')
data = self._download_json(self._API_URL.format(video_id), video_id)
diff --git a/hypervideo_dl/extractor/microsoftvirtualacademy.py b/hypervideo_dl/extractor/microsoftvirtualacademy.py
index 8e0aee0..46abd2a 100644
--- a/hypervideo_dl/extractor/microsoftvirtualacademy.py
+++ b/hypervideo_dl/extractor/microsoftvirtualacademy.py
@@ -55,7 +55,7 @@ class MicrosoftVirtualAcademyIE(MicrosoftVirtualAcademyBaseIE):
def _real_extract(self, url):
url, smuggled_data = unsmuggle_url(url, {})
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
course_id = mobj.group('course_id')
video_id = mobj.group('id')
@@ -152,7 +152,7 @@ class MicrosoftVirtualAcademyCourseIE(MicrosoftVirtualAcademyBaseIE):
MicrosoftVirtualAcademyCourseIE, cls).suitable(url)
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
course_id = mobj.group('id')
display_id = mobj.group('display_id')
diff --git a/hypervideo_dl/extractor/mildom.py b/hypervideo_dl/extractor/mildom.py
new file mode 100644
index 0000000..c147cbb
--- /dev/null
+++ b/hypervideo_dl/extractor/mildom.py
@@ -0,0 +1,258 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import base64
+from datetime import datetime
+import itertools
+import json
+
+from .common import InfoExtractor
+from ..utils import (
+ std_headers,
+ update_url_query,
+ random_uuidv4,
+ try_get,
+)
+from ..compat import (
+ compat_str,
+)
+
+
+class MildomBaseIE(InfoExtractor):
+ _GUEST_ID = None
+ _DISPATCHER_CONFIG = None
+
+ def _call_api(self, url, video_id, query={}, note='Downloading JSON metadata', init=False):
+ url = update_url_query(url, self._common_queries(query, init=init))
+ return self._download_json(url, video_id, note=note)['body']
+
+ def _common_queries(self, query={}, init=False):
+ dc = self._fetch_dispatcher_config()
+ r = {
+ 'timestamp': self.iso_timestamp(),
+ '__guest_id': '' if init else self.guest_id(),
+ '__location': dc['location'],
+ '__country': dc['country'],
+ '__cluster': dc['cluster'],
+ '__platform': 'web',
+ '__la': self.lang_code(),
+ '__pcv': 'v2.9.44',
+ 'sfr': 'pc',
+ 'accessToken': '',
+ }
+ r.update(query)
+ return r
+
+ def _fetch_dispatcher_config(self):
+ if not self._DISPATCHER_CONFIG:
+ tmp = self._download_json(
+ 'https://disp.mildom.com/serverListV2', 'initialization',
+ note='Downloading dispatcher_config', data=json.dumps({
+ 'protover': 0,
+ 'data': base64.b64encode(json.dumps({
+ 'fr': 'web',
+ 'sfr': 'pc',
+ 'devi': 'Windows',
+ 'la': 'ja',
+ 'gid': None,
+ 'loc': '',
+ 'clu': '',
+ 'wh': '1919*810',
+ 'rtm': self.iso_timestamp(),
+ 'ua': std_headers['User-Agent'],
+ }).encode('utf8')).decode('utf8').replace('\n', ''),
+ }).encode('utf8'))
+ self._DISPATCHER_CONFIG = self._parse_json(base64.b64decode(tmp['data']), 'initialization')
+ return self._DISPATCHER_CONFIG
+
+ @staticmethod
+ def iso_timestamp():
+ 'new Date().toISOString()'
+ return datetime.utcnow().isoformat()[0:-3] + 'Z'
+
+ def guest_id(self):
+ 'getGuestId'
+ if self._GUEST_ID:
+ return self._GUEST_ID
+ self._GUEST_ID = try_get(
+ self, (
+ lambda x: x._call_api(
+ 'https://cloudac.mildom.com/nonolive/gappserv/guest/h5init', 'initialization',
+ note='Downloading guest token', init=True)['guest_id'] or None,
+ lambda x: x._get_cookies('https://www.mildom.com').get('gid').value,
+ lambda x: x._get_cookies('https://m.mildom.com').get('gid').value,
+ ), compat_str) or ''
+ return self._GUEST_ID
+
+ def lang_code(self):
+ 'getCurrentLangCode'
+ return 'ja'
+
+
+class MildomIE(MildomBaseIE):
+ IE_NAME = 'mildom'
+ IE_DESC = 'Record ongoing live by specific user in Mildom'
+ _VALID_URL = r'https?://(?:(?:www|m)\.)mildom\.com/(?P<id>\d+)'
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ url = 'https://www.mildom.com/%s' % video_id
+
+ webpage = self._download_webpage(url, video_id)
+
+ enterstudio = self._call_api(
+ 'https://cloudac.mildom.com/nonolive/gappserv/live/enterstudio', video_id,
+ note='Downloading live metadata', query={'user_id': video_id})
+ result_video_id = enterstudio.get('log_id', video_id)
+
+ title = try_get(
+ enterstudio, (
+ lambda x: self._html_search_meta('twitter:description', webpage),
+ lambda x: x['anchor_intro'],
+ ), compat_str)
+ description = try_get(
+ enterstudio, (
+ lambda x: x['intro'],
+ lambda x: x['live_intro'],
+ ), compat_str)
+ uploader = try_get(
+ enterstudio, (
+ lambda x: self._html_search_meta('twitter:title', webpage),
+ lambda x: x['loginname'],
+ ), compat_str)
+
+ servers = self._call_api(
+ 'https://cloudac.mildom.com/nonolive/gappserv/live/liveserver', result_video_id,
+ note='Downloading live server list', query={
+ 'user_id': video_id,
+ 'live_server_type': 'hls',
+ })
+
+ stream_query = self._common_queries({
+ 'streamReqId': random_uuidv4(),
+ 'is_lhls': '0',
+ })
+ m3u8_url = update_url_query(servers['stream_server'] + '/%s_master.m3u8' % video_id, stream_query)
+ formats = self._extract_m3u8_formats(m3u8_url, result_video_id, 'mp4', headers={
+ 'Referer': 'https://www.mildom.com/',
+ 'Origin': 'https://www.mildom.com',
+ }, note='Downloading m3u8 information')
+
+ del stream_query['streamReqId'], stream_query['timestamp']
+ for fmt in formats:
+ fmt.setdefault('http_headers', {})['Referer'] = 'https://www.mildom.com/'
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': result_video_id,
+ 'title': title,
+ 'description': description,
+ 'uploader': uploader,
+ 'uploader_id': video_id,
+ 'formats': formats,
+ 'is_live': True,
+ }
+
+
+class MildomVodIE(MildomBaseIE):
+ IE_NAME = 'mildom:vod'
+ IE_DESC = 'Download a VOD in Mildom'
+ _VALID_URL = r'https?://(?:(?:www|m)\.)mildom\.com/playback/(?P<user_id>\d+)/(?P<id>(?P=user_id)-[a-zA-Z0-9]+)'
+
+ def _real_extract(self, url):
+ m = self._match_valid_url(url)
+ user_id, video_id = m.group('user_id'), m.group('id')
+ url = 'https://www.mildom.com/playback/%s/%s' % (user_id, video_id)
+
+ webpage = self._download_webpage(url, video_id)
+
+ autoplay = self._call_api(
+ 'https://cloudac.mildom.com/nonolive/videocontent/playback/getPlaybackDetail', video_id,
+ note='Downloading playback metadata', query={
+ 'v_id': video_id,
+ })['playback']
+
+ title = try_get(
+ autoplay, (
+ lambda x: self._html_search_meta('og:description', webpage),
+ lambda x: x['title'],
+ ), compat_str)
+ description = try_get(
+ autoplay, (
+ lambda x: x['video_intro'],
+ ), compat_str)
+ uploader = try_get(
+ autoplay, (
+ lambda x: x['author_info']['login_name'],
+ ), compat_str)
+
+ formats = [{
+ 'url': autoplay['audio_url'],
+ 'format_id': 'audio',
+ 'protocol': 'm3u8_native',
+ 'vcodec': 'none',
+ 'acodec': 'aac',
+ 'ext': 'm4a'
+ }]
+ for fmt in autoplay['video_link']:
+ formats.append({
+ 'format_id': 'video-%s' % fmt['name'],
+ 'url': fmt['url'],
+ 'protocol': 'm3u8_native',
+ 'width': fmt['level'] * autoplay['video_width'] // autoplay['video_height'],
+ 'height': fmt['level'],
+ 'vcodec': 'h264',
+ 'acodec': 'aac',
+ 'ext': 'mp4'
+ })
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'uploader': uploader,
+ 'uploader_id': user_id,
+ 'formats': formats,
+ }
+
+
+class MildomUserVodIE(MildomBaseIE):
+ IE_NAME = 'mildom:user:vod'
+ IE_DESC = 'Download all VODs from specific user in Mildom'
+ _VALID_URL = r'https?://(?:(?:www|m)\.)mildom\.com/profile/(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'https://www.mildom.com/profile/10093333',
+ 'info_dict': {
+ 'id': '10093333',
+ 'title': 'Uploads from ねこばたけ',
+ },
+ 'playlist_mincount': 351,
+ }]
+
+ def _entries(self, user_id):
+ for page in itertools.count(1):
+ reply = self._call_api(
+ 'https://cloudac.mildom.com/nonolive/videocontent/profile/playbackList',
+ user_id, note='Downloading page %d' % page, query={
+ 'user_id': user_id,
+ 'page': page,
+ 'limit': '30',
+ })
+ if not reply:
+ break
+ for x in reply:
+ yield self.url_result('https://www.mildom.com/playback/%s/%s' % (user_id, x['v_id']))
+
+ def _real_extract(self, url):
+ user_id = self._match_id(url)
+ self.to_screen('This will download all VODs belonging to user. To download ongoing live video, use "https://www.mildom.com/%s" instead' % user_id)
+
+ profile = self._call_api(
+ 'https://cloudac.mildom.com/nonolive/gappserv/user/profileV2', user_id,
+ query={'user_id': user_id}, note='Downloading user profile')['user_info']
+
+ return self.playlist_result(
+ self._entries(user_id), user_id, 'Uploads from %s' % profile['loginname'])
diff --git a/hypervideo_dl/extractor/minoto.py b/hypervideo_dl/extractor/minoto.py
index 6367311..603ce94 100644
--- a/hypervideo_dl/extractor/minoto.py
+++ b/hypervideo_dl/extractor/minoto.py
@@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
from ..utils import (
@@ -14,7 +13,7 @@ class MinotoIE(InfoExtractor):
_VALID_URL = r'(?:minoto:|https?://(?:play|iframe|embed)\.minoto-video\.com/(?P<player_id>[0-9]+)/)(?P<id>[a-zA-Z0-9]+)'
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
player_id = mobj.group('player_id') or '1'
video_id = mobj.group('id')
video_data = self._download_json('http://play.minoto-video.com/%s/%s.js' % (player_id, video_id), video_id)
@@ -38,7 +37,7 @@ class MinotoIE(InfoExtractor):
'filesize': int_or_none(fmt.get('filesize')),
'width': int_or_none(fmt.get('width')),
'height': int_or_none(fmt.get('height')),
- 'codecs': parse_codecs(fmt.get('codecs')),
+ **parse_codecs(fmt.get('codecs')),
})
self._sort_formats(formats)
diff --git a/hypervideo_dl/extractor/mirrativ.py b/hypervideo_dl/extractor/mirrativ.py
new file mode 100644
index 0000000..81aea54
--- /dev/null
+++ b/hypervideo_dl/extractor/mirrativ.py
@@ -0,0 +1,134 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ dict_get,
+ traverse_obj,
+ try_get,
+)
+
+
+class MirrativBaseIE(InfoExtractor):
+ def assert_error(self, response):
+ error_message = traverse_obj(response, ('status', 'error'))
+ if error_message:
+ raise ExtractorError('Mirrativ says: %s' % error_message, expected=True)
+
+
+class MirrativIE(MirrativBaseIE):
+ IE_NAME = 'mirrativ'
+ _VALID_URL = r'https?://(?:www\.)?mirrativ\.com/live/(?P<id>[^/?#&]+)'
+ LIVE_API_URL = 'https://www.mirrativ.com/api/live/live?live_id=%s'
+
+ TESTS = [{
+ 'url': 'https://mirrativ.com/live/POxyuG1KmW2982lqlDTuPw',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage('https://www.mirrativ.com/live/%s' % video_id, video_id)
+ live_response = self._download_json(self.LIVE_API_URL % video_id, video_id)
+ self.assert_error(live_response)
+
+ hls_url = dict_get(live_response, ('archive_url_hls', 'streaming_url_hls'))
+ is_live = bool(live_response.get('is_live'))
+ was_live = bool(live_response.get('is_archive'))
+ if not hls_url:
+ raise ExtractorError('Neither archive nor live is available.', expected=True)
+
+ formats = self._extract_m3u8_formats(
+ hls_url, video_id,
+ ext='mp4', entry_protocol='m3u8_native',
+ m3u8_id='hls', live=is_live)
+ rtmp_url = live_response.get('streaming_url_edge')
+ if rtmp_url:
+ keys_to_copy = ('width', 'height', 'vcodec', 'acodec', 'tbr')
+ fmt = {
+ 'format_id': 'rtmp',
+ 'url': rtmp_url,
+ 'protocol': 'rtmp',
+ 'ext': 'mp4',
+ }
+ fmt.update({k: traverse_obj(formats, (0, k)) for k in keys_to_copy})
+ formats.append(fmt)
+ self._sort_formats(formats)
+
+ title = self._og_search_title(webpage, default=None) or self._search_regex(
+ r'<title>\s*(.+?) - Mirrativ\s*</title>', webpage) or live_response.get('title')
+ description = live_response.get('description')
+ thumbnail = live_response.get('image_url')
+
+ duration = try_get(live_response, lambda x: x['ended_at'] - x['started_at'])
+ view_count = live_response.get('total_viewer_num')
+ release_timestamp = live_response.get('started_at')
+ timestamp = live_response.get('created_at')
+
+ owner = live_response.get('owner', {})
+ uploader = owner.get('name')
+ uploader_id = owner.get('user_id')
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'is_live': is_live,
+ 'description': description,
+ 'formats': formats,
+ 'thumbnail': thumbnail,
+ 'uploader': uploader,
+ 'uploader_id': uploader_id,
+ 'duration': duration,
+ 'view_count': view_count,
+ 'release_timestamp': release_timestamp,
+ 'timestamp': timestamp,
+ 'was_live': was_live,
+ }
+
+
+class MirrativUserIE(MirrativBaseIE):
+ IE_NAME = 'mirrativ:user'
+ _VALID_URL = r'https?://(?:www\.)?mirrativ\.com/user/(?P<id>\d+)'
+ LIVE_HISTORY_API_URL = 'https://www.mirrativ.com/api/live/live_history?user_id=%s&page=%d'
+ USER_INFO_API_URL = 'https://www.mirrativ.com/api/user/profile?user_id=%s'
+
+ _TESTS = [{
+ # Live archive is available up to 3 days
+ # see: https://helpfeel.com/mirrativ/%E9%8C%B2%E7%94%BB-5e26d3ad7b59ef0017fb49ac (Japanese)
+ 'url': 'https://www.mirrativ.com/user/110943130',
+ 'note': 'multiple archives available',
+ 'only_matching': True,
+ }]
+
+ def _entries(self, user_id):
+ page = 1
+ while page is not None:
+ api_response = self._download_json(
+ self.LIVE_HISTORY_API_URL % (user_id, page), user_id,
+ note='Downloading page %d' % page)
+ self.assert_error(api_response)
+ lives = api_response.get('lives')
+ if not lives:
+ break
+ for live in lives:
+ if not live.get('is_archive') and not live.get('is_live'):
+ # neither archive nor live is available, so skip it
+ # or the service will ban your IP address for a while
+ continue
+ live_id = live.get('live_id')
+ url = 'https://www.mirrativ.com/live/%s' % live_id
+ yield self.url_result(url, video_id=live_id, video_title=live.get('title'))
+ page = api_response.get('next_page')
+
+ def _real_extract(self, url):
+ user_id = self._match_id(url)
+ user_info = self._download_json(
+ self.USER_INFO_API_URL % user_id, user_id,
+ note='Downloading user info', fatal=False)
+ self.assert_error(user_info)
+
+ uploader = user_info.get('name')
+ description = user_info.get('description')
+
+ entries = self._entries(user_id)
+ return self.playlist_result(entries, user_id, uploader, description)
diff --git a/hypervideo_dl/extractor/mit.py b/hypervideo_dl/extractor/mit.py
index e1506a7..60e4569 100644
--- a/hypervideo_dl/extractor/mit.py
+++ b/hypervideo_dl/extractor/mit.py
@@ -98,7 +98,7 @@ class OCWMITIE(InfoExtractor):
]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
topic = mobj.group('topic')
webpage = self._download_webpage(url, topic)
diff --git a/hypervideo_dl/extractor/mixcloud.py b/hypervideo_dl/extractor/mixcloud.py
index 6931985..a0c043d 100644
--- a/hypervideo_dl/extractor/mixcloud.py
+++ b/hypervideo_dl/extractor/mixcloud.py
@@ -1,7 +1,6 @@
from __future__ import unicode_literals
import itertools
-import re
from .common import InfoExtractor
from ..compat import (
@@ -79,7 +78,7 @@ class MixcloudIE(MixcloudBaseIE):
for ch, k in compat_zip(ciphertext, itertools.cycle(key))])
def _real_extract(self, url):
- username, slug = re.match(self._VALID_URL, url).groups()
+ username, slug = self._match_valid_url(url).groups()
username, slug = compat_urllib_parse_unquote(username), compat_urllib_parse_unquote(slug)
track_id = '%s_%s' % (username, slug)
@@ -157,7 +156,7 @@ class MixcloudIE(MixcloudBaseIE):
})
if not formats and cloudcast.get('isExclusive'):
- self.raise_login_required()
+ self.raise_login_required(metadata_available=True)
self._sort_formats(formats)
@@ -214,7 +213,7 @@ class MixcloudPlaylistBaseIE(MixcloudBaseIE):
return title
def _real_extract(self, url):
- username, slug = re.match(self._VALID_URL, url).groups()
+ username, slug = self._match_valid_url(url).groups()
username = compat_urllib_parse_unquote(username)
if not slug:
slug = 'uploads'
diff --git a/hypervideo_dl/extractor/moevideo.py b/hypervideo_dl/extractor/moevideo.py
index eb9b4ce..a3f1b38 100644
--- a/hypervideo_dl/extractor/moevideo.py
+++ b/hypervideo_dl/extractor/moevideo.py
@@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
from ..utils import (
@@ -54,7 +53,7 @@ class MoeVideoIE(InfoExtractor):
]
def _real_extract(self, url):
- host, video_id = re.match(self._VALID_URL, url).groups()
+ host, video_id = self._match_valid_url(url).groups()
webpage = self._download_webpage(
'http://%s/video/%s' % (host, video_id),
diff --git a/hypervideo_dl/extractor/mojvideo.py b/hypervideo_dl/extractor/mojvideo.py
index 165e658..0421f3f 100644
--- a/hypervideo_dl/extractor/mojvideo.py
+++ b/hypervideo_dl/extractor/mojvideo.py
@@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
from ..utils import (
@@ -26,7 +25,7 @@ class MojvideoIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
video_id = mobj.group('id')
display_id = mobj.group('display_id')
diff --git a/hypervideo_dl/extractor/morningstar.py b/hypervideo_dl/extractor/morningstar.py
index 0093bcd..71a22a6 100644
--- a/hypervideo_dl/extractor/morningstar.py
+++ b/hypervideo_dl/extractor/morningstar.py
@@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
@@ -25,7 +24,7 @@ class MorningstarIE(InfoExtractor):
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
diff --git a/hypervideo_dl/extractor/motherless.py b/hypervideo_dl/extractor/motherless.py
index ef1e081..111c7c5 100644
--- a/hypervideo_dl/extractor/motherless.py
+++ b/hypervideo_dl/extractor/motherless.py
@@ -127,9 +127,9 @@ class MotherlessIE(InfoExtractor):
comment_count = webpage.count('class="media-comment-contents"')
uploader_id = self._html_search_regex(
- r'"thumb-member-username">\s+<a href="/m/([^"]+)"',
- webpage, 'uploader_id')
-
+ (r'"media-meta-member">\s+<a href="/m/([^"]+)"',
+ r'<span\b[^>]+\bclass="username">([^<]+)</span>'),
+ webpage, 'uploader_id', fatal=False)
categories = self._html_search_meta('keywords', webpage, default=None)
if categories:
categories = [cat.strip() for cat in categories.split(',')]
@@ -169,7 +169,18 @@ class MotherlessGroupIE(InfoExtractor):
'description': 'Sex can be funny. Wide smiles,laugh, games, fun of '
'any kind!'
},
- 'playlist_mincount': 9,
+ 'playlist_mincount': 0,
+ 'expected_warnings': [
+ 'This group has no videos.',
+ ]
+ }, {
+ 'url': 'https://motherless.com/g/beautiful_cock',
+ 'info_dict': {
+ 'id': 'beautiful_cock',
+ 'title': 'Beautiful Cock',
+ 'description': 'Group for lovely cocks yours, mine, a friends anything human',
+ },
+ 'playlist_mincount': 2500,
}]
@classmethod
@@ -209,11 +220,18 @@ class MotherlessGroupIE(InfoExtractor):
description = self._html_search_meta(
'description', webpage, fatal=False)
page_count = self._int(self._search_regex(
- r'(\d+)</(?:a|span)><(?:a|span)[^>]+>\s*NEXT',
- webpage, 'page_count'), 'page_count')
+ r'(\d+)</(?:a|span)><(?:a|span)[^>]+rel="next">',
+ webpage, 'page_count', default=0), 'page_count')
+ if not page_count:
+ message = self._search_regex(
+ r'class="error-page"[^>]*>\s*<p[^>]*>\s*(?P<error_msg>[^<]+)(?<=\S)\s*',
+ webpage, 'error_msg', default=None) or 'This group has no videos.'
+ self.report_warning(message, group_id)
PAGE_SIZE = 80
def _get_page(idx):
+ if not page_count:
+ return
webpage = self._download_webpage(
page_url, group_id, query={'page': idx + 1},
note='Downloading page %d/%d' % (idx + 1, page_count)
diff --git a/hypervideo_dl/extractor/moviezine.py b/hypervideo_dl/extractor/moviezine.py
index 85cc6e2..730da4b 100644
--- a/hypervideo_dl/extractor/moviezine.py
+++ b/hypervideo_dl/extractor/moviezine.py
@@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
@@ -21,7 +20,7 @@ class MoviezineIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
diff --git a/hypervideo_dl/extractor/msn.py b/hypervideo_dl/extractor/msn.py
index e59b0b7..f34e210 100644
--- a/hypervideo_dl/extractor/msn.py
+++ b/hypervideo_dl/extractor/msn.py
@@ -67,7 +67,7 @@ class MSNIE(InfoExtractor):
}]
def _real_extract(self, url):
- display_id, page_id = re.match(self._VALID_URL, url).groups()
+ display_id, page_id = self._match_valid_url(url).groups()
webpage = self._download_webpage(url, display_id)
@@ -132,7 +132,7 @@ class MSNIE(InfoExtractor):
'width': int_or_none(file_.get('width')),
'height': int_or_none(file_.get('height')),
'vbr': int_or_none(self._search_regex(r'_(\d+)\.mp4', format_url, 'vbr', default=None)),
- 'preference': 1 if format_id == '1001' else None,
+ 'quality': 1 if format_id == '1001' else None,
})
self._sort_formats(formats)
diff --git a/hypervideo_dl/extractor/mtv.py b/hypervideo_dl/extractor/mtv.py
index 5a5205c..e060884 100644
--- a/hypervideo_dl/extractor/mtv.py
+++ b/hypervideo_dl/extractor/mtv.py
@@ -14,6 +14,7 @@ from ..utils import (
fix_xml_ampersands,
float_or_none,
HEADRequest,
+ int_or_none,
RegexNotFoundError,
sanitized_Request,
strip_or_none,
@@ -43,7 +44,7 @@ class MTVServicesInfoExtractor(InfoExtractor):
# Remove the templates, like &device={device}
return re.sub(r'&[^=]*?={.*?}(?=(&|$))', '', url)
- def _get_feed_url(self, uri):
+ def _get_feed_url(self, uri, url=None):
return self._FEED_URL
def _get_thumbnail_url(self, uri, itemdoc):
@@ -176,6 +177,22 @@ class MTVServicesInfoExtractor(InfoExtractor):
raise ExtractorError('Could not find video title')
title = title.strip()
+ series = find_xpath_attr(
+ itemdoc, './/{http://search.yahoo.com/mrss/}category',
+ 'scheme', 'urn:mtvn:franchise')
+ season = find_xpath_attr(
+ itemdoc, './/{http://search.yahoo.com/mrss/}category',
+ 'scheme', 'urn:mtvn:seasonN')
+ episode = find_xpath_attr(
+ itemdoc, './/{http://search.yahoo.com/mrss/}category',
+ 'scheme', 'urn:mtvn:episodeN')
+ series = series.text if series is not None else None
+ season = season.text if season is not None else None
+ episode = episode.text if episode is not None else None
+ if season and episode:
+ # episode number includes season, so remove it
+ episode = re.sub(r'^%s' % season, '', episode)
+
# This a short id that's used in the webpage urls
mtvn_id = None
mtvn_id_node = find_xpath_attr(itemdoc, './/{http://search.yahoo.com/mrss/}category',
@@ -201,6 +218,9 @@ class MTVServicesInfoExtractor(InfoExtractor):
'description': description,
'duration': float_or_none(content_el.attrib.get('duration')),
'timestamp': timestamp,
+ 'series': series,
+ 'season_number': int_or_none(season),
+ 'episode_number': int_or_none(episode),
}
def _get_feed_query(self, uri):
@@ -209,9 +229,9 @@ class MTVServicesInfoExtractor(InfoExtractor):
data['lang'] = self._LANG
return data
- def _get_videos_info(self, uri, use_hls=True):
+ def _get_videos_info(self, uri, use_hls=True, url=None):
video_id = self._id_from_uri(uri)
- feed_url = self._get_feed_url(uri)
+ feed_url = self._get_feed_url(uri, url)
info_url = update_url_query(feed_url, self._get_feed_query(uri))
return self._get_videos_info_from_url(info_url, video_id, use_hls)
@@ -229,6 +249,7 @@ class MTVServicesInfoExtractor(InfoExtractor):
if info:
entries.append(info)
+ # TODO: should be multi-video
return self.playlist_result(
entries, playlist_title=title, playlist_description=description)
@@ -292,13 +313,17 @@ class MTVServicesInfoExtractor(InfoExtractor):
video_player = self._extract_child_with_type(ab_testing or main_container, 'VideoPlayer')
mgid = video_player['props']['media']['video']['config']['uri']
+ if not mgid:
+ mgid = self._search_regex(
+ r'"media":{"video":{"config":{"uri":"(mgid:.*?)"', webpage, 'mgid', default=None)
+
return mgid
def _real_extract(self, url):
title = url_basename(url)
webpage = self._download_webpage(url, title)
mgid = self._extract_mgid(webpage)
- videos_info = self._get_videos_info(mgid)
+ videos_info = self._get_videos_info(mgid, url=url)
return videos_info
@@ -327,14 +352,14 @@ class MTVServicesEmbeddedIE(MTVServicesInfoExtractor):
if mobj:
return mobj.group('url')
- def _get_feed_url(self, uri):
+ def _get_feed_url(self, uri, url=None):
video_id = self._id_from_uri(uri)
config = self._download_json(
'http://media.mtvnservices.com/pmt/e1/access/index.html?uri=%s&configtype=edge' % uri, video_id)
return self._remove_template_parameter(config['feedWithQueryParams'])
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
mgid = mobj.group('mgid')
return self._get_videos_info(mgid)
@@ -416,7 +441,7 @@ class MTVVideoIE(MTVServicesInfoExtractor):
return 'http://mtv.mtvnimages.com/uri/' + uri
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
video_id = mobj.group('videoid')
uri = mobj.groupdict().get('mgid')
if uri is None:
@@ -486,3 +511,152 @@ class MTVDEIE(MTVServicesInfoExtractor):
'arcEp': 'mtv.de',
'mgid': uri,
}
+
+
+class MTVItaliaIE(MTVServicesInfoExtractor):
+ IE_NAME = 'mtv.it'
+ _VALID_URL = r'https?://(?:www\.)?mtv\.it/(?:episodi|video|musica)/(?P<id>[0-9a-z]+)'
+ _TESTS = [{
+ 'url': 'http://www.mtv.it/episodi/24bqab/mario-una-serie-di-maccio-capatonda-cavoli-amario-episodio-completo-S1-E1',
+ 'info_dict': {
+ 'id': '0f0fc78e-45fc-4cce-8f24-971c25477530',
+ 'ext': 'mp4',
+ 'title': 'Cavoli amario (episodio completo)',
+ 'description': 'md5:4962bccea8fed5b7c03b295ae1340660',
+ 'series': 'Mario - Una Serie Di Maccio Capatonda',
+ 'season_number': 1,
+ 'episode_number': 1,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }]
+ _GEO_COUNTRIES = ['IT']
+ _FEED_URL = 'http://feeds.mtvnservices.com/od/feed/intl-mrss-player-feed'
+
+ def _get_feed_query(self, uri):
+ return {
+ 'arcEp': 'mtv.it',
+ 'mgid': uri,
+ }
+
+
+class MTVItaliaProgrammaIE(MTVItaliaIE):
+ IE_NAME = 'mtv.it:programma'
+ _VALID_URL = r'https?://(?:www\.)?mtv\.it/(?:programmi|playlist)/(?P<id>[0-9a-z]+)'
+ _TESTS = [{
+ # program page: general
+ 'url': 'http://www.mtv.it/programmi/s2rppv/mario-una-serie-di-maccio-capatonda',
+ 'info_dict': {
+ 'id': 'a6f155bc-8220-4640-aa43-9b95f64ffa3d',
+ 'title': 'Mario - Una Serie Di Maccio Capatonda',
+ 'description': 'md5:72fbffe1f77ccf4e90757dd4e3216153',
+ },
+ 'playlist_count': 2,
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ # program page: specific season
+ 'url': 'http://www.mtv.it/programmi/d9ncjf/mario-una-serie-di-maccio-capatonda-S2',
+ 'info_dict': {
+ 'id': '4deeb5d8-f272-490c-bde2-ff8d261c6dd1',
+ 'title': 'Mario - Una Serie Di Maccio Capatonda - Stagione 2',
+ },
+ 'playlist_count': 34,
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ # playlist page + redirect
+ 'url': 'http://www.mtv.it/playlist/sexy-videos/ilctal',
+ 'info_dict': {
+ 'id': 'dee8f9ee-756d-493b-bf37-16d1d2783359',
+ 'title': 'Sexy Videos',
+ },
+ 'playlist_mincount': 145,
+ 'params': {
+ 'skip_download': True,
+ },
+ }]
+ _GEO_COUNTRIES = ['IT']
+ _FEED_URL = 'http://www.mtv.it/feeds/triforce/manifest/v8'
+
+ def _get_entries(self, title, url):
+ while True:
+ pg = self._search_regex(r'/(\d+)$', url, 'entries', '1')
+ entries = self._download_json(url, title, 'page %s' % pg)
+ url = try_get(
+ entries, lambda x: x['result']['nextPageURL'], compat_str)
+ entries = try_get(
+ entries, (
+ lambda x: x['result']['data']['items'],
+ lambda x: x['result']['data']['seasons']),
+ list)
+ for entry in entries or []:
+ if entry.get('canonicalURL'):
+ yield self.url_result(entry['canonicalURL'])
+ if not url:
+ break
+
+ def _real_extract(self, url):
+ query = {'url': url}
+ info_url = update_url_query(self._FEED_URL, query)
+ video_id = self._match_id(url)
+ info = self._download_json(info_url, video_id).get('manifest')
+
+ redirect = try_get(
+ info, lambda x: x['newLocation']['url'], compat_str)
+ if redirect:
+ return self.url_result(redirect)
+
+ title = info.get('title')
+ video_id = try_get(
+ info, lambda x: x['reporting']['itemId'], compat_str)
+ parent_id = try_get(
+ info, lambda x: x['reporting']['parentId'], compat_str)
+
+ playlist_url = current_url = None
+ for z in (info.get('zones') or {}).values():
+ if z.get('moduleName') in ('INTL_M304', 'INTL_M209'):
+ info_url = z.get('feed')
+ if z.get('moduleName') in ('INTL_M308', 'INTL_M317'):
+ playlist_url = playlist_url or z.get('feed')
+ if z.get('moduleName') in ('INTL_M300',):
+ current_url = current_url or z.get('feed')
+
+ if not info_url:
+ raise ExtractorError('No info found')
+
+ if video_id == parent_id:
+ video_id = self._search_regex(
+ r'([^\/]+)/[^\/]+$', info_url, 'video_id')
+
+ info = self._download_json(info_url, video_id, 'Show infos')
+ info = try_get(info, lambda x: x['result']['data'], dict)
+ title = title or try_get(
+ info, (
+ lambda x: x['title'],
+ lambda x: x['headline']),
+ compat_str)
+ description = try_get(info, lambda x: x['content'], compat_str)
+
+ if current_url:
+ season = try_get(
+ self._download_json(playlist_url, video_id, 'Seasons info'),
+ lambda x: x['result']['data'], dict)
+ current = try_get(
+ season, lambda x: x['currentSeason'], compat_str)
+ seasons = try_get(
+ season, lambda x: x['seasons'], list) or []
+
+ if current in [s.get('eTitle') for s in seasons]:
+ playlist_url = current_url
+
+ title = re.sub(
+ r'[-|]\s*(?:mtv\s*italia|programma|playlist)',
+ '', title, flags=re.IGNORECASE).strip()
+
+ return self.playlist_result(
+ self._get_entries(title, playlist_url),
+ video_id, title, description)
diff --git a/hypervideo_dl/extractor/muenchentv.py b/hypervideo_dl/extractor/muenchentv.py
index 2cc2bf2..d256236 100644
--- a/hypervideo_dl/extractor/muenchentv.py
+++ b/hypervideo_dl/extractor/muenchentv.py
@@ -61,7 +61,7 @@ class MuenchenTVIE(InfoExtractor):
'tbr': int_or_none(s.get('label')),
'ext': 'mp4',
'format_id': format_id,
- 'preference': -100 if '.smil' in s['file'] else 0,
+ 'preference': -100 if '.smil' in s['file'] else 0, # Strictly inferior than all other formats?
})
self._sort_formats(formats)
diff --git a/hypervideo_dl/extractor/musescore.py b/hypervideo_dl/extractor/musescore.py
new file mode 100644
index 0000000..dcd2638
--- /dev/null
+++ b/hypervideo_dl/extractor/musescore.py
@@ -0,0 +1,67 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class MuseScoreIE(InfoExtractor):
+ _VALID_URL = r'(?:https?://)(?:www\.)?musescore\.com/(?:user/\d+|[^/]+)(?:/scores)?/(?P<id>[^#&?]+)'
+ _TESTS = [{
+ 'url': 'https://musescore.com/user/73797/scores/142975',
+ 'info_dict': {
+ 'id': '142975',
+ 'ext': 'mp3',
+ 'title': 'WA Mozart Marche Turque (Turkish March fingered)',
+ 'description': 'md5:7ede08230e4eaabd67a4a98bb54d07be',
+ 'thumbnail': r're:(?:https?://)(?:www\.)?musescore\.com/.*\.png[^$]+',
+ 'uploader': 'PapyPiano',
+ 'creator': 'Wolfgang Amadeus Mozart',
+ }
+ }, {
+ 'url': 'https://musescore.com/user/36164500/scores/6837638',
+ 'info_dict': {
+ 'id': '6837638',
+ 'ext': 'mp3',
+ 'title': 'Sweet Child O\' Mine – Guns N\' Roses sweet child',
+ 'description': 'md5:4dca71191c14abc312a0a4192492eace',
+ 'thumbnail': r're:(?:https?://)(?:www\.)?musescore\.com/.*\.png[^$]+',
+ 'uploader': 'roxbelviolin',
+ 'creator': 'Guns N´Roses Arr. Roxbel Violin',
+ }
+ }, {
+ 'url': 'https://musescore.com/classicman/fur-elise',
+ 'info_dict': {
+ 'id': '33816',
+ 'ext': 'mp3',
+ 'title': 'Für Elise – Beethoven',
+ 'description': 'md5:49515a3556d5ecaf9fa4b2514064ac34',
+ 'thumbnail': r're:(?:https?://)(?:www\.)?musescore\.com/.*\.png[^$]+',
+ 'uploader': 'ClassicMan',
+ 'creator': 'Ludwig van Beethoven (1770–1827)',
+ }
+ }, {
+ 'url': 'https://musescore.com/minh_cuteee/scores/6555384',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ webpage = self._download_webpage(url, None)
+ url = self._og_search_url(webpage) or url
+ id = self._match_id(url)
+ mp3_url = self._download_json(f'https://musescore.com/api/jmuse?id={id}&index=0&type=mp3&v2=1', id,
+ headers={'authorization': '63794e5461e4cfa046edfbdddfccc1ac16daffd2'})['info']['url']
+ formats = [{
+ 'url': mp3_url,
+ 'ext': 'mp3',
+ 'vcodec': 'none',
+ }]
+
+ return {
+ 'id': id,
+ 'formats': formats,
+ 'title': self._og_search_title(webpage),
+ 'description': self._og_search_description(webpage),
+ 'thumbnail': self._og_search_thumbnail(webpage),
+ 'uploader': self._html_search_meta('musescore:author', webpage, 'uploader'),
+ 'creator': self._html_search_meta('musescore:composer', webpage, 'composer'),
+ }
diff --git a/hypervideo_dl/extractor/mxplayer.py b/hypervideo_dl/extractor/mxplayer.py
new file mode 100644
index 0000000..5874556
--- /dev/null
+++ b/hypervideo_dl/extractor/mxplayer.py
@@ -0,0 +1,222 @@
+from __future__ import unicode_literals
+
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import try_get
+
+
+class MxplayerIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?mxplayer\.in/(?P<type>movie|show/[-\w]+/[-\w]+)/(?P<display_id>[-\w]+)-(?P<id>\w+)'
+ _TESTS = [{
+ 'url': 'https://www.mxplayer.in/show/watch-my-girlfriend-is-an-alien-hindi-dubbed/season-1/episode-1-online-9d2013d31d5835bb8400e3b3c5e7bb72',
+ 'info_dict': {
+ 'id': '9d2013d31d5835bb8400e3b3c5e7bb72',
+ 'ext': 'mp4',
+ 'title': 'Episode 1',
+ 'description': 'md5:62ed43eb9fec5efde5cf3bd1040b7670',
+ 'season_number': 1,
+ 'episode_number': 1,
+ 'duration': 2451,
+ 'season': 'Season 1',
+ 'series': 'My Girlfriend Is An Alien (Hindi Dubbed)',
+ 'thumbnail': 'https://qqcdnpictest.mxplay.com/pic/9d2013d31d5835bb8400e3b3c5e7bb72/en/16x9/320x180/9562f5f8df42cad09c9a9c4e69eb1567_1920x1080.webp',
+ 'episode': 'Episode 1'
+ },
+ 'params': {
+ 'format': 'bv',
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'https://www.mxplayer.in/movie/watch-knock-knock-hindi-dubbed-movie-online-b9fa28df3bfb8758874735bbd7d2655a?watch=true',
+ 'info_dict': {
+ 'id': 'b9fa28df3bfb8758874735bbd7d2655a',
+ 'ext': 'mp4',
+ 'title': 'Knock Knock (Hindi Dubbed)',
+ 'description': 'md5:b195ba93ff1987309cfa58e2839d2a5b',
+ 'season_number': 0,
+ 'episode_number': 0,
+ 'duration': 5970,
+ 'season': 'Season 0',
+ 'series': None,
+ 'thumbnail': 'https://qqcdnpictest.mxplay.com/pic/b9fa28df3bfb8758874735bbd7d2655a/en/16x9/320x180/test_pic1588676032011.webp',
+ 'episode': 'Episode 0'
+ },
+ 'params': {
+ 'format': 'bv',
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'https://www.mxplayer.in/show/watch-shaitaan/season-1/the-infamous-taxi-gang-of-meerut-online-45055d5bcff169ad48f2ad7552a83d6c',
+ 'info_dict': {
+ 'id': '45055d5bcff169ad48f2ad7552a83d6c',
+ 'ext': 'mp4',
+ 'title': 'The infamous taxi gang of Meerut',
+ 'description': 'md5:033a0a7e3fd147be4fb7e07a01a3dc28',
+ 'season_number': 1,
+ 'episode_number': 1,
+ 'duration': 2332,
+ 'season': 'Season 1',
+ 'series': 'Shaitaan',
+ 'thumbnail': 'https://qqcdnpictest.mxplay.com/pic/45055d5bcff169ad48f2ad7552a83d6c/en/16x9/320x180/voot_8e7d5f8d8183340869279c732c1e3a43.webp',
+ 'episode': 'Episode 1'
+ },
+ 'params': {
+ 'format': 'best',
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'https://www.mxplayer.in/show/watch-aashram/chapter-1/duh-swapna-online-d445579792b0135598ba1bc9088a84cb',
+ 'info_dict': {
+ 'id': 'd445579792b0135598ba1bc9088a84cb',
+ 'ext': 'mp4',
+ 'title': 'Duh Swapna',
+ 'description': 'md5:35ff39c4bdac403c53be1e16a04192d8',
+ 'season_number': 1,
+ 'episode_number': 3,
+ 'duration': 2568,
+ 'season': 'Chapter 1',
+ 'series': 'Aashram',
+ 'thumbnail': 'https://qqcdnpictest.mxplay.com/pic/d445579792b0135598ba1bc9088a84cb/en/4x3/1600x1200/test_pic1624819307993.webp',
+ 'episode': 'Episode 3'
+ },
+ 'params': {
+ 'format': 'bv',
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'https://www.mxplayer.in/show/watch-dangerous/season-1/chapter-1-online-5a351b4f9fb69436f6bd6ae3a1a75292',
+ 'info_dict': {
+ 'id': '5a351b4f9fb69436f6bd6ae3a1a75292',
+ 'ext': 'mp4',
+ 'title': 'Chapter 1',
+ 'description': 'md5:233886b8598bc91648ac098abe1d288f',
+ 'season_number': 1,
+ 'episode_number': 1,
+ 'duration': 1305,
+ 'season': 'Season 1',
+ 'series': 'Dangerous',
+ 'thumbnail': 'https://qqcdnpictest.mxplay.com/pic/5a351b4f9fb69436f6bd6ae3a1a75292/en/4x3/1600x1200/test_pic1624706302350.webp',
+ 'episode': 'Episode 1'
+ },
+ 'params': {
+ 'format': 'bv',
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'https://www.mxplayer.in/movie/watch-the-attacks-of-2611-movie-online-0452f0d80226c398d63ce7e3ea40fa2d',
+ 'info_dict': {
+ 'id': '0452f0d80226c398d63ce7e3ea40fa2d',
+ 'ext': 'mp4',
+ 'title': 'The Attacks of 26/11',
+ 'description': 'md5:689bacd29e97b3f31eaf519eb14127e5',
+ 'season_number': 0,
+ 'episode_number': 0,
+ 'duration': 6085,
+ 'season': 'Season 0',
+ 'series': None,
+ 'thumbnail': 'https://qqcdnpictest.mxplay.com/pic/0452f0d80226c398d63ce7e3ea40fa2d/en/16x9/320x180/00c8955dab5e5d340dbde643f9b1f6fd_1920x1080.webp',
+ 'episode': 'Episode 0'
+ },
+ 'params': {
+ 'format': 'best',
+ 'skip_download': True,
+ },
+ }]
+
+ def _real_extract(self, url):
+ type, display_id, video_id = self._match_valid_url(url).groups()
+ type = 'movie_film' if type == 'movie' else 'tvshow_episode'
+ API_URL = 'https://androidapi.mxplay.com/v1/detail/'
+ headers = {
+ 'X-Av-Code': '23',
+ 'X-Country': 'IN',
+ 'X-Platform': 'android',
+ 'X-App-Version': '1370001318',
+ 'X-Resolution': '3840x2160',
+ }
+ data_json = self._download_json(f'{API_URL}{type}/{video_id}', display_id, headers=headers)['profile']
+
+ season, series = None, None
+ for dct in data_json.get('levelInfos', []):
+ if dct.get('type') == 'tvshow_season':
+ season = dct.get('name')
+ elif dct.get('type') == 'tvshow_show':
+ series = dct.get('name')
+ thumbnails = []
+ for thumb in data_json.get('poster', []):
+ thumbnails.append({
+ 'url': thumb.get('url'),
+ 'width': thumb.get('width'),
+ 'height': thumb.get('height'),
+ })
+
+ formats = []
+ subtitles = {}
+ for dct in data_json.get('playInfo', []):
+ if dct.get('extension') == 'mpd':
+ frmt, subs = self._extract_mpd_formats_and_subtitles(dct.get('playUrl'), display_id, fatal=False)
+ formats.extend(frmt)
+ subtitles = self._merge_subtitles(subtitles, subs)
+ elif dct.get('extension') == 'm3u8':
+ frmt, subs = self._extract_m3u8_formats_and_subtitles(dct.get('playUrl'), display_id, fatal=False)
+ formats.extend(frmt)
+ subtitles = self._merge_subtitles(subtitles, subs)
+ self._sort_formats(formats)
+ return {
+ 'id': video_id,
+ 'display_id': display_id,
+ 'title': data_json.get('name') or display_id,
+ 'description': data_json.get('description'),
+ 'season_number': data_json.get('seasonNum'),
+ 'episode_number': data_json.get('episodeNum'),
+ 'duration': data_json.get('duration'),
+ 'season': season,
+ 'series': series,
+ 'thumbnails': thumbnails,
+ 'formats': formats,
+ 'subtitles': subtitles,
+ }
+
+
+class MxplayerShowIE(InfoExtractor):
+ _VALID_URL = r'(?:https?://)(?:www\.)?mxplayer\.in/show/(?P<display_id>[-\w]+)-(?P<id>\w+)/?(?:$|[#?])'
+ _TESTS = [{
+ 'url': 'https://www.mxplayer.in/show/watch-chakravartin-ashoka-samrat-series-online-a8f44e3cc0814b5601d17772cedf5417',
+ 'playlist_mincount': 440,
+ 'info_dict': {
+ 'id': 'a8f44e3cc0814b5601d17772cedf5417',
+ 'title': 'Watch Chakravartin Ashoka Samrat Series Online',
+ }
+ }]
+
+ _API_SHOW_URL = "https://api.mxplay.com/v1/web/detail/tab/tvshowseasons?type=tv_show&id={}&device-density=2&platform=com.mxplay.desktop&content-languages=hi,en"
+ _API_EPISODES_URL = "https://api.mxplay.com/v1/web/detail/tab/tvshowepisodes?type=season&id={}&device-density=1&platform=com.mxplay.desktop&content-languages=hi,en&{}"
+
+ def _entries(self, show_id):
+ show_json = self._download_json(
+ self._API_SHOW_URL.format(show_id),
+ video_id=show_id, headers={'Referer': 'https://mxplayer.in'})
+ page_num = 0
+ for season in show_json.get('items') or []:
+ season_id = try_get(season, lambda x: x['id'], compat_str)
+ next_url = ''
+ while next_url is not None:
+ page_num += 1
+ season_json = self._download_json(
+ self._API_EPISODES_URL.format(season_id, next_url),
+ video_id=season_id,
+ headers={'Referer': 'https://mxplayer.in'},
+ note='Downloading JSON metadata page %d' % page_num)
+ for episode in season_json.get('items') or []:
+ video_url = episode['webUrl']
+ yield self.url_result(
+ 'https://mxplayer.in%s' % video_url,
+ ie=MxplayerIE.ie_key(), video_id=video_url.split('-')[-1])
+ next_url = season_json.get('next')
+
+ def _real_extract(self, url):
+ display_id, show_id = self._match_valid_url(url).groups()
+ return self.playlist_result(
+ self._entries(show_id), playlist_id=show_id,
+ playlist_title=display_id.replace('-', ' ').title())
diff --git a/hypervideo_dl/extractor/mychannels.py b/hypervideo_dl/extractor/mychannels.py
index b1ffe78..d820d4e 100644
--- a/hypervideo_dl/extractor/mychannels.py
+++ b/hypervideo_dl/extractor/mychannels.py
@@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
@@ -21,7 +20,7 @@ class MyChannelsIE(InfoExtractor):
}
def _real_extract(self, url):
- id_type, url_id = re.match(self._VALID_URL, url).groups()
+ id_type, url_id = self._match_valid_url(url).groups()
webpage = self._download_webpage(url, url_id)
video_data = self._html_search_regex(r'<div([^>]+data-%s-id="%s"[^>]+)>' % (id_type, url_id), webpage, 'video data')
diff --git a/hypervideo_dl/extractor/myspace.py b/hypervideo_dl/extractor/myspace.py
index e164d59..4227d42 100644
--- a/hypervideo_dl/extractor/myspace.py
+++ b/hypervideo_dl/extractor/myspace.py
@@ -46,18 +46,6 @@ class MySpaceIE(InfoExtractor):
'uploader_id': 'killsorrow',
},
}, {
- 'add_ie': ['Youtube'],
- 'url': 'https://myspace.com/threedaysgrace/music/song/animal-i-have-become-28400208-28218041',
- 'info_dict': {
- 'id': 'xqds0B_meys',
- 'ext': 'webm',
- 'title': 'Three Days Grace - Animal I Have Become',
- 'description': 'md5:8bd86b3693e72a077cf863a8530c54bb',
- 'uploader': 'ThreeDaysGraceVEVO',
- 'uploader_id': 'ThreeDaysGraceVEVO',
- 'upload_date': '20091002',
- },
- }, {
'url': 'https://myspace.com/starset2/music/song/first-light-95799905-106964426',
'only_matching': True,
}, {
@@ -66,7 +54,7 @@ class MySpaceIE(InfoExtractor):
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
video_id = mobj.group('video_id') or mobj.group('song_id')
is_song = mobj.group('mediatype').startswith('music/song')
webpage = self._download_webpage(url, video_id)
@@ -191,7 +179,7 @@ class MySpaceAlbumIE(InfoExtractor):
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
playlist_id = mobj.group('id')
display_id = mobj.group('title') + playlist_id
webpage = self._download_webpage(url, display_id)
diff --git a/hypervideo_dl/extractor/myvideoge.py b/hypervideo_dl/extractor/myvideoge.py
new file mode 100644
index 0000000..0a1d7d0
--- /dev/null
+++ b/hypervideo_dl/extractor/myvideoge.py
@@ -0,0 +1,56 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import js_to_json
+
+
+class MyVideoGeIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?myvideo\.ge/v/(?P<id>[0-9]+)'
+ _TEST = {
+ 'url': 'https://www.myvideo.ge/v/3941048',
+ 'md5': '8c192a7d2b15454ba4f29dc9c9a52ea9',
+ 'info_dict': {
+ 'id': '3941048',
+ 'ext': 'mp4',
+ 'title': 'The best prikol',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'uploader': 'md5:d72addd357b0dd914e704781f7f777d8',
+ 'description': 'md5:5c0371f540f5888d603ebfedd46b6df3'
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ title = self._html_search_regex(r'<h1[^>]*>([^<]+)</h1>', webpage, 'title')
+ description = self._og_search_description(webpage)
+ thumbnail = self._html_search_meta(['og:image'], webpage)
+ uploader = self._search_regex(r'<a[^>]+class="mv_user_name"[^>]*>([^<]+)<', webpage, 'uploader', fatal=False)
+
+ jwplayer_sources = self._parse_json(
+ self._search_regex(
+ r"(?s)jwplayer\(\"mvplayer\"\).setup\(.*?sources: (.*?])", webpage, 'jwplayer sources'),
+ video_id, transform_source=js_to_json)
+
+ def _formats_key(f):
+ if f['label'] == 'SD':
+ return -1
+ elif f['label'] == 'HD':
+ return 1
+ else:
+ return 0
+
+ jwplayer_sources = sorted(jwplayer_sources, key=_formats_key)
+
+ formats = self._parse_jwplayer_formats(jwplayer_sources, video_id)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'uploader': uploader,
+ 'formats': formats,
+ 'thumbnail': thumbnail
+ }
diff --git a/hypervideo_dl/extractor/n1.py b/hypervideo_dl/extractor/n1.py
new file mode 100644
index 0000000..7a09c67
--- /dev/null
+++ b/hypervideo_dl/extractor/n1.py
@@ -0,0 +1,136 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .youtube import YoutubeIE
+from .reddit import RedditRIE
+from .common import InfoExtractor
+from ..utils import (
+ unified_timestamp,
+ extract_attributes,
+)
+
+
+class N1InfoAssetIE(InfoExtractor):
+ _VALID_URL = r'https?://best-vod\.umn\.cdn\.united\.cloud/stream\?asset=(?P<id>[^&]+)'
+ _TESTS = [{
+ 'url': 'https://best-vod.umn.cdn.united.cloud/stream?asset=ljsottomazilirija3060921-n1info-si-worldwide&stream=hp1400&t=0&player=m3u8v&sp=n1info&u=n1info&p=n1Sh4redSecre7iNf0',
+ 'md5': '28b08b32aeaff2b8562736ccd5a66fe7',
+ 'info_dict': {
+ 'id': 'ljsottomazilirija3060921-n1info-si-worldwide',
+ 'ext': 'mp4',
+ 'title': 'ljsottomazilirija3060921-n1info-si-worldwide',
+ }
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ formats = self._extract_m3u8_formats(
+ url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': video_id,
+ 'formats': formats,
+ }
+
+
+class N1InfoIIE(InfoExtractor):
+ IE_NAME = 'N1Info:article'
+ _VALID_URL = r'https?://(?:(?:ba|rs|hr)\.)?n1info\.(?:com|si)/(?:[^/]+/){1,2}(?P<id>[^/]+)'
+ _TESTS = [{
+ # Youtube embedded
+ 'url': 'https://rs.n1info.com/sport-klub/tenis/kako-je-djokovic-propustio-istorijsku-priliku-video/',
+ 'md5': '01ddb6646d0fd9c4c7d990aa77fe1c5a',
+ 'info_dict': {
+ 'id': 'L5Hd4hQVUpk',
+ 'ext': 'mp4',
+ 'upload_date': '20210913',
+ 'title': 'Ozmo i USO21, ep. 13: Novak Đoković – Danil Medvedev | Ključevi Poraza, Budućnost | SPORT KLUB TENIS',
+ 'description': 'md5:467f330af1effedd2e290f10dc31bb8e',
+ 'uploader': 'Sport Klub',
+ 'uploader_id': 'sportklub',
+ }
+ }, {
+ 'url': 'https://rs.n1info.com/vesti/djilas-los-plan-za-metro-nece-resiti-nijedan-saobracajni-problem/',
+ 'info_dict': {
+ 'id': 'bgmetrosot2409zta20210924174316682-n1info-rs-worldwide',
+ 'ext': 'mp4',
+ 'title': 'Đilas: Predlog izgradnje metroa besmislen; SNS odbacuje navode',
+ 'upload_date': '20210924',
+ 'timestamp': 1632481347,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'https://n1info.si/novice/slovenija/zadnji-dnevi-na-kopaliscu-ilirija-ilirija-ni-umrla-ubili-so-jo/',
+ 'info_dict': {
+ 'id': 'ljsottomazilirija3060921-n1info-si-worldwide',
+ 'ext': 'mp4',
+ 'title': 'Zadnji dnevi na kopališču Ilirija: “Ilirija ni umrla, ubili so jo”',
+ 'timestamp': 1632567630,
+ 'upload_date': '20210925',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ # Reddit embedded
+ 'url': 'https://ba.n1info.com/lifestyle/vucic-bolji-od-tita-ako-izgubi-ja-cu-da-crknem-jugoslavija-je-gotova/',
+ 'info_dict': {
+ 'id': '2wmfee9eycp71',
+ 'ext': 'mp4',
+ 'title': '"Ako Vučić izgubi izbore, ja ću da crknem, Jugoslavija je gotova"',
+ 'upload_date': '20210924',
+ 'timestamp': 1632448649.0,
+ 'uploader': 'YouLotWhatDontStop',
+ },
+ 'params': {
+ 'format': 'bestvideo',
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'https://hr.n1info.com/vijesti/pravobraniteljica-o-ubojstvu-u-zagrebu-radi-se-o-doista-nezapamcenoj-situaciji/',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ title = self._html_search_regex(r'<h1[^>]+>(.+?)</h1>', webpage, 'title')
+ timestamp = unified_timestamp(self._html_search_meta('article:published_time', webpage))
+
+ videos = re.findall(r'(?m)(<video[^>]+>)', webpage)
+ entries = []
+ for video in videos:
+ video_data = extract_attributes(video)
+ entries.append({
+ '_type': 'url_transparent',
+ 'url': video_data.get('data-url'),
+ 'id': video_data.get('id'),
+ 'title': title,
+ 'thumbnail': video_data.get('data-thumbnail'),
+ 'timestamp': timestamp,
+ 'ie_key': N1InfoAssetIE.ie_key()})
+
+ embedded_videos = re.findall(r'(<iframe[^>]+>)', webpage)
+ for embedded_video in embedded_videos:
+ video_data = extract_attributes(embedded_video)
+ url = video_data.get('src')
+ if url.startswith('https://www.youtube.com'):
+ entries.append(self.url_result(url, ie=YoutubeIE.ie_key()))
+ elif url.startswith('https://www.redditmedia.com'):
+ entries.append(self.url_result(url, ie=RedditRIE.ie_key()))
+
+ return {
+ '_type': 'playlist',
+ 'id': video_id,
+ 'title': title,
+ 'timestamp': timestamp,
+ 'entries': entries,
+ }
diff --git a/hypervideo_dl/extractor/naver.py b/hypervideo_dl/extractor/naver.py
index 61fc591..acf53c1 100644
--- a/hypervideo_dl/extractor/naver.py
+++ b/hypervideo_dl/extractor/naver.py
@@ -164,3 +164,88 @@ class NaverIE(NaverBaseIE):
'age_limit': 19 if current_clip.get('adult') else None,
})
return info
+
+
+class NaverLiveIE(InfoExtractor):
+ IE_NAME = 'Naver:live'
+ _VALID_URL = r'https?://(?:m\.)?tv(?:cast)?\.naver\.com/l/(?P<id>\d+)'
+ _GEO_BYPASS = False
+ _TESTS = [{
+ 'url': 'https://tv.naver.com/l/52010',
+ 'info_dict': {
+ 'id': '52010',
+ 'ext': 'm3u8',
+ 'title': '[LIVE] 뉴스특보 : "수도권 거리두기, 2주간 2단계로 조정"',
+ 'description': 'md5:df7f0c237a5ed5e786ce5c91efbeaab3',
+ 'channel_id': 'NTV-ytnnews24-0',
+ 'start_time': 1597026780000,
+ },
+ }, {
+ 'url': 'https://tv.naver.com/l/51549',
+ 'info_dict': {
+ 'id': '51549',
+ 'ext': 'm3u8',
+ 'title': '연합뉴스TV - 코로나19 뉴스특보',
+ 'description': 'md5:c655e82091bc21e413f549c0eaccc481',
+ 'channel_id': 'NTV-yonhapnewstv-0',
+ 'start_time': 1596406380000,
+ },
+ }, {
+ 'url': 'https://tv.naver.com/l/54887',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ page = self._download_webpage(url, video_id, 'Downloading Page', 'Unable to download Page')
+ secure_url = self._search_regex(r'sApiF:\s+(?:"|\')([^"\']+)', page, 'secureurl')
+
+ info = self._extract_video_info(video_id, secure_url)
+ info.update({
+ 'description': self._og_search_description(page)
+ })
+
+ return info
+
+ def _extract_video_info(self, video_id, url):
+ video_data = self._download_json(url, video_id, headers=self.geo_verification_headers())
+ meta = video_data.get('meta')
+ status = meta.get('status')
+
+ if status == 'CLOSED':
+ raise ExtractorError('Stream is offline.', expected=True)
+ elif status != 'OPENED':
+ raise ExtractorError('Unknown status %s' % status)
+
+ title = meta.get('title')
+ stream_list = video_data.get('streams')
+
+ if stream_list is None:
+ raise ExtractorError('Could not get stream data.', expected=True)
+
+ formats = []
+ for quality in stream_list:
+ if not quality.get('url'):
+ continue
+
+ prop = quality.get('property')
+ if prop.get('abr'): # This abr doesn't mean Average audio bitrate.
+ continue
+
+ formats.extend(self._extract_m3u8_formats(
+ quality.get('url'), video_id, 'm3u8',
+ m3u8_id=quality.get('qualityId'), live=True
+ ))
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'formats': formats,
+ 'channel_id': meta.get('channelId'),
+ 'channel_url': meta.get('channelUrl'),
+ 'thumbnail': meta.get('imgUrl'),
+ 'start_time': meta.get('startTime'),
+ 'categories': [meta.get('categoryId')],
+ 'is_live': True
+ }
diff --git a/hypervideo_dl/extractor/nba.py b/hypervideo_dl/extractor/nba.py
index fbc7ada..7390ef8 100644
--- a/hypervideo_dl/extractor/nba.py
+++ b/hypervideo_dl/extractor/nba.py
@@ -5,10 +5,8 @@ import re
from .turner import TurnerBaseIE
from ..compat import (
- compat_parse_qs,
compat_str,
compat_urllib_parse_unquote,
- compat_urllib_parse_urlparse,
)
from ..utils import (
int_or_none,
@@ -16,6 +14,7 @@ from ..utils import (
OnDemandPagedList,
parse_duration,
parse_iso8601,
+ parse_qs,
try_get,
update_url_query,
urljoin,
@@ -165,9 +164,9 @@ class NBAWatchIE(NBAWatchBaseIE):
def _real_extract(self, url):
display_id = self._match_id(url)
- collection_id = compat_parse_qs(compat_urllib_parse_urlparse(url).query).get('collection', [None])[0]
+ collection_id = parse_qs(url).get('collection', [None])[0]
if collection_id:
- if self._downloader.params.get('noplaylist'):
+ if self.get_param('noplaylist'):
self.to_screen('Downloading just video %s because of --no-playlist' % display_id)
else:
self.to_screen('Downloading playlist %s - add --no-playlist to just download video' % collection_id)
@@ -303,7 +302,7 @@ class NBABaseIE(NBACVPBaseIE):
formats.append({
'format_id': 'source',
'url': source_url,
- 'preference': 1,
+ 'quality': 1,
})
m3u8_url = video.get('m3u8')
@@ -337,7 +336,7 @@ class NBABaseIE(NBACVPBaseIE):
return info
def _real_extract(self, url):
- team, display_id = re.match(self._VALID_URL, url).groups()
+ team, display_id = self._match_valid_url(url).groups()
if '/play#/' in url:
display_id = compat_urllib_parse_unquote(display_id)
else:
@@ -359,7 +358,7 @@ class NBAEmbedIE(NBABaseIE):
}]
def _real_extract(self, url):
- qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
+ qs = parse_qs(url)
content_id = qs['contentId'][0]
team = qs.get('team', [None])[0]
if not team:
diff --git a/hypervideo_dl/extractor/nbc.py b/hypervideo_dl/extractor/nbc.py
index 0d77648..f304f19 100644
--- a/hypervideo_dl/extractor/nbc.py
+++ b/hypervideo_dl/extractor/nbc.py
@@ -10,7 +10,9 @@ from .adobepass import AdobePassIE
from ..compat import compat_urllib_parse_unquote
from ..utils import (
int_or_none,
+ parse_age_limit,
parse_duration,
+ RegexNotFoundError,
smuggle_url,
try_get,
unified_timestamp,
@@ -18,7 +20,7 @@ from ..utils import (
)
-class NBCIE(AdobePassIE):
+class NBCIE(ThePlatformIE):
_VALID_URL = r'https?(?P<permalink>://(?:www\.)?nbc\.com/(?:classic-tv/)?[^/]+/video/[^/]+/(?P<id>n?\d+))'
_TESTS = [
@@ -84,7 +86,7 @@ class NBCIE(AdobePassIE):
]
def _real_extract(self, url):
- permalink, video_id = re.match(self._VALID_URL, url).groups()
+ permalink, video_id = self._match_valid_url(url).groups()
permalink = 'http' + compat_urllib_parse_unquote(permalink)
video_data = self._download_json(
'https://friendship.nbc.co/v2/graphql', video_id, query={
@@ -132,7 +134,9 @@ class NBCIE(AdobePassIE):
'manifest': 'm3u',
}
video_id = video_data['mpxGuid']
- title = video_data['secondaryTitle']
+ tp_path = 'NnzsPC/media/guid/%s/%s' % (video_data.get('mpxAccountId') or '2410887629', video_id)
+ tpm = self._download_theplatform_metadata(tp_path, video_id)
+ title = tpm.get('title') or video_data.get('secondaryTitle')
if video_data.get('locked'):
resource = self._get_mvpd_resource(
video_data.get('resourceId') or 'nbcentertainment',
@@ -142,18 +146,40 @@ class NBCIE(AdobePassIE):
theplatform_url = smuggle_url(update_url_query(
'http://link.theplatform.com/s/NnzsPC/media/guid/%s/%s' % (video_data.get('mpxAccountId') or '2410887629', video_id),
query), {'force_smil_url': True})
+
+ # Empty string or 0 can be valid values for these. So the check must be `is None`
+ description = video_data.get('description')
+ if description is None:
+ description = tpm.get('description')
+ episode_number = int_or_none(video_data.get('episodeNumber'))
+ if episode_number is None:
+ episode_number = int_or_none(tpm.get('nbcu$airOrder'))
+ rating = video_data.get('rating')
+ if rating is None:
+ try_get(tpm, lambda x: x['ratings'][0]['rating'])
+ season_number = int_or_none(video_data.get('seasonNumber'))
+ if season_number is None:
+ season_number = int_or_none(tpm.get('nbcu$seasonNumber'))
+ series = video_data.get('seriesShortTitle')
+ if series is None:
+ series = tpm.get('nbcu$seriesShortTitle')
+ tags = video_data.get('keywords')
+ if tags is None or len(tags) == 0:
+ tags = tpm.get('keywords')
+
return {
'_type': 'url_transparent',
+ 'age_limit': parse_age_limit(rating),
+ 'description': description,
+ 'episode': title,
+ 'episode_number': episode_number,
'id': video_id,
+ 'ie_key': 'ThePlatform',
+ 'season_number': season_number,
+ 'series': series,
+ 'tags': tags,
'title': title,
'url': theplatform_url,
- 'description': video_data.get('description'),
- 'tags': video_data.get('keywords'),
- 'season_number': int_or_none(video_data.get('seasonNumber')),
- 'episode_number': int_or_none(video_data.get('episodeNumber')),
- 'episode': title,
- 'series': video_data.get('seriesShortTitle'),
- 'ie_key': 'ThePlatform',
}
@@ -435,7 +461,7 @@ class NBCNewsIE(ThePlatformIE):
class NBCOlympicsIE(InfoExtractor):
IE_NAME = 'nbcolympics'
- _VALID_URL = r'https?://www\.nbcolympics\.com/video/(?P<id>[a-z-]+)'
+ _VALID_URL = r'https?://www\.nbcolympics\.com/videos?/(?P<id>[0-9a-z-]+)'
_TEST = {
# Geo-restricted to US
@@ -458,13 +484,18 @@ class NBCOlympicsIE(InfoExtractor):
webpage = self._download_webpage(url, display_id)
- drupal_settings = self._parse_json(self._search_regex(
- r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);',
- webpage, 'drupal settings'), display_id)
+ try:
+ drupal_settings = self._parse_json(self._search_regex(
+ r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);',
+ webpage, 'drupal settings'), display_id)
- iframe_url = drupal_settings['vod']['iframe_url']
- theplatform_url = iframe_url.replace(
- 'vplayer.nbcolympics.com', 'player.theplatform.com')
+ iframe_url = drupal_settings['vod']['iframe_url']
+ theplatform_url = iframe_url.replace(
+ 'vplayer.nbcolympics.com', 'player.theplatform.com')
+ except RegexNotFoundError:
+ theplatform_url = self._search_regex(
+ r"([\"'])embedUrl\1: *([\"'])(?P<embedUrl>.+)\2",
+ webpage, 'embedding URL', group="embedUrl")
return {
'_type': 'url_transparent',
@@ -477,43 +508,79 @@ class NBCOlympicsIE(InfoExtractor):
class NBCOlympicsStreamIE(AdobePassIE):
IE_NAME = 'nbcolympics:stream'
_VALID_URL = r'https?://stream\.nbcolympics\.com/(?P<id>[0-9a-z-]+)'
- _TEST = {
- 'url': 'http://stream.nbcolympics.com/2018-winter-olympics-nbcsn-evening-feb-8',
- 'info_dict': {
- 'id': '203493',
- 'ext': 'mp4',
- 'title': 're:Curling, Alpine, Luge [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
- },
- 'params': {
- # m3u8 download
- 'skip_download': True,
+ _TESTS = [
+ {
+ 'note': 'Tokenized m3u8 source URL',
+ 'url': 'https://stream.nbcolympics.com/womens-soccer-group-round-11',
+ 'info_dict': {
+ 'id': '2019740',
+ 'ext': 'mp4',
+ 'title': r"re:Women's Group Stage - Netherlands vs\. Brazil [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$",
+ },
+ 'params': {
+ 'skip_download': 'm3u8',
+ },
+ }, {
+ 'note': 'Plain m3u8 source URL',
+ 'url': 'https://stream.nbcolympics.com/gymnastics-event-finals-mens-floor-pommel-horse-womens-vault-bars',
+ 'info_dict': {
+ 'id': '2021729',
+ 'ext': 'mp4',
+ 'title': r're:Event Finals: M Floor, W Vault, M Pommel, W Uneven Bars [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+ },
+ 'params': {
+ 'skip_download': 'm3u8',
+ },
},
- }
- _DATA_URL_TEMPLATE = 'http://stream.nbcolympics.com/data/%s_%s.json'
+ ]
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
pid = self._search_regex(r'pid\s*=\s*(\d+);', webpage, 'pid')
- resource = self._search_regex(
- r"resource\s*=\s*'(.+)';", webpage,
- 'resource').replace("' + pid + '", pid)
+
event_config = self._download_json(
- self._DATA_URL_TEMPLATE % ('event_config', pid),
- pid)['eventConfig']
- title = self._live_title(event_config['eventTitle'])
+ f'http://stream.nbcolympics.com/data/event_config_{pid}.json',
+ pid, 'Downloading event config')['eventConfig']
+
+ title = event_config['eventTitle']
+ is_live = {'live': True, 'replay': False}.get(event_config.get('eventStatus'))
+ if is_live:
+ title = self._live_title(title)
+
source_url = self._download_json(
- self._DATA_URL_TEMPLATE % ('live_sources', pid),
- pid)['videoSources'][0]['sourceUrl']
- media_token = self._extract_mvpd_auth(
- url, pid, event_config.get('requestorId', 'NBCOlympics'), resource)
- formats = self._extract_m3u8_formats(self._download_webpage(
- 'http://sp.auth.adobe.com/tvs/v1/sign', pid, query={
- 'cdn': 'akamai',
- 'mediaToken': base64.b64encode(media_token.encode()),
- 'resource': base64.b64encode(resource.encode()),
- 'url': source_url,
- }), pid, 'mp4')
+ f'https://api-leap.nbcsports.com/feeds/assets/{pid}?application=NBCOlympics&platform=desktop&format=nbc-player&env=staging',
+ pid, 'Downloading leap config'
+ )['videoSources'][0]['cdnSources']['primary'][0]['sourceUrl']
+
+ if event_config.get('cdnToken'):
+ ap_resource = self._get_mvpd_resource(
+ event_config.get('resourceId', 'NBCOlympics'),
+ re.sub(r'[^\w\d ]+', '', event_config['eventTitle']), pid,
+ event_config.get('ratingId', 'NO VALUE'))
+ media_token = self._extract_mvpd_auth(url, pid, event_config.get('requestorId', 'NBCOlympics'), ap_resource)
+
+ source_url = self._download_json(
+ 'https://tokens.playmakerservices.com/', pid, 'Retrieving tokenized URL',
+ data=json.dumps({
+ 'application': 'NBCSports',
+ 'authentication-type': 'adobe-pass',
+ 'cdn': 'akamai',
+ 'pid': pid,
+ 'platform': 'desktop',
+ 'requestorId': 'NBCOlympics',
+ 'resourceId': base64.b64encode(ap_resource.encode()).decode(),
+ 'token': base64.b64encode(media_token.encode()).decode(),
+ 'url': source_url,
+ 'version': 'v1',
+ }).encode(),
+ )['akamai'][0]['tokenizedUrl']
+
+ formats = self._extract_m3u8_formats(source_url, pid, 'mp4', live=is_live)
+ for f in formats:
+ # -http_seekable requires ffmpeg 4.3+ but it doesnt seem possible to
+ # download with ffmpeg without this option
+ f['_ffmpeg_args'] = ['-seekable', '0', '-http_seekable', '0', '-icy', '0']
self._sort_formats(formats)
return {
@@ -521,5 +588,5 @@ class NBCOlympicsStreamIE(AdobePassIE):
'display_id': display_id,
'title': title,
'formats': formats,
- 'is_live': True,
+ 'is_live': is_live,
}
diff --git a/hypervideo_dl/extractor/ndr.py b/hypervideo_dl/extractor/ndr.py
index ddd828d..f2bae2c 100644
--- a/hypervideo_dl/extractor/ndr.py
+++ b/hypervideo_dl/extractor/ndr.py
@@ -1,135 +1,136 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
from ..utils import (
determine_ext,
int_or_none,
- merge_dicts,
- parse_iso8601,
+ parse_duration,
qualities,
try_get,
+ unified_strdate,
urljoin,
)
class NDRBaseIE(InfoExtractor):
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
display_id = next(group for group in mobj.groups() if group)
+ id = mobj.group('id')
webpage = self._download_webpage(url, display_id)
- return self._extract_embed(webpage, display_id)
+ return self._extract_embed(webpage, display_id, id)
class NDRIE(NDRBaseIE):
IE_NAME = 'ndr'
IE_DESC = 'NDR.de - Norddeutscher Rundfunk'
- _VALID_URL = r'https?://(?:www\.)?ndr\.de/(?:[^/]+/)*(?P<id>[^/?#]+),[\da-z]+\.html'
+ _VALID_URL = r'https?://(?:www\.)?(?:daserste\.)?ndr\.de/(?:[^/]+/)*(?P<display_id>[^/?#]+),(?P<id>[\da-z]+)\.html'
_TESTS = [{
- # httpVideo, same content id
'url': 'http://www.ndr.de/fernsehen/Party-Poette-und-Parade,hafengeburtstag988.html',
- 'md5': '6515bc255dc5c5f8c85bbc38e035a659',
'info_dict': {
'id': 'hafengeburtstag988',
- 'display_id': 'Party-Poette-und-Parade',
'ext': 'mp4',
'title': 'Party, Pötte und Parade',
+ 'thumbnail': 'https://www.ndr.de/fernsehen/hafengeburtstag990_v-contentxl.jpg',
'description': 'md5:ad14f9d2f91d3040b6930c697e5f6b4c',
- 'uploader': 'ndrtv',
- 'timestamp': 1431108900,
- 'upload_date': '20150510',
+ 'series': None,
+ 'channel': 'NDR Fernsehen',
+ 'upload_date': '20150508',
'duration': 3498,
},
- 'params': {
- 'skip_download': True,
- },
}, {
- # httpVideo, different content id
- 'url': 'http://www.ndr.de/sport/fussball/40-Osnabrueck-spielt-sich-in-einen-Rausch,osna270.html',
- 'md5': '1043ff203eab307f0c51702ec49e9a71',
+ 'url': 'https://www.ndr.de/sport/fussball/Rostocks-Matchwinner-Froede-Ein-Hansa-Debuet-wie-im-Maerchen,hansa10312.html',
+ 'only_matching': True
+ }, {
+ 'url': 'https://www.ndr.de/nachrichten/niedersachsen/kommunalwahl_niedersachsen_2021/Grosse-Parteien-zufrieden-mit-Ergebnissen-der-Kommunalwahl,kommunalwahl1296.html',
'info_dict': {
- 'id': 'osna272',
- 'display_id': '40-Osnabrueck-spielt-sich-in-einen-Rausch',
+ 'id': 'kommunalwahl1296',
'ext': 'mp4',
- 'title': 'Osnabrück - Wehen Wiesbaden: Die Highlights',
- 'description': 'md5:32e9b800b3d2d4008103752682d5dc01',
- 'uploader': 'ndrtv',
- 'timestamp': 1442059200,
- 'upload_date': '20150912',
- 'duration': 510,
- },
- 'params': {
- 'skip_download': True,
+ 'title': 'Die Spitzenrunde: Die Wahl aus Sicht der Landespolitik',
+ 'thumbnail': 'https://www.ndr.de/fernsehen/screenshot1194912_v-contentxl.jpg',
+ 'description': 'md5:5c6e2ad744cef499135735a1036d7aa7',
+ 'series': 'Hallo Niedersachsen',
+ 'channel': 'NDR Fernsehen',
+ 'upload_date': '20210913',
+ 'duration': 438,
},
}, {
- # httpAudio, same content id
+ 'url': 'https://www.ndr.de/fernsehen/sendungen/extra_3/extra-3-Satiremagazin-mit-Christian-Ehring,sendung1091858.html',
+ 'info_dict': {
+ 'id': 'sendung1091858',
+ 'ext': 'mp4',
+ 'title': 'Extra 3 vom 11.11.2020 mit Christian Ehring',
+ 'thumbnail': 'https://www.ndr.de/fernsehen/screenshot983938_v-contentxl.jpg',
+ 'description': 'md5:700f6de264010585012a72f97b0ac0c9',
+ 'series': 'extra 3',
+ 'channel': 'NDR Fernsehen',
+ 'upload_date': '20201111',
+ 'duration': 1749,
+ }
+ }, {
'url': 'http://www.ndr.de/info/La-Valette-entgeht-der-Hinrichtung,audio51535.html',
- 'md5': 'bb3cd38e24fbcc866d13b50ca59307b8',
'info_dict': {
'id': 'audio51535',
- 'display_id': 'La-Valette-entgeht-der-Hinrichtung',
'ext': 'mp3',
'title': 'La Valette entgeht der Hinrichtung',
+ 'thumbnail': 'https://www.ndr.de/mediathek/mediathekbild140_v-podcast.jpg',
'description': 'md5:22f9541913a40fe50091d5cdd7c9f536',
- 'uploader': 'ndrinfo',
- 'timestamp': 1290626100,
'upload_date': '20140729',
- 'duration': 884,
- },
- 'params': {
- 'skip_download': True,
- },
- }, {
- # with subtitles
- 'url': 'https://www.ndr.de/fernsehen/sendungen/extra_3/extra-3-Satiremagazin-mit-Christian-Ehring,sendung1091858.html',
- 'info_dict': {
- 'id': 'extra18674',
- 'display_id': 'extra-3-Satiremagazin-mit-Christian-Ehring',
- 'ext': 'mp4',
- 'title': 'Extra 3 vom 11.11.2020 mit Christian Ehring',
- 'description': 'md5:42ee53990a715eaaf4dc7f13a3bd56c6',
- 'uploader': 'ndrtv',
- 'upload_date': '20201113',
- 'duration': 1749,
- 'subtitles': {
- 'de': [{
- 'ext': 'ttml',
- 'url': r're:^https://www\.ndr\.de.+',
- }],
- },
+ 'duration': 884.0,
},
- 'params': {
- 'skip_download': True,
- },
- 'expected_warnings': ['Unable to download f4m manifest'],
- }, {
- 'url': 'https://www.ndr.de/Fettes-Brot-Ferris-MC-und-Thees-Uhlmann-live-on-stage,festivalsommer116.html',
- 'only_matching': True,
+ 'expected_warnings': ['unable to extract json url'],
}]
- def _extract_embed(self, webpage, display_id):
- embed_url = self._html_search_meta(
- 'embedURL', webpage, 'embed URL',
- default=None) or self._search_regex(
- r'\bembedUrl["\']\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage,
- 'embed URL', group='url')
- description = self._search_regex(
- r'<p[^>]+itemprop="description">([^<]+)</p>',
- webpage, 'description', default=None) or self._og_search_description(webpage)
- timestamp = parse_iso8601(
- self._search_regex(
- r'<span[^>]+itemprop="(?:datePublished|uploadDate)"[^>]+content="([^"]+)"',
- webpage, 'upload date', default=None))
- info = self._search_json_ld(webpage, display_id, default={})
- return merge_dicts({
- '_type': 'url_transparent',
- 'url': embed_url,
- 'display_id': display_id,
- 'description': description,
- 'timestamp': timestamp,
- }, info)
+ def _extract_embed(self, webpage, display_id, id):
+ formats = []
+ base_url = 'https://www.ndr.de'
+ json_url = self._search_regex(r'<iframe[^>]+src=\"([^\"]+)_theme-ndrde[^\.]*\.html\"', webpage,
+ 'json url', fatal=False)
+ if json_url:
+ data_json = self._download_json(base_url + json_url.replace('ardplayer_image', 'ardjson_image') + '.json',
+ id, fatal=False)
+ info_json = data_json.get('_info', {})
+ media_json = try_get(data_json, lambda x: x['_mediaArray'][0]['_mediaStreamArray'])
+ for media in media_json:
+ if media.get('_quality') == 'auto':
+ formats.extend(self._extract_m3u8_formats(media['_stream'], id))
+ subtitles = {}
+ sub_url = data_json.get('_subtitleUrl')
+ if sub_url:
+ subtitles.setdefault('de', []).append({
+ 'url': base_url + sub_url,
+ })
+ self._sort_formats(formats)
+ return {
+ 'id': id,
+ 'title': info_json.get('clipTitle'),
+ 'thumbnail': base_url + data_json.get('_previewImage'),
+ 'description': info_json.get('clipDescription'),
+ 'series': info_json.get('seriesTitle') or None,
+ 'channel': info_json.get('channelTitle'),
+ 'upload_date': unified_strdate(info_json.get('clipDate')),
+ 'duration': data_json.get('_duration'),
+ 'formats': formats,
+ 'subtitles': subtitles,
+ }
+ else:
+ json_url = base_url + self._search_regex(r'apiUrl\s?=\s?\'([^\']+)\'', webpage, 'json url').replace(
+ '_belongsToPodcast-', '')
+ data_json = self._download_json(json_url, id, fatal=False)
+ return {
+ 'id': id,
+ 'title': data_json.get('title'),
+ 'thumbnail': base_url + data_json.get('poster'),
+ 'description': data_json.get('summary'),
+ 'upload_date': unified_strdate(data_json.get('publicationDate')),
+ 'duration': parse_duration(data_json.get('duration')),
+ 'formats': [{
+ 'url': try_get(data_json, (lambda x: x['audio'][0]['url'], lambda x: x['files'][0]['url'])),
+ 'vcodec': 'none',
+ 'ext': 'mp3',
+ }],
+ }
class NJoyIE(NDRBaseIE):
@@ -175,7 +176,7 @@ class NJoyIE(NDRBaseIE):
'only_matching': True,
}]
- def _extract_embed(self, webpage, display_id):
+ def _extract_embed(self, webpage, display_id, id):
video_id = self._search_regex(
r'<iframe[^>]+id="pp_([\da-z]+)"', webpage, 'embed id')
description = self._search_regex(
@@ -202,7 +203,7 @@ class NDREmbedBaseIE(InfoExtractor):
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
video_id = mobj.group('id') or mobj.group('id_s')
ppjson = self._download_json(
@@ -291,7 +292,7 @@ class NDREmbedBaseIE(InfoExtractor):
class NDREmbedIE(NDREmbedBaseIE):
IE_NAME = 'ndr:embed'
- _VALID_URL = r'https?://(?:www\.)?ndr\.de/(?:[^/]+/)*(?P<id>[\da-z]+)-(?:player|externalPlayer)\.html'
+ _VALID_URL = r'https?://(?:www\.)?(?:daserste\.)?ndr\.de/(?:[^/]+/)*(?P<id>[\da-z]+)-(?:player|externalPlayer)\.html'
_TESTS = [{
'url': 'http://www.ndr.de/fernsehen/sendungen/ndr_aktuell/ndraktuell28488-player.html',
'md5': '8b9306142fe65bbdefb5ce24edb6b0a9',
diff --git a/hypervideo_dl/extractor/nebula.py b/hypervideo_dl/extractor/nebula.py
new file mode 100644
index 0000000..9698a35
--- /dev/null
+++ b/hypervideo_dl/extractor/nebula.py
@@ -0,0 +1,238 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import json
+import time
+
+from urllib.error import HTTPError
+from .common import InfoExtractor
+from ..compat import compat_str, compat_urllib_parse_unquote, compat_urllib_parse_quote
+from ..utils import (
+ ExtractorError,
+ parse_iso8601,
+ try_get,
+ urljoin,
+)
+
+
+class NebulaIE(InfoExtractor):
+
+ _VALID_URL = r'https?://(?:www\.)?(?:watchnebula\.com|nebula\.app)/videos/(?P<id>[-\w]+)'
+ _TESTS = [
+ {
+ 'url': 'https://nebula.app/videos/that-time-disney-remade-beauty-and-the-beast',
+ 'md5': 'fe79c4df8b3aa2fea98a93d027465c7e',
+ 'info_dict': {
+ 'id': '5c271b40b13fd613090034fd',
+ 'ext': 'mp4',
+ 'title': 'That Time Disney Remade Beauty and the Beast',
+ 'description': 'Note: this video was originally posted on YouTube with the sponsor read included. We weren’t able to remove it without reducing video quality, so it’s presented here in its original context.',
+ 'upload_date': '20180731',
+ 'timestamp': 1533009600,
+ 'channel': 'Lindsay Ellis',
+ 'uploader': 'Lindsay Ellis',
+ },
+ 'params': {
+ 'usenetrc': True,
+ },
+ 'skip': 'All Nebula content requires authentication',
+ },
+ {
+ 'url': 'https://nebula.app/videos/the-logistics-of-d-day-landing-craft-how-the-allies-got-ashore',
+ 'md5': '6d4edd14ce65720fa63aba5c583fb328',
+ 'info_dict': {
+ 'id': '5e7e78171aaf320001fbd6be',
+ 'ext': 'mp4',
+ 'title': 'Landing Craft - How The Allies Got Ashore',
+ 'description': r're:^In this episode we explore the unsung heroes of D-Day, the landing craft.',
+ 'upload_date': '20200327',
+ 'timestamp': 1585348140,
+ 'channel': 'The Logistics of D-Day',
+ 'uploader': 'The Logistics of D-Day',
+ },
+ 'params': {
+ 'usenetrc': True,
+ },
+ 'skip': 'All Nebula content requires authentication',
+ },
+ {
+ 'url': 'https://nebula.app/videos/money-episode-1-the-draw',
+ 'md5': '8c7d272910eea320f6f8e6d3084eecf5',
+ 'info_dict': {
+ 'id': '5e779ebdd157bc0001d1c75a',
+ 'ext': 'mp4',
+ 'title': 'Episode 1: The Draw',
+ 'description': r'contains:There’s free money on offer… if the players can all work together.',
+ 'upload_date': '20200323',
+ 'timestamp': 1584980400,
+ 'channel': 'Tom Scott Presents: Money',
+ 'uploader': 'Tom Scott Presents: Money',
+ },
+ 'params': {
+ 'usenetrc': True,
+ },
+ 'skip': 'All Nebula content requires authentication',
+ },
+ {
+ 'url': 'https://watchnebula.com/videos/money-episode-1-the-draw',
+ 'only_matching': True,
+ },
+ ]
+ _NETRC_MACHINE = 'watchnebula'
+
+ _nebula_token = None
+
+ def _retrieve_nebula_auth(self):
+ """
+ Log in to Nebula, and returns a Nebula API token
+ """
+
+ username, password = self._get_login_info()
+ if not (username and password):
+ self.raise_login_required()
+
+ self.report_login()
+ data = json.dumps({'email': username, 'password': password}).encode('utf8')
+ response = self._download_json(
+ 'https://api.watchnebula.com/api/v1/auth/login/',
+ data=data, fatal=False, video_id=None,
+ headers={
+ 'content-type': 'application/json',
+ # Submitting the 'sessionid' cookie always causes a 403 on auth endpoint
+ 'cookie': ''
+ },
+ note='Authenticating to Nebula with supplied credentials',
+ errnote='Authentication failed or rejected')
+ if not response or not response.get('key'):
+ self.raise_login_required()
+
+ # save nebula token as cookie
+ self._set_cookie(
+ 'nebula.app', 'nebula-auth',
+ compat_urllib_parse_quote(
+ json.dumps({
+ "apiToken": response["key"],
+ "isLoggingIn": False,
+ "isLoggingOut": False,
+ }, separators=(",", ":"))),
+ expire_time=int(time.time()) + 86400 * 365,
+ )
+
+ return response['key']
+
+ def _retrieve_zype_api_key(self, page_url, display_id):
+ """
+ Retrieves the Zype API key
+ """
+
+ # Find the js that has the API key from the webpage and download it
+ webpage = self._download_webpage(page_url, video_id=display_id)
+ main_script_relpath = self._search_regex(
+ r'<script[^>]*src="(?P<script_relpath>[^"]*main.[0-9a-f]*.chunk.js)"[^>]*>', webpage,
+ group='script_relpath', name='script relative path', fatal=True)
+ main_script_abspath = urljoin(page_url, main_script_relpath)
+ main_script = self._download_webpage(main_script_abspath, video_id=display_id,
+ note='Retrieving Zype API key')
+
+ api_key = self._search_regex(
+ r'REACT_APP_ZYPE_API_KEY\s*:\s*"(?P<api_key>[\w-]*)"', main_script,
+ group='api_key', name='API key', fatal=True)
+
+ return api_key
+
+ def _call_zype_api(self, path, params, video_id, api_key, note):
+ """
+ A helper for making calls to the Zype API.
+ """
+ query = {'api_key': api_key, 'per_page': 1}
+ query.update(params)
+ return self._download_json('https://api.zype.com' + path, video_id, query=query, note=note)
+
+ def _call_nebula_api(self, path, video_id, access_token, note):
+ """
+ A helper for making calls to the Nebula API.
+ """
+ return self._download_json('https://api.watchnebula.com/api/v1' + path, video_id, headers={
+ 'Authorization': 'Token {access_token}'.format(access_token=access_token)
+ }, note=note)
+
+ def _fetch_zype_access_token(self, video_id):
+ try:
+ user_object = self._call_nebula_api('/auth/user/', video_id, self._nebula_token, note='Retrieving Zype access token')
+ except ExtractorError as exc:
+ # if 401, attempt credential auth and retry
+ if exc.cause and isinstance(exc.cause, HTTPError) and exc.cause.code == 401:
+ self._nebula_token = self._retrieve_nebula_auth()
+ user_object = self._call_nebula_api('/auth/user/', video_id, self._nebula_token, note='Retrieving Zype access token')
+ else:
+ raise
+
+ access_token = try_get(user_object, lambda x: x['zype_auth_info']['access_token'], compat_str)
+ if not access_token:
+ if try_get(user_object, lambda x: x['is_subscribed'], bool):
+ # TODO: Reimplement the same Zype token polling the Nebula frontend implements
+ # see https://github.com/ytdl-org/youtube-dl/pull/24805#issuecomment-749231532
+ raise ExtractorError(
+ 'Unable to extract Zype access token from Nebula API authentication endpoint. '
+ 'Open an arbitrary video in a browser with this account to generate a token',
+ expected=True)
+ raise ExtractorError('Unable to extract Zype access token from Nebula API authentication endpoint')
+ return access_token
+
+ def _extract_channel_title(self, video_meta):
+ # TODO: Implement the API calls giving us the channel list,
+ # so that we can do the title lookup and then figure out the channel URL
+ categories = video_meta.get('categories', []) if video_meta else []
+ # the channel name is the value of the first category
+ for category in categories:
+ if category.get('value'):
+ return category['value'][0]
+
+ def _real_initialize(self):
+ # check cookie jar for valid token
+ nebula_cookies = self._get_cookies('https://nebula.app')
+ nebula_cookie = nebula_cookies.get('nebula-auth')
+ if nebula_cookie:
+ self.to_screen('Authenticating to Nebula with token from cookie jar')
+ nebula_cookie_value = compat_urllib_parse_unquote(nebula_cookie.value)
+ self._nebula_token = self._parse_json(nebula_cookie_value, None).get('apiToken')
+
+ # try to authenticate using credentials if no valid token has been found
+ if not self._nebula_token:
+ self._nebula_token = self._retrieve_nebula_auth()
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ api_key = self._retrieve_zype_api_key(url, display_id)
+
+ response = self._call_zype_api('/videos', {'friendly_title': display_id},
+ display_id, api_key, note='Retrieving metadata from Zype')
+ if len(response.get('response') or []) != 1:
+ raise ExtractorError('Unable to find video on Zype API')
+ video_meta = response['response'][0]
+
+ video_id = video_meta['_id']
+ zype_access_token = self._fetch_zype_access_token(display_id)
+
+ channel_title = self._extract_channel_title(video_meta)
+
+ return {
+ 'id': video_id,
+ 'display_id': display_id,
+ '_type': 'url_transparent',
+ 'ie_key': 'Zype',
+ 'url': 'https://player.zype.com/embed/%s.html?access_token=%s' % (video_id, zype_access_token),
+ 'title': video_meta.get('title'),
+ 'description': video_meta.get('description'),
+ 'timestamp': parse_iso8601(video_meta.get('published_at')),
+ 'thumbnails': [{
+ 'id': tn.get('name'), # this appears to be null
+ 'url': tn['url'],
+ 'width': tn.get('width'),
+ 'height': tn.get('height'),
+ } for tn in video_meta.get('thumbnails', [])],
+ 'duration': video_meta.get('duration'),
+ 'channel': channel_title,
+ 'uploader': channel_title, # we chose uploader = channel name
+ # TODO: uploader_url, channel_id, channel_url
+ }
diff --git a/hypervideo_dl/extractor/neteasemusic.py b/hypervideo_dl/extractor/neteasemusic.py
index 978a058..7652371 100644
--- a/hypervideo_dl/extractor/neteasemusic.py
+++ b/hypervideo_dl/extractor/neteasemusic.py
@@ -405,7 +405,7 @@ class NetEaseMusicProgramIE(NetEaseMusicBaseIE):
name = info['name']
description = info['description']
- if not info['songs'] or self._downloader.params.get('noplaylist'):
+ if not info['songs'] or self.get_param('noplaylist'):
if info['songs']:
self.to_screen(
'Downloading just the main audio %s because of --no-playlist'
diff --git a/hypervideo_dl/extractor/netzkino.py b/hypervideo_dl/extractor/netzkino.py
index aec3026..4ad0d8e 100644
--- a/hypervideo_dl/extractor/netzkino.py
+++ b/hypervideo_dl/extractor/netzkino.py
@@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
from ..utils import (
@@ -13,17 +12,16 @@ from ..utils import (
class NetzkinoIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?netzkino\.de/\#!/(?P<category>[^/]+)/(?P<id>[^/]+)'
+ _VALID_URL = r'https?://(?:www\.)?netzkino\.de/\#!/[^/]+/(?P<id>[^/]+)'
- _TEST = {
- 'url': 'http://www.netzkino.de/#!/scifikino/rakete-zum-mond',
+ _TESTS = [{
+ 'url': 'https://www.netzkino.de/#!/scifikino/rakete-zum-mond',
'md5': '92a3f8b76f8d7220acce5377ea5d4873',
'info_dict': {
'id': 'rakete-zum-mond',
'ext': 'mp4',
- 'title': 'Rakete zum Mond (Endstation Mond, Destination Moon)',
- 'comments': 'mincount:3',
- 'description': 'md5:1eddeacc7e62d5a25a2d1a7290c64a28',
+ 'title': 'Rakete zum Mond \u2013 Jules Verne',
+ 'description': 'md5:f0a8024479618ddbfa450ff48ffa6c60',
'upload_date': '20120813',
'thumbnail': r're:https?://.*\.jpg$',
'timestamp': 1344858571,
@@ -32,17 +30,30 @@ class NetzkinoIE(InfoExtractor):
'params': {
'skip_download': 'Download only works from Germany',
}
- }
+ }, {
+ 'url': 'https://www.netzkino.de/#!/filme/dr-jekyll-mrs-hyde-2',
+ 'md5': 'c7728b2dadd04ff6727814847a51ef03',
+ 'info_dict': {
+ 'id': 'dr-jekyll-mrs-hyde-2',
+ 'ext': 'mp4',
+ 'title': 'Dr. Jekyll & Mrs. Hyde 2',
+ 'description': 'md5:c2e9626ebd02de0a794b95407045d186',
+ 'upload_date': '20190130',
+ 'thumbnail': r're:https?://.*\.jpg$',
+ 'timestamp': 1548849437,
+ 'age_limit': 18,
+ },
+ 'params': {
+ 'skip_download': 'Download only works from Germany',
+ }
+ }]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- category_id = mobj.group('category')
+ mobj = self._match_valid_url(url)
video_id = mobj.group('id')
- api_url = 'http://api.netzkino.de.simplecache.net/capi-2.0a/categories/%s.json?d=www' % category_id
- api_info = self._download_json(api_url, video_id)
- info = next(
- p for p in api_info['posts'] if p['slug'] == video_id)
+ api_url = 'https://api.netzkino.de.simplecache.net/capi-2.0a/movies/%s.json?d=www' % video_id
+ info = self._download_json(api_url, video_id)
custom_fields = info['custom_fields']
production_js = self._download_webpage(
@@ -67,23 +78,12 @@ class NetzkinoIE(InfoExtractor):
} for key, tpl in templates.items()]
self._sort_formats(formats)
- comments = [{
- 'timestamp': parse_iso8601(c.get('date'), delimiter=' '),
- 'id': c['id'],
- 'author': c['name'],
- 'html': c['content'],
- 'parent': 'root' if c.get('parent', 0) == 0 else c['parent'],
- } for c in info.get('comments', [])]
-
return {
'id': video_id,
'formats': formats,
- 'comments': comments,
'title': info['title'],
'age_limit': int_or_none(custom_fields.get('FSK')[0]),
'timestamp': parse_iso8601(info.get('date'), delimiter=' '),
'description': clean_html(info.get('content')),
'thumbnail': info.get('thumbnail'),
- 'playlist_title': api_info.get('title'),
- 'playlist_id': category_id,
}
diff --git a/hypervideo_dl/extractor/newgrounds.py b/hypervideo_dl/extractor/newgrounds.py
index 82e7cf5..bbbd9e8 100644
--- a/hypervideo_dl/extractor/newgrounds.py
+++ b/hypervideo_dl/extractor/newgrounds.py
@@ -1,19 +1,23 @@
+# coding: utf-8
from __future__ import unicode_literals
+import functools
import re
from .common import InfoExtractor
from ..utils import (
extract_attributes,
int_or_none,
+ parse_count,
parse_duration,
- parse_filesize,
unified_timestamp,
+ OnDemandPagedList,
+ try_get,
)
class NewgroundsIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?newgrounds\.com/(?:audio/listen|portal/view)/(?P<id>[0-9]+)'
+ _VALID_URL = r'https?://(?:www\.)?newgrounds\.com/(?:audio/listen|portal/view)/(?P<id>\d+)(?:/format/flash)?'
_TESTS = [{
'url': 'https://www.newgrounds.com/audio/listen/549479',
'md5': 'fe6033d297591288fa1c1f780386f07a',
@@ -25,17 +29,20 @@ class NewgroundsIE(InfoExtractor):
'timestamp': 1378878540,
'upload_date': '20130911',
'duration': 143,
+ 'description': 'md5:6d885138814015dfd656c2ddb00dacfc',
},
}, {
- 'url': 'https://www.newgrounds.com/portal/view/673111',
- 'md5': '3394735822aab2478c31b1004fe5e5bc',
+ 'url': 'https://www.newgrounds.com/portal/view/1',
+ 'md5': 'fbfb40e2dc765a7e830cb251d370d981',
'info_dict': {
- 'id': '673111',
+ 'id': '1',
'ext': 'mp4',
- 'title': 'Dancin',
- 'uploader': 'Squirrelman82',
- 'timestamp': 1460256780,
- 'upload_date': '20160410',
+ 'title': 'Scrotum 1',
+ 'uploader': 'Brian-Beaton',
+ 'timestamp': 955064100,
+ 'upload_date': '20000406',
+ 'description': 'Scrotum plays "catch."',
+ 'age_limit': 17,
},
}, {
# source format unavailable, additional mp4 formats
@@ -44,70 +51,123 @@ class NewgroundsIE(InfoExtractor):
'id': '689400',
'ext': 'mp4',
'title': 'ZTV News Episode 8',
- 'uploader': 'BennettTheSage',
+ 'uploader': 'ZONE-SAMA',
'timestamp': 1487965140,
'upload_date': '20170224',
+ 'description': 'ZTV News Episode 8 (February 2017)',
+ 'age_limit': 17,
},
'params': {
'skip_download': True,
},
+ }, {
+ 'url': 'https://www.newgrounds.com/portal/view/297383',
+ 'md5': '2c11f5fd8cb6b433a63c89ba3141436c',
+ 'info_dict': {
+ 'id': '297383',
+ 'ext': 'mp4',
+ 'title': 'Metal Gear Awesome',
+ 'uploader': 'Egoraptor',
+ 'timestamp': 1140663240,
+ 'upload_date': '20060223',
+ 'description': 'Metal Gear is awesome is so is this movie.',
+ 'age_limit': 13,
+ }
+ }, {
+ 'url': 'https://www.newgrounds.com/portal/view/297383/format/flash',
+ 'md5': '5d05585a9a0caca059f5abfbd3865524',
+ 'info_dict': {
+ 'id': '297383',
+ 'ext': 'swf',
+ 'title': 'Metal Gear Awesome',
+ 'description': 'Metal Gear is awesome is so is this movie.',
+ 'uploader': 'Egoraptor',
+ 'upload_date': '20060223',
+ 'timestamp': 1140663240,
+ 'age_limit': 13,
+ }
}]
+ _AGE_LIMIT = {
+ 'e': 0,
+ 't': 13,
+ 'm': 17,
+ 'a': 18,
+ }
def _real_extract(self, url):
media_id = self._match_id(url)
-
+ formats = []
+ uploader = None
webpage = self._download_webpage(url, media_id)
title = self._html_search_regex(
- r'<title>([^>]+)</title>', webpage, 'title')
+ r'<title>(.+?)</title>', webpage, 'title')
- media_url = self._parse_json(self._search_regex(
- r'"url"\s*:\s*("[^"]+"),', webpage, ''), media_id)
+ media_url_string = self._search_regex(
+ r'"url"\s*:\s*("[^"]+"),', webpage, 'media url', default=None)
- formats = [{
- 'url': media_url,
- 'format_id': 'source',
- 'quality': 1,
- }]
+ if media_url_string:
+ media_url = self._parse_json(media_url_string, media_id)
+ formats = [{
+ 'url': media_url,
+ 'format_id': 'source',
+ 'quality': 1,
+ }]
+ else:
+ json_video = self._download_json('https://www.newgrounds.com/portal/video/' + media_id, media_id, headers={
+ 'Accept': 'application/json',
+ 'Referer': url,
+ 'X-Requested-With': 'XMLHttpRequest'
+ })
- max_resolution = int_or_none(self._search_regex(
- r'max_resolution["\']\s*:\s*(\d+)', webpage, 'max resolution',
- default=None))
- if max_resolution:
- url_base = media_url.rpartition('.')[0]
- for resolution in (360, 720, 1080):
- if resolution > max_resolution:
- break
- formats.append({
- 'url': '%s.%dp.mp4' % (url_base, resolution),
- 'format_id': '%dp' % resolution,
- 'height': resolution,
- })
+ uploader = json_video.get('author')
+ media_formats = json_video.get('sources', [])
+ for media_format in media_formats:
+ media_sources = media_formats[media_format]
+ for source in media_sources:
+ formats.append({
+ 'format_id': media_format,
+ 'quality': int_or_none(media_format[:-1]),
+ 'url': source.get('src')
+ })
- self._check_formats(formats, media_id)
- self._sort_formats(formats)
+ if not uploader:
+ uploader = self._html_search_regex(
+ (r'(?s)<h4[^>]*>(.+?)</h4>.*?<em>\s*(?:Author|Artist)\s*</em>',
+ r'(?:Author|Writer)\s*<a[^>]+>([^<]+)'), webpage, 'uploader',
+ fatal=False)
- uploader = self._html_search_regex(
- (r'(?s)<h4[^>]*>(.+?)</h4>.*?<em>\s*Author\s*</em>',
- r'(?:Author|Writer)\s*<a[^>]+>([^<]+)'), webpage, 'uploader',
- fatal=False)
+ age_limit = self._html_search_regex(
+ r'<h2\s*class=["\']rated-([^"\'])["\'][^>]+>', webpage, 'age_limit', default='e')
+ age_limit = self._AGE_LIMIT.get(age_limit)
timestamp = unified_timestamp(self._html_search_regex(
(r'<dt>\s*Uploaded\s*</dt>\s*<dd>([^<]+</dd>\s*<dd>[^<]+)',
r'<dt>\s*Uploaded\s*</dt>\s*<dd>([^<]+)'), webpage, 'timestamp',
default=None))
- duration = parse_duration(self._search_regex(
- r'(?s)<dd>\s*Song\s*</dd>\s*<dd>.+?</dd>\s*<dd>([^<]+)', webpage,
+ duration = parse_duration(self._html_search_regex(
+ r'"duration"\s*:\s*["\']?(\d+)["\']?', webpage,
'duration', default=None))
- filesize_approx = parse_filesize(self._html_search_regex(
- r'(?s)<dd>\s*Song\s*</dd>\s*<dd>(.+?)</dd>', webpage, 'filesize',
+ view_count = parse_count(self._html_search_regex(
+ r'(?s)<dt>\s*(?:Views|Listens)\s*</dt>\s*<dd>([\d\.,]+)</dd>', webpage,
+ 'view count', default=None))
+
+ filesize = int_or_none(self._html_search_regex(
+ r'"filesize"\s*:\s*["\']?([\d]+)["\']?,', webpage, 'filesize',
default=None))
+
+ video_type_description = self._html_search_regex(
+ r'"description"\s*:\s*["\']?([^"\']+)["\']?,', webpage, 'filesize',
+ default=None)
+
if len(formats) == 1:
- formats[0]['filesize_approx'] = filesize_approx
+ formats[0]['filesize'] = filesize
- if '<dd>Song' in webpage:
+ if video_type_description == 'Audio File':
formats[0]['vcodec'] = 'none'
+ self._check_formats(formats, media_id)
+ self._sort_formats(formats)
return {
'id': media_id,
@@ -116,10 +176,15 @@ class NewgroundsIE(InfoExtractor):
'timestamp': timestamp,
'duration': duration,
'formats': formats,
+ 'thumbnail': self._og_search_thumbnail(webpage),
+ 'description': self._og_search_description(webpage),
+ 'age_limit': age_limit,
+ 'view_count': view_count,
}
class NewgroundsPlaylistIE(InfoExtractor):
+ IE_NAME = 'Newgrounds:playlist'
_VALID_URL = r'https?://(?:www\.)?newgrounds\.com/(?:collection|[^/]+/search/[^/]+)/(?P<id>[^/?#&]+)'
_TESTS = [{
'url': 'https://www.newgrounds.com/collection/cats',
@@ -127,14 +192,14 @@ class NewgroundsPlaylistIE(InfoExtractor):
'id': 'cats',
'title': 'Cats',
},
- 'playlist_mincount': 46,
+ 'playlist_mincount': 45,
}, {
- 'url': 'http://www.newgrounds.com/portal/search/author/ZONE-SAMA',
+ 'url': 'https://www.newgrounds.com/collection/dogs',
'info_dict': {
- 'id': 'ZONE-SAMA',
- 'title': 'Portal Search: ZONE-SAMA',
+ 'id': 'dogs',
+ 'title': 'Dogs',
},
- 'playlist_mincount': 47,
+ 'playlist_mincount': 26,
}, {
'url': 'http://www.newgrounds.com/audio/search/title/cats',
'only_matching': True,
@@ -155,14 +220,64 @@ class NewgroundsPlaylistIE(InfoExtractor):
entries = []
for a, path, media_id in re.findall(
- r'(<a[^>]+\bhref=["\']/?((?:portal/view|audio/listen)/(\d+))[^>]+>)',
+ r'(<a[^>]+\bhref=["\'][^"\']+((?:portal/view|audio/listen)/(\d+))[^>]+>)',
webpage):
a_class = extract_attributes(a).get('class')
if a_class not in ('item-portalsubmission', 'item-audiosubmission'):
continue
entries.append(
self.url_result(
- 'https://www.newgrounds.com/%s' % path,
+ f'https://www.newgrounds.com/{path}',
ie=NewgroundsIE.ie_key(), video_id=media_id))
return self.playlist_result(entries, playlist_id, title)
+
+
+class NewgroundsUserIE(InfoExtractor):
+ IE_NAME = 'Newgrounds:user'
+ _VALID_URL = r'https?://(?P<id>[^\.]+)\.newgrounds\.com/(?:movies|audio)/?(?:[#?]|$)'
+ _TESTS = [{
+ 'url': 'https://burn7.newgrounds.com/audio',
+ 'info_dict': {
+ 'id': 'burn7',
+ },
+ 'playlist_mincount': 150,
+ }, {
+ 'url': 'https://burn7.newgrounds.com/movies',
+ 'info_dict': {
+ 'id': 'burn7',
+ },
+ 'playlist_mincount': 2,
+ }, {
+ 'url': 'https://brian-beaton.newgrounds.com/movies',
+ 'info_dict': {
+ 'id': 'brian-beaton',
+ },
+ 'playlist_mincount': 10,
+ }]
+ _PAGE_SIZE = 30
+
+ def _fetch_page(self, channel_id, url, page):
+ page += 1
+ posts_info = self._download_json(
+ f'{url}/page/{page}', channel_id,
+ note=f'Downloading page {page}', headers={
+ 'Accept': 'application/json, text/javascript, */*; q = 0.01',
+ 'X-Requested-With': 'XMLHttpRequest',
+ })
+ sequence = posts_info.get('sequence', [])
+ for year in sequence:
+ posts = try_get(posts_info, lambda x: x['years'][str(year)]['items'])
+ for post in posts:
+ path, media_id = self._search_regex(
+ r'<a[^>]+\bhref=["\'][^"\']+((?:portal/view|audio/listen)/(\d+))[^>]+>',
+ post, 'url', group=(1, 2))
+ yield self.url_result(f'https://www.newgrounds.com/{path}', NewgroundsIE.ie_key(), media_id)
+
+ def _real_extract(self, url):
+ channel_id = self._match_id(url)
+
+ entries = OnDemandPagedList(functools.partial(
+ self._fetch_page, channel_id, url), self._PAGE_SIZE)
+
+ return self.playlist_result(entries, channel_id)
diff --git a/hypervideo_dl/extractor/nexx.py b/hypervideo_dl/extractor/nexx.py
index 586c1b7..860d636 100644
--- a/hypervideo_dl/extractor/nexx.py
+++ b/hypervideo_dl/extractor/nexx.py
@@ -289,7 +289,7 @@ class NexxIE(InfoExtractor):
return formats
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
domain_id = mobj.group('domain_id') or mobj.group('domain_id_s')
video_id = mobj.group('id')
diff --git a/hypervideo_dl/extractor/nfhsnetwork.py b/hypervideo_dl/extractor/nfhsnetwork.py
new file mode 100644
index 0000000..802f6ca
--- /dev/null
+++ b/hypervideo_dl/extractor/nfhsnetwork.py
@@ -0,0 +1,144 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+from ..utils import (
+ try_get,
+ unified_strdate,
+ unified_timestamp
+)
+
+
+class NFHSNetworkIE(InfoExtractor):
+ IE_NAME = 'NFHSNetwork'
+ _VALID_URL = r'https?://(?:www\.)?nfhsnetwork\.com/events/[\w-]+/(?P<id>(?:gam|evt|dd|)?[\w\d]{0,10})'
+ _TESTS = [{
+ # Auto-generated two-team sport (pixellot)
+ 'url': 'https://www.nfhsnetwork.com/events/rockford-high-school-rockford-mi/gamcf7e54cfbc',
+ 'info_dict': {
+ 'id': 'gamcf7e54cfbc',
+ 'ext': 'mp4',
+ 'title': 'Rockford vs Spring Lake - Girls Varsity Lacrosse 03/27/2021',
+ 'uploader': 'MHSAA - Michigan: Rockford High School, Rockford, MI',
+ 'uploader_id': 'cd2622cf76',
+ 'uploader_url': 'https://www.nfhsnetwork.com/schools/rockford-high-school-rockford-mi',
+ 'location': 'Rockford, Michigan',
+ 'timestamp': 1616859000,
+ 'upload_date': '20210327'
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ }
+ }, {
+ # Non-sport activity with description
+ 'url': 'https://www.nfhsnetwork.com/events/limon-high-school-limon-co/evt4a30e3726c',
+ 'info_dict': {
+ 'id': 'evt4a30e3726c',
+ 'ext': 'mp4',
+ 'title': 'Drama Performance Limon High School vs. Limon High School - 12/13/2020',
+ 'description': 'Join the broadcast of the Limon High School Musical Performance at 2 PM.',
+ 'uploader': 'CHSAA: Limon High School, Limon, CO',
+ 'uploader_id': '7d2d121332',
+ 'uploader_url': 'https://www.nfhsnetwork.com/schools/limon-high-school-limon-co',
+ 'location': 'Limon, Colorado',
+ 'timestamp': 1607893200,
+ 'upload_date': '20201213'
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ }
+ }, {
+ # Postseason game
+ 'url': 'https://www.nfhsnetwork.com/events/nfhs-network-special-events/dd8de71d45',
+ 'info_dict': {
+ 'id': 'dd8de71d45',
+ 'ext': 'mp4',
+ 'title': '2015 UA Holiday Classic Tournament: National Division - 12/26/2015',
+ 'uploader': 'SoCal Sports Productions',
+ 'uploader_id': '063dba0150',
+ 'uploader_url': 'https://www.nfhsnetwork.com/affiliates/socal-sports-productions',
+ 'location': 'San Diego, California',
+ 'timestamp': 1451187000,
+ 'upload_date': '20151226'
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ }
+ }, {
+ # Video with no broadcasts object
+ 'url': 'https://www.nfhsnetwork.com/events/wiaa-wi/9aa2f92f82',
+ 'info_dict': {
+ 'id': '9aa2f92f82',
+ 'ext': 'mp4',
+ 'title': 'Competitive Equity - 01/21/2015',
+ 'description': 'Committee members discuss points of their research regarding a competitive equity plan',
+ 'uploader': 'WIAA - Wisconsin: Wisconsin Interscholastic Athletic Association',
+ 'uploader_id': 'a49f7d1002',
+ 'uploader_url': 'https://www.nfhsnetwork.com/associations/wiaa-wi',
+ 'location': 'Stevens Point, Wisconsin',
+ 'timestamp': 1421856000,
+ 'upload_date': '20150121'
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ }
+ }
+ ]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+ data = self._download_json(
+ 'https://cfunity.nfhsnetwork.com/v2/game_or_event/' + video_id,
+ video_id)
+ publisher = data.get('publishers')[0] # always exists
+ broadcast = (publisher.get('broadcasts') or publisher.get('vods'))[0] # some (older) videos don't have a broadcasts object
+ uploader = publisher.get('formatted_name') or publisher.get('name')
+ uploaderID = publisher.get('publisher_key')
+ pubType = publisher.get('type')
+ uploaderPrefix = (
+ "schools" if pubType == "school"
+ else "associations" if "association" in pubType
+ else "affiliates" if (pubType == "publisher" or pubType == "affiliate")
+ else "schools")
+ uploaderPage = 'https://www.nfhsnetwork.com/%s/%s' % (uploaderPrefix, publisher.get('slug'))
+ location = '%s, %s' % (data.get('city'), data.get('state_name'))
+ description = broadcast.get('description')
+ isLive = broadcast.get('on_air') or broadcast.get('status') == 'on_air' or False
+
+ timestamp = unified_timestamp(data.get('local_start_time'))
+ upload_date = unified_strdate(data.get('local_start_time'))
+
+ title = (
+ self._og_search_title(webpage)
+ or self._html_search_regex(r'<h1 class="sr-hidden">(.*?)</h1>', webpage, 'title'))
+ title = title.split('|')[0].strip()
+
+ video_type = 'broadcasts' if isLive else 'vods'
+ key = broadcast.get('key') if isLive else try_get(publisher, lambda x: x['vods'][0]['key'])
+ m3u8_url = self._download_json(
+ 'https://cfunity.nfhsnetwork.com/v2/%s/%s/url' % (video_type, key),
+ video_id).get('video_url')
+
+ formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', live=isLive)
+ self._sort_formats(formats, ['res', 'tbr'])
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'formats': formats,
+ 'description': description,
+ 'timestamp': timestamp,
+ 'uploader': uploader,
+ 'uploader_id': uploaderID,
+ 'uploader_url': uploaderPage,
+ 'location': location,
+ 'upload_date': upload_date,
+ 'is_live': isLive
+ }
diff --git a/hypervideo_dl/extractor/nhk.py b/hypervideo_dl/extractor/nhk.py
index 8a9331a..950a3d0 100644
--- a/hypervideo_dl/extractor/nhk.py
+++ b/hypervideo_dl/extractor/nhk.py
@@ -1,6 +1,5 @@
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
from ..utils import urljoin
@@ -22,7 +21,7 @@ class NhkBaseIE(InfoExtractor):
def _extract_episode_info(self, url, episode=None):
fetch_episode = episode is None
- lang, m_type, episode_id = re.match(NhkVodIE._VALID_URL, url).groups()
+ lang, m_type, episode_id = NhkVodIE._match_valid_url(url).groups()
if episode_id.isdigit():
episode_id = episode_id[:4] + '-' + episode_id[4:]
@@ -158,7 +157,7 @@ class NhkVodProgramIE(NhkBaseIE):
}]
def _real_extract(self, url):
- lang, m_type, program_id, episode_type = re.match(self._VALID_URL, url).groups()
+ lang, m_type, program_id, episode_type = self._match_valid_url(url).groups()
episodes = self._call_api(
program_id, lang, m_type == 'video', False, episode_type == 'clip')
diff --git a/hypervideo_dl/extractor/nhl.py b/hypervideo_dl/extractor/nhl.py
index eddfe1f..d3a5e17 100644
--- a/hypervideo_dl/extractor/nhl.py
+++ b/hypervideo_dl/extractor/nhl.py
@@ -1,6 +1,5 @@
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
from ..compat import compat_str
@@ -14,7 +13,7 @@ from ..utils import (
class NHLBaseIE(InfoExtractor):
def _real_extract(self, url):
- site, tmp_id = re.match(self._VALID_URL, url).groups()
+ site, tmp_id = self._match_valid_url(url).groups()
video_data = self._download_json(
'https://%s/%s/%sid/v1/%s/details/web-v1.json'
% (self._CONTENT_DOMAIN, site[:3], 'item/' if site == 'mlb' else '', tmp_id), tmp_id)
diff --git a/hypervideo_dl/extractor/nick.py b/hypervideo_dl/extractor/nick.py
index 2e8b302..ba7da76 100644
--- a/hypervideo_dl/extractor/nick.py
+++ b/hypervideo_dl/extractor/nick.py
@@ -1,66 +1,73 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
from .mtv import MTVServicesInfoExtractor
from ..utils import update_url_query
class NickIE(MTVServicesInfoExtractor):
- # None of videos on the website are still alive?
IE_NAME = 'nick.com'
- _VALID_URL = r'https?://(?P<domain>(?:(?:www|beta)\.)?nick(?:jr)?\.com)/(?:[^/]+/)?(?:videos/clip|[^/]+/videos)/(?P<id>[^/?#.]+)'
+ _VALID_URL = r'https?://(?P<domain>(?:www\.)?nick(?:jr)?\.com)/(?:[^/]+/)?(?P<type>videos/clip|[^/]+/videos|episodes/[^/]+)/(?P<id>[^/?#.]+)'
_FEED_URL = 'http://udat.mtvnservices.com/service1/dispatch.htm'
_GEO_COUNTRIES = ['US']
_TESTS = [{
- 'url': 'http://www.nick.com/videos/clip/alvinnn-and-the-chipmunks-112-full-episode.html',
+ 'url': 'https://www.nick.com/episodes/sq47rw/spongebob-squarepants-a-place-for-pets-lockdown-for-love-season-13-ep-1',
+ 'info_dict': {
+ 'description': 'md5:0650a9eb88955609d5c1d1c79292e234',
+ 'title': 'A Place for Pets/Lockdown for Love',
+ },
'playlist': [
{
- 'md5': '6e5adc1e28253bbb1b28ab05403dd4d4',
+ 'md5': 'cb8a2afeafb7ae154aca5a64815ec9d6',
'info_dict': {
- 'id': 'be6a17b0-412d-11e5-8ff7-0026b9414f30',
+ 'id': '85ee8177-d6ce-48f8-9eee-a65364f8a6df',
'ext': 'mp4',
- 'title': 'ALVINNN!!! and The Chipmunks: "Mojo Missing/Who\'s The Animal" S1',
- 'description': 'Alvin is convinced his mojo was in a cap he gave to a fan, and must find a way to get his hat back before the Chipmunks’ big concert.\nDuring a costume visit to the zoo, Alvin finds himself mistaken for the real Tasmanian devil.',
+ 'title': 'SpongeBob SquarePants: "A Place for Pets/Lockdown for Love" S1',
+ 'description': 'A Place for Pets/Lockdown for Love: When customers bring pets into the Krusty Krab, Mr. Krabs realizes pets are more profitable than owners. Plankton ruins another date with Karen, so she puts the Chum Bucket on lockdown until he proves his affection.',
}
},
{
- 'md5': 'd7be441fc53a1d4882fa9508a1e5b3ce',
+ 'md5': '839a04f49900a1fcbf517020d94e0737',
'info_dict': {
- 'id': 'be6b8f96-412d-11e5-8ff7-0026b9414f30',
+ 'id': '2e2a9960-8fd4-411d-868b-28eb1beb7fae',
'ext': 'mp4',
- 'title': 'ALVINNN!!! and The Chipmunks: "Mojo Missing/Who\'s The Animal" S2',
- 'description': 'Alvin is convinced his mojo was in a cap he gave to a fan, and must find a way to get his hat back before the Chipmunks’ big concert.\nDuring a costume visit to the zoo, Alvin finds himself mistaken for the real Tasmanian devil.',
+ 'title': 'SpongeBob SquarePants: "A Place for Pets/Lockdown for Love" S2',
+ 'description': 'A Place for Pets/Lockdown for Love: When customers bring pets into the Krusty Krab, Mr. Krabs realizes pets are more profitable than owners. Plankton ruins another date with Karen, so she puts the Chum Bucket on lockdown until he proves his affection.',
}
},
{
- 'md5': 'efffe1728a234b2b0d2f2b343dd1946f',
+ 'md5': 'f1145699f199770e2919ee8646955d46',
'info_dict': {
- 'id': 'be6cf7e6-412d-11e5-8ff7-0026b9414f30',
+ 'id': 'dc91c304-6876-40f7-84a6-7aece7baa9d0',
'ext': 'mp4',
- 'title': 'ALVINNN!!! and The Chipmunks: "Mojo Missing/Who\'s The Animal" S3',
- 'description': 'Alvin is convinced his mojo was in a cap he gave to a fan, and must find a way to get his hat back before the Chipmunks’ big concert.\nDuring a costume visit to the zoo, Alvin finds himself mistaken for the real Tasmanian devil.',
+ 'title': 'SpongeBob SquarePants: "A Place for Pets/Lockdown for Love" S3',
+ 'description': 'A Place for Pets/Lockdown for Love: When customers bring pets into the Krusty Krab, Mr. Krabs realizes pets are more profitable than owners. Plankton ruins another date with Karen, so she puts the Chum Bucket on lockdown until he proves his affection.',
+
}
},
{
- 'md5': '1ec6690733ab9f41709e274a1d5c7556',
+ 'md5': 'd463116875aee2585ee58de3b12caebd',
'info_dict': {
- 'id': 'be6e3354-412d-11e5-8ff7-0026b9414f30',
+ 'id': '5d929486-cf4c-42a1-889a-6e0d183a101a',
'ext': 'mp4',
- 'title': 'ALVINNN!!! and The Chipmunks: "Mojo Missing/Who\'s The Animal" S4',
- 'description': 'Alvin is convinced his mojo was in a cap he gave to a fan, and must find a way to get his hat back before the Chipmunks’ big concert.\nDuring a costume visit to the zoo, Alvin finds himself mistaken for the real Tasmanian devil.',
+ 'title': 'SpongeBob SquarePants: "A Place for Pets/Lockdown for Love" S4',
+ 'description': 'A Place for Pets/Lockdown for Love: When customers bring pets into the Krusty Krab, Mr. Krabs realizes pets are more profitable than owners. Plankton ruins another date with Karen, so she puts the Chum Bucket on lockdown until he proves his affection.',
+
}
},
],
}, {
- 'url': 'http://www.nickjr.com/paw-patrol/videos/pups-save-a-goldrush-s3-ep302-full-episode/',
- 'only_matching': True,
- }, {
- 'url': 'http://beta.nick.com/nicky-ricky-dicky-and-dawn/videos/nicky-ricky-dicky-dawn-301-full-episode/',
- 'only_matching': True,
+ 'url': 'http://www.nickjr.com/blues-clues-and-you/videos/blues-clues-and-you-original-209-imagination-station/',
+ 'info_dict': {
+ 'id': '31631529-2fc5-430b-b2ef-6a74b4609abd',
+ 'ext': 'mp4',
+ 'description': 'md5:9d65a66df38e02254852794b2809d1cf',
+ 'title': 'Blue\'s Imagination Station',
+ },
+ 'skip': 'Not accessible?'
}]
def _get_feed_query(self, uri):
@@ -70,7 +77,9 @@ class NickIE(MTVServicesInfoExtractor):
}
def _real_extract(self, url):
- domain, display_id = re.match(self._VALID_URL, url).groups()
+ domain, video_type, display_id = self._match_valid_url(url).groups()
+ if video_type.startswith("episodes"):
+ return super()._real_extract(url)
video_data = self._download_json(
'http://%s/data/video.endLevel.json' % domain,
display_id, query={
@@ -108,7 +117,7 @@ class NickBrIE(MTVServicesInfoExtractor):
}]
def _real_extract(self, url):
- domain, display_id = re.match(self._VALID_URL, url).groups()
+ domain, display_id = self._match_valid_url(url).groups()
webpage = self._download_webpage(url, display_id)
uri = self._search_regex(
r'data-(?:contenturi|mgid)="([^"]+)', webpage, 'mgid')
@@ -176,21 +185,11 @@ class NickDeIE(MTVServicesInfoExtractor):
'only_matching': True,
}]
- def _extract_mrss_url(self, webpage, host):
- return update_url_query(self._search_regex(
- r'data-mrss=(["\'])(?P<url>http.+?)\1', webpage, 'mrss url', group='url'),
- {'siteKey': host})
-
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
- host = mobj.group('host')
-
- webpage = self._download_webpage(url, video_id)
-
- mrss_url = self._extract_mrss_url(webpage, host)
-
- return self._get_videos_info_from_url(mrss_url, video_id)
+ def _get_feed_url(self, uri, url=None):
+ video_id = self._id_from_uri(uri)
+ config = self._download_json(
+ 'http://media.mtvnservices.com/pmt/e1/access/index.html?uri=%s&configtype=edge&ref=%s' % (uri, url), video_id)
+ return self._remove_template_parameter(config['feedWithQueryParams'])
class NickNightIE(NickDeIE):
@@ -245,5 +244,5 @@ class NickRuIE(MTVServicesInfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- mgid = self._extract_mgid(webpage)
+ mgid = self._extract_mgid(webpage, url)
return self.url_result('http://media.mtvnservices.com/embed/%s' % mgid)
diff --git a/hypervideo_dl/extractor/niconico.py b/hypervideo_dl/extractor/niconico.py
index a85fc3d..76f0870 100644
--- a/hypervideo_dl/extractor/niconico.py
+++ b/hypervideo_dl/extractor/niconico.py
@@ -2,25 +2,28 @@
from __future__ import unicode_literals
import datetime
-import functools
+import itertools
import json
-import math
+import re
-from .common import InfoExtractor
+from .common import InfoExtractor, SearchInfoExtractor
+from ..postprocessor.ffmpeg import FFmpegPostProcessor
from ..compat import (
+ compat_str,
compat_parse_qs,
compat_urllib_parse_urlparse,
)
from ..utils import (
- determine_ext,
- dict_get,
ExtractorError,
+ dict_get,
float_or_none,
- InAdvancePagedList,
int_or_none,
+ OnDemandPagedList,
parse_duration,
parse_iso8601,
+ PostProcessingError,
remove_start,
+ str_or_none,
try_get,
unified_timestamp,
urlencode_postdata,
@@ -34,7 +37,7 @@ class NiconicoIE(InfoExtractor):
_TESTS = [{
'url': 'http://www.nicovideo.jp/watch/sm22312215',
- 'md5': 'd1a75c0823e2f629128c43e1212760f9',
+ 'md5': 'a5bad06f1347452102953f323c69da34s',
'info_dict': {
'id': 'sm22312215',
'ext': 'mp4',
@@ -162,6 +165,11 @@ class NiconicoIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.|secure\.|sp\.)?nicovideo\.jp/watch/(?P<id>(?:[a-z]{2})?[0-9]+)'
_NETRC_MACHINE = 'niconico'
+ _API_HEADERS = {
+ 'X-Frontend-ID': '6',
+ 'X-Frontend-Version': '0'
+ }
+
def _real_initialize(self):
self._login()
@@ -188,40 +196,92 @@ class NiconicoIE(InfoExtractor):
if compat_parse_qs(parts.query).get('message', [None])[0] == 'cant_login':
login_ok = False
if not login_ok:
- self._downloader.report_warning('unable to log in: bad username or password')
+ self.report_warning('unable to log in: bad username or password')
return login_ok
- def _extract_format_for_quality(self, api_data, video_id, audio_quality, video_quality):
- def yesno(boolean):
- return 'yes' if boolean else 'no'
-
- session_api_data = api_data['video']['dmcInfo']['session_api']
- session_api_endpoint = session_api_data['urls'][0]
-
- format_id = '-'.join(map(lambda s: remove_start(s['id'], 'archive_'), [video_quality, audio_quality]))
+ def _get_heartbeat_info(self, info_dict):
+
+ video_id, video_src_id, audio_src_id = info_dict['url'].split(':')[1].split('/')
+
+ api_data = (
+ info_dict.get('_api_data')
+ or self._parse_json(
+ self._html_search_regex(
+ 'data-api-data="([^"]+)"',
+ self._download_webpage('http://www.nicovideo.jp/watch/' + video_id, video_id),
+ 'API data', default='{}'),
+ video_id))
+
+ session_api_data = try_get(api_data, lambda x: x['media']['delivery']['movie']['session'])
+ session_api_endpoint = try_get(session_api_data, lambda x: x['urls'][0])
+
+ def ping():
+ status = try_get(
+ self._download_json(
+ 'https://nvapi.nicovideo.jp/v1/2ab0cbaa/watch', video_id,
+ query={'t': try_get(api_data, lambda x: x['media']['delivery']['trackingId'])},
+ note='Acquiring permission for downloading video',
+ headers=self._API_HEADERS),
+ lambda x: x['meta']['status'])
+ if status != 200:
+ self.report_warning('Failed to acquire permission for playing video. The video may not download.')
+
+ yesno = lambda x: 'yes' if x else 'no'
+
+ # m3u8 (encryption)
+ if try_get(api_data, lambda x: x['media']['delivery']['encryption']) is not None:
+ protocol = 'm3u8'
+ encryption = self._parse_json(session_api_data['token'], video_id)['hls_encryption']
+ session_api_http_parameters = {
+ 'parameters': {
+ 'hls_parameters': {
+ 'encryption': {
+ encryption: {
+ 'encrypted_key': try_get(api_data, lambda x: x['media']['delivery']['encryption']['encryptedKey']),
+ 'key_uri': try_get(api_data, lambda x: x['media']['delivery']['encryption']['keyUri'])
+ }
+ },
+ 'transfer_preset': '',
+ 'use_ssl': yesno(session_api_endpoint['isSsl']),
+ 'use_well_known_port': yesno(session_api_endpoint['isWellKnownPort']),
+ 'segment_duration': 6000,
+ }
+ }
+ }
+ # http
+ else:
+ protocol = 'http'
+ session_api_http_parameters = {
+ 'parameters': {
+ 'http_output_download_parameters': {
+ 'use_ssl': yesno(session_api_endpoint['isSsl']),
+ 'use_well_known_port': yesno(session_api_endpoint['isWellKnownPort']),
+ }
+ }
+ }
session_response = self._download_json(
session_api_endpoint['url'], video_id,
query={'_format': 'json'},
headers={'Content-Type': 'application/json'},
- note='Downloading JSON metadata for %s' % format_id,
+ note='Downloading JSON metadata for %s' % info_dict['format_id'],
data=json.dumps({
'session': {
'client_info': {
- 'player_id': session_api_data['player_id'],
+ 'player_id': session_api_data.get('playerId'),
},
'content_auth': {
- 'auth_type': session_api_data['auth_types'][session_api_data['protocols'][0]],
- 'content_key_timeout': session_api_data['content_key_timeout'],
+ 'auth_type': try_get(session_api_data, lambda x: x['authTypes'][session_api_data['protocols'][0]]),
+ 'content_key_timeout': session_api_data.get('contentKeyTimeout'),
'service_id': 'nicovideo',
- 'service_user_id': session_api_data['service_user_id']
+ 'service_user_id': session_api_data.get('serviceUserId')
},
- 'content_id': session_api_data['content_id'],
+ 'content_id': session_api_data.get('contentId'),
'content_src_id_sets': [{
'content_src_ids': [{
'src_id_to_mux': {
- 'audio_src_ids': [audio_quality['id']],
- 'video_src_ids': [video_quality['id']],
+ 'audio_src_ids': [audio_src_id],
+ 'video_src_ids': [video_src_id],
}
}]
}],
@@ -229,52 +289,81 @@ class NiconicoIE(InfoExtractor):
'content_uri': '',
'keep_method': {
'heartbeat': {
- 'lifetime': session_api_data['heartbeat_lifetime']
+ 'lifetime': session_api_data.get('heartbeatLifetime')
}
},
- 'priority': session_api_data['priority'],
+ 'priority': session_api_data.get('priority'),
'protocol': {
'name': 'http',
'parameters': {
- 'http_parameters': {
- 'parameters': {
- 'http_output_download_parameters': {
- 'use_ssl': yesno(session_api_endpoint['is_ssl']),
- 'use_well_known_port': yesno(session_api_endpoint['is_well_known_port']),
- }
- }
- }
+ 'http_parameters': session_api_http_parameters
}
},
- 'recipe_id': session_api_data['recipe_id'],
+ 'recipe_id': session_api_data.get('recipeId'),
'session_operation_auth': {
'session_operation_auth_by_signature': {
- 'signature': session_api_data['signature'],
- 'token': session_api_data['token'],
+ 'signature': session_api_data.get('signature'),
+ 'token': session_api_data.get('token'),
}
},
'timing_constraint': 'unlimited'
}
}).encode())
- resolution = video_quality.get('resolution', {})
+ info_dict['url'] = session_response['data']['session']['content_uri']
+ info_dict['protocol'] = protocol
+
+ # get heartbeat info
+ heartbeat_info_dict = {
+ 'url': session_api_endpoint['url'] + '/' + session_response['data']['session']['id'] + '?_format=json&_method=PUT',
+ 'data': json.dumps(session_response['data']),
+ # interval, convert milliseconds to seconds, then halve to make a buffer.
+ 'interval': float_or_none(session_api_data.get('heartbeatLifetime'), scale=3000),
+ 'ping': ping
+ }
+
+ return info_dict, heartbeat_info_dict
+
+ def _extract_format_for_quality(self, api_data, video_id, audio_quality, video_quality):
+ def parse_format_id(id_code):
+ mobj = re.match(r'''(?x)
+ (?:archive_)?
+ (?:(?P<codec>[^_]+)_)?
+ (?:(?P<br>[\d]+)kbps_)?
+ (?:(?P<res>[\d+]+)p_)?
+ ''', '%s_' % id_code)
+ return mobj.groupdict() if mobj else {}
+
+ protocol = 'niconico_dmc'
+ format_id = '-'.join(map(lambda s: remove_start(s['id'], 'archive_'), [video_quality, audio_quality]))
+ vdict = parse_format_id(video_quality['id'])
+ adict = parse_format_id(audio_quality['id'])
+ resolution = try_get(video_quality, lambda x: x['metadata']['resolution'], dict) or {'height': vdict.get('res')}
+ vbr = try_get(video_quality, lambda x: x['metadata']['bitrate'], float)
return {
- 'url': session_response['data']['session']['content_uri'],
+ 'url': '%s:%s/%s/%s' % (protocol, video_id, video_quality['id'], audio_quality['id']),
'format_id': format_id,
+ 'format_note': 'DMC %s' % try_get(video_quality, lambda x: x['metadata']['label'], compat_str),
'ext': 'mp4', # Session API are used in HTML5, which always serves mp4
- 'abr': float_or_none(audio_quality.get('bitrate'), 1000),
- 'vbr': float_or_none(video_quality.get('bitrate'), 1000),
- 'height': resolution.get('height'),
- 'width': resolution.get('width'),
+ 'vcodec': vdict.get('codec'),
+ 'acodec': adict.get('codec'),
+ 'vbr': float_or_none(vbr, 1000) or float_or_none(vdict.get('br')),
+ 'abr': float_or_none(audio_quality.get('bitrate'), 1000) or float_or_none(adict.get('br')),
+ 'height': int_or_none(resolution.get('height', vdict.get('res'))),
+ 'width': int_or_none(resolution.get('width')),
+ 'quality': -2 if 'low' in format_id else -1, # Default quality value is -1
+ 'protocol': protocol,
+ 'http_headers': {
+ 'Origin': 'https://www.nicovideo.jp',
+ 'Referer': 'https://www.nicovideo.jp/watch/' + video_id,
+ }
}
def _real_extract(self, url):
video_id = self._match_id(url)
- # Get video webpage. We are not actually interested in it for normal
- # cases, but need the cookies in order to be able to download the
- # info webpage
+ # Get video webpage for API data.
webpage, handle = self._download_webpage_handle(
'http://www.nicovideo.jp/watch/' + video_id, video_id)
if video_id.startswith('so'):
@@ -284,86 +373,136 @@ class NiconicoIE(InfoExtractor):
'data-api-data="([^"]+)"', webpage,
'API data', default='{}'), video_id)
- def _format_id_from_url(video_url):
- return 'economy' if video_real_url.endswith('low') else 'normal'
-
- try:
- video_real_url = api_data['video']['smileInfo']['url']
- except KeyError: # Flash videos
- # Get flv info
- flv_info_webpage = self._download_webpage(
- 'http://flapi.nicovideo.jp/api/getflv/' + video_id + '?as3=1',
- video_id, 'Downloading flv info')
-
- flv_info = compat_parse_qs(flv_info_webpage)
- if 'url' not in flv_info:
- if 'deleted' in flv_info:
- raise ExtractorError('The video has been deleted.',
- expected=True)
- elif 'closed' in flv_info:
- raise ExtractorError('Niconico videos now require logging in',
- expected=True)
- elif 'error' in flv_info:
- raise ExtractorError('%s reports error: %s' % (
- self.IE_NAME, flv_info['error'][0]), expected=True)
- else:
- raise ExtractorError('Unable to find video URL')
-
- video_info_xml = self._download_xml(
- 'http://ext.nicovideo.jp/api/getthumbinfo/' + video_id,
- video_id, note='Downloading video info page')
-
- def get_video_info(items):
- if not isinstance(items, list):
- items = [items]
- for item in items:
- ret = xpath_text(video_info_xml, './/' + item)
- if ret:
- return ret
-
- video_real_url = flv_info['url'][0]
-
- extension = get_video_info('movie_type')
- if not extension:
- extension = determine_ext(video_real_url)
-
- formats = [{
- 'url': video_real_url,
- 'ext': extension,
- 'format_id': _format_id_from_url(video_real_url),
- }]
- else:
- formats = []
-
- dmc_info = api_data['video'].get('dmcInfo')
- if dmc_info: # "New" HTML5 videos
- quality_info = dmc_info['quality']
- for audio_quality in quality_info['audios']:
- for video_quality in quality_info['videos']:
- if not audio_quality['available'] or not video_quality['available']:
- continue
- formats.append(self._extract_format_for_quality(
- api_data, video_id, audio_quality, video_quality))
-
- self._sort_formats(formats)
- else: # "Old" HTML5 videos
- formats = [{
+ def get_video_info_web(items):
+ return dict_get(api_data['video'], items)
+
+ # Get video info
+ video_info_xml = self._download_xml(
+ 'http://ext.nicovideo.jp/api/getthumbinfo/' + video_id,
+ video_id, note='Downloading video info page')
+
+ def get_video_info_xml(items):
+ if not isinstance(items, list):
+ items = [items]
+ for item in items:
+ ret = xpath_text(video_info_xml, './/' + item)
+ if ret:
+ return ret
+
+ if get_video_info_xml('error'):
+ error_code = get_video_info_xml('code')
+
+ if error_code == 'DELETED':
+ raise ExtractorError('The video has been deleted.',
+ expected=True)
+ elif error_code == 'NOT_FOUND':
+ raise ExtractorError('The video is not found.',
+ expected=True)
+ elif error_code == 'COMMUNITY':
+ self.to_screen('%s: The video is community members only.' % video_id)
+ else:
+ raise ExtractorError('%s reports error: %s' % (self.IE_NAME, error_code))
+
+ # Start extracting video formats
+ formats = []
+
+ # Get HTML5 videos info
+ quality_info = try_get(api_data, lambda x: x['media']['delivery']['movie'])
+ if not quality_info:
+ raise ExtractorError('The video can\'t be downloaded', expected=True)
+
+ for audio_quality in quality_info.get('audios') or {}:
+ for video_quality in quality_info.get('videos') or {}:
+ if not audio_quality.get('isAvailable') or not video_quality.get('isAvailable'):
+ continue
+ formats.append(self._extract_format_for_quality(
+ api_data, video_id, audio_quality, video_quality))
+
+ # Get flv/swf info
+ timestamp = None
+ video_real_url = try_get(api_data, lambda x: x['video']['smileInfo']['url'])
+ if video_real_url:
+ is_economy = video_real_url.endswith('low')
+
+ if is_economy:
+ self.report_warning('Site is currently in economy mode! You will only have access to lower quality streams')
+
+ # Invoking ffprobe to determine resolution
+ pp = FFmpegPostProcessor(self._downloader)
+ cookies = self._get_cookies('https://nicovideo.jp').output(header='', sep='; path=/; domain=nicovideo.jp;\n')
+
+ self.to_screen('%s: %s' % (video_id, 'Checking smile format with ffprobe'))
+
+ try:
+ metadata = pp.get_metadata_object(video_real_url, ['-cookies', cookies])
+ except PostProcessingError as err:
+ raise ExtractorError(err.msg, expected=True)
+
+ v_stream = a_stream = {}
+
+ # Some complex swf files doesn't have video stream (e.g. nm4809023)
+ for stream in metadata['streams']:
+ if stream['codec_type'] == 'video':
+ v_stream = stream
+ elif stream['codec_type'] == 'audio':
+ a_stream = stream
+
+ # Community restricted videos seem to have issues with the thumb API not returning anything at all
+ filesize = int(
+ (get_video_info_xml('size_high') if not is_economy else get_video_info_xml('size_low'))
+ or metadata['format']['size']
+ )
+ extension = (
+ get_video_info_xml('movie_type')
+ or 'mp4' if 'mp4' in metadata['format']['format_name'] else metadata['format']['format_name']
+ )
+
+ # 'creation_time' tag on video stream of re-encoded SMILEVIDEO mp4 files are '1970-01-01T00:00:00.000000Z'.
+ timestamp = (
+ parse_iso8601(get_video_info_web('first_retrieve'))
+ or unified_timestamp(get_video_info_web('postedDateTime'))
+ )
+ metadata_timestamp = (
+ parse_iso8601(try_get(v_stream, lambda x: x['tags']['creation_time']))
+ or timestamp if extension != 'mp4' else 0
+ )
+
+ # According to compconf, smile videos from pre-2017 are always better quality than their DMC counterparts
+ smile_threshold_timestamp = parse_iso8601('2016-12-08T00:00:00+09:00')
+
+ is_source = timestamp < smile_threshold_timestamp or metadata_timestamp > 0
+
+ # If movie file size is unstable, old server movie is not source movie.
+ if filesize > 1:
+ formats.append({
'url': video_real_url,
- 'ext': 'mp4',
- 'format_id': _format_id_from_url(video_real_url),
- }]
-
- def get_video_info(items):
- return dict_get(api_data['video'], items)
+ 'format_id': 'smile' if not is_economy else 'smile_low',
+ 'format_note': 'SMILEVIDEO source' if not is_economy else 'SMILEVIDEO low quality',
+ 'ext': extension,
+ 'container': extension,
+ 'vcodec': v_stream.get('codec_name'),
+ 'acodec': a_stream.get('codec_name'),
+ # Some complex swf files doesn't have total bit rate metadata (e.g. nm6049209)
+ 'tbr': int_or_none(metadata['format'].get('bit_rate'), scale=1000),
+ 'vbr': int_or_none(v_stream.get('bit_rate'), scale=1000),
+ 'abr': int_or_none(a_stream.get('bit_rate'), scale=1000),
+ 'height': int_or_none(v_stream.get('height')),
+ 'width': int_or_none(v_stream.get('width')),
+ 'source_preference': 5 if not is_economy else -2,
+ 'quality': 5 if is_source and not is_economy else None,
+ 'filesize': filesize
+ })
+
+ self._sort_formats(formats)
# Start extracting information
- title = get_video_info('title')
- if not title:
- title = self._og_search_title(webpage, default=None)
- if not title:
- title = self._html_search_regex(
+ title = (
+ get_video_info_xml('title') # prefer to get the untranslated original title
+ or get_video_info_web(['originalTitle', 'title'])
+ or self._og_search_title(webpage, default=None)
+ or self._html_search_regex(
r'<span[^>]+class="videoHeaderTitle"[^>]*>([^<]+)</span>',
- webpage, 'video title')
+ webpage, 'video title'))
watch_api_data_string = self._html_search_regex(
r'<div[^>]+id="watchAPIDataContainer"[^>]+>([^<]+)</div>',
@@ -372,14 +511,15 @@ class NiconicoIE(InfoExtractor):
video_detail = watch_api_data.get('videoDetail', {})
thumbnail = (
- get_video_info(['thumbnail_url', 'thumbnailURL'])
+ self._html_search_regex(r'<meta property="og:image" content="([^"]+)">', webpage, 'thumbnail data', default=None)
+ or dict_get( # choose highest from 720p to 240p
+ get_video_info_web('thumbnail'),
+ ['ogp', 'player', 'largeUrl', 'middleUrl', 'url'])
or self._html_search_meta('image', webpage, 'thumbnail', default=None)
or video_detail.get('thumbnail'))
- description = get_video_info('description')
+ description = get_video_info_web('description')
- timestamp = (parse_iso8601(get_video_info('first_retrieve'))
- or unified_timestamp(get_video_info('postedDateTime')))
if not timestamp:
match = self._html_search_meta('datePublished', webpage, 'date published', default=None)
if match:
@@ -388,19 +528,25 @@ class NiconicoIE(InfoExtractor):
timestamp = parse_iso8601(
video_detail['postedAt'].replace('/', '-'),
delimiter=' ', timezone=datetime.timedelta(hours=9))
+ timestamp = timestamp or try_get(api_data, lambda x: parse_iso8601(x['video']['registeredAt']))
- view_count = int_or_none(get_video_info(['view_counter', 'viewCount']))
+ view_count = int_or_none(get_video_info_web(['view_counter', 'viewCount']))
if not view_count:
match = self._html_search_regex(
r'>Views: <strong[^>]*>([^<]+)</strong>',
webpage, 'view count', default=None)
if match:
view_count = int_or_none(match.replace(',', ''))
- view_count = view_count or video_detail.get('viewCount')
+ view_count = (
+ view_count
+ or video_detail.get('viewCount')
+ or try_get(api_data, lambda x: x['video']['count']['view']))
+
+ comment_count = (
+ int_or_none(get_video_info_web('comment_num'))
+ or video_detail.get('commentCount')
+ or try_get(api_data, lambda x: x['video']['count']['comment']))
- comment_count = (int_or_none(get_video_info('comment_num'))
- or video_detail.get('commentCount')
- or try_get(api_data, lambda x: x['thread']['commentCount']))
if not comment_count:
match = self._html_search_regex(
r'>Comments: <strong[^>]*>([^<]+)</strong>',
@@ -409,22 +555,41 @@ class NiconicoIE(InfoExtractor):
comment_count = int_or_none(match.replace(',', ''))
duration = (parse_duration(
- get_video_info('length')
+ get_video_info_web('length')
or self._html_search_meta(
'video:duration', webpage, 'video duration', default=None))
or video_detail.get('length')
- or get_video_info('duration'))
+ or get_video_info_web('duration'))
+
+ webpage_url = get_video_info_web('watch_url') or url
- webpage_url = get_video_info('watch_url') or url
+ # for channel movie and community movie
+ channel_id = try_get(
+ api_data,
+ (lambda x: x['channel']['globalId'],
+ lambda x: x['community']['globalId']))
+ channel = try_get(
+ api_data,
+ (lambda x: x['channel']['name'],
+ lambda x: x['community']['name']))
# Note: cannot use api_data.get('owner', {}) because owner may be set to "null"
# in the JSON, which will cause None to be returned instead of {}.
owner = try_get(api_data, lambda x: x.get('owner'), dict) or {}
- uploader_id = get_video_info(['ch_id', 'user_id']) or owner.get('id')
- uploader = get_video_info(['ch_name', 'user_nickname']) or owner.get('nickname')
+ uploader_id = str_or_none(
+ get_video_info_web(['ch_id', 'user_id'])
+ or owner.get('id')
+ or channel_id
+ )
+ uploader = (
+ get_video_info_web(['ch_name', 'user_nickname'])
+ or owner.get('nickname')
+ or channel
+ )
return {
'id': video_id,
+ '_api_data': api_data,
'title': title,
'formats': formats,
'thumbnail': thumbnail,
@@ -432,6 +597,8 @@ class NiconicoIE(InfoExtractor):
'uploader': uploader,
'timestamp': timestamp,
'uploader_id': uploader_id,
+ 'channel': channel,
+ 'channel_id': channel_id,
'view_count': view_count,
'comment_count': comment_count,
'duration': duration,
@@ -440,7 +607,7 @@ class NiconicoIE(InfoExtractor):
class NiconicoPlaylistIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?nicovideo\.jp/(?:user/\d+/)?mylist/(?P<id>\d+)'
+ _VALID_URL = r'https?://(?:www\.)?nicovideo\.jp/(?:user/\d+/|my/)?mylist/(?P<id>\d+)'
_TESTS = [{
'url': 'http://www.nicovideo.jp/mylist/27411728',
@@ -456,60 +623,175 @@ class NiconicoPlaylistIE(InfoExtractor):
'url': 'https://www.nicovideo.jp/user/805442/mylist/27411728',
'only_matching': True,
}]
- _PAGE_SIZE = 100
- def _call_api(self, list_id, resource, query):
- return self._download_json(
- 'https://nvapi.nicovideo.jp/v2/mylists/' + list_id, list_id,
- 'Downloading %s JSON metatdata' % resource, query=query,
- headers={'X-Frontend-Id': 6})['data']['mylist']
-
- def _parse_owner(self, item):
- owner = item.get('owner') or {}
- if owner:
- return {
- 'uploader': owner.get('name'),
- 'uploader_id': owner.get('id'),
- }
- return {}
-
- def _fetch_page(self, list_id, page):
- page += 1
- items = self._call_api(list_id, 'page %d' % page, {
- 'page': page,
- 'pageSize': self._PAGE_SIZE,
- })['items']
- for item in items:
- video = item.get('video') or {}
- video_id = video.get('id')
- if not video_id:
- continue
- count = video.get('count') or {}
- get_count = lambda x: int_or_none(count.get(x))
- info = {
+ _API_HEADERS = {
+ 'X-Frontend-ID': '6',
+ 'X-Frontend-Version': '0'
+ }
+
+ def _real_extract(self, url):
+ list_id = self._match_id(url)
+
+ def get_page_data(pagenum, pagesize):
+ return self._download_json(
+ 'http://nvapi.nicovideo.jp/v2/mylists/' + list_id, list_id,
+ query={'page': 1 + pagenum, 'pageSize': pagesize},
+ headers=self._API_HEADERS).get('data').get('mylist')
+
+ data = get_page_data(0, 1)
+ title = data.get('name')
+ description = data.get('description')
+ uploader = data.get('owner').get('name')
+ uploader_id = data.get('owner').get('id')
+
+ def pagefunc(pagenum):
+ data = get_page_data(pagenum, 25)
+ return ({
'_type': 'url',
- 'id': video_id,
- 'title': video.get('title'),
- 'url': 'https://www.nicovideo.jp/watch/' + video_id,
- 'description': video.get('shortDescription'),
- 'duration': int_or_none(video.get('duration')),
- 'view_count': get_count('view'),
- 'comment_count': get_count('comment'),
- 'ie_key': NiconicoIE.ie_key(),
- }
- info.update(self._parse_owner(video))
- yield info
+ 'url': 'http://www.nicovideo.jp/watch/' + item.get('watchId'),
+ } for item in data.get('items'))
+
+ return {
+ '_type': 'playlist',
+ 'id': list_id,
+ 'title': title,
+ 'description': description,
+ 'uploader': uploader,
+ 'uploader_id': uploader_id,
+ 'entries': OnDemandPagedList(pagefunc, 25),
+ }
+
+
+NicovideoSearchIE_NAME = 'nicovideo:search'
+
+
+class NicovideoSearchURLIE(InfoExtractor):
+ IE_NAME = f'{NicovideoSearchIE_NAME}_url'
+ IE_DESC = 'Nico video search URLs'
+ _VALID_URL = r'https?://(?:www\.)?nicovideo\.jp/search/(?P<id>[^?#&]+)?'
+ _TESTS = [{
+ 'url': 'http://www.nicovideo.jp/search/sm9',
+ 'info_dict': {
+ 'id': 'sm9',
+ 'title': 'sm9'
+ },
+ 'playlist_mincount': 40,
+ }, {
+ 'url': 'https://www.nicovideo.jp/search/sm9?sort=h&order=d&end=2020-12-31&start=2020-01-01',
+ 'info_dict': {
+ 'id': 'sm9',
+ 'title': 'sm9'
+ },
+ 'playlist_count': 31,
+ }]
+
+ def _entries(self, url, item_id, query=None, note='Downloading page %(page)s'):
+ query = query or {}
+ pages = [query['page']] if 'page' in query else itertools.count(1)
+ for page_num in pages:
+ query['page'] = str(page_num)
+ webpage = self._download_webpage(url, item_id, query=query, note=note % {'page': page_num})
+ results = re.findall(r'(?<=data-video-id=)["\']?(?P<videoid>.*?)(?=["\'])', webpage)
+ for item in results:
+ yield self.url_result(f'http://www.nicovideo.jp/watch/{item}', 'Niconico', item)
+ if not results:
+ break
+
+ def _real_extract(self, url):
+ query = self._match_id(url)
+ return self.playlist_result(self._entries(url, query), query, query)
+
+
+class NicovideoSearchIE(SearchInfoExtractor, NicovideoSearchURLIE):
+ IE_DESC = 'Nico video searches'
+ _MAX_RESULTS = float('inf')
+ IE_NAME = NicovideoSearchIE_NAME
+ _SEARCH_KEY = 'nicosearch'
+ _TESTS = []
+
+ def _search_results(self, query):
+ return self._entries(
+ self._proto_relative_url(f'//www.nicovideo.jp/search/{query}'), query)
+
+
+class NicovideoSearchDateIE(NicovideoSearchIE):
+ IE_DESC = 'Nico video searches, newest first'
+ IE_NAME = f'{NicovideoSearchIE_NAME}:date'
+ _SEARCH_KEY = 'nicosearchdate'
+ _TESTS = [{
+ 'url': 'nicosearchdateall:a',
+ 'info_dict': {
+ 'id': 'a',
+ 'title': 'a'
+ },
+ 'playlist_mincount': 1610,
+ }]
+
+ _START_DATE = datetime.date(2007, 1, 1)
+ _RESULTS_PER_PAGE = 32
+ _MAX_PAGES = 50
+
+ def _entries(self, url, item_id, start_date=None, end_date=None):
+ start_date, end_date = start_date or self._START_DATE, end_date or datetime.datetime.now().date()
+
+ # If the last page has a full page of videos, we need to break down the query interval further
+ last_page_len = len(list(self._get_entries_for_date(
+ url, item_id, start_date, end_date, self._MAX_PAGES,
+ note=f'Checking number of videos from {start_date} to {end_date}')))
+ if (last_page_len == self._RESULTS_PER_PAGE and start_date != end_date):
+ midpoint = start_date + ((end_date - start_date) // 2)
+ yield from self._entries(url, item_id, midpoint, end_date)
+ yield from self._entries(url, item_id, start_date, midpoint)
+ else:
+ self.to_screen(f'{item_id}: Downloading results from {start_date} to {end_date}')
+ yield from self._get_entries_for_date(
+ url, item_id, start_date, end_date, note=' Downloading page %(page)s')
+
+ def _get_entries_for_date(self, url, item_id, start_date, end_date=None, page_num=None, note=None):
+ query = {
+ 'start': str(start_date),
+ 'end': str(end_date or start_date),
+ 'sort': 'f',
+ 'order': 'd',
+ }
+ if page_num:
+ query['page'] = str(page_num)
+
+ yield from NicovideoSearchURLIE._entries(self, url, item_id, query=query, note=note)
+
+
+class NiconicoUserIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?nicovideo\.jp/user/(?P<id>\d+)/?(?:$|[#?])'
+ _TEST = {
+ 'url': 'https://www.nicovideo.jp/user/419948',
+ 'info_dict': {
+ 'id': '419948',
+ },
+ 'playlist_mincount': 101,
+ }
+ _API_URL = "https://nvapi.nicovideo.jp/v1/users/%s/videos?sortKey=registeredAt&sortOrder=desc&pageSize=%s&page=%s"
+ _PAGE_SIZE = 100
+
+ _API_HEADERS = {
+ 'X-Frontend-ID': '6',
+ 'X-Frontend-Version': '0'
+ }
+
+ def _entries(self, list_id):
+ total_count = 1
+ count = page_num = 0
+ while count < total_count:
+ json_parsed = self._download_json(
+ self._API_URL % (list_id, self._PAGE_SIZE, page_num + 1), list_id,
+ headers=self._API_HEADERS,
+ note='Downloading JSON metadata%s' % (' page %d' % page_num if page_num else ''))
+ if not page_num:
+ total_count = int_or_none(json_parsed['data'].get('totalCount'))
+ for entry in json_parsed["data"]["items"]:
+ count += 1
+ yield self.url_result('https://www.nicovideo.jp/watch/%s' % entry['id'])
+ page_num += 1
def _real_extract(self, url):
list_id = self._match_id(url)
- mylist = self._call_api(list_id, 'list', {
- 'pageSize': 1,
- })
- entries = InAdvancePagedList(
- functools.partial(self._fetch_page, list_id),
- math.ceil(mylist['totalItemCount'] / self._PAGE_SIZE),
- self._PAGE_SIZE)
- result = self.playlist_result(
- entries, list_id, mylist.get('name'), mylist.get('description'))
- result.update(self._parse_owner(mylist))
- return result
+ return self.playlist_result(self._entries(list_id), list_id, ie=NiconicoIE.ie_key())
diff --git a/hypervideo_dl/extractor/ninecninemedia.py b/hypervideo_dl/extractor/ninecninemedia.py
index cfc2203..4aaf21a 100644
--- a/hypervideo_dl/extractor/ninecninemedia.py
+++ b/hypervideo_dl/extractor/ninecninemedia.py
@@ -1,11 +1,9 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
from ..utils import (
- ExtractorError,
float_or_none,
int_or_none,
parse_iso8601,
@@ -20,7 +18,7 @@ class NineCNineMediaIE(InfoExtractor):
_API_BASE_TEMPLATE = 'http://capi.9c9media.com/destinations/%s/platforms/desktop/contents/%s/'
def _real_extract(self, url):
- destination_code, content_id = re.match(self._VALID_URL, url).groups()
+ destination_code, content_id = self._match_valid_url(url).groups()
api_base_url = self._API_BASE_TEMPLATE % (destination_code, content_id)
content = self._download_json(api_base_url, content_id, query={
'$include': '[Media.Name,Season,ContentPackages.Duration,ContentPackages.Id]',
@@ -34,8 +32,9 @@ class NineCNineMediaIE(InfoExtractor):
'$include': '[HasClosedCaptions]',
})
- if try_get(content_package, lambda x: x['Constraints']['Security']['Type']):
- raise ExtractorError('This video is DRM protected.', expected=True)
+ if (not self.get_param('allow_unplayable_formats')
+ and try_get(content_package, lambda x: x['Constraints']['Security']['Type'])):
+ self.report_drm(content_id)
manifest_base_url = content_package_url + 'manifest.'
formats = []
diff --git a/hypervideo_dl/extractor/ninenow.py b/hypervideo_dl/extractor/ninenow.py
index 6157dc7..6043674 100644
--- a/hypervideo_dl/extractor/ninenow.py
+++ b/hypervideo_dl/extractor/ninenow.py
@@ -8,6 +8,10 @@ from ..utils import (
int_or_none,
float_or_none,
smuggle_url,
+ str_or_none,
+ try_get,
+ unified_strdate,
+ unified_timestamp,
)
@@ -37,6 +41,24 @@ class NineNowIE(InfoExtractor):
# DRM protected
'url': 'https://www.9now.com.au/andrew-marrs-history-of-the-world/season-1/episode-1',
'only_matching': True,
+ }, {
+ # episode of series
+ 'url': 'https://www.9now.com.au/lego-masters/season-3/episode-3',
+ 'info_dict': {
+ 'id': '6249614030001',
+ 'title': 'Episode 3',
+ 'ext': 'mp4',
+ 'season_number': 3,
+ 'episode_number': 3,
+ 'description': 'In the first elimination of the competition, teams will have 10 hours to build a world inside a snow globe.',
+ 'uploader_id': '4460760524001',
+ 'timestamp': 1619002200,
+ 'upload_date': '20210421',
+ },
+ 'expected_warnings': ['Ignoring subtitle tracks'],
+ 'params':{
+ 'skip_download': True,
+ }
}]
BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/4460760524001/default_default/index.html?videoId=%s'
@@ -59,25 +81,31 @@ class NineNowIE(InfoExtractor):
cache = page_data.get(kind, {}).get('%sCache' % kind, {})
if not cache:
continue
- common_data = (cache.get(current_key) or list(cache.values())[0])[kind]
+ common_data = {
+ 'episode': (cache.get(current_key) or list(cache.values())[0])[kind],
+ 'season': (cache.get(current_key) or list(cache.values())[0]).get('season', None)
+ }
break
else:
raise ExtractorError('Unable to find video data')
- video_data = common_data['video']
-
- if video_data.get('drm'):
- raise ExtractorError('This video is DRM protected.', expected=True)
-
- brightcove_id = video_data.get('brightcoveId') or 'ref:' + video_data['referenceId']
- video_id = compat_str(video_data.get('id') or brightcove_id)
- title = common_data['name']
+ if not self.get_param('allow_unplayable_formats') and try_get(common_data, lambda x: x['episode']['video']['drm'], bool):
+ self.report_drm(display_id)
+ brightcove_id = try_get(
+ common_data, lambda x: x['episode']['video']['brightcoveId'], compat_str) or 'ref:%s' % common_data['episode']['video']['referenceId']
+ video_id = str_or_none(try_get(common_data, lambda x: x['episode']['video']['id'])) or brightcove_id
+ title = try_get(common_data, lambda x: x['episode']['name'], compat_str)
+ season_number = try_get(common_data, lambda x: x['season']['seasonNumber'], int)
+ episode_number = try_get(common_data, lambda x: x['episode']['episodeNumber'], int)
+ timestamp = unified_timestamp(try_get(common_data, lambda x: x['episode']['airDate'], compat_str))
+ release_date = unified_strdate(try_get(common_data, lambda x: x['episode']['availability'], compat_str))
+ thumbnails_data = try_get(common_data, lambda x: x['episode']['image']['sizes'], dict) or {}
thumbnails = [{
'id': thumbnail_id,
'url': thumbnail_url,
- 'width': int_or_none(thumbnail_id[1:])
- } for thumbnail_id, thumbnail_url in common_data.get('image', {}).get('sizes', {}).items()]
+ 'width': int_or_none(thumbnail_id[1:]),
+ } for thumbnail_id, thumbnail_url in thumbnails_data.items()]
return {
'_type': 'url_transparent',
@@ -86,8 +114,12 @@ class NineNowIE(InfoExtractor):
{'geo_countries': self._GEO_COUNTRIES}),
'id': video_id,
'title': title,
- 'description': common_data.get('description'),
- 'duration': float_or_none(video_data.get('duration'), 1000),
+ 'description': try_get(common_data, lambda x: x['episode']['description'], compat_str),
+ 'duration': float_or_none(try_get(common_data, lambda x: x['episode']['video']['duration'], float), 1000),
'thumbnails': thumbnails,
'ie_key': 'BrightcoveNew',
+ 'season_number': season_number,
+ 'episode_number': episode_number,
+ 'timestamp': timestamp,
+ 'release_date': release_date,
}
diff --git a/hypervideo_dl/extractor/nitter.py b/hypervideo_dl/extractor/nitter.py
new file mode 100644
index 0000000..a0546cd
--- /dev/null
+++ b/hypervideo_dl/extractor/nitter.py
@@ -0,0 +1,228 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_urlparse
+from ..utils import (
+ parse_count,
+ unified_strdate,
+ unified_timestamp,
+ remove_end,
+ determine_ext,
+)
+import re
+import random
+
+
+class NitterIE(InfoExtractor):
+ # Taken from https://github.com/zedeus/nitter/wiki/Instances
+
+ NON_HTTP_INSTANCES = (
+ '3nzoldnxplag42gqjs23xvghtzf6t6yzssrtytnntc6ppc7xxuoneoad.onion',
+ 'nitter.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd.onion',
+ 'nitter7bryz3jv7e3uekphigvmoyoem4al3fynerxkj22dmoxoq553qd.onion',
+ 'npf37k3mtzwxreiw52ccs5ay4e6qt2fkcs2ndieurdyn2cuzzsfyfvid.onion',
+ 'nitter.v6vgyqpa7yefkorazmg5d5fimstmvm2vtbirt6676mt7qmllrcnwycqd.onion',
+ 'i23nv6w3juvzlw32xzoxcqzktegd4i4fu3nmnc2ewv4ggiu4ledwklad.onion',
+ '26oq3gioiwcmfojub37nz5gzbkdiqp7fue5kvye7d4txv4ny6fb4wwid.onion',
+
+ 'nitter.i2p',
+ 'u6ikd6zndl3c4dsdq4mmujpntgeevdk5qzkfb57r4tnfeccrn2qa.b32.i2p',
+
+ 'nitterlgj3n5fgwesu3vxc5h67ruku33nqaoeoocae2mvlzhsu6k7fqd.onion',
+ )
+
+ HTTP_INSTANCES = (
+ 'nitter.42l.fr',
+ 'nitter.pussthecat.org',
+ 'nitter.nixnet.services',
+ 'nitter.mastodont.cat',
+ 'nitter.tedomum.net',
+ 'nitter.fdn.fr',
+ 'nitter.1d4.us',
+ 'nitter.kavin.rocks',
+ 'tweet.lambda.dance',
+ 'nitter.cc',
+ 'nitter.vxempire.xyz',
+ 'nitter.unixfox.eu',
+ 'nitter.domain.glass',
+ 'nitter.himiko.cloud',
+ 'nitter.eu',
+ 'nitter.namazso.eu',
+ 'nitter.mailstation.de',
+ 'nitter.actionsack.com',
+ 'nitter.cattube.org',
+ 'nitter.dark.fail',
+ 'birdsite.xanny.family',
+ 'nitter.40two.app',
+ 'nitter.skrep.in',
+
+ # not in the list anymore
+ 'nitter.snopyta.org',
+ )
+
+ DEAD_INSTANCES = (
+ # maintenance
+ 'nitter.ethibox.fr',
+
+ # official, rate limited
+ 'nitter.net',
+ # offline
+ 'nitter.13ad.de',
+ 'nitter.weaponizedhumiliation.com',
+ )
+
+ INSTANCES = NON_HTTP_INSTANCES + HTTP_INSTANCES + DEAD_INSTANCES
+
+ _INSTANCES_RE = '(?:' + '|'.join([re.escape(instance) for instance in INSTANCES]) + ')'
+ _VALID_URL = r'https?://%(instance)s/(?P<uploader_id>.+)/status/(?P<id>[0-9]+)(#.)?' % {'instance': _INSTANCES_RE}
+ current_instance = random.choice(HTTP_INSTANCES)
+
+ _TESTS = [
+ {
+ # GIF (wrapped in mp4)
+ 'url': 'https://%s/firefox/status/1314279897502629888#m' % current_instance,
+ 'info_dict': {
+ 'id': '1314279897502629888',
+ 'ext': 'mp4',
+ 'title': 'Firefox 🔥 - You know the old saying, if you see something say something. Now you actually can with the YouTube regrets extension. \n\nReport harmful YouTube recommendations so others can avoid watching them. ➡️ https://mzl.la/3iFIiyg\n\n#UnfckTheInternet',
+ 'description': 'You know the old saying, if you see something say something. Now you actually can with the YouTube regrets extension. \n\nReport harmful YouTube recommendations so others can avoid watching them. ➡️ https://mzl.la/3iFIiyg\n\n#UnfckTheInternet',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'uploader': 'Firefox 🔥',
+ 'uploader_id': 'firefox',
+ 'uploader_url': 'https://%s/firefox' % current_instance,
+ 'upload_date': '20201008',
+ 'timestamp': 1602183720,
+ },
+ }, { # normal video
+ 'url': 'https://%s/Le___Doc/status/1299715685392756737#m' % current_instance,
+ 'info_dict': {
+ 'id': '1299715685392756737',
+ 'ext': 'mp4',
+ 'title': 'Le Doc - "Je ne prédis jamais rien"\nD Raoult, Août 2020...',
+ 'description': '"Je ne prédis jamais rien"\nD Raoult, Août 2020...',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'uploader': 'Le Doc',
+ 'uploader_id': 'Le___Doc',
+ 'uploader_url': 'https://%s/Le___Doc' % current_instance,
+ 'upload_date': '20200829',
+ 'timestamp': 1598711341,
+ 'view_count': int,
+ 'like_count': int,
+ 'repost_count': int,
+ 'comment_count': int,
+ },
+ }, { # video embed in a "Streaming Political Ads" box
+ 'url': 'https://%s/mozilla/status/1321147074491092994#m' % current_instance,
+ 'info_dict': {
+ 'id': '1321147074491092994',
+ 'ext': 'mp4',
+ 'title': "Mozilla - Are you being targeted with weird, ominous or just plain annoying political ads while streaming your favorite shows?\n\nThis isn't a real political ad, but if you're watching streaming TV in the U.S., chances are you've seen quite a few. \n\nLearn more ➡️ https://mzl.la/StreamingAds",
+ 'description': "Are you being targeted with weird, ominous or just plain annoying political ads while streaming your favorite shows?\n\nThis isn't a real political ad, but if you're watching streaming TV in the U.S., chances are you've seen quite a few. \n\nLearn more ➡️ https://mzl.la/StreamingAds",
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'uploader': 'Mozilla',
+ 'uploader_id': 'mozilla',
+ 'uploader_url': 'https://%s/mozilla' % current_instance,
+ 'upload_date': '20201027',
+ 'timestamp': 1603820982
+ },
+ }, { # not the first tweet but main-tweet
+ 'url': 'https://%s/TheNaturalNu/status/1379050895539724290#m' % current_instance,
+ 'info_dict': {
+ 'id': '1379050895539724290',
+ 'ext': 'mp4',
+ 'title': 'Dorothy Zbornak - This had me hollering!!',
+ 'description': 'This had me hollering!!',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'uploader': 'Dorothy Zbornak',
+ 'uploader_id': 'TheNaturalNu',
+ 'uploader_url': 'https://%s/TheNaturalNu' % current_instance,
+ 'timestamp': 1617626329,
+ 'upload_date': '20210405'
+ }
+ }
+ ]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ parsed_url = compat_urlparse.urlparse(url)
+ base_url = '%s://%s' % (parsed_url.scheme, parsed_url.netloc)
+
+ self._set_cookie(parsed_url.netloc, 'hlsPlayback', 'on')
+ full_webpage = self._download_webpage(url, video_id)
+
+ main_tweet_start = full_webpage.find('class="main-tweet"')
+ if main_tweet_start > 0:
+ webpage = full_webpage[main_tweet_start:]
+ if not webpage:
+ webpage = full_webpage
+
+ video_url = '%s%s' % (base_url, self._html_search_regex(r'(?:<video[^>]+data-url|<source[^>]+src)="([^"]+)"', webpage, 'video url'))
+ ext = determine_ext(video_url)
+
+ if ext == 'unknown_video':
+ formats = self._extract_m3u8_formats(video_url, video_id, ext='mp4')
+ else:
+ formats = [{
+ 'url': video_url,
+ 'ext': ext
+ }]
+
+ title = self._og_search_description(full_webpage)
+ if not title:
+ title = self._html_search_regex(r'<div class="tweet-content[^>]+>([^<]+)</div>', webpage, 'title')
+ description = title
+
+ mobj = self._match_valid_url(url)
+ uploader_id = (
+ mobj.group('uploader_id')
+ or self._html_search_regex(r'<a class="fullname"[^>]+title="([^"]+)"', webpage, 'uploader name', fatal=False)
+ )
+
+ if uploader_id:
+ uploader_url = '%s/%s' % (base_url, uploader_id)
+
+ uploader = self._html_search_regex(r'<a class="fullname"[^>]+title="([^"]+)"', webpage, 'uploader name', fatal=False)
+
+ if uploader:
+ title = '%s - %s' % (uploader, title)
+
+ view_count = parse_count(self._html_search_regex(r'<span[^>]+class="icon-play[^>]*></span>\s([^<]+)</div>', webpage, 'view count', fatal=False))
+ like_count = parse_count(self._html_search_regex(r'<span[^>]+class="icon-heart[^>]*></span>\s([^<]+)</div>', webpage, 'like count', fatal=False))
+ repost_count = parse_count(self._html_search_regex(r'<span[^>]+class="icon-retweet[^>]*></span>\s([^<]+)</div>', webpage, 'repost count', fatal=False))
+ comment_count = parse_count(self._html_search_regex(r'<span[^>]+class="icon-comment[^>]*></span>\s([^<]+)</div>', webpage, 'repost count', fatal=False))
+
+ thumbnail = self._html_search_meta('og:image', full_webpage, 'thumbnail url')
+ if not thumbnail:
+ thumbnail = '%s%s' % (base_url, self._html_search_regex(r'<video[^>]+poster="([^"]+)"', webpage, 'thumbnail url', fatal=False))
+ thumbnail = remove_end(thumbnail, '%3Asmall')
+
+ thumbnails = []
+ thumbnail_ids = ('thumb', 'small', 'large', 'medium', 'orig')
+ for id in thumbnail_ids:
+ thumbnails.append({
+ 'id': id,
+ 'url': thumbnail + '%3A' + id,
+ })
+
+ date = self._html_search_regex(r'<span[^>]+class="tweet-date"[^>]*><a[^>]+title="([^"]+)"', webpage, 'upload date', fatal=False)
+ upload_date = unified_strdate(date)
+ timestamp = unified_timestamp(date)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'uploader': uploader,
+ 'timestamp': timestamp,
+ 'uploader_id': uploader_id,
+ 'uploader_url': uploader_url,
+ 'view_count': view_count,
+ 'like_count': like_count,
+ 'repost_count': repost_count,
+ 'comment_count': comment_count,
+ 'formats': formats,
+ 'thumbnails': thumbnails,
+ 'thumbnail': thumbnail,
+ 'upload_date': upload_date,
+ }
diff --git a/hypervideo_dl/extractor/noco.py b/hypervideo_dl/extractor/noco.py
new file mode 100644
index 0000000..78c4952
--- /dev/null
+++ b/hypervideo_dl/extractor/noco.py
@@ -0,0 +1,235 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+import time
+import hashlib
+
+from .common import InfoExtractor
+from ..compat import (
+ compat_str,
+)
+from ..utils import (
+ clean_html,
+ ExtractorError,
+ int_or_none,
+ float_or_none,
+ parse_iso8601,
+ parse_qs,
+ sanitized_Request,
+ urlencode_postdata,
+)
+
+
+class NocoIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:(?:www\.)?noco\.tv/emission/|player\.noco\.tv/\?idvideo=)(?P<id>\d+)'
+ _LOGIN_URL = 'https://noco.tv/do.php'
+ _API_URL_TEMPLATE = 'https://api.noco.tv/1.1/%s?ts=%s&tk=%s'
+ _SUB_LANG_TEMPLATE = '&sub_lang=%s'
+ _NETRC_MACHINE = 'noco'
+
+ _TESTS = [
+ {
+ 'url': 'http://noco.tv/emission/11538/nolife/ami-ami-idol-hello-france/',
+ 'md5': '0a993f0058ddbcd902630b2047ef710e',
+ 'info_dict': {
+ 'id': '11538',
+ 'ext': 'mp4',
+ 'title': 'Ami Ami Idol - Hello! France',
+ 'description': 'md5:4eaab46ab68fa4197a317a88a53d3b86',
+ 'upload_date': '20140412',
+ 'uploader': 'Nolife',
+ 'uploader_id': 'NOL',
+ 'duration': 2851.2,
+ },
+ 'skip': 'Requires noco account',
+ },
+ {
+ 'url': 'http://noco.tv/emission/12610/lbl42/the-guild/s01e01-wake-up-call',
+ 'md5': 'c190f1f48e313c55838f1f412225934d',
+ 'info_dict': {
+ 'id': '12610',
+ 'ext': 'mp4',
+ 'title': 'The Guild #1 - Wake-Up Call',
+ 'timestamp': 1403863200,
+ 'upload_date': '20140627',
+ 'uploader': 'LBL42',
+ 'uploader_id': 'LBL',
+ 'duration': 233.023,
+ },
+ 'skip': 'Requires noco account',
+ }
+ ]
+
+ def _real_initialize(self):
+ self._login()
+
+ def _login(self):
+ username, password = self._get_login_info()
+ if username is None:
+ return
+
+ login = self._download_json(
+ self._LOGIN_URL, None, 'Logging in',
+ data=urlencode_postdata({
+ 'a': 'login',
+ 'cookie': '1',
+ 'username': username,
+ 'password': password,
+ }),
+ headers={
+ 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
+ })
+
+ if 'erreur' in login:
+ raise ExtractorError('Unable to login: %s' % clean_html(login['erreur']), expected=True)
+
+ @staticmethod
+ def _ts():
+ return int(time.time() * 1000)
+
+ def _call_api(self, path, video_id, note, sub_lang=None):
+ ts = compat_str(self._ts() + self._ts_offset)
+ tk = hashlib.md5((hashlib.md5(ts.encode('ascii')).hexdigest() + '#8S?uCraTedap6a').encode('ascii')).hexdigest()
+ url = self._API_URL_TEMPLATE % (path, ts, tk)
+ if sub_lang:
+ url += self._SUB_LANG_TEMPLATE % sub_lang
+
+ request = sanitized_Request(url)
+ request.add_header('Referer', self._referer)
+
+ resp = self._download_json(request, video_id, note)
+
+ if isinstance(resp, dict) and resp.get('error'):
+ self._raise_error(resp['error'], resp['description'])
+
+ return resp
+
+ def _raise_error(self, error, description):
+ raise ExtractorError(
+ '%s returned error: %s - %s' % (self.IE_NAME, error, description),
+ expected=True)
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ # Timestamp adjustment offset between server time and local time
+ # must be calculated in order to use timestamps closest to server's
+ # in all API requests (see https://github.com/ytdl-org/youtube-dl/issues/7864)
+ webpage = self._download_webpage(url, video_id)
+
+ player_url = self._search_regex(
+ r'(["\'])(?P<player>https?://noco\.tv/(?:[^/]+/)+NocoPlayer.+?\.swf.*?)\1',
+ webpage, 'noco player', group='player',
+ default='http://noco.tv/cdata/js/player/NocoPlayer-v1.2.40.swf')
+
+ qs = parse_qs(player_url)
+ ts = int_or_none(qs.get('ts', [None])[0])
+ self._ts_offset = ts - self._ts() if ts else 0
+ self._referer = player_url
+
+ medias = self._call_api(
+ 'shows/%s/medias' % video_id,
+ video_id, 'Downloading video JSON')
+
+ show = self._call_api(
+ 'shows/by_id/%s' % video_id,
+ video_id, 'Downloading show JSON')[0]
+
+ options = self._call_api(
+ 'users/init', video_id,
+ 'Downloading user options JSON')['options']
+ audio_lang_pref = options.get('audio_language') or options.get('language', 'fr')
+
+ if audio_lang_pref == 'original':
+ audio_lang_pref = show['original_lang']
+ if len(medias) == 1:
+ audio_lang_pref = list(medias.keys())[0]
+ elif audio_lang_pref not in medias:
+ audio_lang_pref = 'fr'
+
+ qualities = self._call_api(
+ 'qualities',
+ video_id, 'Downloading qualities JSON')
+
+ formats = []
+
+ for audio_lang, audio_lang_dict in medias.items():
+ preference = 1 if audio_lang == audio_lang_pref else 0
+ for sub_lang, lang_dict in audio_lang_dict['video_list'].items():
+ for format_id, fmt in lang_dict['quality_list'].items():
+ format_id_extended = 'audio-%s_sub-%s_%s' % (audio_lang, sub_lang, format_id)
+
+ video = self._call_api(
+ 'shows/%s/video/%s/%s' % (video_id, format_id.lower(), audio_lang),
+ video_id, 'Downloading %s video JSON' % format_id_extended,
+ sub_lang if sub_lang != 'none' else None)
+
+ file_url = video['file']
+ if not file_url:
+ continue
+
+ if file_url in ['forbidden', 'not found']:
+ popmessage = video['popmessage']
+ self._raise_error(popmessage['title'], popmessage['message'])
+
+ formats.append({
+ 'url': file_url,
+ 'format_id': format_id_extended,
+ 'width': int_or_none(fmt.get('res_width')),
+ 'height': int_or_none(fmt.get('res_lines')),
+ 'abr': int_or_none(fmt.get('audiobitrate'), 1000),
+ 'vbr': int_or_none(fmt.get('videobitrate'), 1000),
+ 'filesize': int_or_none(fmt.get('filesize')),
+ 'format_note': qualities[format_id].get('quality_name'),
+ 'quality': qualities[format_id].get('priority'),
+ 'language_preference': preference,
+ })
+
+ self._sort_formats(formats)
+
+ timestamp = parse_iso8601(show.get('online_date_start_utc'), ' ')
+
+ if timestamp is not None and timestamp < 0:
+ timestamp = None
+
+ uploader = show.get('partner_name')
+ uploader_id = show.get('partner_key')
+ duration = float_or_none(show.get('duration_ms'), 1000)
+
+ thumbnails = []
+ for thumbnail_key, thumbnail_url in show.items():
+ m = re.search(r'^screenshot_(?P<width>\d+)x(?P<height>\d+)$', thumbnail_key)
+ if not m:
+ continue
+ thumbnails.append({
+ 'url': thumbnail_url,
+ 'width': int(m.group('width')),
+ 'height': int(m.group('height')),
+ })
+
+ episode = show.get('show_TT') or show.get('show_OT')
+ family = show.get('family_TT') or show.get('family_OT')
+ episode_number = show.get('episode_number')
+
+ title = ''
+ if family:
+ title += family
+ if episode_number:
+ title += ' #' + compat_str(episode_number)
+ if episode:
+ title += ' - ' + compat_str(episode)
+
+ description = show.get('show_resume') or show.get('family_resume')
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnails': thumbnails,
+ 'timestamp': timestamp,
+ 'uploader': uploader,
+ 'uploader_id': uploader_id,
+ 'duration': duration,
+ 'formats': formats,
+ }
diff --git a/hypervideo_dl/extractor/nova.py b/hypervideo_dl/extractor/nova.py
index 47b9748..3acb881 100644
--- a/hypervideo_dl/extractor/nova.py
+++ b/hypervideo_dl/extractor/nova.py
@@ -39,7 +39,7 @@ class NovaEmbedIE(InfoExtractor):
player = self._parse_json(
self._search_regex(
- r'Player\.init\s*\([^,]+,\s*({.+?})\s*,\s*{.+?}\s*\)\s*;',
+ r'Player\.init\s*\([^,]+,\s*(?:\w+\s*\?\s*{.+?}\s*:\s*)?({.+})\s*,\s*{.+?}\s*\)\s*;',
webpage, 'player', default='{}'), video_id, fatal=False)
if player:
for format_id, format_list in player['tracks'].items():
@@ -190,7 +190,7 @@ class NovaIE(InfoExtractor):
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
display_id = mobj.group('id')
site = mobj.group('site')
diff --git a/hypervideo_dl/extractor/novaplay.py b/hypervideo_dl/extractor/novaplay.py
new file mode 100644
index 0000000..724986a
--- /dev/null
+++ b/hypervideo_dl/extractor/novaplay.py
@@ -0,0 +1,63 @@
+# coding: utf-8
+from .common import InfoExtractor
+from ..utils import int_or_none, parse_duration, parse_iso8601
+
+
+class NovaPlayIE(InfoExtractor):
+ _VALID_URL = r'https://play.nova\.bg/video/.*/(?P<id>\d+)'
+ _TESTS = [
+ {
+ 'url': 'https://play.nova.bg/video/bratya/season-3/bratq-2021-10-08/548677',
+ 'md5': 'b1127a84e61bed1632b7c2ca9cbb4153',
+ 'info_dict': {
+ 'id': '548677',
+ 'ext': 'mp4',
+ 'title': 'Братя',
+ 'alt_title': 'bratya/season-3/bratq-2021-10-08',
+ 'duration': 1603.0,
+ 'timestamp': 1633724150,
+ 'upload_date': '20211008',
+ 'thumbnail': 'https://nbg-img.fite.tv/img/548677_460x260.jpg',
+ 'description': 'Сезон 3 Епизод 25'
+ },
+ },
+ {
+ 'url': 'https://play.nova.bg/video/igri-na-volqta/season-3/igri-na-volqta-2021-09-20-1/548227',
+ 'md5': '5fd61b8ecbe582fc021019d570965d58',
+ 'info_dict': {
+ 'id': '548227',
+ 'ext': 'mp4',
+ 'title': 'Игри на волята: България (20.09.2021) - част 1',
+ 'alt_title': 'gri-na-volqta/season-3/igri-na-volqta-2021-09-20-1',
+ 'duration': 4060.0,
+ 'timestamp': 1632167564,
+ 'upload_date': '20210920',
+ 'thumbnail': 'https://nbg-img.fite.tv/img/548227_460x260.jpg',
+ 'description': 'Сезон 3 Епизод 13'
+ },
+ }
+ ]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+ video_props = self._parse_json(self._search_regex(
+ r'<script\s?id=\"__NEXT_DATA__\"\s?type=\"application/json\">({.+})</script>',
+ webpage, 'video_props'), video_id)['props']['pageProps']['video']
+ m3u8_url = self._download_json(
+ f'https://nbg-api.fite.tv/api/v2/videos/{video_id}/streams',
+ video_id, headers={'x-flipps-user-agent': 'Flipps/75/9.7'})[0]['url']
+ formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', m3u8_id='hls')
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': video_props['title'],
+ 'alt_title': video_props.get('slug'),
+ 'thumbnail': self._og_search_thumbnail(webpage),
+ 'description': self._og_search_description(webpage),
+ 'formats': formats,
+ 'duration': parse_duration(video_props['duration']),
+ 'timestamp': parse_iso8601(video_props['published_at']),
+ 'view_count': int_or_none(video_props['view_count']),
+ }
diff --git a/hypervideo_dl/extractor/npo.py b/hypervideo_dl/extractor/npo.py
index e525ad9..ed547d0 100644
--- a/hypervideo_dl/extractor/npo.py
+++ b/hypervideo_dl/extractor/npo.py
@@ -246,9 +246,8 @@ class NPOIE(NPOBaseIE):
})
if not formats:
- if drm:
- raise ExtractorError('This video is DRM protected.', expected=True)
- return
+ if not self.get_param('allow_unplayable_formats') and drm:
+ self.report_drm(video_id)
self._sort_formats(formats)
@@ -425,7 +424,7 @@ class NPOIE(NPOBaseIE):
stream_url, video_id, fatal=False)
# f4m downloader downloads only piece of live stream
for f4m_format in f4m_formats:
- f4m_format['preference'] = -1
+ f4m_format['preference'] = -5
formats.extend(f4m_formats)
elif stream_type == 'hls':
formats.extend(self._extract_m3u8_formats(
diff --git a/hypervideo_dl/extractor/nrk.py b/hypervideo_dl/extractor/nrk.py
index 40dee21..b556bc6 100644
--- a/hypervideo_dl/extractor/nrk.py
+++ b/hypervideo_dl/extractor/nrk.py
@@ -58,7 +58,7 @@ class NRKBaseIE(InfoExtractor):
def _call_api(self, path, video_id, item=None, note=None, fatal=True, query=None):
return self._download_json(
- urljoin('http://psapi.nrk.no/', path),
+ urljoin('https://psapi.nrk.no/', path),
video_id, note or 'Downloading %s JSON' % item,
fatal=fatal, query=query,
headers={'Accept-Encoding': 'gzip, deflate, br'})
@@ -452,7 +452,7 @@ class NRKTVEpisodeIE(InfoExtractor):
}]
def _real_extract(self, url):
- display_id, season_number, episode_number = re.match(self._VALID_URL, url).groups()
+ display_id, season_number, episode_number = self._match_valid_url(url).groups()
webpage = self._download_webpage(url, display_id)
@@ -594,7 +594,7 @@ class NRKTVSeasonIE(NRKTVSerieBaseIE):
else super(NRKTVSeasonIE, cls).suitable(url))
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
domain = mobj.group('domain')
serie_kind = mobj.group('serie_kind')
serie = mobj.group('serie')
@@ -692,7 +692,7 @@ class NRKTVSeriesIE(NRKTVSerieBaseIE):
else super(NRKTVSeriesIE, cls).suitable(url))
def _real_extract(self, url):
- site, serie_kind, series_id = re.match(self._VALID_URL, url).groups()
+ site, serie_kind, series_id = self._match_valid_url(url).groups()
is_radio = site == 'radio.nrk'
domain = 'radio' if is_radio else 'tv'
diff --git a/hypervideo_dl/extractor/ntvde.py b/hypervideo_dl/extractor/ntvde.py
index 101a537..035582e 100644
--- a/hypervideo_dl/extractor/ntvde.py
+++ b/hypervideo_dl/extractor/ntvde.py
@@ -62,7 +62,7 @@ class NTVDeIE(InfoExtractor):
m3u8_url = compat_urlparse.urljoin('http://video.n-tv.de', vdata['videoM3u8'])
formats.extend(self._extract_m3u8_formats(
m3u8_url, video_id, ext='mp4', entry_protocol='m3u8_native',
- preference=0, m3u8_id='hls', fatal=False))
+ quality=1, m3u8_id='hls', fatal=False))
self._sort_formats(formats)
return {
diff --git a/hypervideo_dl/extractor/nuvid.py b/hypervideo_dl/extractor/nuvid.py
index ab6bfcd..7487824 100644
--- a/hypervideo_dl/extractor/nuvid.py
+++ b/hypervideo_dl/extractor/nuvid.py
@@ -1,71 +1,73 @@
+# coding: utf-8
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
from ..utils import (
parse_duration,
+ int_or_none,
+ try_get,
)
class NuvidIE(InfoExtractor):
_VALID_URL = r'https?://(?:www|m)\.nuvid\.com/video/(?P<id>[0-9]+)'
- _TEST = {
- 'url': 'http://m.nuvid.com/video/1310741/',
- 'md5': 'eab207b7ac4fccfb4e23c86201f11277',
+ _TESTS = [{
+ 'url': 'https://www.nuvid.com/video/6513023/italian-babe',
+ 'md5': '772d2f8288f3d3c5c45f7a41761c7844',
+ 'info_dict': {
+ 'id': '6513023',
+ 'ext': 'mp4',
+ 'title': 'italian babe',
+ 'duration': 321.0,
+ 'age_limit': 18,
+ }
+ }, {
+ 'url': 'https://m.nuvid.com/video/6523263',
'info_dict': {
- 'id': '1310741',
+ 'id': '6523263',
'ext': 'mp4',
- 'title': 'Horny babes show their awesome bodeis and',
- 'duration': 129,
'age_limit': 18,
+ 'title': 'Slut brunette college student anal dorm',
}
- }
+ }]
def _real_extract(self, url):
video_id = self._match_id(url)
- page_url = 'http://m.nuvid.com/video/%s' % video_id
- webpage = self._download_webpage(
- page_url, video_id, 'Downloading video page')
- # When dwnld_speed exists and has a value larger than the MP4 file's
- # bitrate, Nuvid returns the MP4 URL
- # It's unit is 100bytes/millisecond, see mobile-nuvid-min.js for the algorithm
- self._set_cookie('nuvid.com', 'dwnld_speed', '10.0')
- mp4_webpage = self._download_webpage(
- page_url, video_id, 'Downloading video page for MP4 format')
+ qualities = {
+ 'lq': '360p',
+ 'hq': '720p',
+ }
+
+ json_url = f'https://www.nuvid.com/player_config_json/?vid={video_id}&aid=0&domain_id=0&embed=0&check_speed=0'
+ video_data = self._download_json(
+ json_url, video_id, headers={
+ 'Accept': 'application/json, text/javascript, */*; q = 0.01',
+ 'Content-Type': 'application/x-www-form-urlencoded; charset=utf-8',
+ })
- html5_video_re = r'(?s)<(?:video|audio)[^<]*(?:>.*?<source[^>]*)?\s+src=["\'](.*?)["\']',
- video_url = self._html_search_regex(html5_video_re, webpage, video_id)
- mp4_video_url = self._html_search_regex(html5_video_re, mp4_webpage, video_id)
formats = [{
- 'url': video_url,
- }]
- if mp4_video_url != video_url:
- formats.append({
- 'url': mp4_video_url,
- })
+ 'url': source,
+ 'format_id': qualities.get(quality),
+ 'height': int_or_none(qualities.get(quality)[:-1]),
+ } for quality, source in video_data.get('files').items() if source]
- title = self._html_search_regex(
- [r'<span title="([^"]+)">',
- r'<div class="thumb-holder video">\s*<h5[^>]*>([^<]+)</h5>',
- r'<span[^>]+class="title_thumb">([^<]+)</span>'], webpage, 'title').strip()
- thumbnails = [
- {
- 'url': thumb_url,
- } for thumb_url in re.findall(r'<img src="([^"]+)" alt="" />', webpage)
- ]
- thumbnail = thumbnails[0]['url'] if thumbnails else None
- duration = parse_duration(self._html_search_regex(
- [r'<i class="fa fa-clock-o"></i>\s*(\d{2}:\d{2})',
- r'<span[^>]+class="view_time">([^<]+)</span>'], webpage, 'duration', fatal=False))
+ self._check_formats(formats, video_id)
+ self._sort_formats(formats)
+
+ title = video_data.get('title')
+ thumbnail_base_url = try_get(video_data, lambda x: x['thumbs']['url'])
+ thumbnail_extension = try_get(video_data, lambda x: x['thumbs']['extension'])
+ thumbnail_id = self._search_regex(
+ r'/media/videos/tmb/6523263/preview/(/d+)' + thumbnail_extension, video_data.get('poster', ''), 'thumbnail id', default=19)
+ thumbnail = f'{thumbnail_base_url}player/{thumbnail_id}{thumbnail_extension}'
+ duration = parse_duration(video_data.get('duration') or video_data.get('duration_format'))
return {
'id': video_id,
+ 'formats': formats,
'title': title,
- 'thumbnails': thumbnails,
'thumbnail': thumbnail,
'duration': duration,
'age_limit': 18,
- 'formats': formats,
}
diff --git a/hypervideo_dl/extractor/nytimes.py b/hypervideo_dl/extractor/nytimes.py
index 976b1c6..9996473 100644
--- a/hypervideo_dl/extractor/nytimes.py
+++ b/hypervideo_dl/extractor/nytimes.py
@@ -46,6 +46,7 @@ class NYTimesBaseIE(InfoExtractor):
urls = []
formats = []
+ subtitles = {}
for video in video_data.get('renditions', []):
video_url = video.get('url')
format_id = video.get('type')
@@ -54,9 +55,11 @@ class NYTimesBaseIE(InfoExtractor):
urls.append(video_url)
ext = mimetype2ext(video.get('mimetype')) or determine_ext(video_url)
if ext == 'm3u8':
- formats.extend(self._extract_m3u8_formats(
+ m3u8_fmts, m3u8_subs = self._extract_m3u8_formats_and_subtitles(
video_url, video_id, 'mp4', 'm3u8_native',
- m3u8_id=format_id or 'hls', fatal=False))
+ m3u8_id=format_id or 'hls', fatal=False)
+ formats.extend(m3u8_fmts)
+ subtitles = self._merge_subtitles(subtitles, m3u8_subs)
elif ext == 'mpd':
continue
# formats.extend(self._extract_mpd_formats(
@@ -72,7 +75,7 @@ class NYTimesBaseIE(InfoExtractor):
'tbr': int_or_none(video.get('bitrate'), 1000) or None,
'ext': ext,
})
- self._sort_formats(formats, ('height', 'width', 'filesize', 'tbr', 'fps', 'format_id'))
+ self._sort_formats(formats)
thumbnails = []
for image in video_data.get('images', []):
@@ -96,6 +99,7 @@ class NYTimesBaseIE(InfoExtractor):
'uploader': video_data.get('byline'),
'duration': float_or_none(video_data.get('duration'), 1000),
'formats': formats,
+ 'subtitles': subtitles,
'thumbnails': thumbnails,
}
diff --git a/hypervideo_dl/extractor/nzherald.py b/hypervideo_dl/extractor/nzherald.py
new file mode 100644
index 0000000..e5601b4
--- /dev/null
+++ b/hypervideo_dl/extractor/nzherald.py
@@ -0,0 +1,98 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .brightcove import BrightcoveNewIE
+from .common import InfoExtractor
+
+from ..compat import compat_str
+from ..utils import (
+ ExtractorError,
+ traverse_obj
+)
+
+
+class NZHeraldIE(InfoExtractor):
+ IE_NAME = 'nzherald'
+ _VALID_URL = r'https?://(?:www\.)?nzherald\.co\.nz/[\w\/-]+\/(?P<id>[A-Z0-9]+)'
+ _TESTS = [
+ {
+ 'url': 'https://www.nzherald.co.nz/nz/weather-heavy-rain-gales-across-nz-most-days-this-week/PTG7QWY4E2225YHZ5NAIRBTYTQ/',
+ 'info_dict': {
+ 'id': '6271084466001',
+ 'ext': 'mp4',
+ 'title': 'MetService severe weather warning: September 6th - 7th',
+ 'timestamp': 1630891576,
+ 'upload_date': '20210906',
+ 'uploader_id': '1308227299001',
+ 'description': 'md5:db6ca335a22e2cdf37ab9d2bcda52902'
+ }
+
+ }, {
+ # Webpage has brightcove embed player url
+ 'url': 'https://www.nzherald.co.nz/travel/pencarrow-coastal-trail/HDVTPJEPP46HJ2UEMK4EGD2DFI/',
+ 'info_dict': {
+ 'id': '6261791733001',
+ 'ext': 'mp4',
+ 'title': 'Pencarrow Coastal Trail',
+ 'timestamp': 1625102897,
+ 'upload_date': '20210701',
+ 'uploader_id': '1308227299001',
+ 'description': 'md5:d361aaa0c6498f7ac1bc4fc0a0aec1e4'
+ }
+
+ }, {
+ # two video embeds of the same video
+ 'url': 'https://www.nzherald.co.nz/nz/truck-driver-captured-cutting-off-motorist-on-state-highway-1-in-canterbury/FIHNJB7PLLPHWQPK4S7ZBDUC4I/',
+ 'info_dict': {
+ 'id': '6251114530001',
+ 'ext': 'mp4',
+ 'title': 'Truck travelling north from Rakaia runs car off road',
+ 'timestamp': 1619730509,
+ 'upload_date': '20210429',
+ 'uploader_id': '1308227299001',
+ 'description': 'md5:4cae7dfb7613ac4c73b9e73a75c6b5d7'
+ }
+ }, {
+ 'url': 'https://www.nzherald.co.nz/kahu/kaupapa-companies-my-taiao-supporting-maori-in-study-and-business/PQBO2J25WCG77VGRX7W7BVYEAI/',
+ 'only_matching': True
+ }, {
+ 'url': 'https://nzherald.co.nz/the-country/video/focus-nzs-first-mass-covid-19-vaccination-event/N5I7IL3BRFLZSD33TLDLYJDGK4/',
+ 'only_matching': True
+ }, {
+ 'url': 'https://www.nzherald.co.nz/the-vision-is-clear/news/tvic-damian-roper-planting-trees-an-addiction/AN2AAEPNRK5VLISDWQAJZB6ATQ',
+ 'only_matching': True
+ }
+ ]
+
+ BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1308227299001/S1BXZn8t_default/index.html?videoId=%s'
+
+ def _extract_bc_embed_url(self, webpage):
+ """The initial webpage may include the brightcove player embed url"""
+ bc_url = BrightcoveNewIE._extract_url(self, webpage)
+ return bc_url or self._search_regex(
+ r'(?:embedUrl)\"\s*:\s*\"(?P<embed_url>%s)' % BrightcoveNewIE._VALID_URL,
+ webpage, 'embed url', default=None, group='embed_url')
+
+ def _real_extract(self, url):
+ article_id = self._match_id(url)
+ webpage = self._download_webpage(url, article_id)
+ bc_url = self._extract_bc_embed_url(webpage)
+
+ if not bc_url:
+ fusion_metadata = self._parse_json(
+ self._search_regex(r'Fusion\.globalContent\s*=\s*({.+?})\s*;', webpage, 'fusion metadata'), article_id)
+
+ video_metadata = fusion_metadata.get('video')
+ bc_video_id = traverse_obj(
+ video_metadata or fusion_metadata, # fusion metadata is the video metadata for video-only pages
+ 'brightcoveId', ('content_elements', ..., 'referent', 'id'),
+ get_all=False, expected_type=compat_str)
+
+ if not bc_video_id:
+ if isinstance(video_metadata, dict) and len(video_metadata) == 0:
+ raise ExtractorError('This article does not have a video.', expected=True)
+ else:
+ raise ExtractorError('Failed to extract brightcove video id')
+ bc_url = self.BRIGHTCOVE_URL_TEMPLATE % bc_video_id
+
+ return self.url_result(bc_url, 'BrightcoveNew')
diff --git a/hypervideo_dl/extractor/odnoklassniki.py b/hypervideo_dl/extractor/odnoklassniki.py
index 7ed9fac..9cacd38 100644
--- a/hypervideo_dl/extractor/odnoklassniki.py
+++ b/hypervideo_dl/extractor/odnoklassniki.py
@@ -247,8 +247,7 @@ class OdnoklassnikiIE(InfoExtractor):
m3u8_url = metadata.get('hlsMasterPlaylistUrl')
if m3u8_url:
formats.extend(self._extract_m3u8_formats(
- m3u8_url, video_id, 'mp4', entry_protocol='m3u8',
- m3u8_id='hls', fatal=False))
+ m3u8_url, video_id, 'mp4', m3u8_id='hls', fatal=False))
rtmp_url = metadata.get('rtmpUrl')
if rtmp_url:
formats.append({
@@ -260,7 +259,7 @@ class OdnoklassnikiIE(InfoExtractor):
if not formats:
payment_info = metadata.get('paymentInfo')
if payment_info:
- raise ExtractorError('This video is paid, subscribe to download it', expected=True)
+ self.raise_no_formats('This video is paid, subscribe to download it', expected=True)
self._sort_formats(formats)
diff --git a/hypervideo_dl/extractor/olympics.py b/hypervideo_dl/extractor/olympics.py
new file mode 100644
index 0000000..0bc9206
--- /dev/null
+++ b/hypervideo_dl/extractor/olympics.py
@@ -0,0 +1,56 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import unified_strdate
+
+
+class OlympicsReplayIE(InfoExtractor):
+ _VALID_URL = r'(?:https?://)(?:www\.)?olympics\.com/tokyo-2020/(?:[a-z]{2}/)?replay/(?P<id>[^/#&?]+)'
+ _TESTS = [{
+ 'url': 'https://olympics.com/tokyo-2020/en/replay/300622eb-abc0-43ea-b03b-c5f2d429ec7b/jumping-team-qualifier',
+ 'info_dict': {
+ 'id': '300622eb-abc0-43ea-b03b-c5f2d429ec7b',
+ 'ext': 'mp4',
+ 'title': 'Jumping Team Qualifier',
+ 'release_date': '20210806',
+ 'upload_date': '20210713',
+ },
+ 'params': {
+ 'format': 'bv',
+ },
+ }, {
+ 'url': 'https://olympics.com/tokyo-2020/en/replay/bd242924-4b22-49a5-a846-f1d4c809250d/mens-bronze-medal-match-hun-esp',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ id = self._match_id(url)
+ # The parameters are hardcoded in the webpage, it's not necessary to download the webpage just for these parameters.
+ # If in downloading webpage serves other functions aswell, then extract these parameters from it.
+ token_url = 'https://appovptok.ovpobs.tv/api/identity/app/token?api_key=OTk5NDcxOjpvY3N3LWFwaXVzZXI%3D&api_secret=ODY4ODM2MjE3ODMwYmVjNTAxMWZlMDJiMTYxZmY0MjFiMjMwMjllMjJmNDA1YWRiYzA5ODcxYTZjZTljZDkxOTo6NTM2NWIzNjRlMTM1ZmI2YWNjNmYzMGMzOGM3NzZhZTY%3D'
+ token = self._download_webpage(token_url, id)
+ headers = {'x-obs-app-token': token}
+ data_json = self._download_json(f'https://appocswtok.ovpobs.tv/api/schedule-sessions/{id}?include=stream',
+ id, headers=headers)
+ meta_data = data_json['data']['attributes']
+ for t_dict in data_json['included']:
+ if t_dict.get('type') == 'Stream':
+ stream_data = t_dict['attributes']
+ m3u8_url = self._download_json(
+ 'https://meteringtok.ovpobs.tv/api/playback-sessions', id, headers=headers, query={
+ 'alias': stream_data['alias'],
+ 'stream': stream_data['stream'],
+ 'type': 'vod'
+ })['data']['attributes']['url']
+ formats, subtitles = self._extract_m3u8_formats_and_subtitles(m3u8_url, id)
+ self._sort_formats(formats)
+
+ return {
+ 'id': id,
+ 'title': meta_data['title'],
+ 'release_date': unified_strdate(meta_data.get('start') or meta_data.get('broadcastPublished')),
+ 'upload_date': unified_strdate(meta_data.get('publishedAt')),
+ 'formats': formats,
+ 'subtitles': subtitles,
+ }
diff --git a/hypervideo_dl/extractor/on24.py b/hypervideo_dl/extractor/on24.py
new file mode 100644
index 0000000..d4d8244
--- /dev/null
+++ b/hypervideo_dl/extractor/on24.py
@@ -0,0 +1,91 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ int_or_none,
+ strip_or_none,
+ try_get,
+ urljoin,
+)
+
+
+class On24IE(InfoExtractor):
+ IE_NAME = 'on24'
+ IE_DESC = 'ON24'
+
+ _VALID_URL = r'''(?x)
+ https?://event\.on24\.com/(?:
+ wcc/r/(?P<id_1>\d{7})/(?P<key_1>[0-9A-F]{32})|
+ eventRegistration/(?:console/EventConsoleApollo|EventLobbyServlet\?target=lobby30)
+ \.jsp\?(?:[^/#?]*&)?eventid=(?P<id_2>\d{7})[^/#?]*&key=(?P<key_2>[0-9A-F]{32})
+ )'''
+
+ _TESTS = [{
+ 'url': 'https://event.on24.com/eventRegistration/console/EventConsoleApollo.jsp?uimode=nextgeneration&eventid=2197467&sessionid=1&key=5DF57BE53237F36A43B478DD36277A84&contenttype=A&eventuserid=305999&playerwidth=1000&playerheight=650&caller=previewLobby&text_language_id=en&format=fhaudio&newConsole=false',
+ 'info_dict': {
+ 'id': '2197467',
+ 'ext': 'wav',
+ 'title': 'Pearson Test of English General/Pearson English International Certificate Teacher Training Guide',
+ 'upload_date': '20200219',
+ 'timestamp': 1582149600.0,
+ 'view_count': int,
+ }
+ }, {
+ 'url': 'https://event.on24.com/wcc/r/2639291/82829018E813065A122363877975752E?mode=login&email=johnsmith@gmail.com',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://event.on24.com/eventRegistration/console/EventConsoleApollo.jsp?&eventid=2639291&sessionid=1&username=&partnerref=&format=fhvideo1&mobile=&flashsupportedmobiledevice=&helpcenter=&key=82829018E813065A122363877975752E&newConsole=true&nxChe=true&newTabCon=true&text_language_id=en&playerwidth=748&playerheight=526&eventuserid=338788762&contenttype=A&mediametricsessionid=384764716&mediametricid=3558192&usercd=369267058&mode=launch',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ mobj = self._match_valid_url(url)
+ event_id = mobj.group('id_1') or mobj.group('id_2')
+ event_key = mobj.group('key_1') or mobj.group('key_2')
+
+ event_data = self._download_json(
+ 'https://event.on24.com/apic/utilApp/EventConsoleCachedServlet',
+ event_id, query={
+ 'eventId': event_id,
+ 'displayProfile': 'player',
+ 'key': event_key,
+ 'contentType': 'A'
+ })
+ event_id = str(try_get(event_data, lambda x: x['presentationLogInfo']['eventid'])) or event_id
+ language = event_data.get('localelanguagecode')
+
+ formats = []
+ for media in event_data.get('mediaUrlInfo', []):
+ media_url = urljoin('https://event.on24.com/media/news/corporatevideo/events/', str(media.get('url')))
+ if not media_url:
+ continue
+ media_type = media.get('code')
+ if media_type == 'fhvideo1':
+ formats.append({
+ 'format_id': 'video',
+ 'url': media_url,
+ 'language': language,
+ 'ext': 'mp4',
+ 'vcodec': 'avc1.640020',
+ 'acodec': 'mp4a.40.2',
+ })
+ elif media_type == 'audio':
+ formats.append({
+ 'format_id': 'audio',
+ 'url': media_url,
+ 'language': language,
+ 'ext': 'wav',
+ 'vcodec': 'none',
+ 'acodec': 'wav'
+ })
+ self._sort_formats(formats)
+
+ return {
+ 'id': event_id,
+ 'title': strip_or_none(event_data.get('description')),
+ 'timestamp': int_or_none(try_get(event_data, lambda x: x['session']['startdate']), 1000),
+ 'webpage_url': f'https://event.on24.com/wcc/r/{event_id}/{event_key}',
+ 'view_count': event_data.get('registrantcount'),
+ 'formats': formats,
+ }
diff --git a/hypervideo_dl/extractor/ondemandkorea.py b/hypervideo_dl/extractor/ondemandkorea.py
index df1ce3c..cc3c587 100644
--- a/hypervideo_dl/extractor/ondemandkorea.py
+++ b/hypervideo_dl/extractor/ondemandkorea.py
@@ -11,18 +11,34 @@ from ..utils import (
class OnDemandKoreaIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?ondemandkorea\.com/(?P<id>[^/]+)\.html'
_GEO_COUNTRIES = ['US', 'CA']
- _TEST = {
- 'url': 'http://www.ondemandkorea.com/ask-us-anything-e43.html',
+ _TESTS = [{
+ 'url': 'https://www.ondemandkorea.com/ask-us-anything-e43.html',
'info_dict': {
'id': 'ask-us-anything-e43',
'ext': 'mp4',
- 'title': 'Ask Us Anything : E43',
+ 'title': 'Ask Us Anything : Gain, Ji Soo - 09/24/2016',
+ 'description': 'A talk show/game show with a school theme where celebrity guests appear as “transfer students.”',
'thumbnail': r're:^https?://.*\.jpg$',
},
'params': {
'skip_download': 'm3u8 download'
}
- }
+ }, {
+ 'url': 'https://www.ondemandkorea.com/confession-e01-1.html',
+ 'info_dict': {
+ 'id': 'confession-e01-1',
+ 'ext': 'mp4',
+ 'title': 'Confession : E01',
+ 'description': 'Choi Do-hyun, a criminal attorney, is the son of a death row convict. Ever since Choi Pil-su got arrested for murder, Do-hyun has wanted to solve his ',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'subtitles': {
+ 'English': 'mincount:1',
+ },
+ },
+ 'params': {
+ 'skip_download': 'm3u8 download'
+ }
+ }]
def _real_extract(self, url):
video_id = self._match_id(url)
@@ -44,11 +60,18 @@ class OnDemandKoreaIE(InfoExtractor):
'This video is only available to ODK PLUS members.',
expected=True)
- title = self._og_search_title(webpage)
+ if 'ODK PREMIUM Members Only' in webpage:
+ raise ExtractorError(
+ 'This video is only available to ODK PREMIUM members.',
+ expected=True)
+
+ title = self._search_regex(
+ r'class=["\']episode_title["\'][^>]*>([^<]+)',
+ webpage, 'episode_title', fatal=False) or self._og_search_title(webpage)
jw_config = self._parse_json(
self._search_regex(
- r'(?s)jwplayer\(([\'"])(?:(?!\1).)+\1\)\.setup\s*\((?P<options>.+?)\);',
+ r'(?s)odkPlayer\.init.*?(?P<options>{[^;]+}).*?;',
webpage, 'jw config', group='options'),
video_id, transform_source=js_to_json)
info = self._parse_jwplayer_data(
@@ -57,6 +80,7 @@ class OnDemandKoreaIE(InfoExtractor):
info.update({
'title': title,
- 'thumbnail': self._og_search_thumbnail(webpage),
+ 'description': self._og_search_description(webpage),
+ 'thumbnail': self._og_search_thumbnail(webpage)
})
return info
diff --git a/hypervideo_dl/extractor/onet.py b/hypervideo_dl/extractor/onet.py
index e55b2ac..bf53ea0 100644
--- a/hypervideo_dl/extractor/onet.py
+++ b/hypervideo_dl/extractor/onet.py
@@ -138,7 +138,7 @@ class OnetIE(OnetBaseIE):
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
display_id, video_id = mobj.group('display_id', 'id')
webpage = self._download_webpage(url, display_id)
@@ -182,7 +182,7 @@ class OnetChannelIE(OnetBaseIE):
video_id = remove_start(current_clip_info['ckmId'], 'mvp:')
video_name = url_basename(current_clip_info['url'])
- if self._downloader.params.get('noplaylist'):
+ if self.get_param('noplaylist'):
self.to_screen(
'Downloading just video %s because of --no-playlist' % video_name)
return self._extract_from_id(video_id, webpage)
diff --git a/hypervideo_dl/extractor/ooyala.py b/hypervideo_dl/extractor/ooyala.py
index eb957b8..20cfa0a 100644
--- a/hypervideo_dl/extractor/ooyala.py
+++ b/hypervideo_dl/extractor/ooyala.py
@@ -10,7 +10,6 @@ from ..compat import (
)
from ..utils import (
determine_ext,
- ExtractorError,
float_or_none,
int_or_none,
try_get,
@@ -85,7 +84,7 @@ class OoyalaBaseIE(InfoExtractor):
'fps': float_or_none(stream.get('framerate')),
})
if not formats and not auth_data.get('authorized'):
- raise ExtractorError('%s said: %s' % (
+ self.raise_no_formats('%s said: %s' % (
self.IE_NAME, auth_data['message']), expected=True)
self._sort_formats(formats)
@@ -205,6 +204,6 @@ class OoyalaExternalIE(OoyalaBaseIE):
}
def _real_extract(self, url):
- partner_id, video_id, pcode = re.match(self._VALID_URL, url).groups()
+ partner_id, video_id, pcode = self._match_valid_url(url).groups()
content_tree_url = self._CONTENT_TREE_BASE + 'external_id/%s/%s:%s' % (pcode, partner_id, video_id)
return self._extract(content_tree_url, video_id)
diff --git a/hypervideo_dl/extractor/openload.py b/hypervideo_dl/extractor/openload.py
index 0c20d01..dfdd0e5 100644
--- a/hypervideo_dl/extractor/openload.py
+++ b/hypervideo_dl/extractor/openload.py
@@ -17,6 +17,7 @@ from ..utils import (
get_exe_version,
is_outdated_version,
std_headers,
+ process_communicate_or_kill,
)
@@ -226,7 +227,7 @@ class PhantomJSwrapper(object):
self.exe, '--ssl-protocol=any',
self._TMP_FILES['script'].name
], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
- out, err = p.communicate()
+ out, err = process_communicate_or_kill(p)
if p.returncode != 0:
raise ExtractorError(
'Executing JS failed\n:' + encodeArgument(err))
diff --git a/hypervideo_dl/extractor/openrec.py b/hypervideo_dl/extractor/openrec.py
new file mode 100644
index 0000000..d7073ab
--- /dev/null
+++ b/hypervideo_dl/extractor/openrec.py
@@ -0,0 +1,126 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ traverse_obj,
+ try_get,
+ unified_strdate
+)
+from ..compat import compat_str
+
+
+class OpenRecIE(InfoExtractor):
+ IE_NAME = 'openrec'
+ _VALID_URL = r'https?://(?:www\.)?openrec\.tv/live/(?P<id>[^/]+)'
+ _TESTS = [{
+ 'url': 'https://www.openrec.tv/live/2p8v31qe4zy',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.openrec.tv/live/wez93eqvjzl',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage('https://www.openrec.tv/live/%s' % video_id, video_id)
+
+ window_stores = self._parse_json(
+ self._search_regex(r'(?m)window\.pageStore\s*=\s*(\{.+?\});$', webpage, 'window.pageStore'), video_id)
+ movie_store = traverse_obj(
+ window_stores,
+ ('v8', 'state', 'movie'),
+ ('v8', 'movie'),
+ expected_type=dict)
+ if not movie_store:
+ raise ExtractorError('Failed to extract live info')
+
+ title = movie_store.get('title')
+ description = movie_store.get('introduction')
+ thumbnail = movie_store.get('thumbnailUrl')
+
+ channel_user = movie_store.get('channel', {}).get('user')
+ uploader = try_get(channel_user, lambda x: x['name'], compat_str)
+ uploader_id = try_get(channel_user, lambda x: x['id'], compat_str)
+
+ timestamp = traverse_obj(movie_store, ('startedAt', 'time'), expected_type=int)
+
+ m3u8_playlists = movie_store.get('media')
+ formats = []
+ for (name, m3u8_url) in m3u8_playlists.items():
+ if not m3u8_url:
+ continue
+ formats.extend(self._extract_m3u8_formats(
+ m3u8_url, video_id, ext='mp4', entry_protocol='m3u8',
+ m3u8_id='hls-%s' % name, live=True))
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'formats': formats,
+ 'uploader': uploader,
+ 'uploader_id': uploader_id,
+ 'timestamp': timestamp,
+ 'is_live': True,
+ }
+
+
+class OpenRecCaptureIE(InfoExtractor):
+ IE_NAME = 'openrec:capture'
+ _VALID_URL = r'https?://(?:www\.)?openrec\.tv/capture/(?P<id>[^/]+)'
+ _TESTS = [{
+ 'url': 'https://www.openrec.tv/capture/l9nk2x4gn14',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.openrec.tv/capture/mldjr82p7qk',
+ 'info_dict': {
+ 'id': 'mldjr82p7qk',
+ 'title': 'たいじの恥ずかしい英語力',
+ 'uploader': 'たいちゃんねる',
+ 'uploader_id': 'Yaritaiji',
+ 'upload_date': '20210803',
+ },
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage('https://www.openrec.tv/capture/%s' % video_id, video_id)
+
+ window_stores = self._parse_json(
+ self._search_regex(r'(?m)window\.pageStore\s*=\s*(\{.+?\});$', webpage, 'window.pageStore'), video_id)
+ movie_store = window_stores.get('movie')
+
+ capture_data = window_stores.get('capture')
+ if not capture_data:
+ raise ExtractorError('Cannot extract title')
+ title = capture_data.get('title')
+ thumbnail = capture_data.get('thumbnailUrl')
+ upload_date = unified_strdate(capture_data.get('createdAt'))
+
+ channel_info = movie_store.get('channel') or {}
+ uploader = channel_info.get('name')
+ uploader_id = channel_info.get('id')
+
+ m3u8_url = capture_data.get('source')
+ if not m3u8_url:
+ raise ExtractorError('Cannot extract m3u8 url')
+ formats = self._extract_m3u8_formats(
+ m3u8_url, video_id, ext='mp4', entry_protocol='m3u8_native',
+ m3u8_id='hls')
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'thumbnail': thumbnail,
+ 'formats': formats,
+ 'uploader': uploader,
+ 'uploader_id': uploader_id,
+ 'upload_date': upload_date,
+ }
diff --git a/hypervideo_dl/extractor/ora.py b/hypervideo_dl/extractor/ora.py
index 1d42be3..422d0b3 100644
--- a/hypervideo_dl/extractor/ora.py
+++ b/hypervideo_dl/extractor/ora.py
@@ -55,7 +55,7 @@ class OraTVIE(InfoExtractor):
formats.append({
'url': http_template % q,
'format_id': q,
- 'preference': preference(q),
+ 'quality': preference(q),
})
self._sort_formats(formats)
else:
diff --git a/hypervideo_dl/extractor/orf.py b/hypervideo_dl/extractor/orf.py
index ed8a9a8..428ec97 100644
--- a/hypervideo_dl/extractor/orf.py
+++ b/hypervideo_dl/extractor/orf.py
@@ -98,6 +98,9 @@ class ORFTVthekIE(InfoExtractor):
elif ext == 'f4m':
formats.extend(self._extract_f4m_formats(
src, video_id, f4m_id=format_id, fatal=False))
+ elif ext == 'mpd':
+ formats.extend(self._extract_mpd_formats(
+ src, video_id, mpd_id=format_id, fatal=False))
else:
formats.append({
'format_id': format_id,
@@ -180,7 +183,7 @@ class ORFTVthekIE(InfoExtractor):
class ORFRadioIE(InfoExtractor):
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
show_date = mobj.group('date')
show_id = mobj.group('show')
diff --git a/hypervideo_dl/extractor/packtpub.py b/hypervideo_dl/extractor/packtpub.py
index 11ad3b3..c06fca7 100644
--- a/hypervideo_dl/extractor/packtpub.py
+++ b/hypervideo_dl/extractor/packtpub.py
@@ -1,7 +1,6 @@
from __future__ import unicode_literals
import json
-import re
from .common import InfoExtractor
from ..compat import (
@@ -66,7 +65,7 @@ class PacktPubIE(PacktPubBaseIE):
raise
def _real_extract(self, url):
- course_id, chapter_id, video_id, display_id = re.match(self._VALID_URL, url).groups()
+ course_id, chapter_id, video_id, display_id = self._match_valid_url(url).groups()
headers = {}
if self._TOKEN:
@@ -123,7 +122,7 @@ class PacktPubCourseIE(PacktPubBaseIE):
PacktPubCourseIE, cls).suitable(url)
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
url, course_id = mobj.group('url', 'id')
course = self._download_json(
diff --git a/hypervideo_dl/extractor/palcomp3.py b/hypervideo_dl/extractor/palcomp3.py
index fb29d83..d0a62fb 100644
--- a/hypervideo_dl/extractor/palcomp3.py
+++ b/hypervideo_dl/extractor/palcomp3.py
@@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
from ..compat import compat_str
@@ -64,7 +63,7 @@ class PalcoMP3BaseIE(InfoExtractor):
self._ARTIST_FIELDS_TMPL = self._ARTIST_FIELDS_TMPL % self._MUSIC_FIELDS
def _real_extract(self, url):
- artist_slug, music_slug = re.match(self._VALID_URL, url).groups()
+ artist_slug, music_slug = self._match_valid_url(url).groups()
artist_fields = self._ARTIST_FIELDS_TMPL % music_slug
music = self._call_api(artist_slug, artist_fields)['artist']['music']
return self._parse_music(music)
@@ -109,9 +108,9 @@ class PalcoMP3ArtistIE(PalcoMP3BaseIE):
}
name'''
- @ classmethod
+ @classmethod
def suitable(cls, url):
- return False if re.match(PalcoMP3IE._VALID_URL, url) else super(PalcoMP3ArtistIE, cls).suitable(url)
+ return False if PalcoMP3IE._match_valid_url(url) else super(PalcoMP3ArtistIE, cls).suitable(url)
def _real_extract(self, url):
artist_slug = self._match_id(url)
diff --git a/hypervideo_dl/extractor/pandoratv.py b/hypervideo_dl/extractor/pandoratv.py
index 538738c..6230053 100644
--- a/hypervideo_dl/extractor/pandoratv.py
+++ b/hypervideo_dl/extractor/pandoratv.py
@@ -1,17 +1,16 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
from ..compat import (
compat_str,
- compat_urlparse,
)
from ..utils import (
ExtractorError,
float_or_none,
parse_duration,
+ parse_qs,
str_to_int,
urlencode_postdata,
)
@@ -71,12 +70,12 @@ class PandoraTVIE(InfoExtractor):
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
user_id = mobj.group('user_id')
video_id = mobj.group('id')
if not user_id or not video_id:
- qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
+ qs = parse_qs(url)
video_id = qs.get('prgid', [None])[0]
user_id = qs.get('ch_userid', [None])[0]
if any(not f for f in (video_id, user_id,)):
diff --git a/hypervideo_dl/extractor/paramountplus.py b/hypervideo_dl/extractor/paramountplus.py
new file mode 100644
index 0000000..338b84d
--- /dev/null
+++ b/hypervideo_dl/extractor/paramountplus.py
@@ -0,0 +1,145 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from .cbs import CBSBaseIE
+from ..utils import (
+ int_or_none,
+ url_or_none,
+)
+
+
+class ParamountPlusIE(CBSBaseIE):
+ _VALID_URL = r'''(?x)
+ (?:
+ paramountplus:|
+ https?://(?:www\.)?(?:
+ paramountplus\.com/(?:shows/[^/]+/video|movies/[^/]+)/
+ )(?P<id>[\w-]+))'''
+
+ # All tests are blocked outside US
+ _TESTS = [{
+ 'url': 'https://www.paramountplus.com/shows/catdog/video/Oe44g5_NrlgiZE3aQVONleD6vXc8kP0k/catdog-climb-every-catdog-the-canine-mutiny/',
+ 'info_dict': {
+ 'id': 'Oe44g5_NrlgiZE3aQVONleD6vXc8kP0k',
+ 'ext': 'mp4',
+ 'title': 'CatDog - Climb Every CatDog/The Canine Mutiny',
+ 'description': 'md5:7ac835000645a69933df226940e3c859',
+ 'duration': 1418,
+ 'timestamp': 920264400,
+ 'upload_date': '19990301',
+ 'uploader': 'CBSI-NEW',
+ },
+ 'params': {
+ 'skip_download': 'm3u8',
+ },
+ }, {
+ 'url': 'https://www.paramountplus.com/shows/tooning-out-the-news/video/6hSWYWRrR9EUTz7IEe5fJKBhYvSUfexd/7-23-21-week-in-review-rep-jahana-hayes-howard-fineman-sen-michael-bennet-sheera-frenkel-cecilia-kang-/',
+ 'info_dict': {
+ 'id': '6hSWYWRrR9EUTz7IEe5fJKBhYvSUfexd',
+ 'ext': 'mp4',
+ 'title': '7/23/21 WEEK IN REVIEW (Rep. Jahana Hayes/Howard Fineman/Sen. Michael Bennet/Sheera Frenkel & Cecilia Kang)',
+ 'description': 'md5:f4adcea3e8b106192022e121f1565bae',
+ 'duration': 2506,
+ 'timestamp': 1627063200,
+ 'upload_date': '20210723',
+ 'uploader': 'CBSI-NEW',
+ },
+ 'params': {
+ 'skip_download': 'm3u8',
+ },
+ }, {
+ 'url': 'https://www.paramountplus.com/movies/daddys-home/vM2vm0kE6vsS2U41VhMRKTOVHyQAr6pC',
+ 'info_dict': {
+ 'id': 'vM2vm0kE6vsS2U41VhMRKTOVHyQAr6pC',
+ 'ext': 'mp4',
+ 'title': 'Daddy\'s Home',
+ 'upload_date': '20151225',
+ 'description': 'md5:a0beaf24e8d3b0e81b2ee41d47c06f33',
+ 'uploader': 'CBSI-NEW',
+ 'timestamp': 1451030400,
+ },
+ 'params': {
+ 'skip_download': 'm3u8',
+ 'format': 'bestvideo',
+ },
+ 'expected_warnings': ['Ignoring subtitle tracks'], # TODO: Investigate this
+ }, {
+ 'url': 'https://www.paramountplus.com/movies/sonic-the-hedgehog/5EKDXPOzdVf9voUqW6oRuocyAEeJGbEc',
+ 'info_dict': {
+ 'id': '5EKDXPOzdVf9voUqW6oRuocyAEeJGbEc',
+ 'ext': 'mp4',
+ 'uploader': 'CBSI-NEW',
+ 'description': 'md5:bc7b6fea84ba631ef77a9bda9f2ff911',
+ 'timestamp': 1577865600,
+ 'title': 'Sonic the Hedgehog',
+ 'upload_date': '20200101',
+ },
+ 'params': {
+ 'skip_download': 'm3u8',
+ 'format': 'bestvideo',
+ },
+ 'expected_warnings': ['Ignoring subtitle tracks'],
+ }, {
+ 'url': 'https://www.paramountplus.com/shows/all-rise/video/QmR1WhNkh1a_IrdHZrbcRklm176X_rVc/all-rise-space/',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.paramountplus.com/movies/million-dollar-american-princesses-meghan-and-harry/C0LpgNwXYeB8txxycdWdR9TjxpJOsdCq',
+ 'only_matching': True,
+ }]
+
+ def _extract_video_info(self, content_id, mpx_acc=2198311517):
+ items_data = self._download_json(
+ 'https://www.paramountplus.com/apps-api/v2.0/androidtv/video/cid/%s.json' % content_id,
+ content_id, query={'locale': 'en-us', 'at': 'ABCqWNNSwhIqINWIIAG+DFzcFUvF8/vcN6cNyXFFfNzWAIvXuoVgX+fK4naOC7V8MLI='}, headers=self.geo_verification_headers())
+
+ asset_types = {
+ item.get('assetType'): {
+ 'format': 'SMIL',
+ 'formats': 'MPEG4,M3U',
+ } for item in items_data['itemList']
+ }
+ item = items_data['itemList'][-1]
+ return self._extract_common_video_info(content_id, asset_types, mpx_acc, extra_info={
+ 'title': item.get('title'),
+ 'series': item.get('seriesTitle'),
+ 'season_number': int_or_none(item.get('seasonNum')),
+ 'episode_number': int_or_none(item.get('episodeNum')),
+ 'duration': int_or_none(item.get('duration')),
+ 'thumbnail': url_or_none(item.get('thumbnail')),
+ })
+
+
+class ParamountPlusSeriesIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?paramountplus\.com/shows/(?P<id>[a-zA-Z0-9-_]+)/?(?:[#?]|$)'
+ _TESTS = [{
+ 'url': 'https://www.paramountplus.com/shows/drake-josh',
+ 'playlist_mincount': 50,
+ 'info_dict': {
+ 'id': 'drake-josh',
+ }
+ }, {
+ 'url': 'https://www.paramountplus.com/shows/hawaii_five_0/',
+ 'playlist_mincount': 240,
+ 'info_dict': {
+ 'id': 'hawaii_five_0',
+ }
+ }, {
+ 'url': 'https://www.paramountplus.com/shows/spongebob-squarepants/',
+ 'playlist_mincount': 248,
+ 'info_dict': {
+ 'id': 'spongebob-squarepants',
+ }
+ }]
+ _API_URL = 'https://www.paramountplus.com/shows/{}/xhr/episodes/page/0/size/100000/xs/0/season/0/'
+
+ def _entries(self, show_name):
+ show_json = self._download_json(self._API_URL.format(show_name), video_id=show_name)
+ if show_json.get('success'):
+ for episode in show_json['result']['data']:
+ yield self.url_result(
+ 'https://www.paramountplus.com%s' % episode['url'],
+ ie=ParamountPlusIE.ie_key(), video_id=episode['content_id'])
+
+ def _real_extract(self, url):
+ show_name = self._match_id(url)
+ return self.playlist_result(self._entries(show_name), playlist_id=show_name)
diff --git a/hypervideo_dl/extractor/parliamentliveuk.py b/hypervideo_dl/extractor/parliamentliveuk.py
index bdd5ff5..869ebd8 100644
--- a/hypervideo_dl/extractor/parliamentliveuk.py
+++ b/hypervideo_dl/extractor/parliamentliveuk.py
@@ -1,6 +1,14 @@
+# coding: utf-8
from __future__ import unicode_literals
+import json
+import uuid
+
from .common import InfoExtractor
+from ..utils import (
+ unified_timestamp,
+ try_get,
+)
class ParliamentLiveUKIE(InfoExtractor):
@@ -11,12 +19,14 @@ class ParliamentLiveUKIE(InfoExtractor):
_TESTS = [{
'url': 'http://parliamentlive.tv/Event/Index/c1e9d44d-fd6c-4263-b50f-97ed26cc998b',
'info_dict': {
- 'id': '1_af9nv9ym',
+ 'id': 'c1e9d44d-fd6c-4263-b50f-97ed26cc998b',
'ext': 'mp4',
'title': 'Home Affairs Committee',
- 'uploader_id': 'FFMPEG-01',
- 'timestamp': 1422696664,
- 'upload_date': '20150131',
+ 'timestamp': 1395153872,
+ 'upload_date': '20140318',
+ },
+ 'params': {
+ 'format': 'bestvideo',
},
}, {
'url': 'http://parliamentlive.tv/event/index/3f24936f-130f-40bf-9a5d-b3d6479da6a4',
@@ -25,19 +35,49 @@ class ParliamentLiveUKIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
- webpage = self._download_webpage(
- 'http://vodplayer.parliamentlive.tv/?mid=' + video_id, video_id)
- widget_config = self._parse_json(self._search_regex(
- r'(?s)kWidgetConfig\s*=\s*({.+});',
- webpage, 'kaltura widget config'), video_id)
- kaltura_url = 'kaltura:%s:%s' % (
- widget_config['wid'][1:], widget_config['entry_id'])
- event_title = self._download_json(
- 'http://parliamentlive.tv/Event/GetShareVideo/' + video_id, video_id)['event']['title']
+ video_info = self._download_json(f'https://www.parliamentlive.tv/Event/GetShareVideo/{video_id}', video_id)
+ _DEVICE_ID = str(uuid.uuid4())
+ auth = 'Bearer ' + self._download_json(
+ 'https://exposure.api.redbee.live/v2/customer/UKParliament/businessunit/ParliamentLive/auth/anonymous',
+ video_id, headers={
+ 'Origin': 'https://videoplayback.parliamentlive.tv',
+ 'Accept': 'application/json, text/plain, */*',
+ 'Content-Type': 'application/json;charset=utf-8'
+ }, data=json.dumps({
+ 'deviceId': _DEVICE_ID,
+ 'device': {
+ 'deviceId': _DEVICE_ID,
+ 'width': 653,
+ 'height': 368,
+ 'type': 'WEB',
+ 'name': ' Mozilla Firefox 91'
+ }
+ }).encode('utf-8'))['sessionToken']
+
+ video_urls = self._download_json(
+ f'https://exposure.api.redbee.live/v2/customer/UKParliament/businessunit/ParliamentLive/entitlement/{video_id}/play',
+ video_id, headers={'Authorization': auth, 'Accept': 'application/json, text/plain, */*'})['formats']
+
+ formats = []
+ for format in video_urls:
+ if not format.get('mediaLocator'):
+ continue
+ if format.get('format') == 'DASH':
+ formats.extend(self._extract_mpd_formats(
+ format['mediaLocator'], video_id, mpd_id='dash', fatal=False))
+ elif format.get('format') == 'SMOOTHSTREAMING':
+ formats.extend(self._extract_ism_formats(
+ format['mediaLocator'], video_id, ism_id='ism', fatal=False))
+ elif format.get('format') == 'HLS':
+ formats.extend(self._extract_m3u8_formats(
+ format['mediaLocator'], video_id, m3u8_id='hls', fatal=False))
+
+ self._sort_formats(formats)
+
return {
- '_type': 'url_transparent',
- 'title': event_title,
- 'description': '',
- 'url': kaltura_url,
- 'ie_key': 'Kaltura',
+ 'id': video_id,
+ 'formats': formats,
+ 'title': video_info['event']['title'],
+ 'timestamp': unified_timestamp(try_get(video_info, lambda x: x['event']['publishedStartTime'])),
+ 'thumbnail': video_info.get('thumbnailUrl'),
}
diff --git a/hypervideo_dl/extractor/parlview.py b/hypervideo_dl/extractor/parlview.py
new file mode 100644
index 0000000..c85eaa7
--- /dev/null
+++ b/hypervideo_dl/extractor/parlview.py
@@ -0,0 +1,68 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+ int_or_none,
+ try_get,
+ unified_timestamp,
+)
+
+
+class ParlviewIE(InfoExtractor):
+
+ _VALID_URL = r'https?://(?:www\.)?parlview\.aph\.gov\.au/(?:[^/]+)?\bvideoID=(?P<id>\d{6})'
+ _TESTS = [{
+ 'url': 'https://parlview.aph.gov.au/mediaPlayer.php?videoID=542661',
+ 'info_dict': {
+ 'id': '542661',
+ 'ext': 'mp4',
+ 'title': "Australia's Family Law System [Part 2]",
+ 'duration': 5799,
+ 'description': 'md5:7099883b391619dbae435891ca871a62',
+ 'timestamp': 1621430700,
+ 'upload_date': '20210519',
+ 'uploader': 'Joint Committee',
+ },
+ 'params': {
+ 'skip_download': True,
+ }
+ }, {
+ 'url': 'https://parlview.aph.gov.au/mediaPlayer.php?videoID=539936',
+ 'only_matching': True,
+ }]
+ _API_URL = 'https://parlview.aph.gov.au/api_v3/1/playback/getUniversalPlayerConfig?videoID=%s&format=json'
+ _MEDIA_INFO_URL = 'https://parlview.aph.gov.au/ajaxPlayer.php?videoID=%s&tabNum=4&action=loadTab'
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+ media = self._download_json(self._API_URL % video_id, video_id).get('media')
+ timestamp = try_get(media, lambda x: x['timeMap']['source']['timecode_offsets'][0], compat_str) or '/'
+
+ stream = try_get(media, lambda x: x['renditions'][0], dict)
+ if not stream:
+ self.raise_no_formats('No streams were detected')
+ elif stream.get('streamType') != 'VOD':
+ self.raise_no_formats('Unknown type of stream was detected: "%s"' % str(stream.get('streamType')))
+ formats = self._extract_m3u8_formats(stream['url'], video_id, 'mp4', 'm3u8_native')
+ self._sort_formats(formats)
+
+ media_info = self._download_webpage(
+ self._MEDIA_INFO_URL % video_id, video_id, note='Downloading media info', fatal=False)
+
+ return {
+ 'id': video_id,
+ 'url': url,
+ 'title': self._html_search_regex(r'<h2>([^<]+)<', webpage, 'title', fatal=False),
+ 'formats': formats,
+ 'duration': int_or_none(media.get('duration')),
+ 'timestamp': unified_timestamp(timestamp.split('/', 1)[1].replace('_', ' ')),
+ 'description': self._html_search_regex(
+ r'<div[^>]+class="descripti?on"[^>]*>[^>]+<strong>[^>]+>[^>]+>([^<]+)',
+ webpage, 'description', fatal=False),
+ 'uploader': self._html_search_regex(
+ r'<td>[^>]+>Channel:[^>]+>([^<]+)', media_info, 'channel', fatal=False),
+ 'thumbnail': media.get('staticImage'),
+ }
diff --git a/hypervideo_dl/extractor/patreon.py b/hypervideo_dl/extractor/patreon.py
index 761a4b1..a189c02 100644
--- a/hypervideo_dl/extractor/patreon.py
+++ b/hypervideo_dl/extractor/patreon.py
@@ -1,7 +1,12 @@
# coding: utf-8
from __future__ import unicode_literals
+import itertools
+
from .common import InfoExtractor
+from .vimeo import VimeoIE
+
+from ..compat import compat_urllib_parse_unquote
from ..utils import (
clean_html,
determine_ext,
@@ -11,6 +16,7 @@ from ..utils import (
parse_iso8601,
str_or_none,
try_get,
+ url_or_none,
)
@@ -63,6 +69,20 @@ class PatreonIE(InfoExtractor):
}, {
'url': 'https://www.patreon.com/posts/743933',
'only_matching': True,
+ }, {
+ 'url': 'https://www.patreon.com/posts/kitchen-as-seen-51706779',
+ 'md5': '96656690071f6d64895866008484251b',
+ 'info_dict': {
+ 'id': '555089736',
+ 'ext': 'mp4',
+ 'title': 'KITCHEN AS SEEN ON DEEZ NUTS EXTENDED!',
+ 'uploader': 'Cold Ones',
+ 'thumbnail': 're:^https?://.*$',
+ 'upload_date': '20210526',
+ 'description': 'md5:557a409bd79d3898689419094934ba79',
+ 'uploader_id': '14936315',
+ },
+ 'skip': 'Patron-only content'
}]
# Currently Patreon exposes download URL via hidden CSS, so login is not
@@ -137,6 +157,19 @@ class PatreonIE(InfoExtractor):
})
if not info.get('url'):
+ # handle Vimeo embeds
+ if try_get(attributes, lambda x: x['embed']['provider']) == 'Vimeo':
+ embed_html = try_get(attributes, lambda x: x['embed']['html'])
+ v_url = url_or_none(compat_urllib_parse_unquote(
+ self._search_regex(r'src=(https%3A%2F%2Fplayer\.vimeo\.com.+)%3F', embed_html, 'vimeo url', fatal=False)))
+ if v_url:
+ info.update({
+ '_type': 'url_transparent',
+ 'url': VimeoIE._smuggle_referrer(v_url, 'https://patreon.com'),
+ 'ie_key': 'Vimeo',
+ })
+
+ if not info.get('url'):
embed_url = try_get(attributes, lambda x: x['embed']['url'])
if embed_url:
info.update({
@@ -154,3 +187,56 @@ class PatreonIE(InfoExtractor):
})
return info
+
+
+class PatreonUserIE(InfoExtractor):
+
+ _VALID_URL = r'https?://(?:www\.)?patreon\.com/(?P<id>[-_\w\d]+)/?(?:posts/?)?'
+
+ _TESTS = [{
+ 'url': 'https://www.patreon.com/dissonancepod/',
+ 'info_dict': {
+ 'title': 'dissonancepod',
+ },
+ 'playlist_mincount': 68,
+ 'expected_warnings': 'Post not viewable by current user! Skipping!',
+ }, {
+ 'url': 'https://www.patreon.com/dissonancepod/posts',
+ 'only_matching': True
+ }, ]
+
+ @classmethod
+ def suitable(cls, url):
+ return False if PatreonIE.suitable(url) else super(PatreonUserIE, cls).suitable(url)
+
+ def _entries(self, campaign_id, user_id):
+ cursor = None
+ params = {
+ 'fields[campaign]': 'show_audio_post_download_links,name,url',
+ 'fields[post]': 'current_user_can_view,embed,image,is_paid,post_file,published_at,patreon_url,url,post_type,thumbnail_url,title',
+ 'filter[campaign_id]': campaign_id,
+ 'filter[is_draft]': 'false',
+ 'sort': '-published_at',
+ 'json-api-version': 1.0,
+ 'json-api-use-default-includes': 'false',
+ }
+
+ for page in itertools.count(1):
+
+ params.update({'page[cursor]': cursor} if cursor else {})
+ posts_json = self._download_json('https://www.patreon.com/api/posts', user_id, note='Downloading posts page %d' % page, query=params, headers={'Cookie': '.'})
+
+ cursor = try_get(posts_json, lambda x: x['meta']['pagination']['cursors']['next'])
+
+ for post in posts_json.get('data') or []:
+ yield self.url_result(url_or_none(try_get(post, lambda x: x['attributes']['patreon_url'])), 'Patreon')
+
+ if cursor is None:
+ break
+
+ def _real_extract(self, url):
+
+ user_id = self._match_id(url)
+ webpage = self._download_webpage(url, user_id, headers={'Cookie': '.'})
+ campaign_id = self._search_regex(r'https://www.patreon.com/api/campaigns/(\d+)/?', webpage, 'Campaign ID')
+ return self.playlist_result(self._entries(campaign_id, user_id), playlist_title=user_id)
diff --git a/hypervideo_dl/extractor/pbs.py b/hypervideo_dl/extractor/pbs.py
index d4baa16..0eabf9b 100644
--- a/hypervideo_dl/extractor/pbs.py
+++ b/hypervideo_dl/extractor/pbs.py
@@ -436,7 +436,7 @@ class PBSIE(InfoExtractor):
self._set_cookie('.pbs.org', 'pbsol.station', station)
def _extract_webpage(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
description = None
@@ -600,6 +600,7 @@ class PBSIE(InfoExtractor):
formats = []
http_url = None
+ hls_subs = {}
for num, redirect in enumerate(redirects):
redirect_id = redirect.get('eeid')
@@ -622,8 +623,9 @@ class PBSIE(InfoExtractor):
continue
if determine_ext(format_url) == 'm3u8':
- formats.extend(self._extract_m3u8_formats(
- format_url, display_id, 'mp4', m3u8_id='hls', fatal=False))
+ hls_formats, hls_subs = self._extract_m3u8_formats_and_subtitles(
+ format_url, display_id, 'mp4', m3u8_id='hls', fatal=False)
+ formats.extend(hls_formats)
else:
formats.append({
'url': format_url,
@@ -666,25 +668,12 @@ class PBSIE(InfoExtractor):
age_limit = US_RATINGS.get(rating_str)
subtitles = {}
- closed_captions_url = info.get('closed_captions_url')
- if closed_captions_url:
- subtitles['en'] = [{
- 'ext': 'ttml',
- 'url': closed_captions_url,
- }]
- mobj = re.search(r'/(\d+)_Encoded\.dfxp', closed_captions_url)
- if mobj:
- ttml_caption_suffix, ttml_caption_id = mobj.group(0, 1)
- ttml_caption_id = int(ttml_caption_id)
- subtitles['en'].extend([{
- 'url': closed_captions_url.replace(
- ttml_caption_suffix, '/%d_Encoded.srt' % (ttml_caption_id + 1)),
- 'ext': 'srt',
- }, {
- 'url': closed_captions_url.replace(
- ttml_caption_suffix, '/%d_Encoded.vtt' % (ttml_caption_id + 2)),
- 'ext': 'vtt',
- }])
+ captions = info.get('cc') or {}
+ for caption_url in captions.values():
+ subtitles.setdefault('en', []).append({
+ 'url': caption_url
+ })
+ subtitles = self._merge_subtitles(subtitles, hls_subs)
# info['title'] is often incomplete (e.g. 'Full Episode', 'Episode 5', etc)
# Try turning it to 'program - title' naming scheme if possible
diff --git a/hypervideo_dl/extractor/peertube.py b/hypervideo_dl/extractor/peertube.py
index d9b13ad..1e22f24 100644
--- a/hypervideo_dl/extractor/peertube.py
+++ b/hypervideo_dl/extractor/peertube.py
@@ -1,6 +1,7 @@
# coding: utf-8
from __future__ import unicode_literals
+import functools
import re
from .common import InfoExtractor
@@ -13,12 +14,644 @@ from ..utils import (
unified_timestamp,
url_or_none,
urljoin,
+ OnDemandPagedList,
)
class PeerTubeIE(InfoExtractor):
_INSTANCES_RE = r'''(?:
# Taken from https://instances.joinpeertube.org/instances
+ 40two\.tube|
+ a\.metube\.ch|
+ advtv\.ml|
+ algorithmic\.tv|
+ alimulama\.com|
+ arcana\.fun|
+ archive\.vidicon\.org|
+ artefac-paris\.tv|
+ auf1\.eu|
+ battlepenguin\.video|
+ beertube\.epgn\.ch|
+ befree\.nohost\.me|
+ bideoak\.argia\.eus|
+ birkeundnymphe\.de|
+ bitcointv\.com|
+ cattube\.org|
+ clap\.nerv-project\.eu|
+ climatejustice\.video|
+ comf\.tube|
+ conspiracydistillery\.com|
+ darkvapor\.nohost\.me|
+ daschauher\.aksel\.rocks|
+ digitalcourage\.video|
+ dreiecksnebel\.alex-detsch\.de|
+ eduvid\.org|
+ evangelisch\.video|
+ exo\.tube|
+ fair\.tube|
+ fediverse\.tv|
+ film\.k-prod\.fr|
+ flim\.txmn\.tk|
+ fotogramas\.politicaconciencia\.org|
+ ftsi\.ru|
+ gary\.vger\.cloud|
+ graeber\.video|
+ greatview\.video|
+ grypstube\.uni-greifswald\.de|
+ highvoltage\.tv|
+ hpstube\.fr|
+ htp\.live|
+ hyperreal\.tube|
+ juggling\.digital|
+ kino\.kompot\.si|
+ kino\.schuerz\.at|
+ kinowolnosc\.pl|
+ kirche\.peertube-host\.de|
+ kodcast\.com|
+ kolektiva\.media|
+ kraut\.zone|
+ kumi\.tube|
+ lastbreach\.tv|
+ lepetitmayennais\.fr\.nf|
+ lexx\.impa\.me|
+ libertynode\.tv|
+ libra\.syntazia\.org|
+ libremedia\.video|
+ live\.libratoi\.org|
+ live\.nanao\.moe|
+ live\.toobnix\.org|
+ livegram\.net|
+ lolitube\.freedomchan\.moe|
+ lucarne\.balsamine\.be|
+ maindreieck-tv\.de|
+ mani\.tube|
+ manicphase\.me|
+ media\.gzevd\.de|
+ media\.inno3\.cricket|
+ media\.kaitaia\.life|
+ media\.krashboyz\.org|
+ media\.over-world\.org|
+ media\.skewed\.de|
+ media\.undeadnetwork\.de|
+ medias\.pingbase\.net|
+ melsungen\.peertube-host\.de|
+ mirametube\.fr|
+ mojotube\.net|
+ monplaisirtube\.ddns\.net|
+ mountaintown\.video|
+ my\.bunny\.cafe|
+ myfreetube\.de|
+ mytube\.kn-cloud\.de|
+ mytube\.madzel\.de|
+ myworkoutarenapeertube\.cf|
+ nanawel-peertube\.dyndns\.org|
+ nastub\.cz|
+ offenes\.tv|
+ orgdup\.media|
+ ovaltube\.codinglab\.ch|
+ p2ptv\.ru|
+ p\.eertu\.be|
+ p\.lu|
+ peer\.azurs\.fr|
+ peertube1\.zeteo\.me|
+ peertube\.020\.pl|
+ peertube\.0x5e\.eu|
+ peertube\.alpharius\.io|
+ peertube\.am-networks\.fr|
+ peertube\.anduin\.net|
+ peertube\.anzui\.dev|
+ peertube\.arbleizez\.bzh|
+ peertube\.art3mis\.de|
+ peertube\.atilla\.org|
+ peertube\.atsuchan\.page|
+ peertube\.aukfood\.net|
+ peertube\.aventer\.biz|
+ peertube\.b38\.rural-it\.org|
+ peertube\.beeldengeluid\.nl|
+ peertube\.be|
+ peertube\.bgzashtita\.es|
+ peertube\.bitsandlinux\.com|
+ peertube\.biz|
+ peertube\.boba\.best|
+ peertube\.br0\.fr|
+ peertube\.bridaahost\.ynh\.fr|
+ peertube\.bubbletea\.dev|
+ peertube\.bubuit\.net|
+ peertube\.cabaal\.net|
+ peertube\.cats-home\.net|
+ peertube\.chemnitz\.freifunk\.net|
+ peertube\.chevro\.fr|
+ peertube\.chrisspiegl\.com|
+ peertube\.chtisurel\.net|
+ peertube\.cipherbliss\.com|
+ peertube\.cloud\.sans\.pub|
+ peertube\.cpge-brizeux\.fr|
+ peertube\.ctseuro\.com|
+ peertube\.cuatrolibertades\.org|
+ peertube\.cybercirujas\.club|
+ peertube\.cythin\.com|
+ peertube\.davigge\.com|
+ peertube\.dc\.pini\.fr|
+ peertube\.debian\.social|
+ peertube\.demonix\.fr|
+ peertube\.designersethiques\.org|
+ peertube\.desmu\.fr|
+ peertube\.devloprog\.org|
+ peertube\.devol\.it|
+ peertube\.dtmf\.ca|
+ peertube\.ecologie\.bzh|
+ peertube\.eu\.org|
+ peertube\.european-pirates\.eu|
+ peertube\.euskarabildua\.eus|
+ peertube\.fenarinarsa\.com|
+ peertube\.fomin\.site|
+ peertube\.forsud\.be|
+ peertube\.francoispelletier\.org|
+ peertube\.freenet\.ru|
+ peertube\.freetalklive\.com|
+ peertube\.functional\.cafe|
+ peertube\.gardeludwig\.fr|
+ peertube\.gargantia\.fr|
+ peertube\.gcfamily\.fr|
+ peertube\.genma\.fr|
+ peertube\.get-racing\.de|
+ peertube\.gidikroon\.eu|
+ peertube\.gruezishop\.ch|
+ peertube\.habets\.house|
+ peertube\.hackerfraternity\.org|
+ peertube\.ichigo\.everydayimshuflin\.com|
+ peertube\.ignifi\.me|
+ peertube\.inapurna\.org|
+ peertube\.informaction\.info|
+ peertube\.interhop\.org|
+ peertube\.iselfhost\.com|
+ peertube\.it|
+ peertube\.jensdiemer\.de|
+ peertube\.joffreyverd\.fr|
+ peertube\.kalua\.im|
+ peertube\.kathryl\.fr|
+ peertube\.keazilla\.net|
+ peertube\.klaewyss\.fr|
+ peertube\.kodcast\.com|
+ peertube\.kx\.studio|
+ peertube\.lagvoid\.com|
+ peertube\.lavallee\.tech|
+ peertube\.le5emeaxe\.fr|
+ peertube\.lestutosdeprocessus\.fr|
+ peertube\.librenet\.co\.za|
+ peertube\.logilab\.fr|
+ peertube\.louisematic\.site|
+ peertube\.luckow\.org|
+ peertube\.luga\.at|
+ peertube\.lyceeconnecte\.fr|
+ peertube\.manalejandro\.com|
+ peertube\.marud\.fr|
+ peertube\.mattone\.net|
+ peertube\.maxweiss\.io|
+ peertube\.monlycee\.net|
+ peertube\.mxinfo\.fr|
+ peertube\.myrasp\.eu|
+ peertube\.nebelcloud\.de|
+ peertube\.netzbegruenung\.de|
+ peertube\.newsocial\.tech|
+ peertube\.nicolastissot\.fr|
+ peertube\.nz|
+ peertube\.offerman\.com|
+ peertube\.opencloud\.lu|
+ peertube\.orthus\.link|
+ peertube\.patapouf\.xyz|
+ peertube\.pi2\.dev|
+ peertube\.plataformess\.org|
+ peertube\.pl|
+ peertube\.portaesgnos\.org|
+ peertube\.r2\.enst\.fr|
+ peertube\.r5c3\.fr|
+ peertube\.radres\.xyz|
+ peertube\.red|
+ peertube\.robonomics\.network|
+ peertube\.rtnkv\.cloud|
+ peertube\.runfox\.tk|
+ peertube\.satoshishop\.de|
+ peertube\.scic-tetris\.org|
+ peertube\.securitymadein\.lu|
+ peertube\.semweb\.pro|
+ peertube\.social\.my-wan\.de|
+ peertube\.soykaf\.org|
+ peertube\.stefofficiel\.me|
+ peertube\.stream|
+ peertube\.su|
+ peertube\.swrs\.net|
+ peertube\.takeko\.cyou|
+ peertube\.tangentfox\.com|
+ peertube\.taxinachtegel\.de|
+ peertube\.thenewoil\.xyz|
+ peertube\.ti-fr\.com|
+ peertube\.tiennot\.net|
+ peertube\.troback\.com|
+ peertube\.tspu\.edu\.ru|
+ peertube\.tux\.ovh|
+ peertube\.tv|
+ peertube\.tweb\.tv|
+ peertube\.ucy\.de|
+ peertube\.underworld\.fr|
+ peertube\.us\.to|
+ peertube\.ventresmous\.fr|
+ peertube\.vlaki\.cz|
+ peertube\.w\.utnw\.de|
+ peertube\.westring\.digital|
+ peertube\.xwiki\.com|
+ peertube\.zoz-serv\.org|
+ peervideo\.ru|
+ periscope\.numenaute\.org|
+ perron-tube\.de|
+ petitlutinartube\.fr|
+ phijkchu\.com|
+ pierre\.tube|
+ piraten\.space|
+ play\.rosano\.ca|
+ player\.ojamajo\.moe|
+ plextube\.nl|
+ pocketnetpeertube1\.nohost\.me|
+ pocketnetpeertube3\.nohost\.me|
+ pocketnetpeertube4\.nohost\.me|
+ pocketnetpeertube5\.nohost\.me|
+ pocketnetpeertube6\.nohost\.me|
+ pt\.24-7\.ro|
+ pt\.apathy\.top|
+ pt\.diaspodon\.fr|
+ pt\.fedi\.tech|
+ pt\.maciej\.website|
+ ptb\.lunarviews\.net|
+ ptmir1\.inter21\.net|
+ ptmir2\.inter21\.net|
+ ptmir3\.inter21\.net|
+ ptmir4\.inter21\.net|
+ ptmir5\.inter21\.net|
+ ptube\.horsentiers\.fr|
+ ptube\.xmanifesto\.club|
+ queermotion\.org|
+ re-wizja\.re-medium\.com|
+ regarder\.sans\.pub|
+ ruraletv\.ovh|
+ s1\.gegenstimme\.tv|
+ s2\.veezee\.tube|
+ sdmtube\.fr|
+ sender-fm\.veezee\.tube|
+ serv1\.wiki-tube\.de|
+ serv3\.wiki-tube\.de|
+ sickstream\.net|
+ sleepy\.tube|
+ sovran\.video|
+ spectra\.video|
+ stream\.elven\.pw|
+ stream\.k-prod\.fr|
+ stream\.shahab\.nohost\.me|
+ streamsource\.video|
+ studios\.racer159\.com|
+ testtube\.florimond\.eu|
+ tgi\.hosted\.spacebear\.ee|
+ thaitube\.in\.th|
+ the\.jokertv\.eu|
+ theater\.ethernia\.net|
+ thecool\.tube|
+ tilvids\.com|
+ toob\.bub\.org|
+ tpaw\.video|
+ truetube\.media|
+ tuba\.lhub\.pl|
+ tube-aix-marseille\.beta\.education\.fr|
+ tube-amiens\.beta\.education\.fr|
+ tube-besancon\.beta\.education\.fr|
+ tube-bordeaux\.beta\.education\.fr|
+ tube-clermont-ferrand\.beta\.education\.fr|
+ tube-corse\.beta\.education\.fr|
+ tube-creteil\.beta\.education\.fr|
+ tube-dijon\.beta\.education\.fr|
+ tube-education\.beta\.education\.fr|
+ tube-grenoble\.beta\.education\.fr|
+ tube-lille\.beta\.education\.fr|
+ tube-limoges\.beta\.education\.fr|
+ tube-montpellier\.beta\.education\.fr|
+ tube-nancy\.beta\.education\.fr|
+ tube-nantes\.beta\.education\.fr|
+ tube-nice\.beta\.education\.fr|
+ tube-normandie\.beta\.education\.fr|
+ tube-orleans-tours\.beta\.education\.fr|
+ tube-outremer\.beta\.education\.fr|
+ tube-paris\.beta\.education\.fr|
+ tube-poitiers\.beta\.education\.fr|
+ tube-reims\.beta\.education\.fr|
+ tube-rennes\.beta\.education\.fr|
+ tube-strasbourg\.beta\.education\.fr|
+ tube-toulouse\.beta\.education\.fr|
+ tube-versailles\.beta\.education\.fr|
+ tube1\.it\.tuwien\.ac\.at|
+ tube\.abolivier\.bzh|
+ tube\.ac-amiens\.fr|
+ tube\.aerztefueraufklaerung\.de|
+ tube\.alexx\.ml|
+ tube\.amic37\.fr|
+ tube\.anufrij\.de|
+ tube\.apolut\.net|
+ tube\.arkhalabs\.io|
+ tube\.arthack\.nz|
+ tube\.as211696\.net|
+ tube\.avensio\.de|
+ tube\.azbyka\.ru|
+ tube\.azkware\.net|
+ tube\.bachaner\.fr|
+ tube\.bmesh\.org|
+ tube\.borked\.host|
+ tube\.bstly\.de|
+ tube\.chaoszone\.tv|
+ tube\.chatelet\.ovh|
+ tube\.cloud-libre\.eu|
+ tube\.cms\.garden|
+ tube\.cowfee\.moe|
+ tube\.cryptography\.dog|
+ tube\.darknight-coffee\.org|
+ tube\.dev\.lhub\.pl|
+ tube\.distrilab\.fr|
+ tube\.dsocialize\.net|
+ tube\.ebin\.club|
+ tube\.fdn\.fr|
+ tube\.florimond\.eu|
+ tube\.foxarmy\.ml|
+ tube\.foxden\.party|
+ tube\.frischesicht\.de|
+ tube\.futuretic\.fr|
+ tube\.gnous\.eu|
+ tube\.grap\.coop|
+ tube\.graz\.social|
+ tube\.grin\.hu|
+ tube\.hackerscop\.org|
+ tube\.hordearii\.fr|
+ tube\.jeena\.net|
+ tube\.kai-stuht\.com|
+ tube\.kockatoo\.org|
+ tube\.kotur\.org|
+ tube\.lacaveatonton\.ovh|
+ tube\.linkse\.media|
+ tube\.lokad\.com|
+ tube\.lucie-philou\.com|
+ tube\.melonbread\.xyz|
+ tube\.mfraters\.net|
+ tube\.motuhake\.xyz|
+ tube\.mrbesen\.de|
+ tube\.nah\.re|
+ tube\.nchoco\.net|
+ tube\.novg\.net|
+ tube\.nox-rhea\.org|
+ tube\.nuagelibre\.fr|
+ tube\.nx12\.net|
+ tube\.octaplex\.net|
+ tube\.odat\.xyz|
+ tube\.oisux\.org|
+ tube\.opportunis\.me|
+ tube\.org\.il|
+ tube\.ortion\.xyz|
+ tube\.others\.social|
+ tube\.picasoft\.net|
+ tube\.plomlompom\.com|
+ tube\.pmj\.rocks|
+ tube\.portes-imaginaire\.org|
+ tube\.pyngu\.com|
+ tube\.rebellion\.global|
+ tube\.rhythms-of-resistance\.org|
+ tube\.rita\.moe|
+ tube\.rsi\.cnr\.it|
+ tube\.s1gm4\.eu|
+ tube\.saumon\.io|
+ tube\.schleuss\.online|
+ tube\.schule\.social|
+ tube\.seditio\.fr|
+ tube\.shanti\.cafe|
+ tube\.shela\.nu|
+ tube\.skrep\.in|
+ tube\.sp-codes\.de|
+ tube\.sp4ke\.com|
+ tube\.superseriousbusiness\.org|
+ tube\.systest\.eu|
+ tube\.tappret\.fr|
+ tube\.tardis\.world|
+ tube\.toontoet\.nl|
+ tube\.tpshd\.de|
+ tube\.troopers\.agency|
+ tube\.tylerdavis\.xyz|
+ tube\.undernet\.uy|
+ tube\.vigilian-consulting\.nl|
+ tube\.vraphim\.com|
+ tube\.wehost\.lgbt|
+ tube\.wien\.rocks|
+ tube\.wolfe\.casa|
+ tube\.xd0\.de|
+ tube\.xy-space\.de|
+ tube\.yapbreak\.fr|
+ tubedu\.org|
+ tubes\.jodh\.us|
+ tuktube\.com|
+ turkum\.me|
+ tututu\.tube|
+ tuvideo\.encanarias\.info|
+ tv1\.cocu\.cc|
+ tv1\.gomntu\.space|
+ tv2\.cocu\.cc|
+ tv\.adn\.life|
+ tv\.atmx\.ca|
+ tv\.bitma\.st|
+ tv\.generallyrubbish\.net\.au|
+ tv\.lumbung\.space|
+ tv\.mattchristiansenmedia\.com|
+ tv\.netwhood\.online|
+ tv\.neue\.city|
+ tv\.piejacker\.net|
+ tv\.pirateradio\.social|
+ tv\.undersco\.re|
+ tvox\.ru|
+ twctube\.twc-zone\.eu|
+ unfilter\.tube|
+ v\.basspistol\.org|
+ v\.kisombrella\.top|
+ v\.lastorder\.xyz|
+ v\.lor\.sh|
+ v\.phreedom\.club|
+ v\.sil\.sh|
+ v\.szy\.io|
+ v\.xxxapex\.com|
+ veezee\.tube|
+ vid\.dascoyote\.xyz|
+ vid\.garwood\.io|
+ vid\.ncrypt\.at|
+ vid\.pravdastalina\.info|
+ vid\.qorg11\.net|
+ vid\.rajeshtaylor\.com|
+ vid\.samtripoli\.com|
+ vid\.werefox\.dev|
+ vid\.wildeboer\.net|
+ video-cave-v2\.de|
+ video\.076\.ne\.jp|
+ video\.1146\.nohost\.me|
+ video\.altertek\.org|
+ video\.anartist\.org|
+ video\.apps\.thedoodleproject\.net|
+ video\.artist\.cx|
+ video\.asgardius\.company|
+ video\.balsillie\.net|
+ video\.bards\.online|
+ video\.binarydad\.com|
+ video\.blast-info\.fr|
+ video\.catgirl\.biz|
+ video\.cigliola\.com|
+ video\.cm-en-transition\.fr|
+ video\.cnt\.social|
+ video\.coales\.co|
+ video\.codingfield\.com|
+ video\.comptoir\.net|
+ video\.comune\.trento\.it|
+ video\.cpn\.so|
+ video\.csc49\.fr|
+ video\.cybre\.town|
+ video\.demokratischer-sommer\.de|
+ video\.discord-insoumis\.fr|
+ video\.dolphincastle\.com|
+ video\.dresden\.network|
+ video\.ecole-89\.com|
+ video\.elgrillolibertario\.org|
+ video\.emergeheart\.info|
+ video\.eradicatinglove\.xyz|
+ video\.ethantheenigma\.me|
+ video\.exodus-privacy\.eu\.org|
+ video\.fbxl\.net|
+ video\.fhtagn\.org|
+ video\.greenmycity\.eu|
+ video\.guerredeclasse\.fr|
+ video\.gyt\.is|
+ video\.hackers\.town|
+ video\.hardlimit\.com|
+ video\.hooli\.co|
+ video\.igem\.org|
+ video\.internet-czas-dzialac\.pl|
+ video\.islameye\.com|
+ video\.kicik\.fr|
+ video\.kuba-orlik\.name|
+ video\.kyushojitsu\.ca|
+ video\.lavolte\.net|
+ video\.lespoesiesdheloise\.fr|
+ video\.liberta\.vip|
+ video\.liege\.bike|
+ video\.linc\.systems|
+ video\.linux\.it|
+ video\.linuxtrent\.it|
+ video\.lokal\.social|
+ video\.lono\.space|
+ video\.lunasqu\.ee|
+ video\.lundi\.am|
+ video\.marcorennmaus\.de|
+ video\.mass-trespass\.uk|
+ video\.mugoreve\.fr|
+ video\.mundodesconocido\.com|
+ video\.mycrowd\.ca|
+ video\.nogafam\.es|
+ video\.odayacres\.farm|
+ video\.ozgurkon\.org|
+ video\.p1ng0ut\.social|
+ video\.p3x\.de|
+ video\.pcf\.fr|
+ video\.pony\.gallery|
+ video\.potate\.space|
+ video\.pourpenser\.pro|
+ video\.progressiv\.dev|
+ video\.resolutions\.it|
+ video\.rw501\.de|
+ video\.screamer\.wiki|
+ video\.sdm-tools\.net|
+ video\.sftblw\.moe|
+ video\.shitposter\.club|
+ video\.skyn3t\.in|
+ video\.soi\.ch|
+ video\.stuartbrand\.co\.uk|
+ video\.thinkof\.name|
+ video\.toot\.pt|
+ video\.triplea\.fr|
+ video\.turbo\.chat|
+ video\.vaku\.org\.ua|
+ video\.veloma\.org|
+ video\.violoncello\.ch|
+ video\.wilkie\.how|
+ video\.wsf2021\.info|
+ videorelay\.co|
+ videos-passages\.huma-num\.fr|
+ videos\.3d-wolf\.com|
+ videos\.ahp-numerique\.fr|
+ videos\.alexandrebadalo\.pt|
+ videos\.archigny\.net|
+ videos\.benjaminbrady\.ie|
+ videos\.buceoluegoexisto\.com|
+ videos\.capas\.se|
+ videos\.casually\.cat|
+ videos\.cloudron\.io|
+ videos\.coletivos\.org|
+ videos\.danksquad\.org|
+ videos\.denshi\.live|
+ videos\.fromouter\.space|
+ videos\.fsci\.in|
+ videos\.globenet\.org|
+ videos\.hauspie\.fr|
+ videos\.hush\.is|
+ videos\.john-livingston\.fr|
+ videos\.jordanwarne\.xyz|
+ videos\.lavoixdessansvoix\.org|
+ videos\.leslionsfloorball\.fr|
+ videos\.lucero\.top|
+ videos\.martyn\.berlin|
+ videos\.mastodont\.cat|
+ videos\.monstro1\.com|
+ videos\.npo\.city|
+ videos\.optoutpod\.com|
+ videos\.petch\.rocks|
+ videos\.pzelawski\.xyz|
+ videos\.rampin\.org|
+ videos\.scanlines\.xyz|
+ videos\.shmalls\.pw|
+ videos\.sibear\.fr|
+ videos\.stadtfabrikanten\.org|
+ videos\.tankernn\.eu|
+ videos\.testimonia\.org|
+ videos\.thisishowidontdisappear\.com|
+ videos\.traumaheilung\.net|
+ videos\.trom\.tf|
+ videos\.wakkerewereld\.nu|
+ videos\.weblib\.re|
+ videos\.yesil\.club|
+ vids\.roshless\.me|
+ vids\.tekdmn\.me|
+ vidz\.dou\.bet|
+ vod\.lumikko\.dev|
+ vs\.uniter\.network|
+ vulgarisation-informatique\.fr|
+ watch\.breadtube\.tv|
+ watch\.deranalyst\.ch|
+ watch\.ignorance\.eu|
+ watch\.krazy\.party|
+ watch\.libertaria\.space|
+ watch\.rt4mn\.org|
+ watch\.softinio\.com|
+ watch\.tubelab\.video|
+ web-fellow\.de|
+ webtv\.vandoeuvre\.net|
+ wechill\.space|
+ wikileaks\.video|
+ wiwi\.video|
+ worldofvids\.com|
+ wwtube\.net|
+ www4\.mir\.inter21\.net|
+ www\.birkeundnymphe\.de|
+ www\.captain-german\.com|
+ www\.wiki-tube\.de|
+ xxivproduction\.video|
+ xxx\.noho\.st|
+
+ # from youtube-dl
peertube\.rainbowswingers\.net|
tube\.stanisic\.nl|
peer\.suiri\.us|
@@ -410,24 +1043,24 @@ class PeerTubeIE(InfoExtractor):
video\.colibris-outilslibres\.org|
tube\.svnet\.fr|
peertube\.video|
- peertube3\.cpy\.re|
peertube2\.cpy\.re|
+ peertube3\.cpy\.re|
videos\.tcit\.fr|
peertube\.cpy\.re|
canard\.tube
)'''
- _UUID_RE = r'[\da-fA-F]{8}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{12}'
+ _UUID_RE = r'[\da-zA-Z]{22}|[\da-fA-F]{8}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{12}'
_API_BASE = 'https://%s/api/v1/videos/%s/%s'
_VALID_URL = r'''(?x)
(?:
peertube:(?P<host>[^:]+):|
- https?://(?P<host_2>%s)/(?:videos/(?:watch|embed)|api/v\d/videos)/
+ https?://(?P<host_2>%s)/(?:videos/(?:watch|embed)|api/v\d/videos|w)/
)
(?P<id>%s)
''' % (_INSTANCES_RE, _UUID_RE)
_TESTS = [{
'url': 'https://framatube.org/videos/watch/9c9de5e8-0a1e-484a-b099-e80766180a6d',
- 'md5': '9bed8c0137913e17b86334e5885aacff',
+ 'md5': '8563064d245a4be5705bddb22bb00a28',
'info_dict': {
'id': '9c9de5e8-0a1e-484a-b099-e80766180a6d',
'ext': 'mp4',
@@ -439,9 +1072,9 @@ class PeerTubeIE(InfoExtractor):
'uploader': 'Framasoft',
'uploader_id': '3',
'uploader_url': 'https://framatube.org/accounts/framasoft',
- 'channel': 'Les vidéos de Framasoft',
- 'channel_id': '2',
- 'channel_url': 'https://framatube.org/video-channels/bf54d359-cfad-4935-9d45-9d6be93f63e8',
+ 'channel': 'A propos de PeerTube',
+ 'channel_id': '2215',
+ 'channel_url': 'https://framatube.org/video-channels/joinpeertube',
'language': 'en',
'license': 'Attribution - Share Alike',
'duration': 113,
@@ -452,6 +1085,39 @@ class PeerTubeIE(InfoExtractor):
'categories': ['Science & Technology'],
}
}, {
+ 'url': 'https://peertube2.cpy.re/w/122d093a-1ede-43bd-bd34-59d2931ffc5e',
+ 'info_dict': {
+ 'id': '122d093a-1ede-43bd-bd34-59d2931ffc5e',
+ 'ext': 'mp4',
+ 'title': 'E2E tests',
+ 'uploader_id': '37855',
+ 'timestamp': 1589276219,
+ 'upload_date': '20200512',
+ 'uploader': 'chocobozzz',
+ }
+ }, {
+ 'url': 'https://peertube2.cpy.re/w/3fbif9S3WmtTP8gGsC5HBd',
+ 'info_dict': {
+ 'id': '3fbif9S3WmtTP8gGsC5HBd',
+ 'ext': 'mp4',
+ 'title': 'E2E tests',
+ 'uploader_id': '37855',
+ 'timestamp': 1589276219,
+ 'upload_date': '20200512',
+ 'uploader': 'chocobozzz',
+ },
+ }, {
+ 'url': 'https://peertube2.cpy.re/api/v1/videos/3fbif9S3WmtTP8gGsC5HBd',
+ 'info_dict': {
+ 'id': '3fbif9S3WmtTP8gGsC5HBd',
+ 'ext': 'mp4',
+ 'title': 'E2E tests',
+ 'uploader_id': '37855',
+ 'timestamp': 1589276219,
+ 'upload_date': '20200512',
+ 'uploader': 'chocobozzz',
+ },
+ }, {
# Issue #26002
'url': 'peertube:spacepub.space:d8943b2d-8280-497b-85ec-bc282ec2afdc',
'info_dict': {
@@ -464,29 +1130,30 @@ class PeerTubeIE(InfoExtractor):
'uploader': 'Drew DeVault',
}
}, {
- 'url': 'https://peertube.tamanoir.foucry.net/videos/watch/0b04f13d-1e18-4f1d-814e-4979aa7c9c44',
+ 'url': 'https://peertube.debian.social/videos/watch/0b04f13d-1e18-4f1d-814e-4979aa7c9c44',
'only_matching': True,
}, {
# nsfw
- 'url': 'https://tube.22decembre.eu/videos/watch/9bb88cd3-9959-46d9-9ab9-33d2bb704c39',
+ 'url': 'https://vod.ksite.de/videos/watch/9bb88cd3-9959-46d9-9ab9-33d2bb704c39',
'only_matching': True,
}, {
- 'url': 'https://tube.22decembre.eu/videos/embed/fed67262-6edb-4d1c-833b-daa9085c71d7',
+ 'url': 'https://vod.ksite.de/videos/embed/fed67262-6edb-4d1c-833b-daa9085c71d7',
'only_matching': True,
}, {
- 'url': 'https://tube.openalgeria.org/api/v1/videos/c1875674-97d0-4c94-a058-3f7e64c962e8',
+ 'url': 'https://peertube.tv/api/v1/videos/c1875674-97d0-4c94-a058-3f7e64c962e8',
'only_matching': True,
}, {
- 'url': 'peertube:video.blender.org:b37a5b9f-e6b5-415c-b700-04a5cd6ec205',
+ 'url': 'peertube:framatube.org:b37a5b9f-e6b5-415c-b700-04a5cd6ec205',
'only_matching': True,
}]
@staticmethod
def _extract_peertube_url(webpage, source_url):
mobj = re.match(
- r'https?://(?P<host>[^/]+)/videos/(?:watch|embed)/(?P<id>%s)'
+ r'https?://(?P<host>[^/]+)/(?:videos/(?:watch|embed)|w)/(?P<id>%s)'
% PeerTubeIE._UUID_RE, source_url)
if mobj and any(p in webpage for p in (
+ 'meta property="og:platform" content="PeerTube"',
'<title>PeerTube<',
'There will be other non JS-based clients to access PeerTube',
'>We are sorry but it seems that PeerTube is not compatible with your web browser.<')):
@@ -529,7 +1196,7 @@ class PeerTubeIE(InfoExtractor):
return subtitles
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
host = mobj.group('host') or mobj.group('host_2')
video_id = mobj.group('id')
@@ -569,15 +1236,15 @@ class PeerTubeIE(InfoExtractor):
formats.append(f)
self._sort_formats(formats)
- full_description = self._call_api(
- host, video_id, 'description', note='Downloading description JSON',
- fatal=False)
+ description = video.get('description')
+ if description and len(description) >= 250:
+ # description is shortened
+ full_description = self._call_api(
+ host, video_id, 'description', note='Downloading description JSON',
+ fatal=False)
- description = None
- if isinstance(full_description, dict):
- description = str_or_none(full_description.get('description'))
- if not description:
- description = video.get('description')
+ if isinstance(full_description, dict):
+ description = str_or_none(full_description.get('description')) or description
subtitles = self.extract_subtitles(host, video_id)
@@ -626,3 +1293,110 @@ class PeerTubeIE(InfoExtractor):
'subtitles': subtitles,
'webpage_url': webpage_url,
}
+
+
+class PeerTubePlaylistIE(InfoExtractor):
+ IE_NAME = 'PeerTube:Playlist'
+ _TYPES = {
+ 'a': 'accounts',
+ 'c': 'video-channels',
+ 'w/p': 'video-playlists',
+ }
+ _VALID_URL = r'''(?x)
+ https?://(?P<host>%s)/(?P<type>(?:%s))/
+ (?P<id>[^/]+)
+ ''' % (PeerTubeIE._INSTANCES_RE, '|'.join(_TYPES.keys()))
+ _TESTS = [{
+ 'url': 'https://peertube.tux.ovh/w/p/3af94cba-95e8-4b74-b37a-807ab6d82526',
+ 'info_dict': {
+ 'id': '3af94cba-95e8-4b74-b37a-807ab6d82526',
+ 'description': 'playlist',
+ 'timestamp': 1611171863,
+ 'title': 'playlist',
+ },
+ 'playlist_mincount': 6,
+ }, {
+ 'url': 'https://peertube.tux.ovh/w/p/wkyqcQBnsvFxtUB2pkYc1e',
+ 'info_dict': {
+ 'id': 'wkyqcQBnsvFxtUB2pkYc1e',
+ 'description': 'Cette liste de vidéos contient uniquement les jeux qui peuvent être terminés en une seule vidéo.',
+ 'title': 'Let\'s Play',
+ 'timestamp': 1604147331,
+ },
+ 'playlist_mincount': 6,
+ }, {
+ 'url': 'https://peertube.debian.social/w/p/hFdJoTuyhNJVa1cDWd1d12',
+ 'info_dict': {
+ 'id': 'hFdJoTuyhNJVa1cDWd1d12',
+ 'description': 'Diversas palestras do Richard Stallman no Brasil.',
+ 'title': 'Richard Stallman no Brasil',
+ 'timestamp': 1599676222,
+ },
+ 'playlist_mincount': 9,
+ }, {
+ 'url': 'https://peertube2.cpy.re/a/chocobozzz/videos',
+ 'info_dict': {
+ 'id': 'chocobozzz',
+ 'timestamp': 1553874564,
+ 'title': 'chocobozzz',
+ },
+ 'playlist_mincount': 2,
+ }, {
+ 'url': 'https://framatube.org/c/bf54d359-cfad-4935-9d45-9d6be93f63e8/videos',
+ 'info_dict': {
+ 'id': 'bf54d359-cfad-4935-9d45-9d6be93f63e8',
+ 'timestamp': 1519917377,
+ 'title': 'Les vidéos de Framasoft',
+ },
+ 'playlist_mincount': 345,
+ }, {
+ 'url': 'https://peertube2.cpy.re/c/blender_open_movies@video.blender.org/videos',
+ 'info_dict': {
+ 'id': 'blender_open_movies@video.blender.org',
+ 'timestamp': 1542287810,
+ 'title': 'Official Blender Open Movies',
+ },
+ 'playlist_mincount': 11,
+ }]
+ _API_BASE = 'https://%s/api/v1/%s/%s%s'
+ _PAGE_SIZE = 30
+
+ def call_api(self, host, name, path, base, **kwargs):
+ return self._download_json(
+ self._API_BASE % (host, base, name, path), name, **kwargs)
+
+ def fetch_page(self, host, id, type, page):
+ page += 1
+ video_data = self.call_api(
+ host, id,
+ f'/videos?sort=-createdAt&start={self._PAGE_SIZE * (page - 1)}&count={self._PAGE_SIZE}&nsfw=both',
+ type, note=f'Downloading page {page}').get('data', [])
+ for video in video_data:
+ shortUUID = video.get('shortUUID') or try_get(video, lambda x: x['video']['shortUUID'])
+ video_title = video.get('name') or try_get(video, lambda x: x['video']['name'])
+ yield self.url_result(
+ f'https://{host}/w/{shortUUID}', PeerTubeIE.ie_key(),
+ video_id=shortUUID, video_title=video_title)
+
+ def _extract_playlist(self, host, type, id):
+ info = self.call_api(host, id, '', type, note='Downloading playlist information', fatal=False)
+
+ playlist_title = info.get('displayName')
+ playlist_description = info.get('description')
+ playlist_timestamp = unified_timestamp(info.get('createdAt'))
+ channel = try_get(info, lambda x: x['ownerAccount']['name']) or info.get('displayName')
+ channel_id = try_get(info, lambda x: x['ownerAccount']['id']) or info.get('id')
+ thumbnail = info.get('thumbnailPath')
+ thumbnail = f'https://{host}{thumbnail}' if thumbnail else None
+
+ entries = OnDemandPagedList(functools.partial(
+ self.fetch_page, host, id, type), self._PAGE_SIZE)
+
+ return self.playlist_result(
+ entries, id, playlist_title, playlist_description,
+ timestamp=playlist_timestamp, channel=channel, channel_id=channel_id, thumbnail=thumbnail)
+
+ def _real_extract(self, url):
+ type, host, id = self._match_valid_url(url).group('type', 'host', 'id')
+ type = self._TYPES[type]
+ return self._extract_playlist(host, type, id)
diff --git a/hypervideo_dl/extractor/peloton.py b/hypervideo_dl/extractor/peloton.py
new file mode 100644
index 0000000..287d341
--- /dev/null
+++ b/hypervideo_dl/extractor/peloton.py
@@ -0,0 +1,222 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import json
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+ compat_HTTPError,
+ compat_urllib_parse,
+)
+from ..utils import (
+ ExtractorError,
+ float_or_none,
+ str_or_none,
+ traverse_obj,
+ url_or_none,
+)
+
+
+class PelotonIE(InfoExtractor):
+ IE_NAME = 'peloton'
+ _NETRC_MACHINE = 'peloton'
+ _VALID_URL = r'https?://members\.onepeloton\.com/classes/player/(?P<id>[a-f0-9]+)'
+ _TESTS = [{
+ 'url': 'https://members.onepeloton.com/classes/player/0e9653eb53544eeb881298c8d7a87b86',
+ 'info_dict': {
+ 'id': '0e9653eb53544eeb881298c8d7a87b86',
+ 'title': '20 min Chest & Back Strength',
+ 'ext': 'mp4',
+ 'thumbnail': r're:^https?://.+\.jpg',
+ 'description': 'md5:fcd5be9b9eda0194b470e13219050a66',
+ 'creator': 'Chase Tucker',
+ 'release_timestamp': 1556141400,
+ 'timestamp': 1556141400,
+ 'upload_date': '20190424',
+ 'duration': 1389,
+ 'categories': ['Strength'],
+ 'tags': ['Workout Mat', 'Light Weights', 'Medium Weights'],
+ 'is_live': False,
+ 'chapters': 'count:1',
+ 'subtitles': {'en': [{
+ 'url': r're:^https?://.+',
+ 'ext': 'vtt'
+ }]},
+ }, 'params': {
+ 'skip_download': 'm3u8',
+ },
+ '_skip': 'Account needed'
+ }, {
+ 'url': 'https://members.onepeloton.com/classes/player/26603d53d6bb4de1b340514864a6a6a8',
+ 'info_dict': {
+ 'id': '26603d53d6bb4de1b340514864a6a6a8',
+ 'title': '30 min Earth Day Run',
+ 'ext': 'm4a',
+ 'thumbnail': r're:https://.+\.jpg',
+ 'description': 'md5:adc065a073934d7ee0475d217afe0c3d',
+ 'creator': 'Selena Samuela',
+ 'release_timestamp': 1587567600,
+ 'timestamp': 1587567600,
+ 'upload_date': '20200422',
+ 'duration': 1802,
+ 'categories': ['Running'],
+ 'is_live': False,
+ 'chapters': 'count:3'
+ }, 'params': {
+ 'skip_download': 'm3u8',
+ },
+ '_skip': 'Account needed'
+ }]
+
+ _MANIFEST_URL_TEMPLATE = '%s?hdnea=%s'
+
+ def _start_session(self, video_id):
+ self._download_webpage('https://api.onepeloton.com/api/started_client_session', video_id, note='Starting session')
+
+ def _login(self, video_id):
+ username, password = self._get_login_info()
+ if not (username and password):
+ self.raise_login_required()
+ try:
+ self._download_json(
+ 'https://api.onepeloton.com/auth/login', video_id, note='Logging in',
+ data=json.dumps({
+ 'username_or_email': username,
+ 'password': password,
+ 'with_pubsub': False
+ }).encode(),
+ headers={'Content-Type': 'application/json', 'User-Agent': 'web'})
+ except ExtractorError as e:
+ if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
+ json_string = self._webpage_read_content(e.cause, None, video_id)
+ res = self._parse_json(json_string, video_id)
+ raise ExtractorError(res['message'], expected=res['message'] == 'Login failed')
+ else:
+ raise
+
+ def _get_token(self, video_id):
+ try:
+ subscription = self._download_json(
+ 'https://api.onepeloton.com/api/subscription/stream', video_id, note='Downloading token',
+ data=json.dumps({}).encode(), headers={'Content-Type': 'application/json'})
+ except ExtractorError as e:
+ if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
+ json_string = self._webpage_read_content(e.cause, None, video_id)
+ res = self._parse_json(json_string, video_id)
+ raise ExtractorError(res['message'], expected=res['message'] == 'Stream limit reached')
+ else:
+ raise
+ return subscription['token']
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ try:
+ self._start_session(video_id)
+ except ExtractorError as e:
+ if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
+ self._login(video_id)
+ self._start_session(video_id)
+ else:
+ raise
+
+ metadata = self._download_json('https://api.onepeloton.com/api/ride/%s/details?stream_source=multichannel' % video_id, video_id)
+ ride_data = metadata.get('ride')
+ if not ride_data:
+ raise ExtractorError('Missing stream metadata')
+ token = self._get_token(video_id)
+
+ is_live = False
+ if ride_data.get('content_format') == 'audio':
+ url = self._MANIFEST_URL_TEMPLATE % (ride_data.get('vod_stream_url'), compat_urllib_parse.quote(token))
+ formats = [{
+ 'url': url,
+ 'ext': 'm4a',
+ 'format_id': 'audio',
+ 'vcodec': 'none',
+ }]
+ subtitles = {}
+ else:
+ if ride_data.get('vod_stream_url'):
+ url = 'https://members.onepeloton.com/.netlify/functions/m3u8-proxy?displayLanguage=en&acceptedSubtitles=%s&url=%s?hdnea=%s' % (
+ ','.join([re.sub('^([a-z]+)-([A-Z]+)$', r'\1', caption) for caption in ride_data['captions']]),
+ ride_data['vod_stream_url'],
+ compat_urllib_parse.quote(compat_urllib_parse.quote(token)))
+ elif ride_data.get('live_stream_url'):
+ url = self._MANIFEST_URL_TEMPLATE % (ride_data.get('live_stream_url'), compat_urllib_parse.quote(token))
+ is_live = True
+ else:
+ raise ExtractorError('Missing video URL')
+ formats, subtitles = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4')
+
+ if metadata.get('instructor_cues'):
+ subtitles['cues'] = [{
+ 'data': json.dumps(metadata.get('instructor_cues')),
+ 'ext': 'json'
+ }]
+
+ category = ride_data.get('fitness_discipline_display_name')
+ chapters = [{
+ 'start_time': segment.get('start_time_offset'),
+ 'end_time': segment.get('start_time_offset') + segment.get('length'),
+ 'title': segment.get('name')
+ } for segment in traverse_obj(metadata, ('segments', 'segment_list'))]
+
+ self._sort_formats(formats)
+ return {
+ 'id': video_id,
+ 'title': ride_data.get('title'),
+ 'formats': formats,
+ 'thumbnail': url_or_none(ride_data.get('image_url')),
+ 'description': str_or_none(ride_data.get('description')),
+ 'creator': traverse_obj(ride_data, ('instructor', 'name')),
+ 'release_timestamp': ride_data.get('original_air_time'),
+ 'timestamp': ride_data.get('original_air_time'),
+ 'subtitles': subtitles,
+ 'duration': float_or_none(ride_data.get('length')),
+ 'categories': [category] if category else None,
+ 'tags': traverse_obj(ride_data, ('equipment_tags', ..., 'name')),
+ 'is_live': is_live,
+ 'chapters': chapters
+ }
+
+
+class PelotonLiveIE(InfoExtractor):
+ IE_NAME = 'peloton:live'
+ IE_DESC = 'Peloton Live'
+ _VALID_URL = r'https?://members\.onepeloton\.com/player/live/(?P<id>[a-f0-9]+)'
+ _TEST = {
+ 'url': 'https://members.onepeloton.com/player/live/eedee2d19f804a9788f53aa8bd38eb1b',
+ 'info_dict': {
+ 'id': '32edc92d28044be5bf6c7b6f1f8d1cbc',
+ 'title': '30 min HIIT Ride: Live from Home',
+ 'ext': 'mp4',
+ 'thumbnail': r're:^https?://.+\.png',
+ 'description': 'md5:f0d7d8ed3f901b7ee3f62c1671c15817',
+ 'creator': 'Alex Toussaint',
+ 'release_timestamp': 1587736620,
+ 'timestamp': 1587736620,
+ 'upload_date': '20200424',
+ 'duration': 2014,
+ 'categories': ['Cycling'],
+ 'is_live': False,
+ 'chapters': 'count:3'
+ },
+ 'params': {
+ 'format': 'bestvideo',
+ 'skip_download': 'm3u8',
+ },
+ '_skip': 'Account needed'
+ }
+
+ def _real_extract(self, url):
+ workout_id = self._match_id(url)
+ peloton = self._download_json(f'https://api.onepeloton.com/api/peloton/{workout_id}', workout_id)
+
+ if peloton.get('ride_id'):
+ if not peloton.get('is_live') or peloton.get('is_encore') or peloton.get('status') != 'PRE_START':
+ return self.url_result('https://members.onepeloton.com/classes/player/%s' % peloton['ride_id'])
+ else:
+ raise ExtractorError('Ride has not started', expected=True)
+ else:
+ raise ExtractorError('Missing video ID')
diff --git a/hypervideo_dl/extractor/performgroup.py b/hypervideo_dl/extractor/performgroup.py
index 26942bf..c00d393 100644
--- a/hypervideo_dl/extractor/performgroup.py
+++ b/hypervideo_dl/extractor/performgroup.py
@@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
from ..utils import int_or_none
@@ -34,7 +33,7 @@ class PerformGroupIE(InfoExtractor):
})
def _real_extract(self, url):
- player_id, auth_token = re.search(self._VALID_URL, url).groups()
+ player_id, auth_token = self._match_valid_url(url).groups()
bootstrap = self._call_api('bootstrap', auth_token, player_id, url)
video = bootstrap['config']['dataSource']['sourceItems'][0]['videos'][0]
video_id = video['uuid']
diff --git a/hypervideo_dl/extractor/periscope.py b/hypervideo_dl/extractor/periscope.py
index b159063..b93a02b 100644
--- a/hypervideo_dl/extractor/periscope.py
+++ b/hypervideo_dl/extractor/periscope.py
@@ -12,6 +12,10 @@ from ..utils import (
class PeriscopeBaseIE(InfoExtractor):
+ _M3U8_HEADERS = {
+ 'Referer': 'https://www.periscope.tv/'
+ }
+
def _call_api(self, method, query, item_id):
return self._download_json(
'https://api.periscope.tv/api/v2/%s' % method,
@@ -54,9 +58,11 @@ class PeriscopeBaseIE(InfoExtractor):
m3u8_url, video_id, 'mp4',
entry_protocol='m3u8_native'
if state in ('ended', 'timed_out') else 'm3u8',
- m3u8_id=format_id, fatal=fatal)
+ m3u8_id=format_id, fatal=fatal, headers=self._M3U8_HEADERS)
if len(m3u8_formats) == 1:
self._add_width_and_height(m3u8_formats[0], width, height)
+ for f in m3u8_formats:
+ f.setdefault('http_headers', {}).update(self._M3U8_HEADERS)
return m3u8_formats
diff --git a/hypervideo_dl/extractor/philharmoniedeparis.py b/hypervideo_dl/extractor/philharmoniedeparis.py
index 03da64b..9f4899c 100644
--- a/hypervideo_dl/extractor/philharmoniedeparis.py
+++ b/hypervideo_dl/extractor/philharmoniedeparis.py
@@ -79,7 +79,7 @@ class PhilharmonieDeParisIE(InfoExtractor):
formats.extend(self._extract_m3u8_formats(
m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native',
m3u8_id='hls', fatal=False))
- if not formats:
+ if not formats and not self.get_param('ignore_no_formats'):
return
self._sort_formats(formats)
return {
diff --git a/hypervideo_dl/extractor/photobucket.py b/hypervideo_dl/extractor/photobucket.py
index 6c8bbe1..53aebe2 100644
--- a/hypervideo_dl/extractor/photobucket.py
+++ b/hypervideo_dl/extractor/photobucket.py
@@ -1,7 +1,6 @@
from __future__ import unicode_literals
import json
-import re
from .common import InfoExtractor
from ..compat import compat_urllib_parse_unquote
@@ -23,7 +22,7 @@ class PhotobucketIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
video_id = mobj.group('id')
video_extension = mobj.group('ext')
diff --git a/hypervideo_dl/extractor/piksel.py b/hypervideo_dl/extractor/piksel.py
index ecf56ff..a362664 100644
--- a/hypervideo_dl/extractor/piksel.py
+++ b/hypervideo_dl/extractor/piksel.py
@@ -85,7 +85,7 @@ class PikselIE(InfoExtractor):
return response
def _real_extract(self, url):
- ref_id, display_id = re.match(self._VALID_URL, url).groups()
+ ref_id, display_id = self._match_valid_url(url).groups()
webpage = self._download_webpage(url, display_id)
app_token = self._search_regex([
r'clientAPI\s*:\s*"([^"]+)"',
diff --git a/hypervideo_dl/extractor/pinterest.py b/hypervideo_dl/extractor/pinterest.py
index 42528d7..80e9cd0 100644
--- a/hypervideo_dl/extractor/pinterest.py
+++ b/hypervideo_dl/extractor/pinterest.py
@@ -2,7 +2,6 @@
from __future__ import unicode_literals
import json
-import re
from .common import InfoExtractor
from ..compat import compat_str
@@ -56,8 +55,7 @@ class PinterestBaseIE(InfoExtractor):
'height': int_or_none(format_dict.get('height')),
'duration': duration,
})
- self._sort_formats(
- formats, field_preference=('height', 'width', 'tbr', 'format_id'))
+ self._sort_formats(formats)
description = data.get('description') or data.get('description_html') or data.get('seo_description')
timestamp = unified_timestamp(data.get('created_at'))
@@ -166,7 +164,7 @@ class PinterestCollectionIE(PinterestBaseIE):
PinterestCollectionIE, cls).suitable(url)
def _real_extract(self, url):
- username, slug = re.match(self._VALID_URL, url).groups()
+ username, slug = self._match_valid_url(url).groups()
board = self._call_api(
'Board', slug, {
'slug': slug,
diff --git a/hypervideo_dl/extractor/pladform.py b/hypervideo_dl/extractor/pladform.py
index e86c653..dc20300 100644
--- a/hypervideo_dl/extractor/pladform.py
+++ b/hypervideo_dl/extractor/pladform.py
@@ -4,11 +4,11 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..compat import compat_urlparse
from ..utils import (
determine_ext,
ExtractorError,
int_or_none,
+ parse_qs,
xpath_text,
qualities,
)
@@ -56,7 +56,7 @@ class PladformIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
- qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
+ qs = parse_qs(url)
pl = qs.get('pl', ['1'])[0]
video = self._download_xml(
diff --git a/hypervideo_dl/extractor/playfm.py b/hypervideo_dl/extractor/playfm.py
index e766ccc..4298cbe 100644
--- a/hypervideo_dl/extractor/playfm.py
+++ b/hypervideo_dl/extractor/playfm.py
@@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
from ..compat import compat_str
@@ -35,7 +34,7 @@ class PlayFMIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
video_id = mobj.group('id')
slug = mobj.group('slug')
diff --git a/hypervideo_dl/extractor/playplustv.py b/hypervideo_dl/extractor/playplustv.py
index 1e30ab2..fd72a37 100644
--- a/hypervideo_dl/extractor/playplustv.py
+++ b/hypervideo_dl/extractor/playplustv.py
@@ -2,7 +2,6 @@
from __future__ import unicode_literals
import json
-import re
from .common import InfoExtractor
from ..compat import compat_HTTPError
@@ -63,7 +62,7 @@ class PlayPlusTVIE(InfoExtractor):
self._profile = self._call_api('Profiles')['list'][0]['_id']
def _real_extract(self, url):
- project_id, media_id = re.match(self._VALID_URL, url).groups()
+ project_id, media_id = self._match_valid_url(url).groups()
media = self._call_api(
'Media', media_id, {
'profileId': self._profile,
diff --git a/hypervideo_dl/extractor/playtvak.py b/hypervideo_dl/extractor/playtvak.py
index 4c5f579..84e92dd 100644
--- a/hypervideo_dl/extractor/playtvak.py
+++ b/hypervideo_dl/extractor/playtvak.py
@@ -150,7 +150,7 @@ class PlaytvakIE(InfoExtractor):
ext = 'mp4'
# Some streams have mp3 audio which does not play
# well with ffmpeg filter aac_adtstoasc
- preference = -1
+ preference = -10
elif format_ == 'adobe': # f4m manifest fails with 404 in 80% of requests
continue
else: # Other formats not supported yet
diff --git a/hypervideo_dl/extractor/playwire.py b/hypervideo_dl/extractor/playwire.py
index 4d96a10..9c9e597 100644
--- a/hypervideo_dl/extractor/playwire.py
+++ b/hypervideo_dl/extractor/playwire.py
@@ -1,6 +1,5 @@
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
from ..utils import (
@@ -46,7 +45,7 @@ class PlaywireIE(InfoExtractor):
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
publisher_id, video_id = mobj.group('publisher_id'), mobj.group('id')
player = self._download_json(
diff --git a/hypervideo_dl/extractor/pluralsight.py b/hypervideo_dl/extractor/pluralsight.py
index 2d63855..801057e 100644
--- a/hypervideo_dl/extractor/pluralsight.py
+++ b/hypervideo_dl/extractor/pluralsight.py
@@ -17,6 +17,7 @@ from ..utils import (
float_or_none,
int_or_none,
parse_duration,
+ parse_qs,
qualities,
srt_subtitles_timecode,
try_get,
@@ -273,7 +274,7 @@ query viewClip {
return srt
def _real_extract(self, url):
- qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
+ qs = parse_qs(url)
author = qs.get('author', [None])[0]
name = qs.get('name', [None])[0]
@@ -337,11 +338,11 @@ query viewClip {
# In order to minimize the number of calls to ViewClip API and reduce
# the probability of being throttled or banned by Pluralsight we will request
# only single format until formats listing was explicitly requested.
- if self._downloader.params.get('listformats', False):
+ if self.get_param('listformats', False):
allowed_qualities = ALLOWED_QUALITIES
else:
def guess_allowed_qualities():
- req_format = self._downloader.params.get('format') or 'best'
+ req_format = self.get_param('format') or 'best'
req_format_split = req_format.split('-', 1)
if len(req_format_split) > 1:
req_ext, req_quality = req_format_split
@@ -349,7 +350,7 @@ query viewClip {
for allowed_quality in ALLOWED_QUALITIES:
if req_ext == allowed_quality.ext and req_quality in allowed_quality.qualities:
return (AllowedQuality(req_ext, (req_quality, )), )
- req_ext = 'webm' if self._downloader.params.get('prefer_free_formats') else 'mp4'
+ req_ext = 'webm' if self.get_param('prefer_free_formats') else 'mp4'
return (AllowedQuality(req_ext, (best_quality, )), )
allowed_qualities = guess_allowed_qualities()
diff --git a/hypervideo_dl/extractor/plutotv.py b/hypervideo_dl/extractor/plutotv.py
new file mode 100644
index 0000000..0cf8246
--- /dev/null
+++ b/hypervideo_dl/extractor/plutotv.py
@@ -0,0 +1,184 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+import uuid
+
+from .common import InfoExtractor
+from ..compat import (
+ compat_str,
+ compat_urlparse,
+)
+from ..utils import (
+ ExtractorError,
+ float_or_none,
+ int_or_none,
+ try_get,
+ url_or_none,
+)
+
+
+class PlutoTVIE(InfoExtractor):
+ _VALID_URL = r'''(?x)
+ https?://(?:www\.)?pluto\.tv(?:/en)?/on-demand
+ /(?P<video_type>movies|series)
+ /(?P<series_or_movie_slug>[^/]+)
+ (?:
+ /seasons?/(?P<season_no>\d+)
+ (?:/episode/(?P<episode_slug>[^/]+))?
+ )?
+ /?(?:$|[#?])'''
+
+ _INFO_URL = 'https://service-vod.clusters.pluto.tv/v3/vod/slugs/'
+ _INFO_QUERY_PARAMS = {
+ 'appName': 'web',
+ 'appVersion': 'na',
+ 'clientID': compat_str(uuid.uuid1()),
+ 'clientModelNumber': 'na',
+ 'serverSideAds': 'false',
+ 'deviceMake': 'unknown',
+ 'deviceModel': 'web',
+ 'deviceType': 'web',
+ 'deviceVersion': 'unknown',
+ 'sid': compat_str(uuid.uuid1()),
+ }
+ _TESTS = [
+ {
+ 'url': 'https://pluto.tv/on-demand/series/i-love-money/season/2/episode/its-in-the-cards-2009-2-3',
+ 'md5': 'ebcdd8ed89aaace9df37924f722fd9bd',
+ 'info_dict': {
+ 'id': '5de6c598e9379ae4912df0a8',
+ 'ext': 'mp4',
+ 'title': 'It\'s In The Cards',
+ 'episode': 'It\'s In The Cards',
+ 'description': 'The teams face off against each other in a 3-on-2 soccer showdown. Strategy comes into play, though, as each team gets to select their opposing teams’ two defenders.',
+ 'series': 'I Love Money',
+ 'season_number': 2,
+ 'episode_number': 3,
+ 'duration': 3600,
+ }
+ }, {
+ 'url': 'https://pluto.tv/on-demand/series/i-love-money/season/1/',
+ 'playlist_count': 11,
+ 'info_dict': {
+ 'id': '5de6c582e9379ae4912dedbd',
+ 'title': 'I Love Money - Season 1',
+ }
+ }, {
+ 'url': 'https://pluto.tv/on-demand/series/i-love-money/',
+ 'playlist_count': 26,
+ 'info_dict': {
+ 'id': '5de6c582e9379ae4912dedbd',
+ 'title': 'I Love Money',
+ }
+ }, {
+ 'url': 'https://pluto.tv/on-demand/movies/arrival-2015-1-1',
+ 'md5': '3cead001d317a018bf856a896dee1762',
+ 'info_dict': {
+ 'id': '5e83ac701fa6a9001bb9df24',
+ 'ext': 'mp4',
+ 'title': 'Arrival',
+ 'description': 'When mysterious spacecraft touch down across the globe, an elite team - led by expert translator Louise Banks (Academy Award® nominee Amy Adams) – races against time to decipher their intent.',
+ 'duration': 9000,
+ }
+ }, {
+ 'url': 'https://pluto.tv/en/on-demand/series/manhunters-fugitive-task-force/seasons/1/episode/third-times-the-charm-1-1',
+ 'only_matching': True,
+ }
+ ]
+
+ def _to_ad_free_formats(self, video_id, formats, subtitles):
+ ad_free_formats, ad_free_subtitles, m3u8_urls = [], {}, set()
+ for fmt in formats:
+ res = self._download_webpage(
+ fmt.get('url'), video_id, note='Downloading m3u8 playlist',
+ fatal=False)
+ if not res:
+ continue
+ first_segment_url = re.search(
+ r'^(https?://.*/)0\-(end|[0-9]+)/[^/]+\.ts$', res,
+ re.MULTILINE)
+ if first_segment_url:
+ m3u8_urls.add(
+ compat_urlparse.urljoin(first_segment_url.group(1), '0-end/master.m3u8'))
+ continue
+ first_segment_url = re.search(
+ r'^(https?://.*/).+\-0+\.ts$', res,
+ re.MULTILINE)
+ if first_segment_url:
+ m3u8_urls.add(
+ compat_urlparse.urljoin(first_segment_url.group(1), 'master.m3u8'))
+ continue
+
+ for m3u8_url in m3u8_urls:
+ fmts, subs = self._extract_m3u8_formats_and_subtitles(
+ m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)
+ ad_free_formats.extend(fmts)
+ ad_free_subtitles = self._merge_subtitles(ad_free_subtitles, subs)
+ if ad_free_formats:
+ formats, subtitles = ad_free_formats, ad_free_subtitles
+ else:
+ self.report_warning('Unable to find ad-free formats')
+ return formats, subtitles
+
+ def _get_video_info(self, video_json, slug, series_name=None):
+ video_id = video_json.get('_id', slug)
+ formats, subtitles = [], {}
+ for video_url in try_get(video_json, lambda x: x['stitched']['urls'], list) or []:
+ if video_url.get('type') != 'hls':
+ continue
+ url = url_or_none(video_url.get('url'))
+
+ fmts, subs = self._extract_m3u8_formats_and_subtitles(
+ url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)
+ formats.extend(fmts)
+ subtitles = self._merge_subtitles(subtitles, subs)
+
+ formats, subtitles = self._to_ad_free_formats(video_id, formats, subtitles)
+ self._sort_formats(formats)
+
+ info = {
+ 'id': video_id,
+ 'formats': formats,
+ 'subtitles': subtitles,
+ 'title': video_json.get('name'),
+ 'description': video_json.get('description'),
+ 'duration': float_or_none(video_json.get('duration'), scale=1000),
+ }
+ if series_name:
+ info.update({
+ 'series': series_name,
+ 'episode': video_json.get('name'),
+ 'season_number': int_or_none(video_json.get('season')),
+ 'episode_number': int_or_none(video_json.get('number')),
+ })
+ return info
+
+ def _real_extract(self, url):
+ mobj = self._match_valid_url(url).groupdict()
+ info_slug = mobj['series_or_movie_slug']
+ video_json = self._download_json(self._INFO_URL + info_slug, info_slug, query=self._INFO_QUERY_PARAMS)
+
+ if mobj['video_type'] == 'series':
+ series_name = video_json.get('name', info_slug)
+ season_number, episode_slug = mobj.get('season_number'), mobj.get('episode_slug')
+
+ videos = []
+ for season in video_json['seasons']:
+ if season_number is not None and season_number != int_or_none(season.get('number')):
+ continue
+ for episode in season['episodes']:
+ if episode_slug is not None and episode_slug != episode.get('slug'):
+ continue
+ videos.append(self._get_video_info(episode, episode_slug, series_name))
+ if not videos:
+ raise ExtractorError('Failed to find any videos to extract')
+ if episode_slug is not None and len(videos) == 1:
+ return videos[0]
+ playlist_title = series_name
+ if season_number is not None:
+ playlist_title += ' - Season %d' % season_number
+ return self.playlist_result(videos,
+ playlist_id=video_json.get('_id', info_slug),
+ playlist_title=playlist_title)
+ return self._get_video_info(video_json, info_slug)
diff --git a/hypervideo_dl/extractor/podomatic.py b/hypervideo_dl/extractor/podomatic.py
index e782e3f..673a3ab 100644
--- a/hypervideo_dl/extractor/podomatic.py
+++ b/hypervideo_dl/extractor/podomatic.py
@@ -1,7 +1,6 @@
from __future__ import unicode_literals
import json
-import re
from .common import InfoExtractor
from ..utils import int_or_none
@@ -46,7 +45,7 @@ class PodomaticIE(InfoExtractor):
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
video_id = mobj.group('id')
channel = mobj.group('channel') or mobj.group('channel_2')
diff --git a/hypervideo_dl/extractor/pokemon.py b/hypervideo_dl/extractor/pokemon.py
index 80222d4..402b574 100644
--- a/hypervideo_dl/extractor/pokemon.py
+++ b/hypervideo_dl/extractor/pokemon.py
@@ -1,12 +1,14 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
from ..utils import (
+ ExtractorError,
extract_attributes,
int_or_none,
+ js_to_json,
+ merge_dicts,
)
@@ -47,7 +49,7 @@ class PokemonIE(InfoExtractor):
}]
def _real_extract(self, url):
- video_id, display_id = re.match(self._VALID_URL, url).groups()
+ video_id, display_id = self._match_valid_url(url).groups()
webpage = self._download_webpage(url, video_id or display_id)
video_data = extract_attributes(self._search_regex(
r'(<[^>]+data-video-id="%s"[^>]*>)' % (video_id if video_id else '[a-z0-9]{32}'),
@@ -69,3 +71,70 @@ class PokemonIE(InfoExtractor):
'episode_number': int_or_none(video_data.get('data-video-episode')),
'ie_key': 'LimelightMedia',
}
+
+
+class PokemonWatchIE(InfoExtractor):
+ _VALID_URL = r'https?://watch\.pokemon\.com/[a-z]{2}-[a-z]{2}/(?:#/)?player(?:\.html)?\?id=(?P<id>[a-z0-9]{32})'
+ _API_URL = 'https://www.pokemon.com/api/pokemontv/v2/channels/{0:}'
+ _TESTS = [{
+ 'url': 'https://watch.pokemon.com/en-us/player.html?id=8309a40969894a8e8d5bc1311e9c5667',
+ 'md5': '62833938a31e61ab49ada92f524c42ff',
+ 'info_dict': {
+ 'id': '8309a40969894a8e8d5bc1311e9c5667',
+ 'ext': 'mp4',
+ 'title': 'Lillier and the Staff!',
+ 'description': 'md5:338841b8c21b283d24bdc9b568849f04',
+ }
+ }, {
+ 'url': 'https://watch.pokemon.com/en-us/#/player?id=3fe7752ba09141f0b0f7756d1981c6b2',
+ 'only_matching': True
+ }, {
+ 'url': 'https://watch.pokemon.com/de-de/player.html?id=b3c402e111a4459eb47e12160ab0ba07',
+ 'only_matching': True
+ }]
+
+ def _extract_media(self, channel_array, video_id):
+ for channel in channel_array:
+ for media in channel.get('media'):
+ if media.get('id') == video_id:
+ return media
+ return None
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ info = {
+ '_type': 'url',
+ 'id': video_id,
+ 'url': 'limelight:media:%s' % video_id,
+ 'ie_key': 'LimelightMedia',
+ }
+
+ # API call can be avoided entirely if we are listing formats
+ if self.get_param('listformats', False):
+ return info
+
+ webpage = self._download_webpage(url, video_id)
+ build_vars = self._parse_json(self._search_regex(
+ r'(?s)buildVars\s*=\s*({.*?})', webpage, 'build vars'),
+ video_id, transform_source=js_to_json)
+ region = build_vars.get('region')
+ channel_array = self._download_json(self._API_URL.format(region), video_id)
+ video_data = self._extract_media(channel_array, video_id)
+
+ if video_data is None:
+ raise ExtractorError(
+ 'Video %s does not exist' % video_id, expected=True)
+
+ info['_type'] = 'url_transparent'
+ images = video_data.get('images')
+
+ return merge_dicts(info, {
+ 'title': video_data.get('title'),
+ 'description': video_data.get('description'),
+ 'thumbnail': images.get('medium') or images.get('small'),
+ 'series': 'Pokémon',
+ 'season_number': int_or_none(video_data.get('season')),
+ 'episode': video_data.get('title'),
+ 'episode_number': int_or_none(video_data.get('episode')),
+ })
diff --git a/hypervideo_dl/extractor/polskieradio.py b/hypervideo_dl/extractor/polskieradio.py
index 978d6f8..53fe034 100644
--- a/hypervideo_dl/extractor/polskieradio.py
+++ b/hypervideo_dl/extractor/polskieradio.py
@@ -15,12 +15,13 @@ from ..utils import (
int_or_none,
strip_or_none,
unified_timestamp,
+ unescapeHTML,
)
class PolskieRadioIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?polskieradio\.pl/\d+/\d+/Artykul/(?P<id>[0-9]+)'
- _TESTS = [{
+ _TESTS = [{ # Old-style single broadcast.
'url': 'http://www.polskieradio.pl/7/5102/Artykul/1587943,Prof-Andrzej-Nowak-o-historii-nie-da-sie-myslec-beznamietnie',
'info_dict': {
'id': '1587943',
@@ -39,14 +40,41 @@ class PolskieRadioIE(InfoExtractor):
'thumbnail': r're:^https?://static\.prsa\.pl/images/.*\.jpg$'
},
}],
- }, {
- 'url': 'http://www.polskieradio.pl/265/5217/Artykul/1635803,Euro-2016-nie-ma-miejsca-na-blad-Polacy-graja-ze-Szwajcaria-o-cwiercfinal',
+ }, { # New-style single broadcast.
+ 'url': 'https://www.polskieradio.pl/8/2382/Artykul/2534482,Zagarysci-Poezja-jak-spoiwo',
+ 'info_dict': {
+ 'id': '2534482',
+ 'title': 'Żagaryści. Poezja jak spoiwo',
+ 'description': 'md5:f18d95d5dcba747a09b635e21a4c0695',
+ },
+ 'playlist': [{
+ 'md5': 'd07559829f61d5a93a75755987ded760',
+ 'info_dict': {
+ 'id': '2516679',
+ 'ext': 'mp3',
+ 'title': 'md5:c6e1234e0b747ad883cb91b7ad06b98c',
+ 'timestamp': 1592654400,
+ 'upload_date': '20200620',
+ 'duration': 1430,
+ 'thumbnail': r're:^https?://static\.prsa\.pl/images/.*\.jpg$'
+ },
+ }],
+ }, { # Old-style multiple broadcast playlist.
+ 'url': 'https://www.polskieradio.pl/8/4346/Artykul/2487823,Marek-Kondrat-czyta-Mistrza-i-Malgorzate',
+ 'info_dict': {
+ 'id': '2487823',
+ 'title': 'Marek Kondrat czyta "Mistrza i Małgorzatę"',
+ 'description': 'md5:8422a95cc83834f2aaeff9d82e9c8f39',
+ },
+ 'playlist_mincount': 50,
+ }, { # New-style multiple broadcast playlist.
+ 'url': 'https://www.polskieradio.pl/8/4346/Artykul/2541317,Czytamy-Kalendarz-i-klepsydre-Tadeusza-Konwickiego',
'info_dict': {
- 'id': '1635803',
- 'title': 'Euro 2016: nie ma miejsca na błąd. Polacy grają ze Szwajcarią o ćwierćfinał',
- 'description': 'md5:01cb7d0cad58664095d72b51a1ebada2',
+ 'id': '2541317',
+ 'title': 'Czytamy "Kalendarz i klepsydrę" Tadeusza Konwickiego',
+ 'description': 'md5:0baeaa46d877f1351fb2eeed3e871f9f',
},
- 'playlist_mincount': 12,
+ 'playlist_mincount': 15,
}, {
'url': 'http://polskieradio.pl/9/305/Artykul/1632955,Bardzo-popularne-slowo-remis',
'only_matching': True,
@@ -78,8 +106,8 @@ class PolskieRadioIE(InfoExtractor):
media_urls = set()
- for data_media in re.findall(r'<[^>]+data-media=({[^>]+})', content):
- media = self._parse_json(data_media, playlist_id, fatal=False)
+ for data_media in re.findall(r'<[^>]+data-media="?({[^>]+})"?', content):
+ media = self._parse_json(data_media, playlist_id, transform_source=unescapeHTML, fatal=False)
if not media.get('file') or not media.get('desc'):
continue
media_url = self._proto_relative_url(media['file'], 'http:')
@@ -98,6 +126,7 @@ class PolskieRadioIE(InfoExtractor):
title = self._og_search_title(webpage).strip()
description = strip_or_none(self._og_search_description(webpage))
+ description = description.replace('\xa0', ' ') if description is not None else None
return self.playlist_result(entries, playlist_id, title, description)
diff --git a/hypervideo_dl/extractor/popcorntimes.py b/hypervideo_dl/extractor/popcorntimes.py
index 7bf7f98..5f9d0e7 100644
--- a/hypervideo_dl/extractor/popcorntimes.py
+++ b/hypervideo_dl/extractor/popcorntimes.py
@@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
from ..compat import (
@@ -33,7 +32,7 @@ class PopcorntimesIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
video_id, display_id = mobj.group('id', 'display_id')
webpage = self._download_webpage(url, display_id)
diff --git a/hypervideo_dl/extractor/popcorntv.py b/hypervideo_dl/extractor/popcorntv.py
index 9f834fb..66d2e50 100644
--- a/hypervideo_dl/extractor/popcorntv.py
+++ b/hypervideo_dl/extractor/popcorntv.py
@@ -1,6 +1,5 @@
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
from ..utils import (
@@ -33,7 +32,7 @@ class PopcornTVIE(InfoExtractor):
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
display_id, video_id = mobj.group('display_id', 'id')
webpage = self._download_webpage(url, display_id)
diff --git a/hypervideo_dl/extractor/porncom.py b/hypervideo_dl/extractor/porncom.py
index 5726cab..83df221 100644
--- a/hypervideo_dl/extractor/porncom.py
+++ b/hypervideo_dl/extractor/porncom.py
@@ -35,7 +35,7 @@ class PornComIE(InfoExtractor):
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
video_id = mobj.group('id')
display_id = mobj.group('display_id') or video_id
diff --git a/hypervideo_dl/extractor/pornflip.py b/hypervideo_dl/extractor/pornflip.py
new file mode 100644
index 0000000..d0aefa2
--- /dev/null
+++ b/hypervideo_dl/extractor/pornflip.py
@@ -0,0 +1,82 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ int_or_none,
+ parse_duration,
+ parse_iso8601
+)
+
+
+class PornFlipIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?pornflip\.com/(?:(embed|sv|v)/)?(?P<id>[^/]+)'
+ _TESTS = [
+ {
+ 'url': 'https://www.pornflip.com/dzv9Mtw1qj2/sv/brazzers-double-dare-two-couples-fucked-jenna-reid-maya-bijou',
+ 'info_dict': {
+ 'id': 'dzv9Mtw1qj2',
+ 'ext': 'mp4',
+ 'title': 'Brazzers - Double Dare Two couples fucked Jenna Reid Maya Bijou',
+ 'description': 'md5:d2b69e6cc743c5fd158e162aa7f05821',
+ 'duration': 476,
+ 'like_count': int,
+ 'dislike_count': int,
+ 'view_count': int,
+ 'timestamp': 1617846819,
+ 'upload_date': '20210408',
+ 'uploader': 'Brazzers',
+ 'age_limit': 18,
+ },
+ 'params': {
+ 'format': 'bestvideo',
+ 'skip_download': True,
+ },
+ },
+ {
+ 'url': 'https://www.pornflip.com/v/IrJEC40i21L',
+ 'only_matching': True,
+ },
+ {
+ 'url': 'https://www.pornflip.com/Z3jzbChC5-P/sexintaxi-e-sereyna-gomez-czech-naked-couple',
+ 'only_matching': True,
+ },
+ {
+ 'url': 'https://www.pornflip.com/embed/bLcDFxnrZnU',
+ 'only_matching': True,
+ },
+ ]
+ _HOST = 'www.pornflip.com'
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(
+ 'https://{}/sv/{}'.format(self._HOST, video_id), video_id, headers={'host': self._HOST})
+ description = self._html_search_regex(r'&p\[summary\]=(.*?)\s*&p', webpage, 'description', fatal=False)
+ duration = self._search_regex(r'"duration":\s+"([^"]+)",', webpage, 'duration', fatal=False)
+ view_count = self._search_regex(r'"interactionCount":\s+"([^"]+)"', webpage, 'view_count', fatal=False)
+ title = self._html_search_regex(r'id="mediaPlayerTitleLink"[^>]*>(.+)</a>', webpage, 'title', fatal=False)
+ uploader = self._html_search_regex(r'class="title-chanel"[^>]*>[^<]*<a[^>]*>([^<]+)<', webpage, 'uploader', fatal=False)
+ upload_date = self._search_regex(r'"uploadDate":\s+"([^"]+)",', webpage, 'upload_date', fatal=False)
+ likes = self._html_search_regex(
+ r'class="btn btn-up-rating[^>]*>[^<]*<i[^>]*>[^<]*</i>[^>]*<span[^>]*>[^0-9]*([0-9]+)[^<0-9]*<', webpage, 'like_count', fatal=False)
+ dislikes = self._html_search_regex(
+ r'class="btn btn-down-rating[^>]*>[^<]*<i[^>]*>[^<]*</i>[^>]*<span[^>]*>[^0-9]*([0-9]+)[^<0-9]*<', webpage, 'dislike_count', fatal=False)
+ mpd_url = self._search_regex(r'"([^"]+userscontent.net/dash/[0-9]+/manifest.mpd[^"]*)"', webpage, 'mpd_url').replace('&amp;', '&')
+ formats = self._extract_mpd_formats(mpd_url, video_id, mpd_id='dash')
+ self._sort_formats(formats)
+
+ return {
+ 'age_limit': 18,
+ 'description': description,
+ 'dislike_count': int_or_none(dislikes),
+ 'duration': parse_duration(duration),
+ 'formats': formats,
+ 'id': video_id,
+ 'like_count': int_or_none(likes),
+ 'timestamp': parse_iso8601(upload_date),
+ 'thumbnail': self._og_search_thumbnail(webpage),
+ 'title': title,
+ 'uploader': uploader,
+ 'view_count': int_or_none(view_count),
+ }
diff --git a/hypervideo_dl/extractor/pornhd.py b/hypervideo_dl/extractor/pornhd.py
index c6052ac..9dbd72f 100644
--- a/hypervideo_dl/extractor/pornhd.py
+++ b/hypervideo_dl/extractor/pornhd.py
@@ -1,6 +1,5 @@
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
from ..utils import (
@@ -47,7 +46,7 @@ class PornHdIE(InfoExtractor):
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
video_id = mobj.group('id')
display_id = mobj.group('display_id')
diff --git a/hypervideo_dl/extractor/pornhub.py b/hypervideo_dl/extractor/pornhub.py
index 0314546..6d894af 100644
--- a/hypervideo_dl/extractor/pornhub.py
+++ b/hypervideo_dl/extractor/pornhub.py
@@ -3,6 +3,7 @@ from __future__ import unicode_literals
import functools
import itertools
+import math
import operator
import re
@@ -14,6 +15,7 @@ from ..compat import (
)
from .openload import PhantomJSwrapper
from ..utils import (
+ clean_html,
determine_ext,
ExtractorError,
int_or_none,
@@ -30,6 +32,7 @@ from ..utils import (
class PornHubBaseIE(InfoExtractor):
_NETRC_MACHINE = 'pornhub'
+ _PORNHUB_HOST_RE = r'(?:(?P<host>pornhub(?:premium)?\.(?:com|net|org))|pornhubthbh7ap3u\.onion)'
def _download_webpage_handle(self, *args, **kwargs):
def dl(*args, **kwargs):
@@ -122,11 +125,13 @@ class PornHubIE(PornHubBaseIE):
_VALID_URL = r'''(?x)
https?://
(?:
- (?:[^/]+\.)?(?P<host>pornhub(?:premium)?\.(?:com|net|org))/(?:(?:view_video\.php|video/show)\?viewkey=|embed/)|
+ (?:[^/]+\.)?
+ %s
+ /(?:(?:view_video\.php|video/show)\?viewkey=|embed/)|
(?:www\.)?thumbzilla\.com/video/
)
(?P<id>[\da-z]+)
- '''
+ ''' % PornHubBaseIE._PORNHUB_HOST_RE
_TESTS = [{
'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015',
'md5': 'a6391306d050e4547f62b3f485dd9ba9',
@@ -145,6 +150,7 @@ class PornHubIE(PornHubBaseIE):
'age_limit': 18,
'tags': list,
'categories': list,
+ 'cast': list,
},
}, {
# non-ASCII title
@@ -236,6 +242,13 @@ class PornHubIE(PornHubBaseIE):
}, {
'url': 'https://www.pornhubpremium.com/view_video.php?viewkey=ph5f75b0f4b18e3',
'only_matching': True,
+ }, {
+ # geo restricted
+ 'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5a9813bfa7156',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://pornhubthbh7ap3u.onion/view_video.php?viewkey=ph5a9813bfa7156',
+ 'only_matching': True,
}]
@staticmethod
@@ -249,7 +262,7 @@ class PornHubIE(PornHubBaseIE):
pattern, webpage, '%s count' % name, fatal=False))
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
host = mobj.group('host') or 'pornhub.com'
video_id = mobj.group('id')
@@ -275,6 +288,11 @@ class PornHubIE(PornHubBaseIE):
'PornHub said: %s' % error_msg,
expected=True, video_id=video_id)
+ if any(re.search(p, webpage) for p in (
+ r'class=["\']geoBlocked["\']',
+ r'>\s*This content is unavailable in your country')):
+ self.raise_geo_restricted()
+
# video_title from flashvars contains whitespace instead of non-ASCII (see
# http://www.pornhub.com/view_video.php?viewkey=1331683002), not relying
# on that anymore.
@@ -408,17 +426,14 @@ class PornHubIE(PornHubBaseIE):
format_url, video_id, 'mp4', entry_protocol='m3u8_native',
m3u8_id='hls', fatal=False))
return
- tbr = None
- mobj = re.search(r'(?P<height>\d+)[pP]?_(?P<tbr>\d+)[kK]', format_url)
- if mobj:
- if not height:
- height = int(mobj.group('height'))
- tbr = int(mobj.group('tbr'))
+ if not height:
+ height = int_or_none(self._search_regex(
+ r'(?P<height>\d+)[pP]?_\d+[kK]', format_url, 'height',
+ default=None))
formats.append({
'url': format_url,
'format_id': '%dp' % height if height else None,
'height': height,
- 'tbr': tbr,
})
for video_url, height in video_urls:
@@ -440,7 +455,10 @@ class PornHubIE(PornHubBaseIE):
add_format(video_url, height)
continue
add_format(video_url)
- self._sort_formats(formats)
+
+ # field_preference is unnecessary here, but kept for code-similarity with youtube-dl
+ self._sort_formats(
+ formats, field_preference=('height', 'width', 'fps', 'format_id'))
video_uploader = self._html_search_regex(
r'(?s)From:&nbsp;.+?<(?:a\b[^>]+\bhref=["\']/(?:(?:user|channel)s|model|pornstar)/|span\b[^>]+\bclass=["\']username)[^>]+>(.+?)<',
@@ -464,7 +482,7 @@ class PornHubIE(PornHubBaseIE):
r'(?s)<div[^>]+\bclass=["\'].*?\b%sWrapper[^>]*>(.+?)</div>'
% meta_key, webpage, meta_key, default=None)
if div:
- return re.findall(r'<a[^>]+\bhref=[^>]+>([^<]+)', div)
+ return [clean_html(x).strip() for x in re.findall(r'(?s)<a[^>]+\bhref=[^>]+>.+?</a>', div)]
info = self._search_json_ld(webpage, video_id, default={})
# description provided in JSON-LD is irrelevant
@@ -485,6 +503,7 @@ class PornHubIE(PornHubBaseIE):
'age_limit': 18,
'tags': extract_list('tags'),
'categories': extract_list('categories'),
+ 'cast': extract_list('pornstars'),
'subtitles': subtitles,
}, info)
@@ -513,7 +532,7 @@ class PornHubPlaylistBaseIE(PornHubBaseIE):
class PornHubUserIE(PornHubPlaylistBaseIE):
- _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?(?P<host>pornhub(?:premium)?\.(?:com|net|org))/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/?#&]+))(?:[?#&]|/(?!videos)|$)'
+ _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?%s/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/?#&]+))(?:[?#&]|/(?!videos)|$)' % PornHubBaseIE._PORNHUB_HOST_RE
_TESTS = [{
'url': 'https://www.pornhub.com/model/zoe_ph',
'playlist_mincount': 118,
@@ -542,10 +561,13 @@ class PornHubUserIE(PornHubPlaylistBaseIE):
# Same as before, multi page
'url': 'https://www.pornhubpremium.com/pornstar/lily-labeau',
'only_matching': True,
+ }, {
+ 'url': 'https://pornhubthbh7ap3u.onion/model/zoe_ph',
+ 'only_matching': True,
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
user_id = mobj.group('id')
videos_url = '%s/videos' % mobj.group('url')
page = self._extract_page(url)
@@ -607,7 +629,7 @@ class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE):
break
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
host = mobj.group('host')
item_id = mobj.group('id')
@@ -617,7 +639,7 @@ class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE):
class PornHubPagedVideoListIE(PornHubPagedPlaylistBaseIE):
- _VALID_URL = r'https?://(?:[^/]+\.)?(?P<host>pornhub(?:premium)?\.(?:com|net|org))/(?P<id>(?:[^/]+/)*[^/?#&]+)'
+ _VALID_URL = r'https?://(?:[^/]+\.)?%s/(?!playlist/)(?P<id>(?:[^/]+/)*[^/?#&]+)' % PornHubBaseIE._PORNHUB_HOST_RE
_TESTS = [{
'url': 'https://www.pornhub.com/model/zoe_ph/videos',
'only_matching': True,
@@ -711,16 +733,7 @@ class PornHubPagedVideoListIE(PornHubPagedPlaylistBaseIE):
'url': 'https://www.pornhub.com/video/incategories/60fps-1/hd-porn',
'only_matching': True,
}, {
- 'url': 'https://www.pornhub.com/playlist/44121572',
- 'info_dict': {
- 'id': 'playlist/44121572',
- },
- 'playlist_mincount': 132,
- }, {
- 'url': 'https://www.pornhub.com/playlist/4667351',
- 'only_matching': True,
- }, {
- 'url': 'https://de.pornhub.com/playlist/4667351',
+ 'url': 'https://pornhubthbh7ap3u.onion/model/zoe_ph/videos',
'only_matching': True,
}]
@@ -732,7 +745,7 @@ class PornHubPagedVideoListIE(PornHubPagedPlaylistBaseIE):
class PornHubUserVideosUploadIE(PornHubPagedPlaylistBaseIE):
- _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?(?P<host>pornhub(?:premium)?\.(?:com|net|org))/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/]+)/videos/upload)'
+ _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?%s/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/]+)/videos/upload)' % PornHubBaseIE._PORNHUB_HOST_RE
_TESTS = [{
'url': 'https://www.pornhub.com/pornstar/jenny-blighe/videos/upload',
'info_dict': {
@@ -742,4 +755,63 @@ class PornHubUserVideosUploadIE(PornHubPagedPlaylistBaseIE):
}, {
'url': 'https://www.pornhub.com/model/zoe_ph/videos/upload',
'only_matching': True,
+ }, {
+ 'url': 'http://pornhubthbh7ap3u.onion/pornstar/jenny-blighe/videos/upload',
+ 'only_matching': True,
}]
+
+
+class PornHubPlaylistIE(PornHubPlaylistBaseIE):
+ _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?%s/playlist/(?P<id>[^/?#&]+))' % PornHubBaseIE._PORNHUB_HOST_RE
+ _TESTS = [{
+ 'url': 'https://www.pornhub.com/playlist/44121572',
+ 'info_dict': {
+ 'id': '44121572',
+ },
+ 'playlist_count': 77,
+ }, {
+ 'url': 'https://www.pornhub.com/playlist/4667351',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://de.pornhub.com/playlist/4667351',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://de.pornhub.com/playlist/4667351?page=2',
+ 'only_matching': True,
+ }]
+
+ def _entries(self, url, host, item_id):
+ webpage = self._download_webpage(url, item_id, 'Downloading page 1')
+ playlist_id = self._search_regex(r'var\s+playlistId\s*=\s*"([^"]+)"', webpage, 'playlist_id')
+ video_count = int_or_none(
+ self._search_regex(r'var\s+itemsCount\s*=\s*([0-9]+)\s*\|\|', webpage, 'video_count'))
+ token = self._search_regex(r'var\s+token\s*=\s*"([^"]+)"', webpage, 'token')
+ page_count = math.ceil((video_count - 36) / 40.) + 1
+ page_entries = self._extract_entries(webpage, host)
+
+ def download_page(page_num):
+ note = 'Downloading page {}'.format(page_num)
+ page_url = 'https://www.{}/playlist/viewChunked'.format(host)
+ return self._download_webpage(page_url, item_id, note, query={
+ 'id': playlist_id,
+ 'page': page_num,
+ 'token': token,
+ })
+
+ for page_num in range(1, page_count + 1):
+ if page_num > 1:
+ webpage = download_page(page_num)
+ page_entries = self._extract_entries(webpage, host)
+ if not page_entries:
+ break
+ for e in page_entries:
+ yield e
+
+ def _real_extract(self, url):
+ mobj = self._match_valid_url(url)
+ host = mobj.group('host')
+ item_id = mobj.group('id')
+
+ self._login(host)
+
+ return self.playlist_result(self._entries(mobj.group('url'), host, item_id), item_id)
diff --git a/hypervideo_dl/extractor/pornovoisines.py b/hypervideo_dl/extractor/pornovoisines.py
index b6b7106..18459fc 100644
--- a/hypervideo_dl/extractor/pornovoisines.py
+++ b/hypervideo_dl/extractor/pornovoisines.py
@@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
from ..utils import (
@@ -39,7 +38,7 @@ class PornoVoisinesIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
video_id = mobj.group('id')
display_id = mobj.group('display_id')
diff --git a/hypervideo_dl/extractor/pornoxo.py b/hypervideo_dl/extractor/pornoxo.py
index 2831368..489dc2b 100644
--- a/hypervideo_dl/extractor/pornoxo.py
+++ b/hypervideo_dl/extractor/pornoxo.py
@@ -1,6 +1,5 @@
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
from ..utils import (
@@ -26,7 +25,7 @@ class PornoXOIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
video_id, display_id = mobj.groups()
webpage = self._download_webpage(url, video_id)
diff --git a/hypervideo_dl/extractor/presstv.py b/hypervideo_dl/extractor/presstv.py
index b5c2792..bfb2eb7 100644
--- a/hypervideo_dl/extractor/presstv.py
+++ b/hypervideo_dl/extractor/presstv.py
@@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
from ..utils import remove_start
@@ -25,7 +24,7 @@ class PressTVIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
video_id = mobj.group('id')
display_id = mobj.group('display_id') or video_id
diff --git a/hypervideo_dl/extractor/projectveritas.py b/hypervideo_dl/extractor/projectveritas.py
new file mode 100644
index 0000000..1d832a6
--- /dev/null
+++ b/hypervideo_dl/extractor/projectveritas.py
@@ -0,0 +1,55 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ traverse_obj,
+ unified_strdate,
+)
+
+
+class ProjectVeritasIE(InfoExtractor):
+ _VALID_URL = r'(?:https?://)(?:www\.)?projectveritas\.com/(?P<type>news|video)/(?P<id>[^/?#]+)'
+ _TESTS = [{
+ 'url': 'https://www.projectveritas.com/news/exclusive-inside-the-new-york-and-new-jersey-hospitals-battling-coronavirus/',
+ 'info_dict': {
+ 'id': '51910aab-365a-5cf1-88f2-8eb1ca5fd3c6',
+ 'ext': 'mp4',
+ 'title': 'Exclusive: Inside The New York and New Jersey Hospitals Battling Coronavirus',
+ 'upload_date': '20200327',
+ 'thumbnail': 'md5:6076477fe50b03eb8708be9415e18e1c',
+ }
+ }, {
+ 'url': 'https://www.projectveritas.com/video/ilhan-omar-connected-ballot-harvester-in-cash-for-ballots-scheme-car-is-full/',
+ 'info_dict': {
+ 'id': 'c5aab304-a56b-54b1-9f0b-03b77bc5f2f6',
+ 'ext': 'mp4',
+ 'title': 'Ilhan Omar connected Ballot Harvester in cash-for-ballots scheme: "Car is full" of absentee ballots',
+ 'upload_date': '20200927',
+ 'thumbnail': 'md5:194b8edf0e2ba64f25500ff4378369a4',
+ }
+ }]
+
+ def _real_extract(self, url):
+ id, type = self._match_valid_url(url).group('id', 'type')
+ api_url = f'https://www.projectveritas.com/page-data/{type}/{id}/page-data.json'
+ data_json = self._download_json(api_url, id)['result']['data']
+ main_data = traverse_obj(data_json, 'video', 'post')
+ video_id = main_data['id']
+ thumbnail = traverse_obj(main_data, ('image', 'ogImage', 'src'))
+ mux_asset = traverse_obj(main_data,
+ 'muxAsset', ('body', 'json', 'content', ..., 'data', 'target', 'fields', 'muxAsset'),
+ get_all=False, expected_type=dict)
+ if not mux_asset:
+ raise ExtractorError('No video on the provided url.', expected=True)
+ playback_id = traverse_obj(mux_asset, 'playbackId', ('en-US', 'playbackId'))
+ formats = self._extract_m3u8_formats(f'https://stream.mux.com/{playback_id}.m3u8', video_id)
+ self._sort_formats(formats)
+ return {
+ 'id': video_id,
+ 'title': main_data['title'],
+ 'upload_date': unified_strdate(main_data.get('date')),
+ 'thumbnail': thumbnail.replace('//', ''),
+ 'formats': formats,
+ }
diff --git a/hypervideo_dl/extractor/prosiebensat1.py b/hypervideo_dl/extractor/prosiebensat1.py
index e470882..e89bbfd 100644
--- a/hypervideo_dl/extractor/prosiebensat1.py
+++ b/hypervideo_dl/extractor/prosiebensat1.py
@@ -34,8 +34,8 @@ class ProSiebenSat1BaseIE(InfoExtractor):
'ids': clip_id,
})[0]
- if video.get('is_protected') is True:
- raise ExtractorError('This video is DRM protected.', expected=True)
+ if not self.get_param('allow_unplayable_formats') and video.get('is_protected') is True:
+ self.report_drm(clip_id)
formats = []
if self._ACCESS_ID:
diff --git a/hypervideo_dl/extractor/pyvideo.py b/hypervideo_dl/extractor/pyvideo.py
index b8ac93a..8696197 100644
--- a/hypervideo_dl/extractor/pyvideo.py
+++ b/hypervideo_dl/extractor/pyvideo.py
@@ -27,7 +27,7 @@ class PyvideoIE(InfoExtractor):
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
category = mobj.group('category')
video_id = mobj.group('id')
diff --git a/hypervideo_dl/extractor/qqmusic.py b/hypervideo_dl/extractor/qqmusic.py
index 084308a..0106d16 100644
--- a/hypervideo_dl/extractor/qqmusic.py
+++ b/hypervideo_dl/extractor/qqmusic.py
@@ -121,7 +121,7 @@ class QQMusicIE(InfoExtractor):
% (details['prefix'], mid, details['ext'], vkey, guid),
'format': format_id,
'format_id': format_id,
- 'preference': details['preference'],
+ 'quality': details['preference'],
'abr': details.get('abr'),
})
self._check_formats(formats, mid)
diff --git a/hypervideo_dl/extractor/radiko.py b/hypervideo_dl/extractor/radiko.py
new file mode 100644
index 0000000..1e60de1
--- /dev/null
+++ b/hypervideo_dl/extractor/radiko.py
@@ -0,0 +1,234 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+import base64
+import calendar
+import datetime
+
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ update_url_query,
+ clean_html,
+ unified_timestamp,
+)
+from ..compat import compat_urllib_parse
+
+
+class RadikoBaseIE(InfoExtractor):
+ _FULL_KEY = None
+
+ def _auth_client(self):
+ auth_cache = self._downloader.cache.load('radiko', 'auth_data')
+ if auth_cache:
+ return auth_cache
+
+ _, auth1_handle = self._download_webpage_handle(
+ 'https://radiko.jp/v2/api/auth1', None, 'Downloading authentication page',
+ headers={
+ 'x-radiko-app': 'pc_html5',
+ 'x-radiko-app-version': '0.0.1',
+ 'x-radiko-device': 'pc',
+ 'x-radiko-user': 'dummy_user',
+ })
+ auth1_header = auth1_handle.info()
+
+ auth_token = auth1_header['X-Radiko-AuthToken']
+ kl = int(auth1_header['X-Radiko-KeyLength'])
+ ko = int(auth1_header['X-Radiko-KeyOffset'])
+ raw_partial_key = self._extract_full_key()[ko:ko + kl]
+ partial_key = base64.b64encode(raw_partial_key).decode()
+
+ area_id = self._download_webpage(
+ 'https://radiko.jp/v2/api/auth2', None, 'Authenticating',
+ headers={
+ 'x-radiko-device': 'pc',
+ 'x-radiko-user': 'dummy_user',
+ 'x-radiko-authtoken': auth_token,
+ 'x-radiko-partialkey': partial_key,
+ }).split(',')[0]
+
+ auth_data = (auth_token, area_id)
+ self._downloader.cache.store('radiko', 'auth_data', auth_data)
+ return auth_data
+
+ def _extract_full_key(self):
+ if self._FULL_KEY:
+ return self._FULL_KEY
+
+ jscode = self._download_webpage(
+ 'https://radiko.jp/apps/js/playerCommon.js', None,
+ note='Downloading player js code')
+ full_key = self._search_regex(
+ (r"RadikoJSPlayer\([^,]*,\s*(['\"])pc_html5\1,\s*(['\"])(?P<fullkey>[0-9a-f]+)\2,\s*{"),
+ jscode, 'full key', fatal=False, group='fullkey')
+
+ if full_key:
+ full_key = full_key.encode()
+ else: # use full key ever known
+ full_key = b'bcd151073c03b352e1ef2fd66c32209da9ca0afa'
+
+ self._FULL_KEY = full_key
+ return full_key
+
+ def _find_program(self, video_id, station, cursor):
+ station_program = self._download_xml(
+ 'https://radiko.jp/v3/program/station/weekly/%s.xml' % station, video_id,
+ note='Downloading radio program for %s station' % station)
+
+ prog = None
+ for p in station_program.findall('.//prog'):
+ ft_str, to_str = p.attrib['ft'], p.attrib['to']
+ ft = unified_timestamp(ft_str, False)
+ to = unified_timestamp(to_str, False)
+ if ft <= cursor and cursor < to:
+ prog = p
+ break
+ if not prog:
+ raise ExtractorError('Cannot identify radio program to download!')
+ assert ft, to
+ return prog, station_program, ft, ft_str, to_str
+
+ def _extract_formats(self, video_id, station, is_onair, ft, cursor, auth_token, area_id, query):
+ m3u8_playlist_data = self._download_xml(
+ 'https://radiko.jp/v3/station/stream/pc_html5/%s.xml' % station, video_id,
+ note='Downloading m3u8 information')
+ m3u8_urls = m3u8_playlist_data.findall('.//url')
+
+ formats = []
+ found = set()
+ for url_tag in m3u8_urls:
+ pcu = url_tag.find('playlist_create_url')
+ url_attrib = url_tag.attrib
+ playlist_url = update_url_query(pcu.text, {
+ 'station_id': station,
+ **query,
+ 'l': '15',
+ 'lsid': '77d0678df93a1034659c14d6fc89f018',
+ 'type': 'b',
+ })
+ if playlist_url in found:
+ continue
+ else:
+ found.add(playlist_url)
+
+ time_to_skip = None if is_onair else cursor - ft
+
+ subformats = self._extract_m3u8_formats(
+ playlist_url, video_id, ext='m4a',
+ live=True, fatal=False, m3u8_id=None,
+ headers={
+ 'X-Radiko-AreaId': area_id,
+ 'X-Radiko-AuthToken': auth_token,
+ })
+ for sf in subformats:
+ domain = sf['format_id'] = compat_urllib_parse.urlparse(sf['url']).netloc
+ if re.match(r'^[cf]-radiko\.smartstream\.ne\.jp$', domain):
+ # Prioritize live radio vs playback based on extractor
+ sf['preference'] = 100 if is_onair else -100
+ if not is_onair and url_attrib['timefree'] == '1' and time_to_skip:
+ sf['_ffmpeg_args'] = ['-ss', time_to_skip]
+ formats.extend(subformats)
+
+ self._sort_formats(formats)
+ return formats
+
+
+class RadikoIE(RadikoBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?radiko\.jp/#!/ts/(?P<station>[A-Z0-9-]+)/(?P<id>\d+)'
+
+ _TESTS = [{
+ # QRR (文化放送) station provides <desc>
+ 'url': 'https://radiko.jp/#!/ts/QRR/20210425101300',
+ 'only_matching': True,
+ }, {
+ # FMT (TOKYO FM) station does not provide <desc>
+ 'url': 'https://radiko.jp/#!/ts/FMT/20210810150000',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://radiko.jp/#!/ts/JOAK-FM/20210509090000',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ station, video_id = self._match_valid_url(url).groups()
+ vid_int = unified_timestamp(video_id, False)
+
+ auth_token, area_id = self._auth_client()
+
+ prog, station_program, ft, radio_begin, radio_end = self._find_program(video_id, station, vid_int)
+
+ title = prog.find('title').text
+ description = clean_html(prog.find('info').text)
+ station_name = station_program.find('.//name').text
+
+ formats = self._extract_formats(
+ video_id=video_id, station=station, is_onair=False,
+ ft=ft, cursor=vid_int, auth_token=auth_token, area_id=area_id,
+ query={
+ 'start_at': radio_begin,
+ 'ft': radio_begin,
+ 'end_at': radio_end,
+ 'to': radio_end,
+ 'seek': video_id,
+ })
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'uploader': station_name,
+ 'uploader_id': station,
+ 'timestamp': vid_int,
+ 'formats': formats,
+ 'is_live': True,
+ }
+
+
+class RadikoRadioIE(RadikoBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?radiko\.jp/#!/live/(?P<id>[A-Z0-9-]+)'
+
+ _TESTS = [{
+ # QRR (文化放送) station provides <desc>
+ 'url': 'https://radiko.jp/#!/live/QRR',
+ 'only_matching': True,
+ }, {
+ # FMT (TOKYO FM) station does not provide <desc>
+ 'url': 'https://radiko.jp/#!/live/FMT',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://radiko.jp/#!/live/JOAK-FM',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ station = self._match_id(url)
+ self.report_warning('Downloader will not stop at the end of the program! Press Ctrl+C to stop')
+
+ auth_token, area_id = self._auth_client()
+ # get current time in JST (GMT+9:00 w/o DST)
+ vid_now = datetime.datetime.now(datetime.timezone(datetime.timedelta(hours=9)))
+ vid_now = calendar.timegm(vid_now.timetuple())
+
+ prog, station_program, ft, _, _ = self._find_program(station, station, vid_now)
+
+ title = prog.find('title').text
+ description = clean_html(prog.find('info').text)
+ station_name = station_program.find('.//name').text
+
+ formats = self._extract_formats(
+ video_id=station, station=station, is_onair=True,
+ ft=ft, cursor=vid_now, auth_token=auth_token, area_id=area_id,
+ query={})
+
+ return {
+ 'id': station,
+ 'title': title,
+ 'description': description,
+ 'uploader': station_name,
+ 'uploader_id': station,
+ 'timestamp': ft,
+ 'formats': formats,
+ 'is_live': True,
+ }
diff --git a/hypervideo_dl/extractor/radiocanada.py b/hypervideo_dl/extractor/radiocanada.py
index a28b1a2..4b4445c 100644
--- a/hypervideo_dl/extractor/radiocanada.py
+++ b/hypervideo_dl/extractor/radiocanada.py
@@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
from ..compat import compat_HTTPError
@@ -143,7 +142,7 @@ class RadioCanadaIE(InfoExtractor):
}
def _real_extract(self, url):
- return self._extract_info(*re.match(self._VALID_URL, url).groups())
+ return self._extract_info(*self._match_valid_url(url).groups())
class RadioCanadaAudioVideoIE(InfoExtractor):
diff --git a/hypervideo_dl/extractor/radiofrance.py b/hypervideo_dl/extractor/radiofrance.py
index a8afc00..082238b 100644
--- a/hypervideo_dl/extractor/radiofrance.py
+++ b/hypervideo_dl/extractor/radiofrance.py
@@ -23,7 +23,7 @@ class RadioFranceIE(InfoExtractor):
}
def _real_extract(self, url):
- m = re.match(self._VALID_URL, url)
+ m = self._match_valid_url(url)
video_id = m.group('id')
webpage = self._download_webpage(url, video_id)
@@ -43,7 +43,7 @@ class RadioFranceIE(InfoExtractor):
'format_id': fm[0],
'url': fm[1],
'vcodec': 'none',
- 'preference': i,
+ 'quality': i,
}
for i, fm in
enumerate(re.findall(r"([a-z0-9]+)\s*:\s*'([^']+)'", formats_str))
diff --git a/hypervideo_dl/extractor/radlive.py b/hypervideo_dl/extractor/radlive.py
new file mode 100644
index 0000000..2de7ab0
--- /dev/null
+++ b/hypervideo_dl/extractor/radlive.py
@@ -0,0 +1,179 @@
+import json
+
+from ..utils import ExtractorError, traverse_obj, try_get, unified_timestamp
+from .common import InfoExtractor
+
+
+class RadLiveIE(InfoExtractor):
+ IE_NAME = 'radlive'
+ _VALID_URL = r'https?://(?:www\.)?rad\.live/content/(?P<content_type>feature|episode)/(?P<id>[a-f0-9-]+)'
+ _TESTS = [{
+ 'url': 'https://rad.live/content/feature/dc5acfbc-761b-4bec-9564-df999905116a',
+ 'md5': '6219d5d31d52de87d21c9cf5b7cb27ff',
+ 'info_dict': {
+ 'id': 'dc5acfbc-761b-4bec-9564-df999905116a',
+ 'ext': 'mp4',
+ 'title': 'Deathpact - Digital Mirage 2 [Full Set]',
+ 'language': 'en',
+ 'thumbnail': 'https://static.12core.net/cb65ae077a079c68380e38f387fbc438.png',
+ 'description': '',
+ 'release_timestamp': 1600185600.0,
+ 'channel': 'Proximity',
+ 'channel_id': '9ce6dd01-70a4-4d59-afb6-d01f807cd009',
+ 'channel_url': 'https://rad.live/content/channel/9ce6dd01-70a4-4d59-afb6-d01f807cd009',
+ }
+ }, {
+ 'url': 'https://rad.live/content/episode/bbcf66ec-0d02-4ca0-8dc0-4213eb2429bf',
+ 'md5': '40b2175f347592125d93e9a344080125',
+ 'info_dict': {
+ 'id': 'bbcf66ec-0d02-4ca0-8dc0-4213eb2429bf',
+ 'ext': 'mp4',
+ 'title': 'E01: Bad Jokes 1',
+ 'language': 'en',
+ 'thumbnail': 'https://lsp.littlstar.com/channels/WHISTLE/BAD_JOKES/SEASON_1/BAD_JOKES_101/poster.jpg',
+ 'description': 'Bad Jokes - Champions, Adam Pally, Super Troopers, Team Edge and 2Hype',
+ 'release_timestamp': None,
+ 'channel': None,
+ 'channel_id': None,
+ 'channel_url': None,
+ 'episode': 'E01: Bad Jokes 1',
+ 'episode_number': 1,
+ 'episode_id': '336',
+ },
+ }]
+
+ def _real_extract(self, url):
+ content_type, video_id = self._match_valid_url(url).groups()
+
+ webpage = self._download_webpage(url, video_id)
+
+ content_info = json.loads(self._search_regex(
+ r'<script[^>]*type=([\'"])application/json\1[^>]*>(?P<json>{.+?})</script>',
+ webpage, 'video info', group='json'))['props']['pageProps']['initialContentData']
+ video_info = content_info[content_type]
+
+ if not video_info:
+ raise ExtractorError('Unable to extract video info, make sure the URL is valid')
+
+ formats = self._extract_m3u8_formats(video_info['assets']['videos'][0]['url'], video_id)
+ self._sort_formats(formats)
+
+ data = video_info.get('structured_data', {})
+
+ release_date = unified_timestamp(traverse_obj(data, ('releasedEvent', 'startDate')))
+ channel = next(iter(content_info.get('channels', [])), {})
+ channel_id = channel.get('lrn', '').split(':')[-1] or None
+
+ result = {
+ 'id': video_id,
+ 'title': video_info['title'],
+ 'formats': formats,
+ 'language': traverse_obj(data, ('potentialAction', 'target', 'inLanguage')),
+ 'thumbnail': traverse_obj(data, ('image', 'contentUrl')),
+ 'description': data.get('description'),
+ 'release_timestamp': release_date,
+ 'channel': channel.get('name'),
+ 'channel_id': channel_id,
+ 'channel_url': f'https://rad.live/content/channel/{channel_id}' if channel_id else None,
+
+ }
+ if content_type == 'episode':
+ result.update({
+ # TODO: Get season number when downloading single episode
+ 'episode': video_info.get('title'),
+ 'episode_number': video_info.get('number'),
+ 'episode_id': video_info.get('id'),
+ })
+
+ return result
+
+
+class RadLiveSeasonIE(RadLiveIE):
+ IE_NAME = 'radlive:season'
+ _VALID_URL = r'https?://(?:www\.)?rad\.live/content/season/(?P<id>[a-f0-9-]+)'
+ _TESTS = [{
+ 'url': 'https://rad.live/content/season/08a290f7-c9ef-4e22-9105-c255995a2e75',
+ 'md5': '40b2175f347592125d93e9a344080125',
+ 'info_dict': {
+ 'id': '08a290f7-c9ef-4e22-9105-c255995a2e75',
+ 'title': 'Bad Jokes - Season 1',
+ },
+ 'playlist_mincount': 5,
+ }]
+
+ @classmethod
+ def suitable(cls, url):
+ return False if RadLiveIE.suitable(url) else super(RadLiveSeasonIE, cls).suitable(url)
+
+ def _real_extract(self, url):
+ season_id = self._match_id(url)
+ webpage = self._download_webpage(url, season_id)
+
+ content_info = json.loads(self._search_regex(
+ r'<script[^>]*type=([\'"])application/json\1[^>]*>(?P<json>{.+?})</script>',
+ webpage, 'video info', group='json'))['props']['pageProps']['initialContentData']
+ video_info = content_info['season']
+
+ entries = [{
+ '_type': 'url_transparent',
+ 'id': episode['structured_data']['url'].split('/')[-1],
+ 'url': episode['structured_data']['url'],
+ 'series': try_get(content_info, lambda x: x['series']['title']),
+ 'season': video_info['title'],
+ 'season_number': video_info.get('number'),
+ 'season_id': video_info.get('id'),
+ 'ie_key': RadLiveIE.ie_key(),
+ } for episode in video_info['episodes']]
+
+ return self.playlist_result(entries, season_id, video_info.get('title'))
+
+
+class RadLiveChannelIE(RadLiveIE):
+ IE_NAME = 'radlive:channel'
+ _VALID_URL = r'https?://(?:www\.)?rad\.live/content/channel/(?P<id>[a-f0-9-]+)'
+ _TESTS = [{
+ 'url': 'https://rad.live/content/channel/5c4d8df4-6fa0-413c-81e3-873479b49274',
+ 'md5': '625156a08b7f2b0b849f234e664457ac',
+ 'info_dict': {
+ 'id': '5c4d8df4-6fa0-413c-81e3-873479b49274',
+ 'title': 'Whistle Sports',
+ },
+ 'playlist_mincount': 7,
+ }]
+
+ _QUERY = '''
+query WebChannelListing ($lrn: ID!) {
+ channel (id:$lrn) {
+ name
+ features {
+ structured_data
+ }
+ }
+}'''
+
+ @classmethod
+ def suitable(cls, url):
+ return False if RadLiveIE.suitable(url) else super(RadLiveChannelIE, cls).suitable(url)
+
+ def _real_extract(self, url):
+ channel_id = self._match_id(url)
+
+ graphql = self._download_json(
+ 'https://content.mhq.12core.net/graphql', channel_id,
+ headers={'Content-Type': 'application/json'},
+ data=json.dumps({
+ 'query': self._QUERY,
+ 'variables': {'lrn': f'lrn:12core:media:content:channel:{channel_id}'}
+ }).encode('utf-8'))
+
+ data = traverse_obj(graphql, ('data', 'channel'))
+ if not data:
+ raise ExtractorError('Unable to extract video info, make sure the URL is valid')
+
+ entries = [{
+ '_type': 'url_transparent',
+ 'url': feature['structured_data']['url'],
+ 'ie_key': RadLiveIE.ie_key(),
+ } for feature in data['features']]
+
+ return self.playlist_result(entries, channel_id, data.get('name'))
diff --git a/hypervideo_dl/extractor/rai.py b/hypervideo_dl/extractor/rai.py
index 67b86fc..27cd018 100644
--- a/hypervideo_dl/extractor/rai.py
+++ b/hypervideo_dl/extractor/rai.py
@@ -5,15 +5,16 @@ import re
from .common import InfoExtractor
from ..compat import (
- compat_urlparse,
compat_str,
+ compat_urlparse,
)
from ..utils import (
- ExtractorError,
determine_ext,
+ ExtractorError,
find_xpath_attr,
fix_xml_ampersands,
GeoRestrictedError,
+ HEADRequest,
int_or_none,
parse_duration,
remove_start,
@@ -94,7 +95,9 @@ class RaiBaseIE(InfoExtractor):
})
if not formats and geoprotection is True:
- self.raise_geo_restricted(countries=self._GEO_COUNTRIES)
+ self.raise_geo_restricted(countries=self._GEO_COUNTRIES, metadata_available=True)
+
+ formats.extend(self._create_http_urls(relinker_url, formats))
return dict((k, v) for k, v in {
'is_live': is_live,
@@ -102,6 +105,92 @@ class RaiBaseIE(InfoExtractor):
'formats': formats,
}.items() if v is not None)
+ def _create_http_urls(self, relinker_url, fmts):
+ _RELINKER_REG = r'https?://(?P<host>[^/]+?)/(?:i/)?(?P<extra>[^/]+?)/(?P<path>.+?)/(?P<id>\d+)(?:_(?P<quality>[\d\,]+))?(?:\.mp4|/playlist\.m3u8).+?'
+ _MP4_TMPL = '%s&overrideUserAgentRule=mp4-%s'
+ _QUALITY = {
+ # tbr: w, h
+ '250': [352, 198],
+ '400': [512, 288],
+ '700': [512, 288],
+ '800': [700, 394],
+ '1200': [736, 414],
+ '1800': [1024, 576],
+ '2400': [1280, 720],
+ '3200': [1440, 810],
+ '3600': [1440, 810],
+ '5000': [1920, 1080],
+ '10000': [1920, 1080],
+ }
+
+ def test_url(url):
+ resp = self._request_webpage(
+ HEADRequest(url), None, headers={'User-Agent': 'Rai'},
+ fatal=False, errnote=False, note=False)
+
+ if resp is False:
+ return False
+
+ if resp.code == 200:
+ return False if resp.url == url else resp.url
+ return None
+
+ def get_format_info(tbr):
+ import math
+ br = int_or_none(tbr)
+ if len(fmts) == 1 and not br:
+ br = fmts[0].get('tbr')
+ if br > 300:
+ tbr = compat_str(math.floor(br / 100) * 100)
+ else:
+ tbr = '250'
+
+ # try extracting info from available m3u8 formats
+ format_copy = None
+ for f in fmts:
+ if f.get('tbr'):
+ br_limit = math.floor(br / 100)
+ if br_limit - 1 <= math.floor(f['tbr'] / 100) <= br_limit + 1:
+ format_copy = f.copy()
+ return {
+ 'width': format_copy.get('width'),
+ 'height': format_copy.get('height'),
+ 'tbr': format_copy.get('tbr'),
+ 'vcodec': format_copy.get('vcodec'),
+ 'acodec': format_copy.get('acodec'),
+ 'fps': format_copy.get('fps'),
+ 'format_id': 'https-%s' % tbr,
+ } if format_copy else {
+ 'width': _QUALITY[tbr][0],
+ 'height': _QUALITY[tbr][1],
+ 'format_id': 'https-%s' % tbr,
+ 'tbr': int(tbr),
+ }
+
+ loc = test_url(_MP4_TMPL % (relinker_url, '*'))
+ if not isinstance(loc, compat_str):
+ return []
+
+ mobj = re.match(
+ _RELINKER_REG,
+ test_url(relinker_url) or '')
+ if not mobj:
+ return []
+
+ available_qualities = mobj.group('quality').split(',') if mobj.group('quality') else ['*']
+ available_qualities = [i for i in available_qualities if i]
+
+ formats = []
+ for q in available_qualities:
+ fmt = {
+ 'url': _MP4_TMPL % (relinker_url, q),
+ 'protocol': 'https',
+ 'ext': 'mp4',
+ }
+ fmt.update(get_format_info(q))
+ formats.append(fmt)
+ return formats
+
@staticmethod
def _extract_subtitles(url, video_data):
STL_EXT = 'stl'
@@ -152,22 +241,49 @@ class RaiPlayIE(RaiBaseIE):
'skip_download': True,
},
}, {
+ # 1080p direct mp4 url
+ 'url': 'https://www.raiplay.it/video/2021/03/Leonardo-S1E1-b5703b02-82ee-475a-85b6-c9e4a8adf642.html',
+ 'md5': '2e501e8651d72f05ffe8f5d286ad560b',
+ 'info_dict': {
+ 'id': 'b5703b02-82ee-475a-85b6-c9e4a8adf642',
+ 'ext': 'mp4',
+ 'title': 'Leonardo - S1E1',
+ 'alt_title': 'St 1 Ep 1 - Episodio 1',
+ 'description': 'md5:f5360cd267d2de146e4e3879a5a47d31',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'uploader': 'Rai 1',
+ 'duration': 3229,
+ 'series': 'Leonardo',
+ 'season': 'Season 1',
+ },
+ }, {
'url': 'http://www.raiplay.it/video/2016/11/gazebotraindesi-efebe701-969c-4593-92f3-285f0d1ce750.html?',
'only_matching': True,
}, {
# subtitles at 'subtitlesArray' key (see #27698)
'url': 'https://www.raiplay.it/video/2020/12/Report---04-01-2021-2e90f1de-8eee-4de4-ac0e-78d21db5b600.html',
'only_matching': True,
+ }, {
+ # DRM protected
+ 'url': 'https://www.raiplay.it/video/2020/09/Lo-straordinario-mondo-di-Zoey-S1E1-Lo-straordinario-potere-di-Zoey-ed493918-1d32-44b7-8454-862e473d00ff.html',
+ 'only_matching': True,
}]
def _real_extract(self, url):
- base, video_id = re.match(self._VALID_URL, url).groups()
+ base, video_id = self._match_valid_url(url).groups()
media = self._download_json(
base + '.json', video_id, 'Downloading video JSON')
- title = media['name']
+ if not self.get_param('allow_unplayable_formats'):
+ if try_get(
+ media,
+ (lambda x: x['rights_management']['rights']['drm'],
+ lambda x: x['program_info']['rights_management']['rights']['drm']),
+ dict):
+ self.report_drm(video_id)
+ title = media['name']
video = media['video']
relinker_info = self._extract_relinker_info(video['content_url'], video_id)
@@ -247,7 +363,7 @@ class RaiPlayPlaylistIE(InfoExtractor):
}]
def _real_extract(self, url):
- base, playlist_id = re.match(self._VALID_URL, url).groups()
+ base, playlist_id = self._match_valid_url(url).groups()
program = self._download_json(
base + '.json', playlist_id, 'Downloading program JSON')
@@ -307,7 +423,7 @@ class RaiIE(RaiBaseIE):
}, {
# with ContentItem in og:url
'url': 'http://www.rai.it/dl/RaiTV/programmi/media/ContentItem-efb17665-691c-45d5-a60c-5301333cbb0c.html',
- 'md5': '6865dd00cf0bbf5772fdd89d59bd768a',
+ 'md5': '06345bd97c932f19ffb129973d07a020',
'info_dict': {
'id': 'efb17665-691c-45d5-a60c-5301333cbb0c',
'ext': 'mp4',
@@ -340,22 +456,6 @@ class RaiIE(RaiBaseIE):
'skip_download': True,
},
}, {
- # ContentItem in iframe (see #12652) and subtitle at 'subtitlesUrl' key
- 'url': 'http://www.presadiretta.rai.it/dl/portali/site/puntata/ContentItem-3ed19d13-26c2-46ff-a551-b10828262f1b.html',
- 'info_dict': {
- 'id': '1ad6dc64-444a-42a4-9bea-e5419ad2f5fd',
- 'ext': 'mp4',
- 'title': 'Partiti acchiappavoti - Presa diretta del 13/09/2015',
- 'description': 'md5:d291b03407ec505f95f27970c0b025f4',
- 'upload_date': '20150913',
- 'subtitles': {
- 'it': 'count:2',
- },
- },
- 'params': {
- 'skip_download': True,
- },
- }, {
# Direct MMS URL
'url': 'http://www.rai.it/dl/RaiTV/programmi/media/ContentItem-b63a4089-ac28-48cf-bca5-9f5b5bc46df5.html',
'only_matching': True,
diff --git a/hypervideo_dl/extractor/raywenderlich.py b/hypervideo_dl/extractor/raywenderlich.py
index 5411ece..f04d51f 100644
--- a/hypervideo_dl/extractor/raywenderlich.py
+++ b/hypervideo_dl/extractor/raywenderlich.py
@@ -72,7 +72,7 @@ class RayWenderlichIE(InfoExtractor):
return compat_str(video_id)
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
course_id, lesson_id = mobj.group('course_id', 'id')
display_id = '%s/%s' % (course_id, lesson_id)
diff --git a/hypervideo_dl/extractor/rbmaradio.py b/hypervideo_dl/extractor/rbmaradio.py
index ae7413f..9642fbb 100644
--- a/hypervideo_dl/extractor/rbmaradio.py
+++ b/hypervideo_dl/extractor/rbmaradio.py
@@ -1,6 +1,5 @@
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
from ..compat import compat_str
@@ -30,7 +29,7 @@ class RBMARadioIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
show_id = mobj.group('show_id')
episode_id = mobj.group('id')
diff --git a/hypervideo_dl/extractor/rcs.py b/hypervideo_dl/extractor/rcs.py
new file mode 100644
index 0000000..ace611b
--- /dev/null
+++ b/hypervideo_dl/extractor/rcs.py
@@ -0,0 +1,427 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ clean_html,
+ ExtractorError,
+ js_to_json,
+ base_url,
+ url_basename,
+ urljoin,
+)
+
+
+class RCSBaseIE(InfoExtractor):
+ # based on VideoPlayerLoader.prototype.getVideoSrc
+ # and VideoPlayerLoader.prototype.transformSrc from
+ # https://js2.corriereobjects.it/includes2013/LIBS/js/corriere_video.sjs
+ _ALL_REPLACE = {
+ 'media2vam.corriere.it.edgesuite.net':
+ 'media2vam-corriere-it.akamaized.net',
+ 'media.youreporter.it.edgesuite.net':
+ 'media-youreporter-it.akamaized.net',
+ 'corrierepmd.corriere.it.edgesuite.net':
+ 'corrierepmd-corriere-it.akamaized.net',
+ 'media2vam-corriere-it.akamaized.net/fcs.quotidiani/vr/videos/':
+ 'video.corriere.it/vr360/videos/',
+ '.net//': '.net/',
+ }
+ _MP4_REPLACE = {
+ 'media2vam.corbologna.corriere.it.edgesuite.net':
+ 'media2vam-bologna-corriere-it.akamaized.net',
+ 'media2vam.corfiorentino.corriere.it.edgesuite.net':
+ 'media2vam-fiorentino-corriere-it.akamaized.net',
+ 'media2vam.cormezzogiorno.corriere.it.edgesuite.net':
+ 'media2vam-mezzogiorno-corriere-it.akamaized.net',
+ 'media2vam.corveneto.corriere.it.edgesuite.net':
+ 'media2vam-veneto-corriere-it.akamaized.net',
+ 'media2.oggi.it.edgesuite.net':
+ 'media2-oggi-it.akamaized.net',
+ 'media2.quimamme.it.edgesuite.net':
+ 'media2-quimamme-it.akamaized.net',
+ 'media2.amica.it.edgesuite.net':
+ 'media2-amica-it.akamaized.net',
+ 'media2.living.corriere.it.edgesuite.net':
+ 'media2-living-corriere-it.akamaized.net',
+ 'media2.style.corriere.it.edgesuite.net':
+ 'media2-style-corriere-it.akamaized.net',
+ 'media2.iodonna.it.edgesuite.net':
+ 'media2-iodonna-it.akamaized.net',
+ 'media2.leitv.it.edgesuite.net':
+ 'media2-leitv-it.akamaized.net',
+ }
+ _MIGRATION_MAP = {
+ 'videoamica-vh.akamaihd': 'amica',
+ 'media2-amica-it.akamaized': 'amica',
+ 'corrierevam-vh.akamaihd': 'corriere',
+ 'media2vam-corriere-it.akamaized': 'corriere',
+ 'cormezzogiorno-vh.akamaihd': 'corrieredelmezzogiorno',
+ 'media2vam-mezzogiorno-corriere-it.akamaized': 'corrieredelmezzogiorno',
+ 'corveneto-vh.akamaihd': 'corrieredelveneto',
+ 'media2vam-veneto-corriere-it.akamaized': 'corrieredelveneto',
+ 'corbologna-vh.akamaihd': 'corrieredibologna',
+ 'media2vam-bologna-corriere-it.akamaized': 'corrieredibologna',
+ 'corfiorentino-vh.akamaihd': 'corrierefiorentino',
+ 'media2vam-fiorentino-corriere-it.akamaized': 'corrierefiorentino',
+ 'corinnovazione-vh.akamaihd': 'corriereinnovazione',
+ 'media2-gazzanet-gazzetta-it.akamaized': 'gazzanet',
+ 'videogazzanet-vh.akamaihd': 'gazzanet',
+ 'videogazzaworld-vh.akamaihd': 'gazzaworld',
+ 'gazzettavam-vh.akamaihd': 'gazzetta',
+ 'media2vam-gazzetta-it.akamaized': 'gazzetta',
+ 'videoiodonna-vh.akamaihd': 'iodonna',
+ 'media2-leitv-it.akamaized': 'leitv',
+ 'videoleitv-vh.akamaihd': 'leitv',
+ 'videoliving-vh.akamaihd': 'living',
+ 'media2-living-corriere-it.akamaized': 'living',
+ 'media2-oggi-it.akamaized': 'oggi',
+ 'videooggi-vh.akamaihd': 'oggi',
+ 'media2-quimamme-it.akamaized': 'quimamme',
+ 'quimamme-vh.akamaihd': 'quimamme',
+ 'videorunning-vh.akamaihd': 'running',
+ 'media2-style-corriere-it.akamaized': 'style',
+ 'style-vh.akamaihd': 'style',
+ 'videostyle-vh.akamaihd': 'style',
+ 'media2-stylepiccoli-it.akamaized': 'stylepiccoli',
+ 'stylepiccoli-vh.akamaihd': 'stylepiccoli',
+ 'doveviaggi-vh.akamaihd': 'viaggi',
+ 'media2-doveviaggi-it.akamaized': 'viaggi',
+ 'media2-vivimilano-corriere-it.akamaized': 'vivimilano',
+ 'vivimilano-vh.akamaihd': 'vivimilano',
+ 'media2-youreporter-it.akamaized': 'youreporter'
+ }
+ _MIGRATION_MEDIA = {
+ 'advrcs-vh.akamaihd': '',
+ 'corriere-f.akamaihd': '',
+ 'corrierepmd-corriere-it.akamaized': '',
+ 'corrprotetto-vh.akamaihd': '',
+ 'gazzetta-f.akamaihd': '',
+ 'gazzettapmd-gazzetta-it.akamaized': '',
+ 'gazzprotetto-vh.akamaihd': '',
+ 'periodici-f.akamaihd': '',
+ 'periodicisecure-vh.akamaihd': '',
+ 'videocoracademy-vh.akamaihd': ''
+ }
+
+ def _get_video_src(self, video):
+ mediaFiles = video.get('mediaProfile').get('mediaFile')
+ src = {}
+ # audio
+ if video.get('mediaType') == 'AUDIO':
+ for aud in mediaFiles:
+ # todo: check
+ src['mp3'] = aud.get('value')
+ # video
+ else:
+ for vid in mediaFiles:
+ if vid.get('mimeType') == 'application/vnd.apple.mpegurl':
+ src['m3u8'] = vid.get('value')
+ if vid.get('mimeType') == 'video/mp4':
+ src['mp4'] = vid.get('value')
+
+ # replace host
+ for t in src:
+ for s, r in self._ALL_REPLACE.items():
+ src[t] = src[t].replace(s, r)
+ for s, r in self._MP4_REPLACE.items():
+ src[t] = src[t].replace(s, r)
+
+ # switch cdn
+ if 'mp4' in src and 'm3u8' in src:
+ if ('-lh.akamaihd' not in src.get('m3u8')
+ and 'akamai' in src.get('mp4')):
+ if 'm3u8' in src:
+ matches = re.search(r'(?:https*:)?\/\/(?P<host>.*)\.net\/i(?P<path>.*)$', src.get('m3u8'))
+ src['m3u8'] = 'https://vod.rcsobjects.it/hls/%s%s' % (
+ self._MIGRATION_MAP[matches.group('host')],
+ matches.group('path').replace(
+ '///', '/').replace(
+ '//', '/').replace(
+ '.csmil', '.urlset'
+ )
+ )
+ if 'mp4' in src:
+ matches = re.search(r'(?:https*:)?\/\/(?P<host>.*)\.net\/i(?P<path>.*)$', src.get('mp4'))
+ if matches:
+ if matches.group('host') in self._MIGRATION_MEDIA:
+ vh_stream = 'https://media2.corriereobjects.it'
+ if src.get('mp4').find('fcs.quotidiani_!'):
+ vh_stream = 'https://media2-it.corriereobjects.it'
+ src['mp4'] = '%s%s' % (
+ vh_stream,
+ matches.group('path').replace(
+ '///', '/').replace(
+ '//', '/').replace(
+ '/fcs.quotidiani/mediacenter', '').replace(
+ '/fcs.quotidiani_!/mediacenter', '').replace(
+ 'corriere/content/mediacenter/', '').replace(
+ 'gazzetta/content/mediacenter/', '')
+ )
+ else:
+ src['mp4'] = 'https://vod.rcsobjects.it/%s%s' % (
+ self._MIGRATION_MAP[matches.group('host')],
+ matches.group('path').replace('///', '/').replace('//', '/')
+ )
+
+ if 'mp3' in src:
+ src['mp3'] = src.get('mp3').replace(
+ 'media2vam-corriere-it.akamaized.net',
+ 'vod.rcsobjects.it/corriere')
+ if 'mp4' in src:
+ if src.get('mp4').find('fcs.quotidiani_!'):
+ src['mp4'] = src.get('mp4').replace('vod.rcsobjects', 'vod-it.rcsobjects')
+ if 'm3u8' in src:
+ if src.get('m3u8').find('fcs.quotidiani_!'):
+ src['m3u8'] = src.get('m3u8').replace('vod.rcsobjects', 'vod-it.rcsobjects')
+
+ if 'geoblocking' in video.get('mediaProfile'):
+ if 'm3u8' in src:
+ src['m3u8'] = src.get('m3u8').replace('vod.rcsobjects', 'vod-it.rcsobjects')
+ if 'mp4' in src:
+ src['mp4'] = src.get('mp4').replace('vod.rcsobjects', 'vod-it.rcsobjects')
+ if 'm3u8' in src:
+ if src.get('m3u8').find('csmil') and src.get('m3u8').find('vod'):
+ src['m3u8'] = src.get('m3u8').replace('.csmil', '.urlset')
+
+ return src
+
+ def _create_formats(self, urls, video_id):
+ formats = []
+ formats = self._extract_m3u8_formats(
+ urls.get('m3u8'), video_id, 'mp4', entry_protocol='m3u8_native',
+ m3u8_id='hls', fatal=False)
+
+ if urls.get('mp4'):
+ formats.append({
+ 'format_id': 'http-mp4',
+ 'url': urls['mp4']
+ })
+ self._sort_formats(formats)
+ return formats
+
+ def _real_extract(self, url):
+ mobj = self._match_valid_url(url)
+ video_id = mobj.group('id')
+
+ if 'cdn' not in mobj.groupdict():
+ raise ExtractorError('CDN not found in url: %s' % url)
+
+ # for leitv/youreporter/viaggi don't use the embed page
+ if ((mobj.group('cdn') not in ['leitv.it', 'youreporter.it'])
+ and (mobj.group('vid') == 'video')):
+ url = 'https://video.%s/video-embed/%s' % (mobj.group('cdn'), video_id)
+
+ page = self._download_webpage(url, video_id)
+
+ video_data = None
+ # look for json video data url
+ json = self._search_regex(
+ r'''(?x)url\s*=\s*(["'])
+ (?P<url>
+ (?:https?:)?//video\.rcs\.it
+ /fragment-includes/video-includes/.+?\.json
+ )\1;''',
+ page, video_id, group='url', default=None)
+ if json:
+ if json.startswith('//'):
+ json = 'https:%s' % json
+ video_data = self._download_json(json, video_id)
+
+ # if json url not found, look for json video data directly in the page
+ else:
+ # RCS normal pages and most of the embeds
+ json = self._search_regex(
+ r'[\s;]video\s*=\s*({[\s\S]+?})(?:;|,playlist=)',
+ page, video_id, default=None)
+ if not json and 'video-embed' in url:
+ page = self._download_webpage(url.replace('video-embed', 'video-json'), video_id)
+ json = self._search_regex(
+ r'##start-video##({[\s\S]+?})##end-video##',
+ page, video_id, default=None)
+ if not json:
+ # if no video data found try search for iframes
+ emb = RCSEmbedsIE._extract_url(page)
+ if emb:
+ return {
+ '_type': 'url_transparent',
+ 'url': emb,
+ 'ie_key': RCSEmbedsIE.ie_key()
+ }
+ if json:
+ video_data = self._parse_json(
+ json, video_id, transform_source=js_to_json)
+
+ if not video_data:
+ raise ExtractorError('Video data not found in the page')
+
+ formats = self._create_formats(
+ self._get_video_src(video_data), video_id)
+
+ description = (video_data.get('description')
+ or clean_html(video_data.get('htmlDescription'))
+ or self._html_search_meta('description', page))
+ uploader = video_data.get('provider') or mobj.group('cdn')
+
+ return {
+ 'id': video_id,
+ 'title': video_data.get('title'),
+ 'description': description,
+ 'uploader': uploader,
+ 'formats': formats
+ }
+
+
+class RCSEmbedsIE(RCSBaseIE):
+ _VALID_URL = r'''(?x)
+ https?://(?P<vid>video)\.
+ (?P<cdn>
+ (?:
+ rcs|
+ (?:corriere\w+\.)?corriere|
+ (?:gazzanet\.)?gazzetta
+ )\.it)
+ /video-embed/(?P<id>[^/=&\?]+?)(?:$|\?)'''
+ _TESTS = [{
+ 'url': 'https://video.rcs.it/video-embed/iodonna-0001585037',
+ 'md5': '623ecc8ffe7299b2d0c1046d8331a9df',
+ 'info_dict': {
+ 'id': 'iodonna-0001585037',
+ 'ext': 'mp4',
+ 'title': 'Sky Arte racconta Madonna nella serie "Artist to icon"',
+ 'description': 'md5:65b09633df9ffee57f48b39e34c9e067',
+ 'uploader': 'rcs.it',
+ }
+ }, {
+ # redownload the page changing 'video-embed' in 'video-json'
+ 'url': 'https://video.gazzanet.gazzetta.it/video-embed/gazzanet-mo05-0000260789',
+ 'md5': 'a043e3fecbe4d9ed7fc5d888652a5440',
+ 'info_dict': {
+ 'id': 'gazzanet-mo05-0000260789',
+ 'ext': 'mp4',
+ 'title': 'Valentino Rossi e papà Graziano si divertono col drifting',
+ 'description': 'md5:a8bf90d6adafd9815f70fc74c0fc370a',
+ 'uploader': 'rcd',
+ }
+ }, {
+ 'url': 'https://video.corriere.it/video-embed/b727632a-f9d0-11ea-91b0-38d50a849abb?player',
+ 'match_only': True
+ }, {
+ 'url': 'https://video.gazzetta.it/video-embed/49612410-00ca-11eb-bcd8-30d4253e0140',
+ 'match_only': True
+ }]
+
+ @staticmethod
+ def _sanitize_urls(urls):
+ # add protocol if missing
+ for i, e in enumerate(urls):
+ if e.startswith('//'):
+ urls[i] = 'https:%s' % e
+ # clean iframes urls
+ for i, e in enumerate(urls):
+ urls[i] = urljoin(base_url(e), url_basename(e))
+ return urls
+
+ @staticmethod
+ def _extract_urls(webpage):
+ entries = [
+ mobj.group('url')
+ for mobj in re.finditer(r'''(?x)
+ (?:
+ data-frame-src=|
+ <iframe[^\n]+src=
+ )
+ (["'])
+ (?P<url>(?:https?:)?//video\.
+ (?:
+ rcs|
+ (?:corriere\w+\.)?corriere|
+ (?:gazzanet\.)?gazzetta
+ )
+ \.it/video-embed/.+?)
+ \1''', webpage)]
+ return RCSEmbedsIE._sanitize_urls(entries)
+
+ @staticmethod
+ def _extract_url(webpage):
+ urls = RCSEmbedsIE._extract_urls(webpage)
+ return urls[0] if urls else None
+
+
+class RCSIE(RCSBaseIE):
+ _VALID_URL = r'''(?x)https?://(?P<vid>video|viaggi)\.
+ (?P<cdn>
+ (?:
+ corrieredelmezzogiorno\.
+ |corrieredelveneto\.
+ |corrieredibologna\.
+ |corrierefiorentino\.
+ )?corriere\.it
+ |(?:gazzanet\.)?gazzetta\.it)
+ /(?!video-embed/).+?/(?P<id>[^/\?]+)(?=\?|/$|$)'''
+ _TESTS = [{
+ 'url': 'https://video.corriere.it/sport/formula-1/vettel-guida-ferrari-sf90-mugello-suo-fianco-c-elecrerc-bendato-video-esilarante/b727632a-f9d0-11ea-91b0-38d50a849abb',
+ 'md5': '0f4ededc202b0f00b6e509d831e2dcda',
+ 'info_dict': {
+ 'id': 'b727632a-f9d0-11ea-91b0-38d50a849abb',
+ 'ext': 'mp4',
+ 'title': 'Vettel guida la Ferrari SF90 al Mugello e al suo fianco c\'è Leclerc (bendato): il video è esilarante',
+ 'description': 'md5:93b51c9161ac8a64fb2f997b054d0152',
+ 'uploader': 'Corriere Tv',
+ }
+ }, {
+ # video data inside iframe
+ 'url': 'https://viaggi.corriere.it/video/norvegia-il-nuovo-ponte-spettacolare-sopra-la-cascata-di-voringsfossen/',
+ 'md5': 'da378e4918d2afbf7d61c35abb948d4c',
+ 'info_dict': {
+ 'id': '5b7cd134-e2c1-11ea-89b3-b56dd0df2aa2',
+ 'ext': 'mp4',
+ 'title': 'La nuova spettacolare attrazione in Norvegia: il ponte sopra Vøringsfossen',
+ 'description': 'md5:18b35a291f6746c0c8dacd16e5f5f4f8',
+ 'uploader': 'DOVE Viaggi',
+ }
+ }, {
+ 'url': 'https://video.gazzetta.it/video-motogp-catalogna-cadute-dovizioso-vale-rossi/49612410-00ca-11eb-bcd8-30d4253e0140?vclk=Videobar',
+ 'md5': 'eedc1b5defd18e67383afef51ff7bdf9',
+ 'info_dict': {
+ 'id': '49612410-00ca-11eb-bcd8-30d4253e0140',
+ 'ext': 'mp4',
+ 'title': 'Dovizioso, il contatto con Zarco e la caduta. E anche Vale finisce a terra',
+ 'description': 'md5:8c6e905dc3b9413218beca11ebd69778',
+ 'uploader': 'AMorici',
+ }
+ }, {
+ 'url': 'https://video.corriere.it/video-360/metro-copenaghen-tutta-italiana/a248a7f0-e2db-11e9-9830-af2de6b1f945',
+ 'match_only': True
+ }]
+
+
+class RCSVariousIE(RCSBaseIE):
+ _VALID_URL = r'''(?x)https?://www\.
+ (?P<cdn>
+ leitv\.it|
+ youreporter\.it
+ )/(?:[^/]+/)?(?P<id>[^/]+?)(?:$|\?|/)'''
+ _TESTS = [{
+ 'url': 'https://www.leitv.it/benessere/mal-di-testa-come-combatterlo-ed-evitarne-la-comparsa/',
+ 'md5': '92b4e63667b8f95acb0a04da25ae28a1',
+ 'info_dict': {
+ 'id': 'mal-di-testa-come-combatterlo-ed-evitarne-la-comparsa',
+ 'ext': 'mp4',
+ 'title': 'Cervicalgia e mal di testa, il video con i suggerimenti dell\'esperto',
+ 'description': 'md5:ae21418f34cee0b8d02a487f55bcabb5',
+ 'uploader': 'leitv.it',
+ }
+ }, {
+ 'url': 'https://www.youreporter.it/fiume-sesia-3-ottobre-2020/',
+ 'md5': '8dccd436b47a830bab5b4a88232f391a',
+ 'info_dict': {
+ 'id': 'fiume-sesia-3-ottobre-2020',
+ 'ext': 'mp4',
+ 'title': 'Fiume Sesia 3 ottobre 2020',
+ 'description': 'md5:0070eef1cc884d13c970a4125063de55',
+ 'uploader': 'youreporter.it',
+ }
+ }]
diff --git a/hypervideo_dl/extractor/rcti.py b/hypervideo_dl/extractor/rcti.py
new file mode 100644
index 0000000..31d9779
--- /dev/null
+++ b/hypervideo_dl/extractor/rcti.py
@@ -0,0 +1,354 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import itertools
+import json
+import random
+import time
+
+from .common import InfoExtractor
+from ..compat import compat_HTTPError
+from ..utils import (
+ dict_get,
+ ExtractorError,
+ strip_or_none,
+ try_get
+)
+
+
+class RCTIPlusBaseIE(InfoExtractor):
+ def _real_initialize(self):
+ self._AUTH_KEY = self._download_json(
+ 'https://api.rctiplus.com/api/v1/visitor?platform=web', # platform can be web, mweb, android, ios
+ None, 'Fetching authorization key')['data']['access_token']
+
+ def _call_api(self, url, video_id, note=None):
+ json = self._download_json(
+ url, video_id, note=note, headers={'Authorization': self._AUTH_KEY})
+ if json.get('status', {}).get('code', 0) != 0:
+ raise ExtractorError('%s said: %s' % (self.IE_NAME, json["status"]["message_client"]), cause=json)
+ return json.get('data'), json.get('meta')
+
+
+class RCTIPlusIE(RCTIPlusBaseIE):
+ _VALID_URL = r'https://www\.rctiplus\.com/(?:programs/\d+?/.*?/)?(?P<type>episode|clip|extra|live-event|missed-event)/(?P<id>\d+)/(?P<display_id>[^/?#&]+)'
+ _TESTS = [{
+ 'url': 'https://www.rctiplus.com/programs/1259/kiko-untuk-lola/episode/22124/untuk-lola',
+ 'md5': '56ed45affad45fa18d5592a1bc199997',
+ 'info_dict': {
+ 'id': 'v_e22124',
+ 'title': 'Untuk Lola',
+ 'display_id': 'untuk-lola',
+ 'description': 'md5:2b809075c0b1e071e228ad6d13e41deb',
+ 'ext': 'mp4',
+ 'duration': 1400,
+ 'timestamp': 1615978800,
+ 'upload_date': '20210317',
+ 'series': 'Kiko : Untuk Lola',
+ 'season_number': 1,
+ 'episode_number': 1,
+ 'channel': 'RCTI',
+ },
+ 'params': {
+ 'fixup': 'never',
+ },
+ }, { # Clip; Series title doesn't appear on metadata JSON
+ 'url': 'https://www.rctiplus.com/programs/316/cahaya-terindah/clip/3921/make-a-wish',
+ 'md5': 'd179b2ff356f0e91a53bcc6a4d8504f0',
+ 'info_dict': {
+ 'id': 'v_c3921',
+ 'title': 'Make A Wish',
+ 'display_id': 'make-a-wish',
+ 'description': 'Make A Wish',
+ 'ext': 'mp4',
+ 'duration': 288,
+ 'timestamp': 1571652600,
+ 'upload_date': '20191021',
+ 'series': 'Cahaya Terindah',
+ 'channel': 'RCTI',
+ },
+ 'params': {
+ 'fixup': 'never',
+ },
+ }, { # Extra
+ 'url': 'https://www.rctiplus.com/programs/616/inews-malam/extra/9438/diungkapkan-melalui-surat-terbuka-ceo-ruangguru-belva-devara-mundur-dari-staf-khusus-presiden',
+ 'md5': 'c48106afdbce609749f5e0c007d9278a',
+ 'info_dict': {
+ 'id': 'v_ex9438',
+ 'title': 'md5:2ede828c0f8bde249e0912be150314ca',
+ 'display_id': 'md5:62b8d4e9ff096db527a1ad797e8a9933',
+ 'description': 'md5:2ede828c0f8bde249e0912be150314ca',
+ 'ext': 'mp4',
+ 'duration': 93,
+ 'timestamp': 1587561540,
+ 'upload_date': '20200422',
+ 'series': 'iNews Malam',
+ 'channel': 'INews',
+ },
+ 'params': {
+ 'format': 'bestvideo',
+ },
+ }, { # Missed event/replay
+ 'url': 'https://www.rctiplus.com/missed-event/2507/mou-signing-ceremony-27-juli-2021-1400-wib',
+ 'md5': '649c5f27250faed1452ca8b91e06922d',
+ 'info_dict': {
+ 'id': 'v_pe2507',
+ 'title': 'MOU Signing Ceremony | 27 Juli 2021 | 14.00 WIB',
+ 'display_id': 'mou-signing-ceremony-27-juli-2021-1400-wib',
+ 'ext': 'mp4',
+ 'timestamp': 1627142400,
+ 'upload_date': '20210724',
+ 'was_live': True,
+ 'release_timestamp': 1627369200,
+ },
+ 'params': {
+ 'fixup': 'never',
+ },
+ }, { # Live event; Cloudfront CDN
+ 'url': 'https://www.rctiplus.com/live-event/2530/dai-muda-charging-imun-dengan-iman-4-agustus-2021-1600-wib',
+ 'info_dict': {
+ 'id': 'v_le2530',
+ 'title': 'Dai Muda : Charging Imun dengan Iman | 4 Agustus 2021 | 16.00 WIB',
+ 'display_id': 'dai-muda-charging-imun-dengan-iman-4-agustus-2021-1600-wib',
+ 'ext': 'mp4',
+ 'timestamp': 1627898400,
+ 'upload_date': '20210802',
+ 'release_timestamp': 1628067600,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ 'skip': 'This live event has ended.',
+ }, { # TV; live_at is null
+ 'url': 'https://www.rctiplus.com/live-event/1/rcti',
+ 'info_dict': {
+ 'id': 'v_lt1',
+ 'title': 'RCTI',
+ 'display_id': 'rcti',
+ 'ext': 'mp4',
+ 'timestamp': 1546344000,
+ 'upload_date': '20190101',
+ 'is_live': True,
+ },
+ 'params': {
+ 'skip_download': True,
+ 'format': 'bestvideo',
+ },
+ }]
+ _CONVIVA_JSON_TEMPLATE = {
+ 't': 'CwsSessionHb',
+ 'cid': 'ff84ae928c3b33064b76dec08f12500465e59a6f',
+ 'clid': '0',
+ 'sid': 0,
+ 'seq': 0,
+ 'caps': 0,
+ 'sf': 7,
+ 'sdk': True,
+ }
+
+ def _real_extract(self, url):
+ match = self._match_valid_url(url).groupdict()
+ video_type, video_id, display_id = match['type'], match['id'], match['display_id']
+
+ url_api_version = 'v2' if video_type == 'missed-event' else 'v1'
+ appier_id = '23984824_' + str(random.randint(0, 10000000000)) # Based on the webpage's uuidRandom generator
+ video_json = self._call_api(
+ f'https://api.rctiplus.com/api/{url_api_version}/{video_type}/{video_id}/url?appierid={appier_id}', display_id, 'Downloading video URL JSON')[0]
+ video_url = video_json['url']
+
+ is_upcoming = try_get(video_json, lambda x: x['current_date'] < x['live_at'])
+ if is_upcoming is None:
+ is_upcoming = try_get(video_json, lambda x: x['current_date'] < x['start_date'])
+ if is_upcoming:
+ self.raise_no_formats(
+ 'This event will start at %s.' % video_json['live_label'] if video_json.get('live_label') else 'This event has not started yet.', expected=True)
+ if 'akamaized' in video_url:
+ # For some videos hosted on Akamai's CDN (possibly AES-encrypted ones?), a session needs to at least be made via Conviva's API
+ conviva_json_data = {
+ **self._CONVIVA_JSON_TEMPLATE,
+ 'url': video_url,
+ 'sst': int(time.time())
+ }
+ conviva_json_res = self._download_json(
+ 'https://ff84ae928c3b33064b76dec08f12500465e59a6f.cws.conviva.com/0/wsg', display_id,
+ 'Creating Conviva session', 'Failed to create Conviva session',
+ fatal=False, data=json.dumps(conviva_json_data).encode('utf-8'))
+ if conviva_json_res and conviva_json_res.get('err') != 'ok':
+ self.report_warning('Conviva said: %s' % str(conviva_json_res.get('err')))
+
+ video_meta, meta_paths = self._call_api(
+ 'https://api.rctiplus.com/api/v1/%s/%s' % (video_type, video_id), display_id, 'Downloading video metadata')
+
+ thumbnails, image_path = [], meta_paths.get('image_path', 'https://rstatic.akamaized.net/media/')
+ if video_meta.get('portrait_image'):
+ thumbnails.append({
+ 'id': 'portrait_image',
+ 'url': '%s%d%s' % (image_path, 2000, video_meta['portrait_image']) # 2000px seems to be the highest resolution that can be given
+ })
+ if video_meta.get('landscape_image'):
+ thumbnails.append({
+ 'id': 'landscape_image',
+ 'url': '%s%d%s' % (image_path, 2000, video_meta['landscape_image'])
+ })
+ try:
+ formats = self._extract_m3u8_formats(video_url, display_id, 'mp4', headers={'Referer': 'https://www.rctiplus.com/'})
+ except ExtractorError as e:
+ if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
+ self.raise_geo_restricted(countries=['ID'], metadata_available=True)
+ else:
+ raise e
+ for f in formats:
+ if 'akamaized' in f['url'] or 'cloudfront' in f['url']:
+ f.setdefault('http_headers', {})['Referer'] = 'https://www.rctiplus.com/' # Referer header is required for akamai/cloudfront CDNs
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_meta.get('product_id') or video_json.get('product_id'),
+ 'title': dict_get(video_meta, ('title', 'name')) or dict_get(video_json, ('content_name', 'assets_name')),
+ 'display_id': display_id,
+ 'description': video_meta.get('summary'),
+ 'timestamp': video_meta.get('release_date') or video_json.get('start_date'),
+ 'duration': video_meta.get('duration'),
+ 'categories': [video_meta['genre']] if video_meta.get('genre') else None,
+ 'average_rating': video_meta.get('star_rating'),
+ 'series': video_meta.get('program_title') or video_json.get('program_title'),
+ 'season_number': video_meta.get('season'),
+ 'episode_number': video_meta.get('episode'),
+ 'channel': video_json.get('tv_name'),
+ 'channel_id': video_json.get('tv_id'),
+ 'formats': formats,
+ 'thumbnails': thumbnails,
+ 'is_live': video_type == 'live-event' and not is_upcoming,
+ 'was_live': video_type == 'missed-event',
+ 'live_status': 'is_upcoming' if is_upcoming else None,
+ 'release_timestamp': video_json.get('live_at'),
+ }
+
+
+class RCTIPlusSeriesIE(RCTIPlusBaseIE):
+ _VALID_URL = r'https://www\.rctiplus\.com/programs/(?P<id>\d+)/(?P<display_id>[^/?#&]+)'
+ _TESTS = [{
+ 'url': 'https://www.rctiplus.com/programs/540/upin-ipin',
+ 'playlist_mincount': 417,
+ 'info_dict': {
+ 'id': '540',
+ 'title': 'Upin & Ipin',
+ 'description': 'md5:22cc912381f389664416844e1ec4f86b',
+ },
+ }, {
+ 'url': 'https://www.rctiplus.com/programs/540/upin-ipin/episodes?utm_source=Rplusdweb&utm_medium=share_copy&utm_campaign=programsupin-ipin',
+ 'only_matching': True,
+ }]
+ _AGE_RATINGS = { # Based off https://id.wikipedia.org/wiki/Sistem_rating_konten_televisi with additional ratings
+ 'S-SU': 2,
+ 'SU': 2,
+ 'P': 2,
+ 'A': 7,
+ 'R': 13,
+ 'R-R/1': 17, # Labelled as 17+ despite being R
+ 'D': 18,
+ }
+
+ @classmethod
+ def suitable(cls, url):
+ return False if RCTIPlusIE.suitable(url) else super(RCTIPlusSeriesIE, cls).suitable(url)
+
+ def _entries(self, url, display_id=None, note='Downloading entries JSON', metadata={}):
+ total_pages = 0
+ try:
+ total_pages = self._call_api(
+ '%s&length=20&page=0' % url,
+ display_id, note)[1]['pagination']['total_page']
+ except ExtractorError as e:
+ if 'not found' in str(e):
+ return []
+ raise e
+ if total_pages <= 0:
+ return []
+
+ for page_num in range(1, total_pages + 1):
+ episode_list = self._call_api(
+ '%s&length=20&page=%s' % (url, page_num),
+ display_id, '%s page %s' % (note, page_num))[0] or []
+
+ for video_json in episode_list:
+ link = video_json['share_link']
+ url_res = self.url_result(link, 'RCTIPlus', video_json.get('product_id'), video_json.get('title'))
+ url_res.update(metadata)
+ yield url_res
+
+ def _real_extract(self, url):
+ series_id, display_id = self._match_valid_url(url).groups()
+
+ series_meta, meta_paths = self._call_api(
+ 'https://api.rctiplus.com/api/v1/program/%s/detail' % series_id, display_id, 'Downloading series metadata')
+ metadata = {
+ 'age_limit': try_get(series_meta, lambda x: self._AGE_RATINGS[x['age_restriction'][0]['code']])
+ }
+
+ cast = []
+ for star in series_meta.get('starring', []):
+ cast.append(strip_or_none(star.get('name')))
+ for star in series_meta.get('creator', []):
+ cast.append(strip_or_none(star.get('name')))
+ for star in series_meta.get('writer', []):
+ cast.append(strip_or_none(star.get('name')))
+ metadata['cast'] = cast
+
+ tags = []
+ for tag in series_meta.get('tag', []):
+ tags.append(strip_or_none(tag.get('name')))
+ metadata['tag'] = tags
+
+ entries = []
+ seasons_list = self._call_api(
+ 'https://api.rctiplus.com/api/v1/program/%s/season' % series_id, display_id, 'Downloading seasons list JSON')[0]
+ for season in seasons_list:
+ entries.append(self._entries('https://api.rctiplus.com/api/v2/program/%s/episode?season=%s' % (series_id, season['season']),
+ display_id, 'Downloading season %s episode entries' % season['season'], metadata))
+
+ entries.append(self._entries('https://api.rctiplus.com/api/v2/program/%s/clip?content_id=0' % series_id,
+ display_id, 'Downloading clip entries', metadata))
+ entries.append(self._entries('https://api.rctiplus.com/api/v2/program/%s/extra?content_id=0' % series_id,
+ display_id, 'Downloading extra entries', metadata))
+
+ return self.playlist_result(itertools.chain(*entries), series_id, series_meta.get('title'), series_meta.get('summary'), **metadata)
+
+
+class RCTIPlusTVIE(RCTIPlusBaseIE):
+ _VALID_URL = r'https://www\.rctiplus\.com/((tv/(?P<tvname>\w+))|(?P<eventname>live-event|missed-event))'
+ _TESTS = [{
+ 'url': 'https://www.rctiplus.com/tv/rcti',
+ 'info_dict': {
+ 'id': 'v_lt1',
+ 'title': 'RCTI',
+ 'ext': 'mp4',
+ 'timestamp': 1546344000,
+ 'upload_date': '20190101',
+ },
+ 'params': {
+ 'skip_download': True,
+ 'format': 'bestvideo',
+ }
+ }, {
+ # Returned video will always change
+ 'url': 'https://www.rctiplus.com/live-event',
+ 'only_matching': True,
+ }, {
+ # Returned video will also always change
+ 'url': 'https://www.rctiplus.com/missed-event',
+ 'only_matching': True,
+ }]
+
+ @classmethod
+ def suitable(cls, url):
+ return False if RCTIPlusIE.suitable(url) else super(RCTIPlusTVIE, cls).suitable(url)
+
+ def _real_extract(self, url):
+ match = self._match_valid_url(url).groupdict()
+ tv_id = match.get('tvname') or match.get('eventname')
+ webpage = self._download_webpage(url, tv_id)
+ video_type, video_id = self._search_regex(
+ r'url\s*:\s*["\']https://api\.rctiplus\.com/api/v./(?P<type>[^/]+)/(?P<id>\d+)/url', webpage, 'video link', group=('type', 'id'))
+ return self.url_result(f'https://www.rctiplus.com/{video_type}/{video_id}/{tv_id}', 'RCTIPlus')
diff --git a/hypervideo_dl/extractor/redbulltv.py b/hypervideo_dl/extractor/redbulltv.py
index 6d000b3..e7fdcce 100644
--- a/hypervideo_dl/extractor/redbulltv.py
+++ b/hypervideo_dl/extractor/redbulltv.py
@@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
from ..compat import compat_HTTPError
@@ -161,7 +160,7 @@ class RedBullTVRrnContentIE(InfoExtractor):
}]
def _real_extract(self, url):
- region, lang, rrn_id = re.search(self._VALID_URL, url).groups()
+ region, lang, rrn_id = self._match_valid_url(url).groups()
rrn_id += ':%s-%s' % (lang, region.upper())
return self.url_result(
'https://www.redbull.com/embed/' + rrn_id,
@@ -204,7 +203,7 @@ class RedBullIE(InfoExtractor):
_LAT_FALLBACK_MAP = ['ar', 'bo', 'car', 'cl', 'co', 'mx', 'pe']
def _real_extract(self, url):
- region, lang, filter_type, display_id = re.search(self._VALID_URL, url).groups()
+ region, lang, filter_type, display_id = self._match_valid_url(url).groups()
if filter_type == 'episodes':
filter_type = 'episode-videos'
elif filter_type == 'live':
diff --git a/hypervideo_dl/extractor/reddit.py b/hypervideo_dl/extractor/reddit.py
index 222fa01..c75d95a 100644
--- a/hypervideo_dl/extractor/reddit.py
+++ b/hypervideo_dl/extractor/reddit.py
@@ -1,6 +1,4 @@
-from __future__ import unicode_literals
-
-import re
+import random
from .common import InfoExtractor
from ..utils import (
@@ -50,7 +48,7 @@ class RedditIE(InfoExtractor):
class RedditRIE(InfoExtractor):
- _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?reddit\.com/r/[^/]+/comments/(?P<id>[^/?#&]+))'
+ _VALID_URL = r'https?://(?P<subdomain>[^/]+\.)?reddit(?:media)?\.com/r/(?P<slug>[^/]+/comments/(?P<id>[^/?#&]+))'
_TESTS = [{
'url': 'https://www.reddit.com/r/videos/comments/6rrwyj/that_small_heart_attack/',
'info_dict': {
@@ -95,17 +93,27 @@ class RedditRIE(InfoExtractor):
# reddit video @ nm reddit
'url': 'https://nm.reddit.com/r/Cricket/comments/8idvby/lousy_cameraman_finds_himself_in_cairns_line_of/',
'only_matching': True,
+ }, {
+ 'url': 'https://www.redditmedia.com/r/serbia/comments/pu9wbx/ako_vu%C4%8Di%C4%87_izgubi_izbore_ja_%C4%87u_da_crknem/',
+ 'only_matching': True,
}]
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- url, video_id = mobj.group('url', 'id')
-
- video_id = self._match_id(url)
-
- data = self._download_json(
- url + '/.json', video_id)[0]['data']['children'][0]['data']
+ @staticmethod
+ def _gen_session_id():
+ id_length = 16
+ rand_max = 1 << (id_length * 4)
+ return '%0.*x' % (id_length, random.randrange(rand_max))
+ def _real_extract(self, url):
+ subdomain, slug, video_id = self._match_valid_url(url).group('subdomain', 'slug', 'id')
+
+ self._set_cookie('.reddit.com', 'reddit_session', self._gen_session_id())
+ self._set_cookie('.reddit.com', '_options', '%7B%22pref_quarantine_optin%22%3A%20true%7D')
+ data = self._download_json(f'https://{subdomain}reddit.com/r/{slug}/.json', video_id, fatal=False)
+ if not data:
+ # Fall back to old.reddit.com in case the requested subdomain fails
+ data = self._download_json(f'https://old.reddit.com/r/{slug}/.json', video_id)
+ data = data[0]['data']['children'][0]['data']
video_url = data['url']
# Avoid recursing into the same reddit URL
diff --git a/hypervideo_dl/extractor/redtube.py b/hypervideo_dl/extractor/redtube.py
index a1ca791..747ce51 100644
--- a/hypervideo_dl/extractor/redtube.py
+++ b/hypervideo_dl/extractor/redtube.py
@@ -98,13 +98,14 @@ class RedTubeIE(InfoExtractor):
format_id = media.get('quality')
formats.append({
'url': format_url,
+ 'ext': 'mp4',
'format_id': format_id,
'height': int_or_none(format_id),
})
if not formats:
video_url = self._html_search_regex(
r'<source src="(.+?)" type="video/mp4">', webpage, 'video URL')
- formats.append({'url': video_url})
+ formats.append({'url': video_url, 'ext': 'mp4'})
self._sort_formats(formats)
thumbnail = self._og_search_thumbnail(webpage)
diff --git a/hypervideo_dl/extractor/rice.py b/hypervideo_dl/extractor/rice.py
index f855719..cf2bb1b 100644
--- a/hypervideo_dl/extractor/rice.py
+++ b/hypervideo_dl/extractor/rice.py
@@ -30,7 +30,7 @@ class RICEIE(InfoExtractor):
_NS = 'http://schemas.datacontract.org/2004/07/ensembleVideo.Data.Service.Contracts.Models.Player.Config'
def _real_extract(self, url):
- qs = compat_parse_qs(re.match(self._VALID_URL, url).group('query'))
+ qs = compat_parse_qs(self._match_valid_url(url).group('query'))
if not qs.get('PortalID') or not qs.get('DestinationID') or not qs.get('ContentID'):
raise ExtractorError('Invalid URL', expected=True)
diff --git a/hypervideo_dl/extractor/rmcdecouverte.py b/hypervideo_dl/extractor/rmcdecouverte.py
index c3623ed..422d47a 100644
--- a/hypervideo_dl/extractor/rmcdecouverte.py
+++ b/hypervideo_dl/extractor/rmcdecouverte.py
@@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
from .brightcove import BrightcoveLegacyIE
@@ -13,9 +12,24 @@ from ..utils import smuggle_url
class RMCDecouverteIE(InfoExtractor):
- _VALID_URL = r'https?://rmcdecouverte\.bfmtv\.com/(?:(?:[^/]+/)*program_(?P<id>\d+)|(?P<live_id>mediaplayer-direct))'
+ _VALID_URL = r'https?://rmcdecouverte\.bfmtv\.com/(?:[^?#]*_(?P<id>\d+)|mediaplayer-direct)/?(?:[#?]|$)'
_TESTS = [{
+ 'url': 'https://rmcdecouverte.bfmtv.com/vestiges-de-guerre_22240/les-bunkers-secrets-domaha-beach_25303/',
+ 'info_dict': {
+ 'id': '6250879771001',
+ 'ext': 'mp4',
+ 'title': 'LES BUNKERS SECRETS D´OMAHA BEACH',
+ 'uploader_id': '1969646226001',
+ 'description': 'md5:aed573ca24abde62a148e0eba909657d',
+ 'timestamp': 1619622984,
+ 'upload_date': '20210428',
+ },
+ 'params': {
+ 'format': 'bestvideo',
+ 'skip_download': True,
+ },
+ }, {
'url': 'https://rmcdecouverte.bfmtv.com/wheeler-dealers-occasions-a-saisir/program_2566/',
'info_dict': {
'id': '5983675500001',
@@ -31,6 +45,13 @@ class RMCDecouverteIE(InfoExtractor):
},
'skip': 'only available for a week',
}, {
+ 'url': 'https://rmcdecouverte.bfmtv.com/avions-furtifs-la-technologie-de-lextreme_10598',
+ 'only_matching': True,
+ }, {
+ # The website accepts any URL as long as it has _\d+ at the end
+ 'url': 'https://rmcdecouverte.bfmtv.com/any/thing/can/go/here/_10598',
+ 'only_matching': True,
+ }, {
# live, geo restricted, bypassable
'url': 'https://rmcdecouverte.bfmtv.com/mediaplayer-direct/',
'only_matching': True,
@@ -38,8 +59,8 @@ class RMCDecouverteIE(InfoExtractor):
BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1969646226001/default_default/index.html?videoId=%s'
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- display_id = mobj.group('id') or mobj.group('live_id')
+ mobj = self._match_valid_url(url)
+ display_id = mobj.group('id') or 'direct'
webpage = self._download_webpage(url, display_id)
brightcove_legacy_url = BrightcoveLegacyIE._extract_brightcove_url(webpage)
if brightcove_legacy_url:
diff --git a/hypervideo_dl/extractor/roosterteeth.py b/hypervideo_dl/extractor/roosterteeth.py
index 8883639..2c815bd 100644
--- a/hypervideo_dl/extractor/roosterteeth.py
+++ b/hypervideo_dl/extractor/roosterteeth.py
@@ -31,6 +31,19 @@ class RoosterTeethIE(InfoExtractor):
'episode': 'Million Dollars, But... The Game Announcement',
},
}, {
+ 'url': 'https://roosterteeth.com/watch/rwby-bonus-25',
+ 'md5': 'fe8d9d976b272c18a24fe7f1f5830084',
+ 'info_dict': {
+ 'id': '31',
+ 'display_id': 'rwby-bonus-25',
+ 'title': 'Volume 2, World of Remnant 3',
+ 'description': 'md5:8d58d3270292ea11da00ea712bbfb009',
+ 'episode': 'Volume 2, World of Remnant 3',
+ 'channel_id': 'fab60c1c-29cb-43bc-9383-5c3538d9e246',
+ 'thumbnail': r're:^https?://.*\.(png|jpe?g)$',
+ 'ext': 'mp4',
+ },
+ }, {
'url': 'http://achievementhunter.roosterteeth.com/episode/off-topic-the-achievement-hunter-podcast-2016-i-didn-t-think-it-would-pass-31',
'only_matching': True,
}, {
@@ -50,7 +63,7 @@ class RoosterTeethIE(InfoExtractor):
'url': 'https://roosterteeth.com/watch/million-dollars-but-season-2-million-dollars-but-the-game-announcement',
'only_matching': True,
}]
- _EPISODE_BASE_URL = 'https://svod-be.roosterteeth.com/api/v1/episodes/'
+ _EPISODE_BASE_URL = 'https://svod-be.roosterteeth.com/api/v1/watch/'
def _login(self):
username, password = self._get_login_info()
@@ -86,9 +99,11 @@ class RoosterTeethIE(InfoExtractor):
api_episode_url = self._EPISODE_BASE_URL + display_id
try:
- m3u8_url = self._download_json(
+ video_data = self._download_json(
api_episode_url + '/videos', display_id,
- 'Downloading video JSON metadata')['data'][0]['attributes']['url']
+ 'Downloading video JSON metadata')['data'][0]
+ m3u8_url = video_data['attributes']['url']
+ # XXX: additional URL at video_data['links']['download']
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
if self._parse_json(e.cause.read().decode(), display_id).get('access') is False:
@@ -96,7 +111,7 @@ class RoosterTeethIE(InfoExtractor):
'%s is only available for FIRST members' % display_id)
raise
- formats = self._extract_m3u8_formats(
+ formats, subtitles = self._extract_m3u8_formats_and_subtitles(
m3u8_url, display_id, 'mp4', 'm3u8_native', m3u8_id='hls')
self._sort_formats(formats)
@@ -109,7 +124,7 @@ class RoosterTeethIE(InfoExtractor):
thumbnails = []
for image in episode.get('included', {}).get('images', []):
- if image.get('type') == 'episode_image':
+ if image.get('type') in ('episode_image', 'bonus_feature_image'):
img_attributes = image.get('attributes') or {}
for k in ('thumb', 'small', 'medium', 'large'):
img_url = img_attributes.get(k)
@@ -134,4 +149,5 @@ class RoosterTeethIE(InfoExtractor):
'formats': formats,
'channel_id': attributes.get('channel_id'),
'duration': int_or_none(attributes.get('length')),
+ 'subtitles': subtitles
}
diff --git a/hypervideo_dl/extractor/roxwel.py b/hypervideo_dl/extractor/roxwel.py
index 6528464..84bb1aa 100644
--- a/hypervideo_dl/extractor/roxwel.py
+++ b/hypervideo_dl/extractor/roxwel.py
@@ -1,6 +1,5 @@
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
from ..utils import unified_strdate, determine_ext
@@ -27,7 +26,7 @@ class RoxwelIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
filename = mobj.group('filename')
info_url = 'http://www.roxwel.com/api/videos/%s' % filename
info = self._download_json(info_url, filename)
diff --git a/hypervideo_dl/extractor/rtbf.py b/hypervideo_dl/extractor/rtbf.py
index 3b0f308..f9979d0 100644
--- a/hypervideo_dl/extractor/rtbf.py
+++ b/hypervideo_dl/extractor/rtbf.py
@@ -68,7 +68,7 @@ class RTBFIE(InfoExtractor):
]
def _real_extract(self, url):
- live, media_id = re.match(self._VALID_URL, url).groups()
+ live, media_id = self._match_valid_url(url).groups()
embed_page = self._download_webpage(
'https://www.rtbf.be/auvio/embed/' + ('direct' if live else 'media'),
media_id, query={'id': media_id})
@@ -125,7 +125,7 @@ class RTBFIE(InfoExtractor):
})
mpd_url = data.get('urlDash')
- if not data.get('drm') and mpd_url:
+ if mpd_url and (self.get_param('allow_unplayable_formats') or not data.get('drm')):
formats.extend(self._extract_mpd_formats(
mpd_url, media_id, mpd_id='dash', fatal=False))
diff --git a/hypervideo_dl/extractor/rtl2.py b/hypervideo_dl/extractor/rtl2.py
index 70f000c..4e3aa03 100644
--- a/hypervideo_dl/extractor/rtl2.py
+++ b/hypervideo_dl/extractor/rtl2.py
@@ -51,7 +51,7 @@ class RTL2IE(InfoExtractor):
}]
def _real_extract(self, url):
- vico_id, vivi_id, display_id = re.match(self._VALID_URL, url).groups()
+ vico_id, vivi_id, display_id = self._match_valid_url(url).groups()
if not vico_id:
webpage = self._download_webpage(url, display_id)
@@ -93,7 +93,7 @@ class RTL2IE(InfoExtractor):
'flash_version': 'LNX 11,2,202,429',
'rtmp_conn': rtmp_conn,
'no_resume': True,
- 'preference': 1,
+ 'quality': 1,
})
m3u8_url = video_info.get('streamurl_hls')
diff --git a/hypervideo_dl/extractor/rtp.py b/hypervideo_dl/extractor/rtp.py
index 02986f4..c165ade 100644
--- a/hypervideo_dl/extractor/rtp.py
+++ b/hypervideo_dl/extractor/rtp.py
@@ -2,10 +2,11 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..utils import (
- determine_ext,
- js_to_json,
-)
+from ..utils import js_to_json
+import re
+import json
+import urllib.parse
+import base64
class RTPIE(InfoExtractor):
@@ -25,6 +26,22 @@ class RTPIE(InfoExtractor):
'only_matching': True,
}]
+ _RX_OBFUSCATION = re.compile(r'''(?xs)
+ atob\s*\(\s*decodeURIComponent\s*\(\s*
+ (\[[0-9A-Za-z%,'"]*\])
+ \s*\.\s*join\(\s*(?:""|'')\s*\)\s*\)\s*\)
+ ''')
+
+ def __unobfuscate(self, data, *, video_id):
+ if data.startswith('{'):
+ data = self._RX_OBFUSCATION.sub(
+ lambda m: json.dumps(
+ base64.b64decode(urllib.parse.unquote(
+ ''.join(self._parse_json(m.group(1), video_id))
+ )).decode('iso-8859-1')),
+ data)
+ return js_to_json(data)
+
def _real_extract(self, url):
video_id = self._match_id(url)
@@ -32,30 +49,46 @@ class RTPIE(InfoExtractor):
title = self._html_search_meta(
'twitter:title', webpage, display_name='title', fatal=True)
- config = self._parse_json(self._search_regex(
- r'(?s)RTPPlayer\(({.+?})\);', webpage,
- 'player config'), video_id, js_to_json)
- file_url = config['file']
- ext = determine_ext(file_url)
- if ext == 'm3u8':
- file_key = config.get('fileKey')
- formats = self._extract_m3u8_formats(
- file_url, video_id, 'mp4', 'm3u8_native',
- m3u8_id='hls', fatal=file_key)
- if file_key:
- formats.append({
- 'url': 'https://cdn-ondemand.rtp.pt' + file_key,
- 'preference': 1,
- })
- self._sort_formats(formats)
+ f, config = self._search_regex(
+ r'''(?sx)
+ var\s+f\s*=\s*(?P<f>".*?"|{[^;]+?});\s*
+ var\s+player1\s+=\s+new\s+RTPPlayer\s*\((?P<config>{(?:(?!\*/).)+?})\);(?!\s*\*/)
+ ''', webpage,
+ 'player config', group=('f', 'config'))
+
+ f = self._parse_json(
+ f, video_id,
+ lambda data: self.__unobfuscate(data, video_id=video_id))
+ config = self._parse_json(
+ config, video_id,
+ lambda data: self.__unobfuscate(data, video_id=video_id))
+
+ formats = []
+ if isinstance(f, dict):
+ f_hls = f.get('hls')
+ if f_hls is not None:
+ formats.extend(self._extract_m3u8_formats(
+ f_hls, video_id, 'mp4', 'm3u8_native', m3u8_id='hls'))
+
+ f_dash = f.get('dash')
+ if f_dash is not None:
+ formats.extend(self._extract_mpd_formats(f_dash, video_id, mpd_id='dash'))
else:
- formats = [{
- 'url': file_url,
- 'ext': ext,
- }]
- if config.get('mediaType') == 'audio':
- for f in formats:
- f['vcodec'] = 'none'
+ formats.append({
+ 'format_id': 'f',
+ 'url': f,
+ 'vcodec': 'none' if config.get('mediaType') == 'audio' else None,
+ })
+
+ subtitles = {}
+
+ vtt = config.get('vtt')
+ if vtt is not None:
+ for lcode, lname, url in vtt:
+ subtitles.setdefault(lcode, []).append({
+ 'name': lname,
+ 'url': url,
+ })
return {
'id': video_id,
@@ -63,4 +96,5 @@ class RTPIE(InfoExtractor):
'formats': formats,
'description': self._html_search_meta(['description', 'twitter:description'], webpage),
'thumbnail': config.get('poster') or self._og_search_thumbnail(webpage),
+ 'subtitles': subtitles,
}
diff --git a/hypervideo_dl/extractor/rts.py b/hypervideo_dl/extractor/rts.py
index aed35f8..865a730 100644
--- a/hypervideo_dl/extractor/rts.py
+++ b/hypervideo_dl/extractor/rts.py
@@ -116,7 +116,7 @@ class RTSIE(SRGSSRIE):
]
def _real_extract(self, url):
- m = re.match(self._VALID_URL, url)
+ m = self._match_valid_url(url)
media_id = m.group('rts_id') or m.group('id')
display_id = m.group('display_id') or media_id
diff --git a/hypervideo_dl/extractor/rtve.py b/hypervideo_dl/extractor/rtve.py
index d2fb754..59832ee 100644
--- a/hypervideo_dl/extractor/rtve.py
+++ b/hypervideo_dl/extractor/rtve.py
@@ -3,7 +3,6 @@ from __future__ import unicode_literals
import base64
import io
-import re
import sys
from .common import InfoExtractor
@@ -216,7 +215,7 @@ class RTVELiveIE(RTVEALaCartaIE):
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
diff --git a/hypervideo_dl/extractor/rumble.py b/hypervideo_dl/extractor/rumble.py
index 4a02251..49c1f44 100644
--- a/hypervideo_dl/extractor/rumble.py
+++ b/hypervideo_dl/extractor/rumble.py
@@ -1,13 +1,17 @@
# coding: utf-8
from __future__ import unicode_literals
+import itertools
+import re
+
from .common import InfoExtractor
-from ..compat import compat_str
+from ..compat import compat_str, compat_HTTPError
from ..utils import (
determine_ext,
int_or_none,
parse_iso8601,
try_get,
+ ExtractorError,
)
@@ -28,6 +32,14 @@ class RumbleEmbedIE(InfoExtractor):
'only_matching': True,
}]
+ @staticmethod
+ def _extract_urls(webpage):
+ return [
+ mobj.group('url')
+ for mobj in re.finditer(
+ r'(?:<(?:script|iframe)[^>]+\bsrc=|["\']embedUrl["\']\s*:\s*)["\'](?P<url>%s)' % RumbleEmbedIE._VALID_URL,
+ webpage)]
+
def _real_extract(self, url):
video_id = self._match_id(url)
video = self._download_json(
@@ -65,3 +77,36 @@ class RumbleEmbedIE(InfoExtractor):
'channel_url': author.get('url'),
'duration': int_or_none(video.get('duration')),
}
+
+
+class RumbleChannelIE(InfoExtractor):
+ _VALID_URL = r'(?P<url>https?://(?:www\.)?rumble\.com/(?:c|user)/(?P<id>[^&?#$/]+))'
+
+ _TESTS = [{
+ 'url': 'https://rumble.com/c/Styxhexenhammer666',
+ 'playlist_mincount': 1160,
+ 'info_dict': {
+ 'id': 'Styxhexenhammer666',
+ },
+ }, {
+ 'url': 'https://rumble.com/user/goldenpoodleharleyeuna',
+ 'playlist_count': 4,
+ 'info_dict': {
+ 'id': 'goldenpoodleharleyeuna',
+ },
+ }]
+
+ def entries(self, url, playlist_id):
+ for page in itertools.count(1):
+ try:
+ webpage = self._download_webpage(f'{url}?page={page}', playlist_id, note='Downloading page %d' % page)
+ except ExtractorError as e:
+ if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404:
+ break
+ raise
+ for video_url in re.findall(r'class=video-item--a\s?href=([^>]+\.html)', webpage):
+ yield self.url_result('https://rumble.com' + video_url)
+
+ def _real_extract(self, url):
+ url, playlist_id = self._match_valid_url(url).groups()
+ return self.playlist_result(self.entries(url, playlist_id), playlist_id=playlist_id)
diff --git a/hypervideo_dl/extractor/rutube.py b/hypervideo_dl/extractor/rutube.py
index 8f54d56..d027412 100644
--- a/hypervideo_dl/extractor/rutube.py
+++ b/hypervideo_dl/extractor/rutube.py
@@ -7,13 +7,12 @@ import itertools
from .common import InfoExtractor
from ..compat import (
compat_str,
- compat_parse_qs,
- compat_urllib_parse_urlparse,
)
from ..utils import (
determine_ext,
bool_or_none,
int_or_none,
+ parse_qs,
try_get,
unified_timestamp,
url_or_none,
@@ -178,7 +177,7 @@ class RutubeEmbedIE(RutubeBaseIE):
embed_id = self._match_id(url)
# Query may contain private videos token and should be passed to API
# requests (see #19163)
- query = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
+ query = parse_qs(url)
options = self._download_api_options(embed_id, query)
video_id = options['effective_video']
formats = self._extract_formats(options, video_id)
@@ -298,16 +297,18 @@ class RutubePlaylistIE(RutubePlaylistBaseIE):
@classmethod
def suitable(cls, url):
+ from ..utils import int_or_none, parse_qs
+
if not super(RutubePlaylistIE, cls).suitable(url):
return False
- params = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
+ params = parse_qs(url)
return params.get('pl_type', [None])[0] and int_or_none(params.get('pl_id', [None])[0])
def _next_page_url(self, page_num, playlist_id, item_kind):
return self._PAGE_TEMPLATE % (item_kind, playlist_id, page_num)
def _real_extract(self, url):
- qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
+ qs = parse_qs(url)
playlist_kind = qs['pl_type'][0]
playlist_id = qs['pl_id'][0]
return self._extract_playlist(playlist_id, item_kind=playlist_kind)
diff --git a/hypervideo_dl/extractor/rutv.py b/hypervideo_dl/extractor/rutv.py
index d2713c1..7e0de99 100644
--- a/hypervideo_dl/extractor/rutv.py
+++ b/hypervideo_dl/extractor/rutv.py
@@ -123,7 +123,7 @@ class RUTVIE(InfoExtractor):
return mobj.group('url')
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
video_id = mobj.group('id')
video_path = mobj.group('path')
@@ -139,7 +139,7 @@ class RUTVIE(InfoExtractor):
is_live = video_type == 'live'
json_data = self._download_json(
- 'http://player.rutv.ru/iframe/data%s/id/%s' % ('live' if is_live else 'video', video_id),
+ 'http://player.vgtrk.com/iframe/data%s/id/%s' % ('live' if is_live else 'video', video_id),
video_id, 'Downloading JSON')
if json_data['errors']:
@@ -180,11 +180,11 @@ class RUTVIE(InfoExtractor):
'rtmp_live': True,
'ext': 'flv',
'vbr': int(quality),
- 'preference': preference,
+ 'quality': preference,
}
elif transport == 'm3u8':
formats.extend(self._extract_m3u8_formats(
- url, video_id, 'mp4', preference=preference, m3u8_id='hls'))
+ url, video_id, 'mp4', quality=preference, m3u8_id='hls'))
continue
else:
fmt = {
diff --git a/hypervideo_dl/extractor/ruutu.py b/hypervideo_dl/extractor/ruutu.py
index c50cd3e..d9cf39d 100644
--- a/hypervideo_dl/extractor/ruutu.py
+++ b/hypervideo_dl/extractor/ruutu.py
@@ -200,9 +200,9 @@ class RuutuIE(InfoExtractor):
return node.get('value')
if not formats:
- drm = xpath_text(video_xml, './Clip/DRM', default=None)
- if drm:
- raise ExtractorError('This video is DRM protected.', expected=True)
+ if (not self.get_param('allow_unplayable_formats')
+ and xpath_text(video_xml, './Clip/DRM', default=None)):
+ self.report_drm(video_id)
ns_st_cds = pv('ns_st_cds')
if ns_st_cds != 'free':
raise ExtractorError('This video is %s.' % ns_st_cds, expected=True)
diff --git a/hypervideo_dl/extractor/safari.py b/hypervideo_dl/extractor/safari.py
index 2cc6651..cca4464 100644
--- a/hypervideo_dl/extractor/safari.py
+++ b/hypervideo_dl/extractor/safari.py
@@ -127,7 +127,7 @@ class SafariIE(SafariBaseIE):
_UICONF_ID = '29375172'
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
reference_id = mobj.group('reference_id')
if reference_id:
@@ -189,11 +189,16 @@ class SafariApiIE(SafariBaseIE):
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
part = self._download_json(
url, '%s/%s' % (mobj.group('course_id'), mobj.group('part')),
'Downloading part JSON')
- return self.url_result(part['web_url'], SafariIE.ie_key())
+ web_url = part['web_url']
+ if 'library/view' in web_url:
+ web_url = web_url.replace('library/view', 'videos')
+ natural_keys = part['natural_key']
+ web_url = f'{web_url.rsplit("/", 1)[0]}/{natural_keys[0]}-{natural_keys[1][:-5]}'
+ return self.url_result(web_url, SafariIE.ie_key())
class SafariCourseIE(SafariBaseIE):
diff --git a/hypervideo_dl/extractor/saitosan.py b/hypervideo_dl/extractor/saitosan.py
new file mode 100644
index 0000000..621335c
--- /dev/null
+++ b/hypervideo_dl/extractor/saitosan.py
@@ -0,0 +1,78 @@
+# coding: utf-8
+
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import ExtractorError, try_get
+
+
+class SaitosanIE(InfoExtractor):
+ IE_NAME = 'Saitosan'
+ _VALID_URL = r'https?://(?:www\.)?saitosan\.net/bview.html\?id=(?P<id>[0-9]+)'
+ _TESTS = [{
+ 'url': 'http://www.saitosan.net/bview.html?id=10031846',
+ 'info_dict': {
+ 'id': '10031846',
+ 'ext': 'mp4',
+ 'title': '井下原 和弥',
+ 'uploader': '井下原 和弥',
+ 'thumbnail': 'http://111.171.196.85:8088/921f916f-7f55-4c97-b92e-5d9d0fef8f5f/thumb',
+ 'is_live': True,
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ 'skip': 'Broadcasts are ephemeral',
+ },
+ {
+ 'url': 'http://www.saitosan.net/bview.html?id=10031795',
+ 'info_dict': {
+ 'id': '10031795',
+ 'ext': 'mp4',
+ 'title': '橋本',
+ 'uploader': '橋本',
+ 'thumbnail': 'http://111.171.196.85:8088/1a3933e1-a01a-483b-8931-af15f37f8082/thumb',
+ 'is_live': True,
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ 'skip': 'Broadcasts are ephemeral',
+ }]
+
+ def _real_extract(self, url):
+ b_id = self._match_id(url)
+
+ base = 'http://hankachi.saitosan-api.net:8002/socket.io/?transport=polling&EIO=3'
+ sid = self._download_socket_json(base, b_id, note='Opening socket').get('sid')
+ base += '&sid=' + sid
+
+ self._download_webpage(base, b_id, note='Polling socket')
+ payload = '420["room_start_join",{"room_id":"%s"}]' % b_id
+ payload = '%s:%s' % (len(payload), payload)
+
+ self._download_webpage(base, b_id, data=payload, note='Polling socket with payload')
+ response = self._download_socket_json(base, b_id, note='Polling socket')
+ if not response.get('ok'):
+ err = response.get('error') or {}
+ raise ExtractorError(
+ '%s said: %s - %s' % (self.IE_NAME, err.get('code', '?'), err.get('msg', 'Unknown')) if err
+ else 'The socket reported that the broadcast could not be joined. Maybe it\'s offline or the URL is incorrect',
+ expected=True, video_id=b_id)
+
+ self._download_webpage(base, b_id, data='26:421["room_finish_join",{}]', note='Polling socket')
+ b_data = self._download_socket_json(base, b_id, note='Getting broadcast metadata from socket')
+ m3u8_url = b_data.get('url')
+
+ self._download_webpage(base, b_id, data='1:1', note='Closing socket', fatal=False)
+
+ return {
+ 'id': b_id,
+ 'title': b_data.get('name'),
+ 'formats': self._extract_m3u8_formats(m3u8_url, b_id, 'mp4', live=True),
+ 'thumbnail': m3u8_url.replace('av.m3u8', 'thumb'),
+ 'uploader': try_get(b_data, lambda x: x['broadcast_user']['name']), # same as title
+ 'is_live': True
+ }
diff --git a/hypervideo_dl/extractor/sapo.py b/hypervideo_dl/extractor/sapo.py
index 49a9b31..df202a3 100644
--- a/hypervideo_dl/extractor/sapo.py
+++ b/hypervideo_dl/extractor/sapo.py
@@ -63,7 +63,7 @@ class SapoIE(InfoExtractor):
]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
video_id = mobj.group('id')
item = self._download_xml(
diff --git a/hypervideo_dl/extractor/savefrom.py b/hypervideo_dl/extractor/savefrom.py
index 21e44b6..98efdc2 100644
--- a/hypervideo_dl/extractor/savefrom.py
+++ b/hypervideo_dl/extractor/savefrom.py
@@ -2,7 +2,6 @@
from __future__ import unicode_literals
import os.path
-import re
from .common import InfoExtractor
@@ -28,7 +27,7 @@ class SaveFromIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
video_id = os.path.splitext(url.split('/')[-1])[0]
return self.url_result(mobj.group('url'), video_id=video_id)
diff --git a/hypervideo_dl/extractor/scrippsnetworks.py b/hypervideo_dl/extractor/scrippsnetworks.py
index b40b4c4..84918b6 100644
--- a/hypervideo_dl/extractor/scrippsnetworks.py
+++ b/hypervideo_dl/extractor/scrippsnetworks.py
@@ -3,7 +3,6 @@ from __future__ import unicode_literals
import json
import hashlib
-import re
from .aws import AWSIE
from .anvato import AnvatoIE
@@ -55,7 +54,7 @@ class ScrippsNetworksWatchIE(AWSIE):
_AWS_USER_AGENT = 'aws-sdk-js/2.80.0 callback'
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
site_id, video_id = mobj.group('site', 'id')
aws_identity_id_json = json.dumps({
@@ -146,7 +145,7 @@ class ScrippsNetworksIE(InfoExtractor):
_TP_TEMPL = 'https://link.theplatform.com/s/ip77QC/media/guid/%d/%s?mbr=true'
def _real_extract(self, url):
- site, guid = re.match(self._VALID_URL, url).groups()
+ site, guid = self._match_valid_url(url).groups()
return self.url_result(smuggle_url(
self._TP_TEMPL % (self._ACCOUNT_MAP[site], guid),
{'force_smil_url': True}), 'ThePlatform', guid)
diff --git a/hypervideo_dl/extractor/seeker.py b/hypervideo_dl/extractor/seeker.py
index 7872dc8..e5c18c7 100644
--- a/hypervideo_dl/extractor/seeker.py
+++ b/hypervideo_dl/extractor/seeker.py
@@ -46,7 +46,7 @@ class SeekerIE(InfoExtractor):
}]
def _real_extract(self, url):
- display_id, article_id = re.match(self._VALID_URL, url).groups()
+ display_id, article_id = self._match_valid_url(url).groups()
webpage = self._download_webpage(url, display_id)
entries = []
for jwp_id in re.findall(r'data-video-id="([a-zA-Z0-9]{8})"', webpage):
diff --git a/hypervideo_dl/extractor/senateisvp.py b/hypervideo_dl/extractor/senateisvp.py
index db5ef8b..8794d47 100644
--- a/hypervideo_dl/extractor/senateisvp.py
+++ b/hypervideo_dl/extractor/senateisvp.py
@@ -102,7 +102,7 @@ class SenateISVPIE(InfoExtractor):
def _real_extract(self, url):
url, smuggled_data = unsmuggle_url(url, {})
- qs = compat_parse_qs(re.match(self._VALID_URL, url).group('qs'))
+ qs = compat_parse_qs(self._match_valid_url(url).group('qs'))
if not qs.get('filename') or not qs.get('type') or not qs.get('comm'):
raise ExtractorError('Invalid URL', expected=True)
diff --git a/hypervideo_dl/extractor/sendtonews.py b/hypervideo_dl/extractor/sendtonews.py
index 9d96529..bc38a0f 100644
--- a/hypervideo_dl/extractor/sendtonews.py
+++ b/hypervideo_dl/extractor/sendtonews.py
@@ -80,7 +80,9 @@ class SendtoNewsIE(InfoExtractor):
'format_id': '%s-%d' % (determine_protocol(f), tbr),
'tbr': tbr,
})
- self._sort_formats(info_dict['formats'], ('tbr', 'height', 'width', 'format_id'))
+ # 'tbr' was explicitly set to be prefered over 'height' originally,
+ # So this is being kept unless someone can confirm this is unnecessary
+ self._sort_formats(info_dict['formats'], ('tbr', 'res'))
thumbnails = []
if video.get('thumbnailUrl'):
diff --git a/hypervideo_dl/extractor/sevenplus.py b/hypervideo_dl/extractor/sevenplus.py
index 240afc1..210c44a 100644
--- a/hypervideo_dl/extractor/sevenplus.py
+++ b/hypervideo_dl/extractor/sevenplus.py
@@ -1,6 +1,7 @@
# coding: utf-8
from __future__ import unicode_literals
+import json
import re
from .brightcove import BrightcoveNewIE
@@ -42,8 +43,51 @@ class SevenPlusIE(BrightcoveNewIE):
'only_matching': True,
}]
+ def _real_initialize(self):
+ self.token = None
+
+ cookies = self._get_cookies('https://7plus.com.au')
+ api_key = next((x for x in cookies if x.startswith('glt_')), '')[4:]
+ if not api_key: # Cookies are signed out, skip login
+ return
+
+ login_resp = self._download_json(
+ 'https://login.7plus.com.au/accounts.getJWT', None, 'Logging in', fatal=False,
+ query={
+ 'APIKey': api_key,
+ 'sdk': 'js_latest',
+ 'login_token': cookies[f'glt_{api_key}'].value,
+ 'authMode': 'cookie',
+ 'pageURL': 'https://7plus.com.au/',
+ 'sdkBuild': '12471',
+ 'format': 'json',
+ }) or {}
+
+ if 'errorMessage' in login_resp:
+ self.report_warning(f'Unable to login: 7plus said: {login_resp["errorMessage"]}')
+ return
+ id_token = login_resp.get('id_token')
+ if not id_token:
+ self.report_warning('Unable to login: Could not extract id token')
+ return
+
+ token_resp = self._download_json(
+ 'https://7plus.com.au/auth/token', None, 'Getting auth token', fatal=False,
+ headers={'Content-Type': 'application/json'}, data=json.dumps({
+ 'idToken': id_token,
+ 'platformId': 'web',
+ 'regSource': '7plus',
+ }).encode('utf-8')) or {}
+ self.token = token_resp.get('token')
+ if not self.token:
+ self.report_warning('Unable to log in: Could not extract auth token')
+
def _real_extract(self, url):
- path, episode_id = re.match(self._VALID_URL, url).groups()
+ path, episode_id = self._match_valid_url(url).groups()
+
+ headers = {}
+ if self.token:
+ headers['Authorization'] = f'Bearer {self.token}'
try:
media = self._download_json(
@@ -55,7 +99,7 @@ class SevenPlusIE(BrightcoveNewIE):
'referenceId': 'ref:' + episode_id,
'deliveryId': 'csai',
'videoType': 'vod',
- })['media']
+ }, headers=headers)['media']
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
raise ExtractorError(self._parse_json(
diff --git a/hypervideo_dl/extractor/seznamzpravy.py b/hypervideo_dl/extractor/seznamzpravy.py
index 7a1c7e3..eef4975 100644
--- a/hypervideo_dl/extractor/seznamzpravy.py
+++ b/hypervideo_dl/extractor/seznamzpravy.py
@@ -5,7 +5,6 @@ import re
from .common import InfoExtractor
from ..compat import (
- compat_parse_qs,
compat_str,
compat_urllib_parse_urlparse,
)
@@ -13,6 +12,7 @@ from ..utils import (
urljoin,
int_or_none,
parse_codecs,
+ parse_qs,
try_get,
)
@@ -108,7 +108,7 @@ class SeznamZpravyIE(InfoExtractor):
return formats
def _real_extract(self, url):
- params = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
+ params = parse_qs(url)
src = params['src'][0]
title = params['title'][0]
diff --git a/hypervideo_dl/extractor/shahid.py b/hypervideo_dl/extractor/shahid.py
index 88b938e..42de41a 100644
--- a/hypervideo_dl/extractor/shahid.py
+++ b/hypervideo_dl/extractor/shahid.py
@@ -111,15 +111,15 @@ class ShahidIE(ShahidBaseIE):
}))
def _real_extract(self, url):
- page_type, video_id = re.match(self._VALID_URL, url).groups()
+ page_type, video_id = self._match_valid_url(url).groups()
if page_type == 'clip':
page_type = 'episode'
playout = self._call_api(
'playout/new/url/' + video_id, video_id)['playout']
- if playout.get('drm'):
- raise ExtractorError('This video is DRM protected.', expected=True)
+ if not self.get_param('allow_unplayable_formats') and playout.get('drm'):
+ self.report_drm(video_id)
formats = self._extract_m3u8_formats(re.sub(
# https://docs.aws.amazon.com/mediapackage/latest/ug/manifest-filtering.html
diff --git a/hypervideo_dl/extractor/shemaroome.py b/hypervideo_dl/extractor/shemaroome.py
new file mode 100644
index 0000000..142d5dc
--- /dev/null
+++ b/hypervideo_dl/extractor/shemaroome.py
@@ -0,0 +1,104 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..aes import aes_cbc_decrypt
+from ..compat import (
+ compat_b64decode,
+ compat_ord,
+)
+from ..utils import (
+ bytes_to_intlist,
+ ExtractorError,
+ intlist_to_bytes,
+ unified_strdate,
+)
+
+
+class ShemarooMeIE(InfoExtractor):
+ _VALID_URL = r'(?:https?://)(?:www\.)?shemaroome\.com/(?:movies|shows)/(?P<id>[^?#]+)'
+ _TESTS = [{
+ 'url': 'https://www.shemaroome.com/movies/dil-hai-tumhaara',
+ 'info_dict': {
+ 'id': 'dil-hai-tumhaara',
+ 'ext': 'mp4',
+ 'title': 'Dil Hai Tumhaara',
+ 'release_date': '20020906',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'description': 'md5:2782c4127807103cf5a6ae2ca33645ce',
+ },
+ 'params': {
+ 'skip_download': True
+ }
+ }, {
+ 'url': 'https://www.shemaroome.com/shows/jurm-aur-jazbaat/laalach',
+ 'info_dict': {
+ 'id': 'jurm-aur-jazbaat_laalach',
+ 'ext': 'mp4',
+ 'title': 'Laalach',
+ 'description': 'md5:92b79c2dcb539b0ab53f9fa5a048f53c',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'release_date': '20210507',
+ },
+ 'params': {
+ 'skip_download': True
+ },
+ 'skip': 'Premium videos cannot be downloaded yet.'
+ }, {
+ 'url': 'https://www.shemaroome.com/shows/jai-jai-jai-bajrang-bali/jai-jai-jai-bajrang-bali-episode-99',
+ 'info_dict': {
+ 'id': 'jai-jai-jai-bajrang-bali_jai-jai-jai-bajrang-bali-episode-99',
+ 'ext': 'mp4',
+ 'title': 'Jai Jai Jai Bajrang Bali Episode 99',
+ 'description': 'md5:850d127a18ee3f9529d7fbde2f49910d',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'release_date': '20110101',
+ },
+ 'params': {
+ 'skip_download': True
+ }
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url).replace('/', '_')
+ webpage = self._download_webpage(url, video_id)
+ title = self._search_regex(r'id=\"ma_title\" value=\"([^\"]+)', webpage, 'title')
+ thumbnail = self._og_search_thumbnail(webpage)
+ content_def = self._search_regex(r'id=\"content_definition\" value=\"([^\"]+)', webpage, 'content_def')
+ catalog_id = self._search_regex(r'id=\"catalog_id\" value=\"([^\"]+)', webpage, 'catalog_id')
+ item_category = self._search_regex(r'id=\"item_category\" value=\"([^\"]+)', webpage, 'item_category')
+ content_id = self._search_regex(r'id=\"content_id\" value=\"([^\"]+)', webpage, 'content_id')
+
+ data = f'catalog_id={catalog_id}&content_id={content_id}&category={item_category}&content_def={content_def}'
+ data_json = self._download_json('https://www.shemaroome.com/users/user_all_lists', video_id, data=data.encode())
+ if not data_json.get('status'):
+ raise ExtractorError('Premium videos cannot be downloaded yet.', expected=True)
+ url_data = bytes_to_intlist(compat_b64decode(data_json['new_play_url']))
+ key = bytes_to_intlist(compat_b64decode(data_json['key']))
+ iv = [0] * 16
+ m3u8_url = intlist_to_bytes(aes_cbc_decrypt(url_data, key, iv))
+ m3u8_url = m3u8_url[:-compat_ord((m3u8_url[-1]))].decode('ascii')
+ formats = self._extract_m3u8_formats(m3u8_url, video_id, fatal=False, headers={'stream_key': data_json['stream_key']})
+ self._sort_formats(formats)
+
+ release_date = self._html_search_regex(
+ (r'itemprop="uploadDate">\s*([\d-]+)', r'id="release_date" value="([\d-]+)'),
+ webpage, 'release date', fatal=False)
+
+ subtitles = {}
+ sub_url = data_json.get('subtitle')
+ if sub_url:
+ subtitles.setdefault('EN', []).append({
+ 'url': self._proto_relative_url(sub_url),
+ })
+ description = self._html_search_regex(r'(?s)>Synopsis(</.+?)</', webpage, 'description', fatal=False)
+
+ return {
+ 'id': video_id,
+ 'formats': formats,
+ 'title': title,
+ 'thumbnail': thumbnail,
+ 'release_date': unified_strdate(release_date),
+ 'description': description,
+ 'subtitles': subtitles,
+ }
diff --git a/hypervideo_dl/extractor/simplecast.py b/hypervideo_dl/extractor/simplecast.py
index 2d0b3c0..857e941 100644
--- a/hypervideo_dl/extractor/simplecast.py
+++ b/hypervideo_dl/extractor/simplecast.py
@@ -122,7 +122,7 @@ class SimplecastEpisodeIE(SimplecastBaseIE):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
episode = self._call_search_api(
'episode', mobj.group(1), mobj.group(0))
return self._parse_episode(episode)
diff --git a/hypervideo_dl/extractor/sina.py b/hypervideo_dl/extractor/sina.py
index 07b766b..b62b0c3 100644
--- a/hypervideo_dl/extractor/sina.py
+++ b/hypervideo_dl/extractor/sina.py
@@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
from ..utils import (
@@ -18,7 +17,7 @@ from ..utils import (
class SinaIE(InfoExtractor):
_VALID_URL = r'''(?x)https?://(?:.*?\.)?video\.sina\.com\.cn/
(?:
- (?:view/|.*\#)(?P<video_id>\d+)|
+ (?:view/|.*\#)(?P<id>\d+)|
.+?/(?P<pseudo_id>[^/?#]+)(?:\.s?html)|
# This is used by external sites like Weibo
api/sinawebApi/outplay.php/(?P<token>.+?)\.swf
@@ -56,9 +55,9 @@ class SinaIE(InfoExtractor):
]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
- video_id = mobj.group('video_id')
+ video_id = mobj.group('id')
if not video_id:
if mobj.group('token') is not None:
# The video id is in the redirected url
@@ -99,7 +98,7 @@ class SinaIE(InfoExtractor):
formats.append({
'format_id': quality_id,
'url': update_url_query(file_api, {'vid': file_id}),
- 'preference': preference(quality_id),
+ 'quality': preference(quality_id),
'ext': 'mp4',
})
self._sort_formats(formats)
diff --git a/hypervideo_dl/extractor/sixplay.py b/hypervideo_dl/extractor/sixplay.py
index 7ec66ec..fd747f5 100644
--- a/hypervideo_dl/extractor/sixplay.py
+++ b/hypervideo_dl/extractor/sixplay.py
@@ -1,17 +1,15 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
from ..compat import (
- compat_parse_qs,
compat_str,
- compat_urllib_parse_urlparse,
)
from ..utils import (
determine_ext,
int_or_none,
+ parse_qs,
try_get,
qualities,
)
@@ -41,7 +39,7 @@ class SixPlayIE(InfoExtractor):
}]
def _real_extract(self, url):
- domain, video_id = re.search(self._VALID_URL, url).groups()
+ domain, video_id = self._match_valid_url(url).groups()
service, consumer_name = {
'6play.fr': ('6play', 'm6web'),
'rtlplay.be': ('rtlbe_rtl_play', 'rtlbe'),
@@ -79,7 +77,7 @@ class SixPlayIE(InfoExtractor):
continue
if container == 'm3u8' or ext == 'm3u8':
if protocol == 'usp':
- if compat_parse_qs(compat_urllib_parse_urlparse(asset_url).query).get('token', [None])[0]:
+ if parse_qs(asset_url).get('token', [None])[0]:
urlh = self._request_webpage(
asset_url, video_id, fatal=False,
headers=self.geo_verification_headers())
diff --git a/hypervideo_dl/extractor/skynewsau.py b/hypervideo_dl/extractor/skynewsau.py
new file mode 100644
index 0000000..b1d7795
--- /dev/null
+++ b/hypervideo_dl/extractor/skynewsau.py
@@ -0,0 +1,46 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ try_get,
+ unified_strdate,
+)
+
+
+class SkyNewsAUIE(InfoExtractor):
+ _VALID_URL = r'(?:https?://)(?:www\.)?skynews\.com\.au/[^/]+/[^/]+/[^/]+/video/(?P<id>[a-z0-9]+)'
+
+ _TESTS = [{
+ 'url': 'https://www.skynews.com.au/world-news/united-states/incredible-vision-shows-lava-overflowing-from-spains-la-palma-volcano/video/0f4c6243d6903502c01251f228b91a71',
+ 'info_dict': {
+ 'id': '6277184925001',
+ 'ext': 'mp4',
+ 'title': 'md5:60594f1ea6d5ae93e292900f4d34e9ae',
+ 'description': 'md5:60594f1ea6d5ae93e292900f4d34e9ae',
+ 'thumbnail': r're:^https?://.*\.jpg',
+ 'duration': 76.394,
+ 'timestamp': 1634271300,
+ 'uploader_id': '5348771529001',
+ 'tags': ['fblink', 'msn', 'usa', 'world', 'yt'],
+ 'upload_date': '20211015',
+ },
+ 'params': {'skip_download': True, 'format': 'bv'}
+ }]
+
+ _API_KEY = '6krsj3w249nk779d8fukqx9f'
+
+ def _real_extract(self, url):
+ id = self._match_id(url)
+ webpage = self._download_webpage(url, id)
+ embedcode = self._search_regex(r'embedcode\s?=\s?\"([^\"]+)\"', webpage, 'embedcode')
+ data_json = self._download_json(
+ f'https://content.api.news/v3/videos/brightcove/{embedcode}?api_key={self._API_KEY}', id)['content']
+ return {
+ 'id': id,
+ '_type': 'url_transparent',
+ 'url': 'https://players.brightcove.net/%s/default_default/index.html?videoId=%s' % tuple(embedcode.split('-')),
+ 'ie_key': 'BrightcoveNew',
+ 'title': data_json.get('caption'),
+ 'upload_date': unified_strdate(try_get(data_json, lambda x: x['date']['created'])),
+ }
diff --git a/hypervideo_dl/extractor/slideshare.py b/hypervideo_dl/extractor/slideshare.py
index e89ebeb..9b3ad0a 100644
--- a/hypervideo_dl/extractor/slideshare.py
+++ b/hypervideo_dl/extractor/slideshare.py
@@ -1,6 +1,5 @@
from __future__ import unicode_literals
-import re
import json
from .common import InfoExtractor
@@ -27,7 +26,7 @@ class SlideshareIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
page_title = mobj.group('title')
webpage = self._download_webpage(url, page_title)
slideshare_obj = self._search_regex(
diff --git a/hypervideo_dl/extractor/snotr.py b/hypervideo_dl/extractor/snotr.py
index f773547..0bb5482 100644
--- a/hypervideo_dl/extractor/snotr.py
+++ b/hypervideo_dl/extractor/snotr.py
@@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
from ..utils import (
@@ -39,7 +38,7 @@ class SnotrIE(InfoExtractor):
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
diff --git a/hypervideo_dl/extractor/sohu.py b/hypervideo_dl/extractor/sohu.py
index 9d73650..3bff5c5 100644
--- a/hypervideo_dl/extractor/sohu.py
+++ b/hypervideo_dl/extractor/sohu.py
@@ -77,7 +77,7 @@ class SohuIE(InfoExtractor):
'info_dict': {
'id': '78932792',
'ext': 'mp4',
- 'title': 'hypervideo testing video',
+ 'title': 'youtube-dl testing video',
},
'params': {
'skip_download': True
@@ -97,7 +97,7 @@ class SohuIE(InfoExtractor):
'Downloading JSON data for %s' % vid_id,
headers=self.geo_verification_headers())
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
video_id = mobj.group('id')
mytv = mobj.group('mytv') is not None
diff --git a/hypervideo_dl/extractor/sonyliv.py b/hypervideo_dl/extractor/sonyliv.py
index fedfceb..c3ed442 100644
--- a/hypervideo_dl/extractor/sonyliv.py
+++ b/hypervideo_dl/extractor/sonyliv.py
@@ -9,15 +9,22 @@ from ..compat import compat_HTTPError
from ..utils import (
ExtractorError,
int_or_none,
+ try_get,
)
class SonyLIVIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?sonyliv\.com/(?:s(?:how|port)s/[^/]+|movies|clip|trailer|music-videos)/[^/?#&]+-(?P<id>\d+)'
+ _VALID_URL = r'''(?x)
+ (?:
+ sonyliv:|
+ https?://(?:www\.)?sonyliv\.com/(?:s(?:how|port)s/[^/]+|movies|clip|trailer|music-videos)/[^/?#&]+-
+ )
+ (?P<id>\d+)
+ '''
_TESTS = [{
'url': 'https://www.sonyliv.com/shows/bachelors-delight-1700000113/achaari-cheese-toast-1000022678?watch=true',
'info_dict': {
- 'title': 'Bachelors Delight - Achaari Cheese Toast',
+ 'title': 'Achaari Cheese Toast',
'id': '1000022678',
'ext': 'mp4',
'upload_date': '20200411',
@@ -25,7 +32,7 @@ class SonyLIVIE(InfoExtractor):
'timestamp': 1586632091,
'duration': 185,
'season_number': 1,
- 'episode': 'Achaari Cheese Toast',
+ 'series': 'Bachelors Delight',
'episode_number': 1,
'release_year': 2016,
},
@@ -75,8 +82,8 @@ class SonyLIVIE(InfoExtractor):
video_id = self._match_id(url)
content = self._call_api(
'1.5', 'IN/CONTENT/VIDEOURL/VOD/' + video_id, video_id)
- if content.get('isEncrypted'):
- raise ExtractorError('This video is DRM protected.', expected=True)
+ if not self.get_param('allow_unplayable_formats') and content.get('isEncrypted'):
+ self.report_drm(video_id)
dash_url = content['videoURL']
headers = {
'x-playback-session-id': '%s-%d' % (uuid.uuid4().hex, time.time() * 1000)
@@ -92,11 +99,15 @@ class SonyLIVIE(InfoExtractor):
metadata = self._call_api(
'1.6', 'IN/DETAIL/' + video_id, video_id)['containers'][0]['metadata']
- title = metadata['title']
- episode = metadata.get('episodeTitle')
- if episode and title != episode:
- title += ' - ' + episode
-
+ title = metadata['episodeTitle']
+ subtitles = {}
+ for sub in content.get('subtitle', []):
+ sub_url = sub.get('subtitleUrl')
+ if not sub_url:
+ continue
+ subtitles.setdefault(sub.get('subtitleLanguageName', 'ENG'), []).append({
+ 'url': sub_url,
+ })
return {
'id': video_id,
'title': title,
@@ -106,7 +117,46 @@ class SonyLIVIE(InfoExtractor):
'timestamp': int_or_none(metadata.get('creationDate'), 1000),
'duration': int_or_none(metadata.get('duration')),
'season_number': int_or_none(metadata.get('season')),
- 'episode': episode,
+ 'series': metadata.get('title'),
'episode_number': int_or_none(metadata.get('episodeNumber')),
'release_year': int_or_none(metadata.get('year')),
+ 'subtitles': subtitles,
+ }
+
+
+class SonyLIVSeriesIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?sonyliv\.com/shows/[^/?#&]+-(?P<id>\d{10})$'
+ _TESTS = [{
+ 'url': 'https://www.sonyliv.com/shows/adaalat-1700000091',
+ 'playlist_mincount': 456,
+ 'info_dict': {
+ 'id': '1700000091',
+ },
+ }]
+ _API_SHOW_URL = "https://apiv2.sonyliv.com/AGL/1.9/R/ENG/WEB/IN/DL/DETAIL/{}?kids_safe=false&from=0&to=49"
+ _API_EPISODES_URL = "https://apiv2.sonyliv.com/AGL/1.4/R/ENG/WEB/IN/CONTENT/DETAIL/BUNDLE/{}?from=0&to=1000&orderBy=episodeNumber&sortOrder=asc"
+ _API_SECURITY_URL = 'https://apiv2.sonyliv.com/AGL/1.4/A/ENG/WEB/ALL/GETTOKEN'
+
+ def _entries(self, show_id):
+ headers = {
+ 'Accept': 'application/json, text/plain, */*',
+ 'Referer': 'https://www.sonyliv.com',
}
+ headers['security_token'] = self._download_json(
+ self._API_SECURITY_URL, video_id=show_id, headers=headers,
+ note='Downloading security token')['resultObj']
+ seasons = try_get(
+ self._download_json(self._API_SHOW_URL.format(show_id), video_id=show_id, headers=headers),
+ lambda x: x['resultObj']['containers'][0]['containers'], list)
+ for season in seasons or []:
+ season_id = season['id']
+ episodes = try_get(
+ self._download_json(self._API_EPISODES_URL.format(season_id), video_id=season_id, headers=headers),
+ lambda x: x['resultObj']['containers'][0]['containers'], list)
+ for episode in episodes or []:
+ video_id = episode.get('id')
+ yield self.url_result('sonyliv:%s' % video_id, ie=SonyLIVIE.ie_key(), video_id=video_id)
+
+ def _real_extract(self, url):
+ show_id = self._match_id(url)
+ return self.playlist_result(self._entries(show_id), playlist_id=show_id)
diff --git a/hypervideo_dl/extractor/soundcloud.py b/hypervideo_dl/extractor/soundcloud.py
index abb85e1..78fecd1 100644
--- a/hypervideo_dl/extractor/soundcloud.py
+++ b/hypervideo_dl/extractor/soundcloud.py
@@ -3,6 +3,8 @@ from __future__ import unicode_literals
import itertools
import re
+import json
+# import random
from .common import (
InfoExtractor,
@@ -12,7 +14,6 @@ from ..compat import (
compat_HTTPError,
compat_kwargs,
compat_str,
- compat_urlparse,
)
from ..utils import (
error_to_compat_str,
@@ -22,12 +23,15 @@ from ..utils import (
int_or_none,
KNOWN_EXTENSIONS,
mimetype2ext,
+ remove_end,
+ parse_qs,
str_or_none,
try_get,
unified_timestamp,
update_url_query,
url_or_none,
urlhandle_detect_ext,
+ sanitized_Request,
)
@@ -46,8 +50,7 @@ class SoundcloudEmbedIE(InfoExtractor):
webpage)]
def _real_extract(self, url):
- query = compat_urlparse.parse_qs(
- compat_urlparse.urlparse(url).query)
+ query = parse_qs(url)
api_url = query['url'][0]
secret_token = query.get('secret_token')
if secret_token:
@@ -161,23 +164,11 @@ class SoundcloudIE(InfoExtractor):
},
# downloadable song
{
- 'url': 'https://soundcloud.com/oddsamples/bus-brakes',
- 'md5': '7624f2351f8a3b2e7cd51522496e7631',
+ 'url': 'https://soundcloud.com/the80m/the-following',
+ 'md5': '9ffcddb08c87d74fb5808a3c183a1d04',
'info_dict': {
- 'id': '128590877',
- 'ext': 'mp3',
- 'title': 'Bus Brakes',
- 'description': 'md5:0053ca6396e8d2fd7b7e1595ef12ab66',
- 'uploader': 'oddsamples',
- 'uploader_id': '73680509',
- 'timestamp': 1389232924,
- 'upload_date': '20140109',
- 'duration': 17.346,
- 'license': 'cc-by-sa',
- 'view_count': int,
- 'like_count': int,
- 'comment_count': int,
- 'repost_count': int,
+ 'id': '343609555',
+ 'ext': 'wav',
},
},
# private link, downloadable format
@@ -248,10 +239,15 @@ class SoundcloudIE(InfoExtractor):
},
},
{
- # with AAC HQ format available via OAuth token
+ # AAC HQ format available (account with active subscription needed)
'url': 'https://soundcloud.com/wandw/the-chainsmokers-ft-daya-dont-let-me-down-ww-remix-1',
'only_matching': True,
},
+ {
+ # Go+ (account with active subscription needed)
+ 'url': 'https://soundcloud.com/taylorswiftofficial/look-what-you-made-me-do',
+ 'only_matching': True,
+ },
]
_API_V2_BASE = 'https://api-v2.soundcloud.com/'
@@ -299,17 +295,110 @@ class SoundcloudIE(InfoExtractor):
try:
return super(SoundcloudIE, self)._download_json(*args, **compat_kwargs(kwargs))
except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
+ if isinstance(e.cause, compat_HTTPError) and e.cause.code in (401, 403):
self._store_client_id(None)
self._update_client_id()
continue
elif non_fatal:
- self._downloader.report_warning(error_to_compat_str(e))
+ self.report_warning(error_to_compat_str(e))
return False
raise
def _real_initialize(self):
- self._CLIENT_ID = self._downloader.cache.load('soundcloud', 'client_id') or 'YUKXoArFcqrlQn9tfNHvvyfnDISj04zk'
+ self._CLIENT_ID = self._downloader.cache.load('soundcloud', 'client_id') or 'a3e059563d7fd3372b49b37f00a00bcf'
+ self._login()
+
+ _USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'
+ _API_AUTH_QUERY_TEMPLATE = '?client_id=%s'
+ _API_AUTH_URL_PW = 'https://api-auth.soundcloud.com/web-auth/sign-in/password%s'
+ _API_VERIFY_AUTH_TOKEN = 'https://api-auth.soundcloud.com/connect/session%s'
+ _access_token = None
+ _HEADERS = {}
+ _NETRC_MACHINE = 'soundcloud'
+
+ def _login(self):
+ username, password = self._get_login_info()
+ if username is None:
+ return
+
+ if username == 'oauth' and password is not None:
+ self._access_token = password
+ query = self._API_AUTH_QUERY_TEMPLATE % self._CLIENT_ID
+ payload = {'session': {'access_token': self._access_token}}
+ token_verification = sanitized_Request(self._API_VERIFY_AUTH_TOKEN % query, json.dumps(payload).encode('utf-8'))
+ response = self._download_json(token_verification, None, note='Verifying login token...', fatal=False)
+ if response is not False:
+ self._HEADERS = {'Authorization': 'OAuth ' + self._access_token}
+ self.report_login()
+ else:
+ self.report_warning('Provided authorization token seems to be invalid. Continue as guest')
+ elif username is not None:
+ self.report_warning(
+ 'Login using username and password is not currently supported. '
+ 'Use "--user oauth --password <oauth_token>" to login using an oauth token')
+
+ r'''
+ def genDevId():
+ def genNumBlock():
+ return ''.join([str(random.randrange(10)) for i in range(6)])
+ return '-'.join([genNumBlock() for i in range(4)])
+
+ payload = {
+ 'client_id': self._CLIENT_ID,
+ 'recaptcha_pubkey': 'null',
+ 'recaptcha_response': 'null',
+ 'credentials': {
+ 'identifier': username,
+ 'password': password
+ },
+ 'signature': self.sign(username, password, self._CLIENT_ID),
+ 'device_id': genDevId(),
+ 'user_agent': self._USER_AGENT
+ }
+
+ query = self._API_AUTH_QUERY_TEMPLATE % self._CLIENT_ID
+ login = sanitized_Request(self._API_AUTH_URL_PW % query, json.dumps(payload).encode('utf-8'))
+ response = self._download_json(login, None)
+ self._access_token = response.get('session').get('access_token')
+ if not self._access_token:
+ self.report_warning('Unable to get access token, login may has failed')
+ else:
+ self._HEADERS = {'Authorization': 'OAuth ' + self._access_token}
+ '''
+
+ # signature generation
+ def sign(self, user, pw, clid):
+ a = 33
+ i = 1
+ s = 440123
+ w = 117
+ u = 1800000
+ l = 1042
+ b = 37
+ k = 37
+ c = 5
+ n = '0763ed7314c69015fd4a0dc16bbf4b90' # _KEY
+ y = '8' # _REV
+ r = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36' # _USER_AGENT
+ e = user # _USERNAME
+ t = clid # _CLIENT_ID
+
+ d = '-'.join([str(mInt) for mInt in [a, i, s, w, u, l, b, k]])
+ p = n + y + d + r + e + t + d + n
+ h = p
+
+ m = 8011470
+ f = 0
+
+ for f in range(f, len(h)):
+ m = (m >> 1) + ((1 & m) << 23)
+ m += ord(h[f])
+ m &= 16777215
+
+ # c is not even needed
+ out = str(y) + ':' + str(d) + ':' + format(m, 'x') + ':' + str(c)
+
+ return out
@classmethod
def _resolv_url(cls, url):
@@ -340,7 +429,7 @@ class SoundcloudIE(InfoExtractor):
'ext': urlhandle_detect_ext(urlh) or 'mp3',
'filesize': int_or_none(urlh.headers.get('Content-Length')),
'url': format_url,
- 'preference': 10,
+ 'quality': 10,
})
def invalid_url(url):
@@ -389,7 +478,7 @@ class SoundcloudIE(InfoExtractor):
if not format_url:
continue
stream = self._download_json(
- format_url, track_id, query=query, fatal=False)
+ format_url, track_id, query=query, fatal=False, headers=self._HEADERS)
if not isinstance(stream, dict):
continue
stream_url = url_or_none(stream.get('url'))
@@ -416,7 +505,7 @@ class SoundcloudIE(InfoExtractor):
f['vcodec'] = 'none'
if not formats and info.get('policy') == 'BLOCK':
- self.raise_geo_restricted()
+ self.raise_geo_restricted(metadata_available=True)
self._sort_formats(formats)
user = info.get('user') or {}
@@ -468,7 +557,7 @@ class SoundcloudIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
track_id = mobj.group('track_id')
@@ -487,7 +576,7 @@ class SoundcloudIE(InfoExtractor):
info_json_url = self._resolv_url(self._BASE_URL + resolve_title)
info = self._download_json(
- info_json_url, full_title, 'Downloading info JSON', query=query)
+ info_json_url, full_title, 'Downloading info JSON', query=query, headers=self._HEADERS)
return self._extract_info_dict(info, full_title, token)
@@ -503,7 +592,7 @@ class SoundcloudPlaylistBaseIE(SoundcloudIE):
'ids': ','.join([compat_str(t['id']) for t in tracks]),
'playlistId': playlist_id,
'playlistSecretToken': token,
- })
+ }, headers=self._HEADERS)
entries = []
for track in tracks:
track_id = str_or_none(track.get('id'))
@@ -523,7 +612,7 @@ class SoundcloudPlaylistBaseIE(SoundcloudIE):
class SoundcloudSetIE(SoundcloudPlaylistBaseIE):
- _VALID_URL = r'https?://(?:(?:www|m)\.)?soundcloud\.com/(?P<uploader>[\w\d-]+)/sets/(?P<slug_title>[\w\d-]+)(?:/(?P<token>[^?/]+))?'
+ _VALID_URL = r'https?://(?:(?:www|m)\.)?soundcloud\.com/(?P<uploader>[\w\d-]+)/sets/(?P<slug_title>[:\w\d-]+)(?:/(?P<token>[^?/]+))?'
IE_NAME = 'soundcloud:set'
_TESTS = [{
'url': 'https://soundcloud.com/the-concept-band/sets/the-royal-concept-ep',
@@ -536,10 +625,19 @@ class SoundcloudSetIE(SoundcloudPlaylistBaseIE):
}, {
'url': 'https://soundcloud.com/the-concept-band/sets/the-royal-concept-ep/token',
'only_matching': True,
+ }, {
+ 'url': 'https://soundcloud.com/discover/sets/weekly::flacmatic',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://soundcloud.com/discover/sets/charts-top:all-music:de',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://soundcloud.com/discover/sets/charts-top:hiphoprap:kr',
+ 'only_matching': True,
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
full_title = '%s/sets/%s' % mobj.group('uploader', 'slug_title')
token = mobj.group('token')
@@ -547,7 +645,7 @@ class SoundcloudSetIE(SoundcloudPlaylistBaseIE):
full_title += '/' + token
info = self._download_json(self._resolv_url(
- self._BASE_URL + full_title), full_title)
+ self._BASE_URL + full_title), full_title, headers=self._HEADERS)
if 'errors' in info:
msgs = (compat_str(err['error_message']) for err in info['errors'])
@@ -558,64 +656,60 @@ class SoundcloudSetIE(SoundcloudPlaylistBaseIE):
class SoundcloudPagedPlaylistBaseIE(SoundcloudIE):
def _extract_playlist(self, base_url, playlist_id, playlist_title):
+ return {
+ '_type': 'playlist',
+ 'id': playlist_id,
+ 'title': playlist_title,
+ 'entries': self._entries(base_url, playlist_id),
+ }
+
+ def _entries(self, url, playlist_id):
# Per the SoundCloud documentation, the maximum limit for a linked partitioning query is 200.
# https://developers.soundcloud.com/blog/offset-pagination-deprecated
- COMMON_QUERY = {
+ query = {
'limit': 200,
'linked_partitioning': '1',
+ 'offset': 0,
}
- query = COMMON_QUERY.copy()
- query['offset'] = 0
-
- next_href = base_url
+ retries = self.get_param('extractor_retries', 3)
- entries = []
for i in itertools.count():
- response = self._download_json(
- next_href, playlist_id,
- 'Downloading track page %s' % (i + 1), query=query)
-
- collection = response['collection']
-
- if not isinstance(collection, list):
- collection = []
-
- # Empty collection may be returned, in this case we proceed
- # straight to next_href
-
- def resolve_entry(candidates):
+ attempt, last_error = -1, None
+ while attempt < retries:
+ attempt += 1
+ if last_error:
+ self.report_warning('%s. Retrying ...' % remove_end(last_error, '.'), playlist_id)
+ try:
+ response = self._download_json(
+ url, playlist_id, query=query, headers=self._HEADERS,
+ note='Downloading track page %s%s' % (i + 1, f' (retry #{attempt})' if attempt else ''))
+ break
+ except ExtractorError as e:
+ # Downloading page may result in intermittent 502 HTTP error
+ # See https://github.com/hypervideo/hypervideo/issues/872
+ if attempt >= retries or not isinstance(e.cause, compat_HTTPError) or e.cause.code != 502:
+ raise
+ last_error = str(e.cause or e.msg)
+
+ def resolve_entry(*candidates):
for cand in candidates:
if not isinstance(cand, dict):
continue
permalink_url = url_or_none(cand.get('permalink_url'))
- if not permalink_url:
- continue
- return self.url_result(
- permalink_url,
- SoundcloudIE.ie_key() if SoundcloudIE.suitable(permalink_url) else None,
- str_or_none(cand.get('id')), cand.get('title'))
-
- for e in collection:
- entry = resolve_entry((e, e.get('track'), e.get('playlist')))
- if entry:
- entries.append(entry)
-
- next_href = response.get('next_href')
- if not next_href:
- break
+ if permalink_url:
+ return self.url_result(
+ permalink_url,
+ SoundcloudIE.ie_key() if SoundcloudIE.suitable(permalink_url) else None,
+ str_or_none(cand.get('id')), cand.get('title'))
- next_href = response['next_href']
- parsed_next_href = compat_urlparse.urlparse(next_href)
- query = compat_urlparse.parse_qs(parsed_next_href.query)
- query.update(COMMON_QUERY)
+ for e in response['collection'] or []:
+ yield resolve_entry(e, e.get('track'), e.get('playlist'))
- return {
- '_type': 'playlist',
- 'id': playlist_id,
- 'title': playlist_title,
- 'entries': entries,
- }
+ url = response.get('next_href')
+ if not url:
+ break
+ query.pop('offset', None)
class SoundcloudUserIE(SoundcloudPagedPlaylistBaseIE):
@@ -691,12 +785,12 @@ class SoundcloudUserIE(SoundcloudPagedPlaylistBaseIE):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
uploader = mobj.group('user')
user = self._download_json(
self._resolv_url(self._BASE_URL + uploader),
- uploader, 'Downloading user info')
+ uploader, 'Downloading user info', headers=self._HEADERS)
resource = mobj.group('rsrc') or 'all'
@@ -721,7 +815,7 @@ class SoundcloudTrackStationIE(SoundcloudPagedPlaylistBaseIE):
def _real_extract(self, url):
track_name = self._match_id(url)
- track = self._download_json(self._resolv_url(url), track_name)
+ track = self._download_json(self._resolv_url(url), track_name, headers=self._HEADERS)
track_id = self._search_regex(
r'soundcloud:track-stations:(\d+)', track['id'], 'track id')
@@ -744,7 +838,7 @@ class SoundcloudPlaylistIE(SoundcloudPlaylistBaseIE):
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
playlist_id = mobj.group('id')
query = {}
@@ -754,7 +848,7 @@ class SoundcloudPlaylistIE(SoundcloudPlaylistBaseIE):
data = self._download_json(
self._API_V2_BASE + 'playlists/' + playlist_id,
- playlist_id, 'Downloading playlist', query=query)
+ playlist_id, 'Downloading playlist', query=query, headers=self._HEADERS)
return self._extract_set(data, token)
@@ -786,25 +880,14 @@ class SoundcloudSearchIE(SearchInfoExtractor, SoundcloudIE):
})
next_url = update_url_query(self._API_V2_BASE + endpoint, query)
- collected_results = 0
-
for i in itertools.count(1):
response = self._download_json(
- next_url, collection_id, 'Downloading page {0}'.format(i),
- 'Unable to download API page')
-
- collection = response.get('collection', [])
- if not collection:
- break
+ next_url, collection_id, f'Downloading page {i}',
+ 'Unable to download API page', headers=self._HEADERS)
- collection = list(filter(bool, collection))
- collected_results += len(collection)
-
- for item in collection:
- yield self.url_result(item['uri'], SoundcloudIE.ie_key())
-
- if not collection or collected_results >= limit:
- break
+ for item in response.get('collection') or []:
+ if item:
+ yield self.url_result(item['uri'], SoundcloudIE.ie_key())
next_url = response.get('next_href')
if not next_url:
@@ -812,4 +895,4 @@ class SoundcloudSearchIE(SearchInfoExtractor, SoundcloudIE):
def _get_n_results(self, query, n):
tracks = self._get_collection('search/tracks', query, limit=n, q=query)
- return self.playlist_result(tracks, playlist_title=query)
+ return self.playlist_result(tracks, query, query)
diff --git a/hypervideo_dl/extractor/soundgasm.py b/hypervideo_dl/extractor/soundgasm.py
index 3d78a9d..d608eb7 100644
--- a/hypervideo_dl/extractor/soundgasm.py
+++ b/hypervideo_dl/extractor/soundgasm.py
@@ -22,7 +22,7 @@ class SoundgasmIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
display_id = mobj.group('display_id')
webpage = self._download_webpage(url, display_id)
diff --git a/hypervideo_dl/extractor/southpark.py b/hypervideo_dl/extractor/southpark.py
index 0774da0..d497494 100644
--- a/hypervideo_dl/extractor/southpark.py
+++ b/hypervideo_dl/extractor/southpark.py
@@ -56,40 +56,62 @@ class SouthParkEsIE(SouthParkIE):
class SouthParkDeIE(SouthParkIE):
IE_NAME = 'southpark.de'
- _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.de/(?:clips|alle-episoden|collections)/(?P<id>.+?)(\?|#|$))'
- _FEED_URL = 'http://www.southpark.de/feeds/video-player/mrss/'
-
+ _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.de/(?:(en/(videoclip|collections|episodes|video-clips))|(videoclip|collections|folgen))/(?P<id>(?P<unique_id>.+?)/.+?)(?:\?|#|$))'
_TESTS = [{
- 'url': 'http://www.southpark.de/clips/uygssh/the-government-wont-respect-my-privacy#tab=featured',
+ 'url': 'https://www.southpark.de/videoclip/rsribv/south-park-rueckzug-zum-gummibonbon-wald',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.southpark.de/folgen/jiru42/south-park-verkabelung-staffel-23-ep-9',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.southpark.de/collections/zzno5a/south-park-good-eats/7q26gp',
+ 'only_matching': True,
+ }, {
+ # clip
+ 'url': 'https://www.southpark.de/en/video-clips/ct46op/south-park-tooth-fairy-cartman',
'info_dict': {
- 'id': '85487c96-b3b9-4e39-9127-ad88583d9bf2',
+ 'id': 'e99d45ea-ed00-11e0-aca6-0026b9414f30',
'ext': 'mp4',
- 'title': 'South Park|The Government Won\'t Respect My Privacy',
- 'description': 'Cartman explains the benefits of "Shitter" to Stan, Kyle and Craig.',
- 'timestamp': 1380160800,
- 'upload_date': '20130926',
+ 'title': 'Tooth Fairy Cartman',
+ 'description': 'md5:db02e23818b4dc9cb5f0c5a7e8833a68',
},
}, {
- # non-ASCII characters in initial URL
- 'url': 'http://www.southpark.de/alle-episoden/s18e09-hashtag-aufwärmen',
+ # episode
+ 'url': 'https://www.southpark.de/en/episodes/yy0vjs/south-park-the-pandemic-special-season-24-ep-1',
'info_dict': {
- 'title': 'Hashtag „Aufwärmen“',
- 'description': 'Kyle will mit seinem kleinen Bruder Ike Videospiele spielen. Als der nicht mehr mit ihm spielen will, hat Kyle Angst, dass er die Kids von heute nicht mehr versteht.',
+ 'id': 'f5fbd823-04bc-11eb-9b1b-0e40cf2fc285',
+ 'ext': 'mp4',
+ 'title': 'South Park',
+ 'description': 'md5:ae0d875eff169dcbed16b21531857ac1',
},
- 'playlist_count': 3,
}, {
- # non-ASCII characters in redirect URL
- 'url': 'http://www.southpark.de/alle-episoden/s18e09',
+ # clip
+ 'url': 'https://www.southpark.de/videoclip/ct46op/south-park-zahnfee-cartman',
'info_dict': {
- 'title': 'Hashtag „Aufwärmen“',
- 'description': 'Kyle will mit seinem kleinen Bruder Ike Videospiele spielen. Als der nicht mehr mit ihm spielen will, hat Kyle Angst, dass er die Kids von heute nicht mehr versteht.',
+ 'id': 'e99d45ea-ed00-11e0-aca6-0026b9414f30',
+ 'ext': 'mp4',
+ 'title': 'Zahnfee Cartman',
+ 'description': 'md5:b917eec991d388811d911fd1377671ac'
},
- 'playlist_count': 3,
}, {
- 'url': 'http://www.southpark.de/collections/2476/superhero-showdown/1',
- 'only_matching': True,
+ # episode
+ 'url': 'https://www.southpark.de/folgen/242csn/south-park-her-mit-dem-hirn-staffel-1-ep-7',
+ 'info_dict': {
+ 'id': '607115f3-496f-40c3-8647-2b0bcff486c0',
+ 'ext': 'mp4',
+ 'title': 'md5:South Park | Pink Eye | E 0107 | HDSS0107X deu | Version: 634312 | Comedy Central S1',
+ },
}]
+ def _get_feed_url(self, uri, url=None):
+ video_id = self._id_from_uri(uri)
+ config = self._download_json(
+ 'http://media.mtvnservices.com/pmt/e1/access/index.html?uri=%s&configtype=edge&ref=%s' % (uri, url), video_id)
+ return self._remove_template_parameter(config['feedWithQueryParams'])
+
+ def _get_feed_query(self, uri):
+ return
+
class SouthParkNlIE(SouthParkIE):
IE_NAME = 'southpark.nl'
diff --git a/hypervideo_dl/extractor/sovietscloset.py b/hypervideo_dl/extractor/sovietscloset.py
new file mode 100644
index 0000000..7df2375
--- /dev/null
+++ b/hypervideo_dl/extractor/sovietscloset.py
@@ -0,0 +1,221 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ js_to_json,
+ try_get,
+ unified_timestamp
+)
+
+
+class SovietsClosetBaseIE(InfoExtractor):
+ MEDIADELIVERY_REFERER = {'Referer': 'https://iframe.mediadelivery.net/'}
+
+ def parse_nuxt_jsonp(self, nuxt_jsonp_url, video_id, name):
+ nuxt_jsonp = self._download_webpage(nuxt_jsonp_url, video_id, note=f'Downloading {name} __NUXT_JSONP__')
+ js, arg_keys, arg_vals = self._search_regex(
+ r'__NUXT_JSONP__\(.*?\(function\((?P<arg_keys>.*?)\)\{return\s(?P<js>\{.*?\})\}\((?P<arg_vals>.*?)\)',
+ nuxt_jsonp, '__NUXT_JSONP__', group=['js', 'arg_keys', 'arg_vals'])
+
+ args = dict(zip(arg_keys.split(','), arg_vals.split(',')))
+
+ for key, val in args.items():
+ if val in ('undefined', 'void 0'):
+ args[key] = 'null'
+
+ return self._parse_json(js_to_json(js, args), video_id)['data'][0]
+
+ def video_meta(self, video_id, game_name, category_name, episode_number, stream_date):
+ title = game_name
+ if category_name and category_name != 'Misc':
+ title += f' - {category_name}'
+ if episode_number:
+ title += f' #{episode_number}'
+
+ timestamp = unified_timestamp(stream_date)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'http_headers': self.MEDIADELIVERY_REFERER,
+ 'uploader': 'SovietWomble',
+ 'creator': 'SovietWomble',
+ 'release_timestamp': timestamp,
+ 'timestamp': timestamp,
+ 'uploader_id': 'SovietWomble',
+ 'uploader_url': 'https://www.twitch.tv/SovietWomble',
+ 'was_live': True,
+ 'availability': 'public',
+ 'series': game_name,
+ 'season': category_name,
+ 'episode_number': episode_number,
+ }
+
+
+class SovietsClosetIE(SovietsClosetBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?sovietscloset\.com/video/(?P<id>[0-9]+)/?'
+ _TESTS = [
+ {
+ 'url': 'https://sovietscloset.com/video/1337',
+ 'md5': '11e58781c4ca5b283307aa54db5b3f93',
+ 'info_dict': {
+ 'id': '1337',
+ 'ext': 'mp4',
+ 'title': 'The Witcher #13',
+ 'thumbnail': r're:^https?://.*\.b-cdn\.net/2f0cfbf4-3588-43a9-a7d6-7c9ea3755e67/thumbnail\.jpg$',
+ 'uploader': 'SovietWomble',
+ 'creator': 'SovietWomble',
+ 'release_timestamp': 1492091580,
+ 'release_date': '20170413',
+ 'timestamp': 1492091580,
+ 'upload_date': '20170413',
+ 'uploader_id': 'SovietWomble',
+ 'uploader_url': 'https://www.twitch.tv/SovietWomble',
+ 'duration': 7007,
+ 'was_live': True,
+ 'availability': 'public',
+ 'series': 'The Witcher',
+ 'season': 'Misc',
+ 'episode_number': 13,
+ },
+ },
+ {
+ 'url': 'https://sovietscloset.com/video/1105',
+ 'md5': '578b1958a379e7110ba38697042e9efb',
+ 'info_dict': {
+ 'id': '1105',
+ 'ext': 'mp4',
+ 'title': 'Arma 3 - Zeus Games #3',
+ 'uploader': 'SovietWomble',
+ 'thumbnail': r're:^https?://.*\.b-cdn\.net/c0e5e76f-3a93-40b4-bf01-12343c2eec5d/thumbnail\.jpg$',
+ 'uploader': 'SovietWomble',
+ 'creator': 'SovietWomble',
+ 'release_timestamp': 1461157200,
+ 'release_date': '20160420',
+ 'timestamp': 1461157200,
+ 'upload_date': '20160420',
+ 'uploader_id': 'SovietWomble',
+ 'uploader_url': 'https://www.twitch.tv/SovietWomble',
+ 'duration': 8804,
+ 'was_live': True,
+ 'availability': 'public',
+ 'series': 'Arma 3',
+ 'season': 'Zeus Games',
+ 'episode_number': 3,
+ },
+ },
+ ]
+
+ def _extract_bunnycdn_iframe(self, video_id, bunnycdn_id):
+ iframe = self._download_webpage(
+ f'https://iframe.mediadelivery.net/embed/5105/{bunnycdn_id}',
+ video_id, note='Downloading BunnyCDN iframe', headers=self.MEDIADELIVERY_REFERER)
+
+ m3u8_url = self._search_regex(r'(https?://.*?\.m3u8)', iframe, 'm3u8 url')
+ thumbnail_url = self._search_regex(r'(https?://.*?thumbnail\.jpg)', iframe, 'thumbnail url')
+
+ m3u8_formats = self._extract_m3u8_formats(m3u8_url, video_id, headers=self.MEDIADELIVERY_REFERER)
+ self._sort_formats(m3u8_formats)
+
+ if not m3u8_formats:
+ duration = None
+ else:
+ duration = self._extract_m3u8_vod_duration(
+ m3u8_formats[0]['url'], video_id, headers=self.MEDIADELIVERY_REFERER)
+
+ return {
+ 'formats': m3u8_formats,
+ 'thumbnail': thumbnail_url,
+ 'duration': duration,
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ static_assets_base = self._search_regex(r'staticAssetsBase:\"(.*?)\"', webpage, 'staticAssetsBase')
+ static_assets_base = f'https://sovietscloset.com{static_assets_base}'
+
+ stream = self.parse_nuxt_jsonp(f'{static_assets_base}/video/{video_id}/payload.js', video_id, 'video')['stream']
+
+ return {
+ **self.video_meta(
+ video_id=video_id, game_name=stream['game']['name'],
+ category_name=try_get(stream, lambda x: x['subcategory']['name'], str),
+ episode_number=stream.get('number'), stream_date=stream.get('date')),
+ **self._extract_bunnycdn_iframe(video_id, stream['bunnyId']),
+ }
+
+
+class SovietsClosetPlaylistIE(SovietsClosetBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?sovietscloset\.com/(?!video)(?P<id>[^#?]+)'
+ _TESTS = [
+
+ {
+ 'url': 'https://sovietscloset.com/The-Witcher',
+ 'info_dict': {
+ 'id': 'The-Witcher',
+ 'title': 'The Witcher',
+ },
+ 'playlist_mincount': 31,
+ },
+ {
+ 'url': 'https://sovietscloset.com/Arma-3/Zeus-Games',
+ 'info_dict': {
+ 'id': 'Arma-3/Zeus-Games',
+ 'title': 'Arma 3 - Zeus Games',
+ },
+ 'playlist_mincount': 3,
+ },
+ {
+ 'url': 'https://sovietscloset.com/arma-3/zeus-games/',
+ 'info_dict': {
+ 'id': 'arma-3/zeus-games',
+ 'title': 'Arma 3 - Zeus Games',
+ },
+ 'playlist_mincount': 3,
+ },
+ {
+ 'url': 'https://sovietscloset.com/Total-War-Warhammer',
+ 'info_dict': {
+ 'id': 'Total-War-Warhammer',
+ 'title': 'Total War: Warhammer - Greenskins',
+ },
+ 'playlist_mincount': 33,
+ },
+ ]
+
+ def _real_extract(self, url):
+ playlist_id = self._match_id(url)
+ if playlist_id.endswith('/'):
+ playlist_id = playlist_id[:-1]
+
+ webpage = self._download_webpage(url, playlist_id)
+
+ static_assets_base = self._search_regex(r'staticAssetsBase:\"(.*?)\"', webpage, 'staticAssetsBase')
+ static_assets_base = f'https://sovietscloset.com{static_assets_base}'
+
+ sovietscloset = self.parse_nuxt_jsonp(f'{static_assets_base}/payload.js', playlist_id, 'global')['games']
+
+ if '/' in playlist_id:
+ game_slug, category_slug = playlist_id.lower().split('/')
+ else:
+ game_slug = playlist_id.lower()
+ category_slug = 'misc'
+
+ game = next(game for game in sovietscloset if game['slug'].lower() == game_slug)
+ category = next((cat for cat in game['subcategories'] if cat.get('slug', '').lower() == category_slug),
+ game['subcategories'][0])
+ category_slug = category.get('slug', '').lower() or category_slug
+ playlist_title = game.get('name') or game_slug
+ if category_slug != 'misc':
+ playlist_title += f' - {category.get("name") or category_slug}'
+ entries = [{
+ **self.url_result(f'https://sovietscloset.com/video/{stream["id"]}', ie=SovietsClosetIE.ie_key()),
+ **self.video_meta(
+ video_id=stream['id'], game_name=game['name'], category_name=category.get('name'),
+ episode_number=i + 1, stream_date=stream.get('date')),
+ } for i, stream in enumerate(category['streams'])]
+
+ return self.playlist_result(entries, playlist_id, playlist_title)
diff --git a/hypervideo_dl/extractor/spankbang.py b/hypervideo_dl/extractor/spankbang.py
index 37cb8c8..dd849ae 100644
--- a/hypervideo_dl/extractor/spankbang.py
+++ b/hypervideo_dl/extractor/spankbang.py
@@ -26,17 +26,18 @@ class SpankBangIE(InfoExtractor):
)
'''
_TESTS = [{
- 'url': 'http://spankbang.com/3vvn/video/fantasy+solo',
- 'md5': '1cc433e1d6aa14bc376535b8679302f7',
+ 'url': 'https://spankbang.com/56b3d/video/the+slut+maker+hmv',
+ 'md5': '2D13903DE4ECC7895B5D55930741650A',
'info_dict': {
- 'id': '3vvn',
+ 'id': '56b3d',
'ext': 'mp4',
- 'title': 'fantasy solo',
- 'description': 'dillion harper masturbates on a bed',
+ 'title': 'The Slut Maker HMV',
+ 'description': 'Girls getting converted into cock slaves.',
'thumbnail': r're:^https?://.*\.jpg$',
- 'uploader': 'silly2587',
- 'timestamp': 1422571989,
- 'upload_date': '20150129',
+ 'uploader': 'Mindself',
+ 'uploader_id': 'mindself',
+ 'timestamp': 1617109572,
+ 'upload_date': '20210330',
'age_limit': 18,
}
}, {
@@ -70,7 +71,7 @@ class SpankBangIE(InfoExtractor):
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
video_id = mobj.group('id') or mobj.group('id_2')
webpage = self._download_webpage(
url.replace('/%s/embed' % video_id, '/%s/video' % video_id),
@@ -129,20 +130,20 @@ class SpankBangIE(InfoExtractor):
format_url = format_url[0]
extract_format(format_id, format_url)
- self._sort_formats(formats, field_preference=('preference', 'height', 'width', 'fps', 'tbr', 'format_id'))
+ self._sort_formats(formats)
info = self._search_json_ld(webpage, video_id, default={})
title = self._html_search_regex(
- r'(?s)<h1[^>]*>(.+?)</h1>', webpage, 'title', default=None)
+ r'(?s)<h1[^>]+\btitle=["\']([^"]+)["\']>', webpage, 'title', default=None)
description = self._search_regex(
r'<div[^>]+\bclass=["\']bottom[^>]+>\s*<p>[^<]*</p>\s*<p>([^<]+)',
webpage, 'description', default=None)
thumbnail = self._og_search_thumbnail(webpage, default=None)
uploader = self._html_search_regex(
- (r'(?s)<li[^>]+class=["\']profile[^>]+>(.+?)</a>',
- r'class="user"[^>]*><img[^>]+>([^<]+)'),
- webpage, 'uploader', default=None)
+ r'<svg[^>]+\bclass="(?:[^"]*?user[^"]*?)">.*?</svg>([^<]+)', webpage, 'uploader', default=None)
+ uploader_id = self._html_search_regex(
+ r'<a[^>]+href="/profile/([^"]+)"', webpage, 'uploader_id', default=None)
duration = parse_duration(self._search_regex(
r'<div[^>]+\bclass=["\']right_side[^>]+>\s*<span>([^<]+)',
webpage, 'duration', default=None))
@@ -157,6 +158,7 @@ class SpankBangIE(InfoExtractor):
'description': description,
'thumbnail': thumbnail,
'uploader': uploader,
+ 'uploader_id': uploader_id,
'duration': duration,
'view_count': view_count,
'formats': formats,
@@ -177,7 +179,7 @@ class SpankBangPlaylistIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
playlist_id = mobj.group('id')
display_id = mobj.group('display_id')
diff --git a/hypervideo_dl/extractor/spankwire.py b/hypervideo_dl/extractor/spankwire.py
index 35ab9ec..e97c1d2 100644
--- a/hypervideo_dl/extractor/spankwire.py
+++ b/hypervideo_dl/extractor/spankwire.py
@@ -108,7 +108,7 @@ class SpankwireIE(InfoExtractor):
formats.extend(self._extract_m3u8_formats(
m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native',
m3u8_id='hls', fatal=False))
- self._sort_formats(formats, ('height', 'tbr', 'width', 'format_id'))
+ self._sort_formats(formats)
view_count = str_to_int(video.get('viewed'))
diff --git a/hypervideo_dl/extractor/spiegeltv.py b/hypervideo_dl/extractor/spiegeltv.py
new file mode 100644
index 0000000..6ccf4c3
--- /dev/null
+++ b/hypervideo_dl/extractor/spiegeltv.py
@@ -0,0 +1,17 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from .nexx import NexxIE
+
+
+class SpiegeltvIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?spiegel\.tv/videos/(?P<id>\d+)'
+ _TEST = {
+ 'url': 'http://www.spiegel.tv/videos/161681-flug-mh370/',
+ 'only_matching': True,
+ }
+
+ def _real_extract(self, url):
+ return self.url_result(
+ 'https://api.nexx.cloud/v3/748/videos/byid/%s'
+ % self._match_id(url), ie=NexxIE.ie_key())
diff --git a/hypervideo_dl/extractor/sport5.py b/hypervideo_dl/extractor/sport5.py
index a417b5a..35c57d6 100644
--- a/hypervideo_dl/extractor/sport5.py
+++ b/hypervideo_dl/extractor/sport5.py
@@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
from ..utils import ExtractorError
@@ -36,7 +35,7 @@ class Sport5IE(InfoExtractor):
]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
media_id = mobj.group('id')
webpage = self._download_webpage(url, media_id)
diff --git a/hypervideo_dl/extractor/sportdeutschland.py b/hypervideo_dl/extractor/sportdeutschland.py
index 3e497a9..94bcaba 100644
--- a/hypervideo_dl/extractor/sportdeutschland.py
+++ b/hypervideo_dl/extractor/sportdeutschland.py
@@ -2,15 +2,12 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..compat import (
- compat_parse_qs,
- compat_urllib_parse_urlparse,
-)
from ..utils import (
clean_html,
float_or_none,
int_or_none,
parse_iso8601,
+ parse_qs,
strip_or_none,
try_get,
)
@@ -61,9 +58,9 @@ class SportDeutschlandIE(InfoExtractor):
}
videos = asset.get('videos') or []
if len(videos) > 1:
- playlist_id = compat_parse_qs(compat_urllib_parse_urlparse(url).query).get('playlistId', [None])[0]
+ playlist_id = parse_qs(url).get('playlistId', [None])[0]
if playlist_id:
- if self._downloader.params.get('noplaylist'):
+ if self.get_param('noplaylist'):
videos = [videos[int(playlist_id)]]
self.to_screen('Downloading just a single video because of --no-playlist')
else:
@@ -77,7 +74,7 @@ class SportDeutschlandIE(InfoExtractor):
continue
formats = self._extract_m3u8_formats(
video_url.replace('.smil', '.m3u8'), video_id, 'mp4', fatal=False)
- if not formats:
+ if not formats and not self.get_param('ignore_no_formats'):
continue
yield {
'id': video_id,
diff --git a/hypervideo_dl/extractor/springboardplatform.py b/hypervideo_dl/extractor/springboardplatform.py
index 07d99b5..49ac1f5 100644
--- a/hypervideo_dl/extractor/springboardplatform.py
+++ b/hypervideo_dl/extractor/springboardplatform.py
@@ -57,7 +57,7 @@ class SpringboardPlatformIE(InfoExtractor):
webpage)]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
video_id = mobj.group('id') or mobj.group('id_2')
index = mobj.group('index') or mobj.group('index_2')
diff --git a/hypervideo_dl/extractor/srgssr.py b/hypervideo_dl/extractor/srgssr.py
index ac018e7..cbc1c47 100644
--- a/hypervideo_dl/extractor/srgssr.py
+++ b/hypervideo_dl/extractor/srgssr.py
@@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
from ..utils import (
@@ -82,11 +81,12 @@ class SRGSSRIE(InfoExtractor):
return media_data
def _real_extract(self, url):
- bu, media_type, media_id = re.match(self._VALID_URL, url).groups()
+ bu, media_type, media_id = self._match_valid_url(url).groups()
media_data = self._get_media_data(bu, media_type, media_id)
title = media_data['title']
formats = []
+ subtitles = {}
q = qualities(['SD', 'HD'])
for source in (media_data.get('resourceList') or []):
format_url = source.get('url')
@@ -104,12 +104,16 @@ class SRGSSRIE(InfoExtractor):
if source.get('tokenType') == 'AKAMAI':
format_url = self._get_tokenized_src(
format_url, media_id, format_id)
- formats.extend(self._extract_akamai_formats(
- format_url, media_id))
+ fmts, subs = self._extract_akamai_formats_and_subtitles(
+ format_url, media_id)
+ formats.extend(fmts)
+ subtitles = self._merge_subtitles(subtitles, subs)
elif protocol == 'HLS':
- formats.extend(self._extract_m3u8_formats(
+ m3u8_fmts, m3u8_subs = self._extract_m3u8_formats_and_subtitles(
format_url, media_id, 'mp4', 'm3u8_native',
- m3u8_id=format_id, fatal=False))
+ m3u8_id=format_id, fatal=False)
+ formats.extend(m3u8_fmts)
+ subtitles = self._merge_subtitles(subtitles, m3u8_subs)
elif protocol in ('HTTP', 'HTTPS'):
formats.append({
'format_id': format_id,
@@ -133,7 +137,6 @@ class SRGSSRIE(InfoExtractor):
})
self._sort_formats(formats)
- subtitles = {}
if media_type == 'video':
for sub in (media_data.get('subtitleList') or []):
sub_url = sub.get('url')
@@ -245,7 +248,7 @@ class SRGSSRPlayIE(InfoExtractor):
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
bu = mobj.group('bu')
media_type = mobj.group('type') or mobj.group('type_2')
media_id = mobj.group('id')
diff --git a/hypervideo_dl/extractor/stanfordoc.py b/hypervideo_dl/extractor/stanfordoc.py
index ae3dd13..0003075 100644
--- a/hypervideo_dl/extractor/stanfordoc.py
+++ b/hypervideo_dl/extractor/stanfordoc.py
@@ -25,7 +25,7 @@ class StanfordOpenClassroomIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
if mobj.group('course') and mobj.group('video'): # A specific video
course = mobj.group('course')
diff --git a/hypervideo_dl/extractor/startv.py b/hypervideo_dl/extractor/startv.py
new file mode 100644
index 0000000..411320e
--- /dev/null
+++ b/hypervideo_dl/extractor/startv.py
@@ -0,0 +1,103 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import (
+ compat_str,
+)
+from ..utils import (
+ clean_html,
+ ExtractorError,
+ traverse_obj,
+ int_or_none,
+)
+
+
+class StarTVIE(InfoExtractor):
+ _VALID_URL = r"""(?x)
+ https?://(?:www\.)?startv\.com\.tr/
+ (?:
+ (?:dizi|program)/(?:[^/?#&]+)/(?:bolumler|fragmanlar|ekstralar)|
+ video/arsiv/(?:dizi|program)/(?:[^/?#&]+)
+ )/
+ (?P<id>[^/?#&]+)
+ """
+ IE_NAME = 'startv'
+ _TESTS = [
+ {
+ 'url': 'https://www.startv.com.tr/dizi/cocuk/bolumler/3-bolum',
+ 'md5': '72381a32bcc2e2eb5841e8c8bf68f127',
+ 'info_dict': {
+ 'id': '904972',
+ 'display_id': '3-bolum',
+ 'ext': 'mp4',
+ 'title': '3. Bölüm',
+ 'description': 'md5:3a8049f05a75c2e8747116a673275de4',
+ 'thumbnail': r're:^https?://.*\.jpg(?:\?.*?)?$',
+ 'timestamp': 1569281400,
+ 'upload_date': '20190923'
+ },
+ },
+ {
+ 'url': 'https://www.startv.com.tr/video/arsiv/dizi/avlu/44-bolum',
+ 'only_matching': True
+ },
+ {
+ 'url': 'https://www.startv.com.tr/dizi/cocuk/fragmanlar/5-bolum-fragmani',
+ 'only_matching': True
+ },
+ {
+ 'url': 'https://www.startv.com.tr/dizi/cocuk/ekstralar/5-bolumun-nefes-kesen-final-sahnesi',
+ 'only_matching': True
+ },
+ {
+ 'url': 'https://www.startv.com.tr/program/burcu-ile-haftasonu/bolumler/1-bolum',
+ 'only_matching': True
+ },
+ {
+ 'url': 'https://www.startv.com.tr/program/burcu-ile-haftasonu/fragmanlar/2-fragman',
+ 'only_matching': True
+ },
+ {
+ 'url': 'https://www.startv.com.tr/video/arsiv/program/buyukrisk/14-bolumde-hangi-unlu-ne-sordu-',
+ 'only_matching': True
+ },
+ {
+ 'url': 'https://www.startv.com.tr/video/arsiv/program/buyukrisk/buyuk-risk-334-bolum',
+ 'only_matching': True
+ },
+ {
+ 'url': 'https://www.startv.com.tr/video/arsiv/program/dada/dada-58-bolum',
+ 'only_matching': True
+ }
+ ]
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
+ info_url = self._search_regex(
+ r'(["\'])videoUrl\1\s*:\s*\1(?P<url>(?:(?!\1).)+)\1\s*',
+ webpage, 'video info url', group='url')
+
+ info = traverse_obj(self._download_json(info_url, display_id), 'data', expected_type=dict)
+ if not info:
+ raise ExtractorError('Failed to extract API data')
+
+ video_id = compat_str(info.get('id'))
+ title = info.get('title') or self._og_search_title(webpage)
+ description = clean_html(info.get('description')) or self._og_search_description(webpage, default=None)
+ thumbnail = self._proto_relative_url(
+ self._og_search_thumbnail(webpage), scheme='http:')
+
+ formats = self._extract_m3u8_formats(
+ traverse_obj(info, ('flavors', 'hls')), video_id, entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)
+
+ return {
+ 'id': video_id,
+ 'display_id': display_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'timestamp': int_or_none(info.get('release_date')),
+ 'formats': formats
+ }
diff --git a/hypervideo_dl/extractor/steam.py b/hypervideo_dl/extractor/steam.py
index a6a191c..7f777c4 100644
--- a/hypervideo_dl/extractor/steam.py
+++ b/hypervideo_dl/extractor/steam.py
@@ -66,7 +66,7 @@ class SteamIE(InfoExtractor):
}]
def _real_extract(self, url):
- m = re.match(self._VALID_URL, url)
+ m = self._match_valid_url(url)
fileID = m.group('fileID')
if fileID:
videourl = url
@@ -139,7 +139,7 @@ class SteamIE(InfoExtractor):
'format_id': ext + quality,
'url': video_url,
})
- if not formats:
+ if not formats and not self.get_param('ignore_no_formats'):
continue
entry['formats'] = formats
entries.append(entry)
diff --git a/hypervideo_dl/extractor/streamable.py b/hypervideo_dl/extractor/streamable.py
index 3472527..8081296 100644
--- a/hypervideo_dl/extractor/streamable.py
+++ b/hypervideo_dl/extractor/streamable.py
@@ -8,6 +8,8 @@ from ..utils import (
ExtractorError,
float_or_none,
int_or_none,
+ try_get,
+ parse_codecs,
)
@@ -29,7 +31,7 @@ class StreamableIE(InfoExtractor):
'view_count': int,
}
},
- # older video without bitrate, width/height, etc. info
+ # older video without bitrate, width/height, codecs, etc. info
{
'url': 'https://streamable.com/moo',
'md5': '2cf6923639b87fba3279ad0df3a64e73',
@@ -95,7 +97,9 @@ class StreamableIE(InfoExtractor):
'height': int_or_none(info.get('height')),
'filesize': int_or_none(info.get('size')),
'fps': int_or_none(info.get('framerate')),
- 'vbr': float_or_none(info.get('bitrate'), 1000)
+ 'vbr': float_or_none(info.get('bitrate'), 1000),
+ 'vcodec': parse_codecs(try_get(info, lambda x: x['input_metadata']['video_codec_name'])).get('vcodec'),
+ 'acodec': parse_codecs(try_get(info, lambda x: x['input_metadata']['audio_codec_name'])).get('acodec'),
})
self._sort_formats(formats)
diff --git a/hypervideo_dl/extractor/streamanity.py b/hypervideo_dl/extractor/streamanity.py
new file mode 100644
index 0000000..2e2d5ee
--- /dev/null
+++ b/hypervideo_dl/extractor/streamanity.py
@@ -0,0 +1,51 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class StreamanityIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?streamanity\.com/video/(?P<id>[A-Za-z0-9]+)'
+ _TESTS = [{
+ 'url': 'https://streamanity.com/video/9DFPTnuYi8f2',
+ 'md5': '6ab171e8d4a02ad5dcbff6bea44cf5a1',
+ 'info_dict': {
+ 'id': '9DFPTnuYi8f2',
+ 'ext': 'mp4',
+ 'title': 'Bitcoin vs The Lighting Network',
+ 'thumbnail': r're:https://res\.cloudinary\.com/.+\.png',
+ 'description': '',
+ 'uploader': 'Tom Bombadil (Freddy78)',
+ }
+ }, {
+ 'url': 'https://streamanity.com/video/JktOUjSlfzTD',
+ 'md5': '31f131e28abd3377c38be586a59532dc',
+ 'info_dict': {
+ 'id': 'JktOUjSlfzTD',
+ 'ext': 'mp4',
+ 'title': 'Share data when you see it',
+ 'thumbnail': r're:https://res\.cloudinary\.com/.+\.png',
+ 'description': 'Reposting as data should be public and stored on blockchain',
+ 'uploader': 'digitalcurrencydaily',
+ }
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ video_info = self._download_json(
+ f'https://app.streamanity.com/api/video/{video_id}', video_id)['data']['video']
+
+ formats = self._extract_m3u8_formats(
+ f'https://stream.mux.com/{video_info["play_id"]}.m3u8?token={video_info["token"]}',
+ video_id, ext='mp4', m3u8_id='hls')
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': video_info['title'],
+ 'description': video_info.get('description'),
+ 'uploader': video_info.get('author_name'),
+ 'is_live': False,
+ 'thumbnail': video_info.get('thumb'),
+ 'formats': formats,
+ }
diff --git a/hypervideo_dl/extractor/streamcloud.py b/hypervideo_dl/extractor/streamcloud.py
index 984dea4..b97bb43 100644
--- a/hypervideo_dl/extractor/streamcloud.py
+++ b/hypervideo_dl/extractor/streamcloud.py
@@ -15,12 +15,12 @@ class StreamcloudIE(InfoExtractor):
_VALID_URL = r'https?://streamcloud\.eu/(?P<id>[a-zA-Z0-9_-]+)(?:/(?P<fname>[^#?]*)\.html)?'
_TESTS = [{
- 'url': 'http://streamcloud.eu/skp9j99s4bpz/youtube_dl_test_video_____________-BaW_jenozKc.mp4.html',
+ 'url': 'http://streamcloud.eu/skp9j99s4bpz/youtube-dl_test_video_____________-BaW_jenozKc.mp4.html',
'md5': '6bea4c7fa5daaacc2a946b7146286686',
'info_dict': {
'id': 'skp9j99s4bpz',
'ext': 'mp4',
- 'title': 'hypervideo test video \'/\\ ä ↭',
+ 'title': 'youtube-dl test video \'/\\ ä ↭',
},
'skip': 'Only available from the EU'
}, {
diff --git a/hypervideo_dl/extractor/stv.py b/hypervideo_dl/extractor/stv.py
index 539220a..d36a4b6 100644
--- a/hypervideo_dl/extractor/stv.py
+++ b/hypervideo_dl/extractor/stv.py
@@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
from ..utils import (
@@ -43,7 +42,7 @@ class STVPlayerIE(InfoExtractor):
}
def _real_extract(self, url):
- ptype, video_id = re.match(self._VALID_URL, url).groups()
+ ptype, video_id = self._match_valid_url(url).groups()
webpage = self._download_webpage(url, video_id, fatal=False) or ''
props = (self._parse_json(self._search_regex(
diff --git a/hypervideo_dl/extractor/svt.py b/hypervideo_dl/extractor/svt.py
index a5bb6da..38e0086 100644
--- a/hypervideo_dl/extractor/svt.py
+++ b/hypervideo_dl/extractor/svt.py
@@ -49,7 +49,7 @@ class SVTBaseIE(InfoExtractor):
if not formats and rights.get('geoBlockedSweden'):
self.raise_geo_restricted(
'This video is only available in Sweden',
- countries=self._GEO_COUNTRIES)
+ countries=self._GEO_COUNTRIES, metadata_available=True)
self._sort_formats(formats)
subtitles = {}
@@ -119,7 +119,7 @@ class SVTIE(SVTBaseIE):
return mobj.group('url')
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
widget_id = mobj.group('widget_id')
article_id = mobj.group('id')
@@ -225,7 +225,7 @@ class SVTPlayIE(SVTPlayBaseIE):
return info_dict
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
video_id = mobj.group('id')
svt_id = mobj.group('svt_id') or mobj.group('modal_id')
@@ -301,7 +301,7 @@ class SVTSeriesIE(SVTPlayBaseIE):
return False if SVTIE.suitable(url) or SVTPlayIE.suitable(url) else super(SVTSeriesIE, cls).suitable(url)
def _real_extract(self, url):
- series_slug, season_id = re.match(self._VALID_URL, url).groups()
+ series_slug, season_id = self._match_valid_url(url).groups()
series = self._download_json(
'https://api.svt.se/contento/graphql', series_slug,
@@ -400,7 +400,7 @@ class SVTPageIE(InfoExtractor):
return False if SVTIE.suitable(url) or SVTPlayIE.suitable(url) else super(SVTPageIE, cls).suitable(url)
def _real_extract(self, url):
- path, display_id = re.match(self._VALID_URL, url).groups()
+ path, display_id = self._match_valid_url(url).groups()
article = self._download_json(
'https://api.svt.se/nss-api/page/' + path, display_id,
diff --git a/hypervideo_dl/extractor/tagesschau.py b/hypervideo_dl/extractor/tagesschau.py
index 8ceab7e..25c2004 100644
--- a/hypervideo_dl/extractor/tagesschau.py
+++ b/hypervideo_dl/extractor/tagesschau.py
@@ -78,7 +78,7 @@ class TagesschauPlayerIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
video_id = mobj.group('id')
# kind = mobj.group('kind').lower()
@@ -263,7 +263,7 @@ class TagesschauIE(InfoExtractor):
return formats
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
video_id = mobj.group('id') or mobj.group('path')
display_id = video_id.lstrip('-')
diff --git a/hypervideo_dl/extractor/tastytrade.py b/hypervideo_dl/extractor/tastytrade.py
new file mode 100644
index 0000000..7fe96bd
--- /dev/null
+++ b/hypervideo_dl/extractor/tastytrade.py
@@ -0,0 +1,43 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from .ooyala import OoyalaIE
+
+
+class TastyTradeIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?tastytrade\.com/tt/shows/[^/]+/episodes/(?P<id>[^/?#&]+)'
+
+ _TESTS = [{
+ 'url': 'https://www.tastytrade.com/tt/shows/market-measures/episodes/correlation-in-short-volatility-06-28-2017',
+ 'info_dict': {
+ 'id': 'F3bnlzbToeI6pLEfRyrlfooIILUjz4nM',
+ 'ext': 'mp4',
+ 'title': 'A History of Teaming',
+ 'description': 'md5:2a9033db8da81f2edffa4c99888140b3',
+ 'duration': 422.255,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ 'add_ie': ['Ooyala'],
+ }, {
+ 'url': 'https://www.tastytrade.com/tt/shows/daily-dose/episodes/daily-dose-06-30-2017',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
+
+ ooyala_code = self._search_regex(
+ r'data-media-id=(["\'])(?P<code>(?:(?!\1).)+)\1',
+ webpage, 'ooyala code', group='code')
+
+ info = self._search_json_ld(webpage, display_id, fatal=False)
+ info.update({
+ '_type': 'url_transparent',
+ 'ie_key': OoyalaIE.ie_key(),
+ 'url': 'ooyala:%s' % ooyala_code,
+ 'display_id': display_id,
+ })
+ return info
diff --git a/hypervideo_dl/extractor/tbs.py b/hypervideo_dl/extractor/tbs.py
index e8a7c65..c7d62ff 100644
--- a/hypervideo_dl/extractor/tbs.py
+++ b/hypervideo_dl/extractor/tbs.py
@@ -16,7 +16,7 @@ from ..utils import (
class TBSIE(TurnerBaseIE):
- _VALID_URL = r'https?://(?:www\.)?(?P<site>tbs|tntdrama)\.com(?P<path>/(?:movies|shows/[^/]+/(?:clips|season-\d+/episode-\d+))/(?P<id>[^/?#]+))'
+ _VALID_URL = r'https?://(?:www\.)?(?P<site>tbs|tntdrama)\.com(?P<path>/(?:movies|watchtnt|watchtbs|shows/[^/]+/(?:clips|season-\d+/episode-\d+))/(?P<id>[^/?#]+))'
_TESTS = [{
'url': 'http://www.tntdrama.com/shows/the-alienist/clips/monster',
'info_dict': {
@@ -40,12 +40,13 @@ class TBSIE(TurnerBaseIE):
}]
def _real_extract(self, url):
- site, path, display_id = re.match(self._VALID_URL, url).groups()
+ site, path, display_id = self._match_valid_url(url).groups()
webpage = self._download_webpage(url, display_id)
drupal_settings = self._parse_json(self._search_regex(
r'<script[^>]+?data-drupal-selector="drupal-settings-json"[^>]*?>({.+?})</script>',
webpage, 'drupal setting'), display_id)
- video_data = next(v for v in drupal_settings['turner_playlist'] if v.get('url') == path)
+ isLive = 'watchtnt' in path or 'watchtbs' in path
+ video_data = next(v for v in drupal_settings['turner_playlist'] if isLive or v.get('url') == path)
media_id = video_data['mediaID']
title = video_data['title']
@@ -56,7 +57,8 @@ class TBSIE(TurnerBaseIE):
media_id, tokenizer_query, {
'url': url,
'site_name': site[:3].upper(),
- 'auth_required': video_data.get('authRequired') == '1',
+ 'auth_required': video_data.get('authRequired') == '1' or isLive,
+ 'is_live': isLive
})
thumbnails = []
@@ -85,5 +87,6 @@ class TBSIE(TurnerBaseIE):
'season_number': int_or_none(video_data.get('season')),
'episode_number': int_or_none(video_data.get('episode')),
'thumbnails': thumbnails,
+ 'is_live': isLive
})
return info
diff --git a/hypervideo_dl/extractor/teachable.py b/hypervideo_dl/extractor/teachable.py
index 2394f86..37eae82 100644
--- a/hypervideo_dl/extractor/teachable.py
+++ b/hypervideo_dl/extractor/teachable.py
@@ -151,7 +151,7 @@ class TeachableIE(TeachableBaseIE):
return '%s%s' % (TeachableBaseIE._URL_PREFIX, source_url)
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
site = mobj.group('site') or mobj.group('site_t')
video_id = mobj.group('id')
@@ -248,7 +248,7 @@ class TeachableCourseIE(TeachableBaseIE):
TeachableCourseIE, cls).suitable(url)
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
site = mobj.group('site') or mobj.group('site_t')
course_id = mobj.group('id')
diff --git a/hypervideo_dl/extractor/teachertube.py b/hypervideo_dl/extractor/teachertube.py
index 1272078..e22f011 100644
--- a/hypervideo_dl/extractor/teachertube.py
+++ b/hypervideo_dl/extractor/teachertube.py
@@ -111,7 +111,7 @@ class TeacherTubeUserIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
user_id = mobj.group('user')
urls = []
diff --git a/hypervideo_dl/extractor/techtalks.py b/hypervideo_dl/extractor/techtalks.py
index a5b62c7..78f0731 100644
--- a/hypervideo_dl/extractor/techtalks.py
+++ b/hypervideo_dl/extractor/techtalks.py
@@ -44,7 +44,7 @@ class TechTalksIE(InfoExtractor):
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
talk_id = mobj.group('id')
webpage = self._download_webpage(url, talk_id)
rtmp_url = self._search_regex(
diff --git a/hypervideo_dl/extractor/tele13.py b/hypervideo_dl/extractor/tele13.py
index a29a64b..f8a2755 100644
--- a/hypervideo_dl/extractor/tele13.py
+++ b/hypervideo_dl/extractor/tele13.py
@@ -70,7 +70,7 @@ class Tele13IE(InfoExtractor):
formats.append({
'url': format_url,
'format_id': f.get('label'),
- 'preference': preference(f.get('label')),
+ 'quality': preference(f.get('label')),
'ext': ext,
})
urls.append(format_url)
diff --git a/hypervideo_dl/extractor/tele5.py b/hypervideo_dl/extractor/tele5.py
index 3e1a7a9..0d9cf75 100644
--- a/hypervideo_dl/extractor/tele5.py
+++ b/hypervideo_dl/extractor/tele5.py
@@ -6,9 +6,9 @@ import re
from .common import InfoExtractor
from .jwplatform import JWPlatformIE
from .nexx import NexxIE
-from ..compat import compat_urlparse
from ..utils import (
NO_DEFAULT,
+ parse_qs,
smuggle_url,
)
@@ -64,7 +64,7 @@ class Tele5IE(InfoExtractor):
}]
def _real_extract(self, url):
- qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
+ qs = parse_qs(url)
video_id = (qs.get('vid') or qs.get('ve_id') or [None])[0]
NEXX_ID_RE = r'\d{6,}'
diff --git a/hypervideo_dl/extractor/telemb.py b/hypervideo_dl/extractor/telemb.py
index 9bcac4e..ac2d603 100644
--- a/hypervideo_dl/extractor/telemb.py
+++ b/hypervideo_dl/extractor/telemb.py
@@ -38,7 +38,7 @@ class TeleMBIE(InfoExtractor):
]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
video_id = mobj.group('id')
display_id = mobj.group('display_id')
@@ -57,7 +57,7 @@ class TeleMBIE(InfoExtractor):
'app': rtmp.group('app'),
'player_url': 'http://p.jwpcdn.com/6/10/jwplayer.flash.swf',
'page_url': 'http://www.telemb.be',
- 'preference': -1,
+ 'preference': -10,
})
formats.append(fmt)
self._sort_formats(formats)
diff --git a/hypervideo_dl/extractor/telemundo.py b/hypervideo_dl/extractor/telemundo.py
new file mode 100644
index 0000000..18552a0
--- /dev/null
+++ b/hypervideo_dl/extractor/telemundo.py
@@ -0,0 +1,58 @@
+# coding=utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ try_get,
+ unified_timestamp,
+ HEADRequest,
+)
+
+
+class TelemundoIE(InfoExtractor):
+
+ _VALID_URL = r'https?:\/\/(?:www\.)?telemundo\.com\/.+?video\/[^\/]+(?P<id>tmvo\d{7})'
+ _TESTS = [{
+ 'url': 'https://www.telemundo.com/noticias/noticias-telemundo-en-la-noche/empleo/video/esta-aplicacion-gratuita-esta-ayudando-los-latinos-encontrar-trabajo-en-estados-unidos-tmvo9829325',
+ 'info_dict': {
+ 'id': 'tmvo9829325',
+ 'timestamp': 1621396800,
+ 'title': 'Esta aplicación gratuita está ayudando a los latinos a encontrar trabajo en Estados Unidos',
+ 'uploader': 'Telemundo',
+ 'uploader_id': 'NBCU_Telemundo',
+ 'ext': 'mp4',
+ 'upload_date': '20210519',
+ },
+ 'params': {
+ 'skip_download': True,
+ }
+ }, {
+ 'url': 'https://www.telemundo.com/shows/al-rojo-vivo/empleo/video/personajes-de-times-square-piden-que-la-ciudad-de-nueva-york-los-deje-volver-trabajar-tmvo9816272',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+ metadata = self._parse_json(
+ self._search_regex(r'<[^>]+id="__NEXT_DATA__"[^>]+>([^<]+)', webpage, 'JSON metadata'), video_id)
+ redirect_url = try_get(
+ metadata,
+ lambda x: x['props']['initialState']['video']['associatedPlaylists'][0]['videos'][0]['videoAssets'][0]['publicUrl'])
+
+ m3u8_url = self._request_webpage(HEADRequest(
+ redirect_url + '?format=redirect&manifest=m3u&format=redirect&Tracking=true&Embedded=true&formats=MPEG4'),
+ video_id, 'Processing m3u8').geturl()
+ formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4')
+ self._sort_formats(formats)
+ date = unified_timestamp(try_get(
+ metadata, lambda x: x['props']['initialState']['video']['associatedPlaylists'][0]['videos'][0]['datePublished'].split(' ', 1)[1]))
+ return {
+ 'url': url,
+ 'id': video_id,
+ 'title': self._search_regex(r'<h1[^>]+>([^<]+)', webpage, 'title', fatal=False),
+ 'formats': formats,
+ 'timestamp': date,
+ 'uploader': 'Telemundo',
+ 'uploader_id': self._search_regex(r'https?:\/\/(?:[^/]+\/){3}video\/(?P<id>[^\/]+)', m3u8_url, 'Akamai account', fatal=False)
+ }
diff --git a/hypervideo_dl/extractor/tennistv.py b/hypervideo_dl/extractor/tennistv.py
index a586f30..a39a2fc 100644
--- a/hypervideo_dl/extractor/tennistv.py
+++ b/hypervideo_dl/extractor/tennistv.py
@@ -69,7 +69,7 @@ class TennisTVIE(InfoExtractor):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- internal_id = self._search_regex(r'video=([0-9]+)', webpage, 'internal video id')
+ internal_id = self._search_regex(r'video=([\w-]+)', webpage, 'internal video id')
headers = {
'Origin': 'https://www.tennistv.com',
@@ -79,16 +79,18 @@ class TennisTVIE(InfoExtractor):
}
check_data = {
'videoID': internal_id,
- 'VideoUrlType': 'HLSV3',
+ 'VideoUrlType': 'HLS',
}
check_json = json.dumps(check_data).encode('utf-8')
check_result = self._download_json(
'https://www.tennistv.com/api/users/v1/entitlementchecknondiva',
video_id, note='Checking video authorization', headers=headers, data=check_json)
formats = self._extract_m3u8_formats(check_result['contentUrl'], video_id, ext='mp4')
+ self._sort_formats(formats)
- vdata_url = 'https://www.tennistv.com/api/channels/v1/de/none/video/%s' % video_id
- vdata = self._download_json(vdata_url, video_id)
+ vdata = self._download_json(
+ 'https://www.tennistv.com/api/en/v2/none/common/video/%s' % video_id,
+ video_id, headers=headers)
timestamp = unified_timestamp(vdata['timestamp'])
thumbnail = vdata['video']['thumbnailUrl']
diff --git a/hypervideo_dl/extractor/tenplay.py b/hypervideo_dl/extractor/tenplay.py
index cd30d57..c810cfd 100644
--- a/hypervideo_dl/extractor/tenplay.py
+++ b/hypervideo_dl/extractor/tenplay.py
@@ -1,70 +1,90 @@
# coding: utf-8
from __future__ import unicode_literals
+from datetime import datetime
+import base64
+
from .common import InfoExtractor
from ..utils import (
HEADRequest,
- parse_age_limit,
- parse_iso8601,
- # smuggle_url,
+ urlencode_postdata,
)
class TenPlayIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?10play\.com\.au/(?:[^/]+/)+(?P<id>tpv\d{6}[a-z]{5})'
+ _NETRC_MACHINE = '10play'
_TESTS = [{
- 'url': 'https://10play.com.au/masterchef/episodes/season-1/masterchef-s1-ep-1/tpv190718kwzga',
+ 'url': 'https://10play.com.au/todd-sampsons-body-hack/episodes/season-4/episode-7/tpv200921kvngh',
'info_dict': {
- 'id': '6060533435001',
+ 'id': '6192880312001',
'ext': 'mp4',
- 'title': 'MasterChef - S1 Ep. 1',
- 'description': 'md5:4fe7b78e28af8f2d900cd20d900ef95c',
- 'age_limit': 10,
- 'timestamp': 1240828200,
- 'upload_date': '20090427',
- 'uploader_id': '2199827728001',
+ 'title': "Todd Sampson's Body Hack - S4 Ep. 2",
+ 'description': 'md5:fa278820ad90f08ea187f9458316ac74',
+ 'age_limit': 15,
+ 'timestamp': 1600770600,
+ 'upload_date': '20200922',
+ 'uploader': 'Channel 10',
+ 'uploader_id': '2199827728001'
},
'params': {
- # 'format': 'bestvideo',
'skip_download': True,
}
}, {
'url': 'https://10play.com.au/how-to-stay-married/web-extras/season-1/terrys-talks-ep-1-embracing-change/tpv190915ylupc',
'only_matching': True,
}]
- # BRIGHTCOVE_URL_TEMPLATE = 'https://players.brightcove.net/2199827728001/cN6vRtRQt_default/index.html?videoId=%s'
_GEO_BYPASS = False
- _FASTLY_URL_TEMPL = 'https://10-selector.global.ssl.fastly.net/s/kYEXFC/media/%s?mbr=true&manifest=m3u&format=redirect'
+
+ _AUS_AGES = {
+ 'G': 0,
+ 'PG': 15,
+ 'M': 15,
+ 'MA': 15,
+ 'MA15+': 15,
+ 'R': 18,
+ 'X': 18
+ }
+
+ def _get_bearer_token(self, video_id):
+ username, password = self._get_login_info()
+ if username is None or password is None:
+ self.raise_login_required('Your 10play account\'s details must be provided with --username and --password.')
+ _timestamp = datetime.now().strftime('%Y%m%d000000')
+ _auth_header = base64.b64encode(_timestamp.encode('ascii')).decode('ascii')
+ data = self._download_json('https://10play.com.au/api/user/auth', video_id, 'Getting bearer token', headers={
+ 'X-Network-Ten-Auth': _auth_header,
+ }, data=urlencode_postdata({
+ 'email': username,
+ 'password': password,
+ }))
+ return "Bearer " + data['jwt']['accessToken']
def _real_extract(self, url):
content_id = self._match_id(url)
+ _token = self._get_bearer_token(content_id)
data = self._download_json(
- 'https://10play.com.au/api/video/' + content_id, content_id)
- video = data.get('video') or {}
- metadata = data.get('metaData') or {}
- brightcove_id = video.get('videoId') or metadata['showContentVideoId']
- # brightcove_url = smuggle_url(
- # self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id,
- # {'geo_countries': ['AU']})
+ 'https://10play.com.au/api/v1/videos/' + content_id, content_id)
+ _video_url = self._download_json(
+ data.get('playbackApiEndpoint'), content_id, 'Downloading video JSON',
+ headers={'Authorization': _token}).get('source')
m3u8_url = self._request_webpage(HEADRequest(
- self._FASTLY_URL_TEMPL % brightcove_id), brightcove_id).geturl()
+ _video_url), content_id).geturl()
if '10play-not-in-oz' in m3u8_url:
self.raise_geo_restricted(countries=['AU'])
- formats = self._extract_m3u8_formats(m3u8_url, brightcove_id, 'mp4')
+ formats = self._extract_m3u8_formats(m3u8_url, content_id, 'mp4')
self._sort_formats(formats)
return {
- # '_type': 'url_transparent',
- # 'url': brightcove_url,
'formats': formats,
- 'id': brightcove_id,
- 'title': video.get('title') or metadata.get('pageContentName') or metadata['showContentName'],
- 'description': video.get('description'),
- 'age_limit': parse_age_limit(video.get('showRatingClassification') or metadata.get('showProgramClassification')),
- 'series': metadata.get('showName'),
- 'season': metadata.get('showContentSeason'),
- 'timestamp': parse_iso8601(metadata.get('contentPublishDate') or metadata.get('pageContentPublishDate')),
- 'thumbnail': video.get('poster'),
+ 'id': data.get('altId') or content_id,
+ 'title': data.get('title'),
+ 'description': data.get('description'),
+ 'age_limit': self._AUS_AGES.get(data.get('classification')),
+ 'series': data.get('showName'),
+ 'season': data.get('showContentSeason'),
+ 'timestamp': data.get('published'),
+ 'thumbnail': data.get('imageUrl'),
+ 'uploader': 'Channel 10',
'uploader_id': '2199827728001',
- # 'ie_key': 'BrightcoveNew',
}
diff --git a/hypervideo_dl/extractor/testurl.py b/hypervideo_dl/extractor/testurl.py
index 84a14a0..8bc512a 100644
--- a/hypervideo_dl/extractor/testurl.py
+++ b/hypervideo_dl/extractor/testurl.py
@@ -15,7 +15,7 @@ class TestURLIE(InfoExtractor):
def _real_extract(self, url):
from ..extractor import gen_extractors
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
video_id = mobj.group('id')
extractor_id = mobj.group('extractor')
all_extractors = gen_extractors()
diff --git a/hypervideo_dl/extractor/tf1.py b/hypervideo_dl/extractor/tf1.py
index 23c2808..669eb50 100644
--- a/hypervideo_dl/extractor/tf1.py
+++ b/hypervideo_dl/extractor/tf1.py
@@ -2,7 +2,6 @@
from __future__ import unicode_literals
import json
-import re
from .common import InfoExtractor
from ..utils import (
@@ -41,7 +40,7 @@ class TF1IE(InfoExtractor):
}]
def _real_extract(self, url):
- program_slug, slug = re.match(self._VALID_URL, url).groups()
+ program_slug, slug = self._match_valid_url(url).groups()
video = self._download_json(
'https://www.tf1.fr/graphql/web', slug, query={
'id': '9b80783950b85247541dd1d851f9cc7fa36574af015621f853ab111a679ce26f',
diff --git a/hypervideo_dl/extractor/theplatform.py b/hypervideo_dl/extractor/theplatform.py
index adfe11e..c2729f1 100644
--- a/hypervideo_dl/extractor/theplatform.py
+++ b/hypervideo_dl/extractor/theplatform.py
@@ -10,15 +10,12 @@ import hashlib
from .once import OnceIE
from .adobepass import AdobePassIE
-from ..compat import (
- compat_parse_qs,
- compat_urllib_parse_urlparse,
-)
from ..utils import (
determine_ext,
ExtractorError,
float_or_none,
int_or_none,
+ parse_qs,
sanitized_Request,
unsmuggle_url,
update_url_query,
@@ -238,7 +235,7 @@ class ThePlatformIE(ThePlatformBaseIE, AdobePassIE):
'countries': smuggled_data.get('geo_countries'),
})
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
provider_id = mobj.group('provider_id')
video_id = mobj.group('id')
@@ -250,7 +247,7 @@ class ThePlatformIE(ThePlatformBaseIE, AdobePassIE):
path += mobj.group('media')
path += video_id
- qs_dict = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
+ qs_dict = parse_qs(url)
if 'guid' in qs_dict:
webpage = self._download_webpage(url, video_id)
scripts = re.findall(r'<script[^>]+src="([^"]+)"', webpage)
@@ -359,7 +356,7 @@ class ThePlatformFeedIE(ThePlatformBaseIE):
if first_video_id is None:
first_video_id = cur_video_id
duration = float_or_none(item.get('plfile$duration'))
- file_asset_types = item.get('plfile$assetTypes') or compat_parse_qs(compat_urllib_parse_urlparse(smil_url).query)['assetTypes']
+ file_asset_types = item.get('plfile$assetTypes') or parse_qs(smil_url)['assetTypes']
for asset_type in file_asset_types:
if asset_type in asset_types:
continue
@@ -404,7 +401,7 @@ class ThePlatformFeedIE(ThePlatformBaseIE):
return ret
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
video_id = mobj.group('id')
provider_id = mobj.group('provider_id')
diff --git a/hypervideo_dl/extractor/theta.py b/hypervideo_dl/extractor/theta.py
new file mode 100644
index 0000000..3b65436
--- /dev/null
+++ b/hypervideo_dl/extractor/theta.py
@@ -0,0 +1,87 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import try_get
+
+
+class ThetaStreamIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?theta\.tv/(?!video/)(?P<id>[a-z0-9]+)'
+ _TESTS = [{
+ 'url': 'https://www.theta.tv/davirus',
+ 'skip': 'The live may have ended',
+ 'info_dict': {
+ 'id': 'DaVirus',
+ 'ext': 'mp4',
+ 'title': 'I choose you - My Community is King -👀 - YO HABLO ESPANOL - CODE DAVIRUS',
+ 'thumbnail': r're:https://live-thumbnails-prod-theta-tv\.imgix\.net/thumbnail/.+\.jpg',
+ }
+ }, {
+ 'url': 'https://www.theta.tv/mst3k',
+ 'note': 'This channel is live 24/7',
+ 'info_dict': {
+ 'id': 'MST3K',
+ 'ext': 'mp4',
+ 'title': 'Mystery Science Theatre 3000 24/7 Powered by the THETA Network.',
+ 'thumbnail': r're:https://user-prod-theta-tv\.imgix\.net/.+\.jpg',
+ }
+ }]
+
+ def _real_extract(self, url):
+ channel_id = self._match_id(url)
+ info = self._download_json(f'https://api.theta.tv/v1/channel?alias={channel_id}', channel_id)['body']
+
+ m3u8_playlist = next(
+ data['url'] for data in info['live_stream']['video_urls']
+ if data.get('type') != 'embed' and data.get('resolution') in ('master', 'source'))
+
+ formats = self._extract_m3u8_formats(m3u8_playlist, channel_id, 'mp4', m3u8_id='hls', live=True)
+ self._sort_formats(formats)
+
+ channel = try_get(info, lambda x: x['user']['username']) # using this field instead of channel_id due to capitalization
+
+ return {
+ 'id': channel,
+ 'title': try_get(info, lambda x: x['live_stream']['title']),
+ 'channel': channel,
+ 'view_count': try_get(info, lambda x: x['live_stream']['view_count']),
+ 'is_live': True,
+ 'formats': formats,
+ 'thumbnail': try_get(info, lambda x: x['live_stream']['thumbnail_url']),
+ }
+
+
+class ThetaVideoIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?theta\.tv/video/(?P<id>vid[a-z0-9]+)'
+ _TEST = {
+ 'url': 'https://www.theta.tv/video/vidiq6aaet3kzf799p0',
+ 'md5': '633d8c29eb276bb38a111dbd591c677f',
+ 'info_dict': {
+ 'id': 'vidiq6aaet3kzf799p0',
+ 'ext': 'mp4',
+ 'title': 'Theta EdgeCast Tutorial',
+ 'uploader': 'Pixiekittie',
+ 'description': 'md5:e316253f5bdced8b5a46bb50ae60a09f',
+ 'thumbnail': r're:https://user-prod-theta-tv\.imgix\.net/.+/vod_thumb/.+.jpg',
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ info = self._download_json(f'https://api.theta.tv/v1/video/{video_id}/raw', video_id)['body']
+
+ m3u8_playlist = try_get(info, lambda x: x['video_urls'][0]['url'])
+
+ formats = self._extract_m3u8_formats(m3u8_playlist, video_id, 'mp4', m3u8_id='hls')
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': info.get('title'),
+ 'uploader': try_get(info, lambda x: x['user']['username']),
+ 'description': info.get('description'),
+ 'view_count': info.get('view_count'),
+ 'like_count': info.get('like_count'),
+ 'formats': formats,
+ 'thumbnail': info.get('thumbnail_url'),
+ }
diff --git a/hypervideo_dl/extractor/theweatherchannel.py b/hypervideo_dl/extractor/theweatherchannel.py
index b2a8c37..9e506c9 100644
--- a/hypervideo_dl/extractor/theweatherchannel.py
+++ b/hypervideo_dl/extractor/theweatherchannel.py
@@ -2,7 +2,6 @@
from __future__ import unicode_literals
import json
-import re
from .theplatform import ThePlatformIE
from ..utils import (
@@ -33,7 +32,7 @@ class TheWeatherChannelIE(ThePlatformIE):
}]
def _real_extract(self, url):
- asset_name, locale, display_id = re.match(self._VALID_URL, url).groups()
+ asset_name, locale, display_id = self._match_valid_url(url).groups()
if not locale:
locale = 'en-US'
video_data = list(self._download_json(
diff --git a/hypervideo_dl/extractor/thisav.py b/hypervideo_dl/extractor/thisav.py
index dc3dd03..4af286e 100644
--- a/hypervideo_dl/extractor/thisav.py
+++ b/hypervideo_dl/extractor/thisav.py
@@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
from ..utils import remove_end
@@ -34,7 +33,7 @@ class ThisAVIE(InfoExtractor):
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
diff --git a/hypervideo_dl/extractor/threeqsdn.py b/hypervideo_dl/extractor/threeqsdn.py
index f6d37bb..bb76103 100644
--- a/hypervideo_dl/extractor/threeqsdn.py
+++ b/hypervideo_dl/extractor/threeqsdn.py
@@ -99,16 +99,21 @@ class ThreeQSDNIE(InfoExtractor):
aspect = float_or_none(config.get('aspect'))
formats = []
+ subtitles = {}
for source_type, source in (config.get('sources') or {}).items():
if not source:
continue
if source_type == 'dash':
- formats.extend(self._extract_mpd_formats(
- source, video_id, mpd_id='mpd', fatal=False))
+ fmts, subs = self._extract_mpd_formats_and_subtitles(
+ source, video_id, mpd_id='mpd', fatal=False)
+ formats.extend(fmts)
+ subtitles = self._merge_subtitles(subtitles, subs)
elif source_type == 'hls':
- formats.extend(self._extract_m3u8_formats(
+ fmts, subs = self._extract_m3u8_formats_and_subtitles(
source, video_id, 'mp4', 'm3u8' if live else 'm3u8_native',
- m3u8_id='hls', fatal=False))
+ m3u8_id='hls', fatal=False)
+ formats.extend(fmts)
+ subtitles = self._merge_subtitles(subtitles, subs)
elif source_type == 'progressive':
for s in source:
src = s.get('src')
@@ -133,14 +138,11 @@ class ThreeQSDNIE(InfoExtractor):
'vcodec': 'none' if height == 0 else None,
'width': width,
})
- for f in formats:
- if f.get('acodec') == 'none':
- f['preference'] = -40
- elif f.get('vcodec') == 'none':
- f['preference'] = -50
- self._sort_formats(formats, ('preference', 'width', 'height', 'source_preference', 'tbr', 'vbr', 'abr', 'ext', 'format_id'))
+ # It seems like this would be correctly handled by default
+ # However, unless someone can confirm this, the old
+ # behaviour is being kept as-is
+ self._sort_formats(formats, ('res', 'source_preference'))
- subtitles = {}
for subtitle in (config.get('subtitles') or []):
src = subtitle.get('src')
if not src:
diff --git a/hypervideo_dl/extractor/tiktok.py b/hypervideo_dl/extractor/tiktok.py
index 4faa6de..1db6327 100644
--- a/hypervideo_dl/extractor/tiktok.py
+++ b/hypervideo_dl/extractor/tiktok.py
@@ -1,147 +1,563 @@
# coding: utf-8
from __future__ import unicode_literals
+import itertools
+import random
+import string
+import time
+import json
+
from .common import InfoExtractor
+from ..compat import compat_urllib_parse_unquote
from ..utils import (
- compat_str,
ExtractorError,
- float_or_none,
int_or_none,
str_or_none,
+ traverse_obj,
try_get,
url_or_none,
+ qualities,
)
class TikTokBaseIE(InfoExtractor):
- def _extract_video(self, data, video_id=None):
- video = data['video']
- description = str_or_none(try_get(data, lambda x: x['desc']))
- width = int_or_none(try_get(data, lambda x: video['width']))
- height = int_or_none(try_get(data, lambda x: video['height']))
+ _APP_VERSION = '20.9.3'
+ _MANIFEST_APP_VERSION = '291'
+ _APP_NAME = 'trill'
+ _AID = 1180
+ _API_HOSTNAME = 'api-t2.tiktokv.com'
+ _UPLOADER_URL_FORMAT = 'https://www.tiktok.com/@%s'
+ QUALITIES = ('360p', '540p', '720p')
- format_urls = set()
- formats = []
- for format_id in ('download', 'play'):
- format_url = url_or_none(video.get('%sAddr' % format_id))
- if not format_url:
- continue
- if format_url in format_urls:
- continue
- format_urls.add(format_url)
- formats.append({
- 'url': format_url,
+ def _call_api(self, ep, query, video_id, fatal=True,
+ note='Downloading API JSON', errnote='Unable to download API page'):
+ real_query = {
+ **query,
+ 'version_name': self._APP_VERSION,
+ 'version_code': self._MANIFEST_APP_VERSION,
+ 'build_number': self._APP_VERSION,
+ 'manifest_version_code': self._MANIFEST_APP_VERSION,
+ 'update_version_code': self._MANIFEST_APP_VERSION,
+ 'openudid': ''.join(random.choice('0123456789abcdef') for i in range(16)),
+ 'uuid': ''.join([random.choice(string.digits) for num in range(16)]),
+ '_rticket': int(time.time() * 1000),
+ 'ts': int(time.time()),
+ 'device_brand': 'Google',
+ 'device_type': 'Pixel 4',
+ 'device_platform': 'android',
+ 'resolution': '1080*1920',
+ 'dpi': 420,
+ 'os_version': '10',
+ 'os_api': '29',
+ 'carrier_region': 'US',
+ 'sys_region': 'US',
+ 'region': 'US',
+ 'app_name': self._APP_NAME,
+ 'app_language': 'en',
+ 'language': 'en',
+ 'timezone_name': 'America/New_York',
+ 'timezone_offset': '-14400',
+ 'channel': 'googleplay',
+ 'ac': 'wifi',
+ 'mcc_mnc': '310260',
+ 'is_my_cn': 0,
+ 'aid': self._AID,
+ 'ssmix': 'a',
+ 'as': 'a1qwert123',
+ 'cp': 'cbfhckdckkde1',
+ }
+ self._set_cookie(self._API_HOSTNAME, 'odin_tt', ''.join(random.choice('0123456789abcdef') for i in range(160)))
+ return self._download_json(
+ 'https://%s/aweme/v1/%s/' % (self._API_HOSTNAME, ep), video_id=video_id,
+ fatal=fatal, note=note, errnote=errnote, headers={
+ 'User-Agent': f'com.ss.android.ugc.trill/{self._MANIFEST_APP_VERSION} (Linux; U; Android 10; en_US; Pixel 4; Build/QQ3A.200805.001; Cronet/58.0.2991.0)',
+ 'Accept': 'application/json',
+ }, query=real_query)
+
+ def _parse_aweme_video_app(self, aweme_detail):
+ aweme_id = aweme_detail['aweme_id']
+ video_info = aweme_detail['video']
+
+ def parse_url_key(url_key):
+ format_id, codec, res, bitrate = self._search_regex(
+ r'v[^_]+_(?P<id>(?P<codec>[^_]+)_(?P<res>\d+p)_(?P<bitrate>\d+))', url_key,
+ 'url key', default=(None, None, None, None), group=('id', 'codec', 'res', 'bitrate'))
+ if not format_id:
+ return {}, None
+ return {
+ 'format_id': format_id,
+ 'vcodec': 'h265' if codec == 'bytevc1' else codec,
+ 'tbr': int_or_none(bitrate, scale=1000) or None,
+ 'quality': qualities(self.QUALITIES)(res),
+ }, res
+
+ known_resolutions = {}
+
+ def extract_addr(addr, add_meta={}):
+ parsed_meta, res = parse_url_key(addr.get('url_key', ''))
+ if res:
+ known_resolutions.setdefault(res, {}).setdefault('height', add_meta.get('height'))
+ known_resolutions[res].setdefault('width', add_meta.get('width'))
+ parsed_meta.update(known_resolutions.get(res, {}))
+ add_meta.setdefault('height', int_or_none(res[:-1]))
+ return [{
+ 'url': url,
+ 'filesize': int_or_none(addr.get('data_size')),
'ext': 'mp4',
- 'height': height,
- 'width': width,
- 'http_headers': {
- 'Referer': 'https://www.tiktok.com/',
- }
- })
- self._sort_formats(formats)
+ 'acodec': 'aac',
+ 'source_preference': -2 if 'aweme/v1' in url else -1, # Downloads from API might get blocked
+ **add_meta, **parsed_meta,
+ 'format_note': ' '.join(filter(None, (
+ add_meta.get('format_note'), '(API)' if 'aweme/v1' in url else '')))
+ } for url in addr.get('url_list') or []]
- thumbnail = url_or_none(video.get('cover'))
- duration = float_or_none(video.get('duration'))
+ # Hack: Add direct video links first to prioritize them when removing duplicate formats
+ formats = []
+ if video_info.get('play_addr'):
+ formats.extend(extract_addr(video_info['play_addr'], {
+ 'format_id': 'play_addr',
+ 'format_note': 'Direct video',
+ 'vcodec': 'h265' if traverse_obj(
+ video_info, 'is_bytevc1', 'is_h265') else 'h264', # Always h264?
+ 'width': video_info.get('width'),
+ 'height': video_info.get('height'),
+ }))
+ if video_info.get('download_addr'):
+ formats.extend(extract_addr(video_info['download_addr'], {
+ 'format_id': 'download_addr',
+ 'format_note': 'Download video%s' % (', watermarked' if video_info.get('has_watermark') else ''),
+ 'vcodec': 'h264',
+ 'width': video_info.get('width'),
+ 'height': video_info.get('height'),
+ 'preference': -2 if video_info.get('has_watermark') else -1,
+ }))
+ if video_info.get('play_addr_h264'):
+ formats.extend(extract_addr(video_info['play_addr_h264'], {
+ 'format_id': 'play_addr_h264',
+ 'format_note': 'Direct video',
+ 'vcodec': 'h264',
+ }))
+ if video_info.get('play_addr_bytevc1'):
+ formats.extend(extract_addr(video_info['play_addr_bytevc1'], {
+ 'format_id': 'play_addr_bytevc1',
+ 'format_note': 'Direct video',
+ 'vcodec': 'h265',
+ }))
+
+ for bitrate in video_info.get('bit_rate', []):
+ if bitrate.get('play_addr'):
+ formats.extend(extract_addr(bitrate['play_addr'], {
+ 'format_id': bitrate.get('gear_name'),
+ 'format_note': 'Playback video',
+ 'tbr': try_get(bitrate, lambda x: x['bit_rate'] / 1000),
+ 'vcodec': 'h265' if traverse_obj(
+ bitrate, 'is_bytevc1', 'is_h265') else 'h264',
+ 'fps': bitrate.get('FPS'),
+ }))
- uploader = try_get(data, lambda x: x['author']['nickname'], compat_str)
- uploader_id = try_get(data, lambda x: x['author']['id'], compat_str)
+ self._remove_duplicate_formats(formats)
+ self._sort_formats(formats, ('quality', 'codec', 'size', 'br'))
- timestamp = int_or_none(data.get('createTime'))
+ thumbnails = []
+ for cover_id in ('cover', 'ai_dynamic_cover', 'animated_cover', 'ai_dynamic_cover_bak',
+ 'origin_cover', 'dynamic_cover'):
+ cover = video_info.get(cover_id)
+ if cover:
+ for cover_url in cover['url_list']:
+ thumbnails.append({
+ 'id': cover_id,
+ 'url': cover_url,
+ })
- def stats(key):
- return int_or_none(try_get(
- data, lambda x: x['stats']['%sCount' % key]))
+ stats_info = aweme_detail.get('statistics', {})
+ author_info = aweme_detail.get('author', {})
+ music_info = aweme_detail.get('music', {})
+ user_url = self._UPLOADER_URL_FORMAT % (traverse_obj(author_info,
+ 'sec_uid', 'id', 'uid', 'unique_id',
+ expected_type=str_or_none, get_all=False))
- view_count = stats('play')
- like_count = stats('digg')
- comment_count = stats('comment')
- repost_count = stats('share')
+ contained_music_track = traverse_obj(
+ music_info, ('matched_song', 'title'), ('matched_pgc_sound', 'title'), expected_type=str)
+ contained_music_author = traverse_obj(
+ music_info, ('matched_song', 'author'), ('matched_pgc_sound', 'author'), 'author', expected_type=str)
- aweme_id = data.get('id') or video_id
+ is_generic_og_trackname = music_info.get('is_original_sound') and music_info.get('title') == 'original sound - %s' % music_info.get('owner_handle')
+ if is_generic_og_trackname:
+ music_track, music_author = contained_music_track or 'original sound', contained_music_author
+ else:
+ music_track, music_author = music_info.get('title'), music_info.get('author')
return {
'id': aweme_id,
- 'title': uploader or aweme_id,
- 'description': description,
- 'thumbnail': thumbnail,
- 'duration': duration,
- 'uploader': uploader,
- 'uploader_id': uploader_id,
- 'timestamp': timestamp,
- 'view_count': view_count,
- 'like_count': like_count,
- 'comment_count': comment_count,
- 'repost_count': repost_count,
+ 'title': aweme_detail['desc'],
+ 'description': aweme_detail['desc'],
+ 'view_count': int_or_none(stats_info.get('play_count')),
+ 'like_count': int_or_none(stats_info.get('digg_count')),
+ 'repost_count': int_or_none(stats_info.get('share_count')),
+ 'comment_count': int_or_none(stats_info.get('comment_count')),
+ 'uploader': str_or_none(author_info.get('unique_id')),
+ 'creator': str_or_none(author_info.get('nickname')),
+ 'uploader_id': str_or_none(author_info.get('uid')),
+ 'uploader_url': user_url,
+ 'track': music_track,
+ 'album': str_or_none(music_info.get('album')) or None,
+ 'artist': music_author,
+ 'timestamp': int_or_none(aweme_detail.get('create_time')),
'formats': formats,
+ 'thumbnails': thumbnails,
+ 'duration': int_or_none(traverse_obj(video_info, 'duration', ('download_addr', 'duration')), scale=1000)
+ }
+
+ def _parse_aweme_video_web(self, aweme_detail, webpage_url):
+ video_info = aweme_detail['video']
+ author_info = traverse_obj(aweme_detail, 'author', 'authorInfo', default={})
+ music_info = aweme_detail.get('music') or {}
+ stats_info = aweme_detail.get('stats') or {}
+ user_url = self._UPLOADER_URL_FORMAT % (traverse_obj(author_info,
+ 'secUid', 'id', 'uid', 'uniqueId',
+ expected_type=str_or_none, get_all=False))
+
+ formats = []
+ play_url = video_info.get('playAddr')
+ width = video_info.get('width')
+ height = video_info.get('height')
+ if isinstance(play_url, str):
+ formats = [{
+ 'url': self._proto_relative_url(play_url),
+ 'ext': 'mp4',
+ 'width': width,
+ 'height': height,
+ }]
+ elif isinstance(play_url, list):
+ formats = [{
+ 'url': self._proto_relative_url(url),
+ 'ext': 'mp4',
+ 'width': width,
+ 'height': height,
+ } for url in traverse_obj(play_url, (..., 'src'), expected_type=url_or_none, default=[]) if url]
+
+ download_url = url_or_none(video_info.get('downloadAddr')) or traverse_obj(video_info, ('download', 'url'), expected_type=url_or_none)
+ if download_url:
+ formats.append({
+ 'format_id': 'download',
+ 'url': self._proto_relative_url(download_url),
+ 'ext': 'mp4',
+ 'width': width,
+ 'height': height,
+ })
+ self._remove_duplicate_formats(formats)
+ self._sort_formats(formats)
+
+ thumbnails = []
+ for thumbnail_name in ('thumbnail', 'cover', 'dynamicCover', 'originCover'):
+ if aweme_detail.get(thumbnail_name):
+ thumbnails = [{
+ 'url': self._proto_relative_url(aweme_detail[thumbnail_name]),
+ 'width': width,
+ 'height': height
+ }]
+
+ return {
+ 'id': traverse_obj(aweme_detail, 'id', 'awemeId', expected_type=str_or_none),
+ 'title': aweme_detail.get('desc'),
+ 'duration': try_get(aweme_detail, lambda x: x['video']['duration'], int),
+ 'view_count': int_or_none(stats_info.get('playCount')),
+ 'like_count': int_or_none(stats_info.get('diggCount')),
+ 'repost_count': int_or_none(stats_info.get('shareCount')),
+ 'comment_count': int_or_none(stats_info.get('commentCount')),
+ 'timestamp': int_or_none(aweme_detail.get('createTime')),
+ 'creator': str_or_none(author_info.get('nickname')),
+ 'uploader': str_or_none(author_info.get('uniqueId')),
+ 'uploader_id': str_or_none(author_info.get('id')),
+ 'uploader_url': user_url,
+ 'track': str_or_none(music_info.get('title')),
+ 'album': str_or_none(music_info.get('album')) or None,
+ 'artist': str_or_none(music_info.get('authorName')),
+ 'formats': formats,
+ 'thumbnails': thumbnails,
+ 'description': str_or_none(aweme_detail.get('desc')),
+ 'http_headers': {
+ 'Referer': webpage_url
+ }
}
class TikTokIE(TikTokBaseIE):
- _VALID_URL = r'https?://(?:www\.)?tiktok\.com/@[^/]+/video/(?P<id>\d+)'
+ _VALID_URL = r'https?://www\.tiktok\.com/@[\w\.-]+/video/(?P<id>\d+)'
+
_TESTS = [{
- 'url': 'https://www.tiktok.com/@zureeal/video/6606727368545406213',
- 'md5': '163ceff303bb52de60e6887fe399e6cd',
+ 'url': 'https://www.tiktok.com/@leenabhushan/video/6748451240264420610',
+ 'md5': '736bb7a466c6f0a6afeb597da1e6f5b7',
'info_dict': {
- 'id': '6606727368545406213',
+ 'id': '6748451240264420610',
'ext': 'mp4',
- 'title': 'Zureeal',
- 'description': '#bowsette#mario#cosplay#uk#lgbt#gaming#asian#bowsettecosplay',
- 'thumbnail': r're:^https?://.*',
- 'duration': 15,
- 'uploader': 'Zureeal',
- 'uploader_id': '188294915489964032',
- 'timestamp': 1538248586,
- 'upload_date': '20180929',
+ 'title': '#jassmanak #lehanga #leenabhushan',
+ 'description': '#jassmanak #lehanga #leenabhushan',
+ 'duration': 13,
+ 'height': 1024,
+ 'width': 576,
+ 'uploader': 'leenabhushan',
+ 'uploader_id': '6691488002098119685',
+ 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAA_Eb4t1vodM1IuTy_cvp9CY22RAb59xqrO0Xtz9CYQJvgXaDvZxYnZYRzDWhhgJmy',
+ 'creator': 'facestoriesbyleenabh',
+ 'thumbnail': r're:^https?://[\w\/\.\-]+(~[\w\-]+\.image)?',
+ 'upload_date': '20191016',
+ 'timestamp': 1571246252,
'view_count': int,
'like_count': int,
+ 'repost_count': int,
'comment_count': int,
+ }
+ }, {
+ 'url': 'https://www.tiktok.com/@patroxofficial/video/6742501081818877190?langCountry=en',
+ 'md5': '6f3cf8cdd9b28cb8363fe0a9a160695b',
+ 'info_dict': {
+ 'id': '6742501081818877190',
+ 'ext': 'mp4',
+ 'title': 'md5:5e2a23877420bb85ce6521dbee39ba94',
+ 'description': 'md5:5e2a23877420bb85ce6521dbee39ba94',
+ 'duration': 27,
+ 'height': 960,
+ 'width': 540,
+ 'uploader': 'patrox',
+ 'uploader_id': '18702747',
+ 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAAiFnldaILebi5heDoVU6bn4jBWWycX6-9U3xuNPqZ8Ws',
+ 'creator': 'patroX',
+ 'thumbnail': r're:^https?://[\w\/\.\-]+(~[\w\-]+\.image)?',
+ 'upload_date': '20190930',
+ 'timestamp': 1569860870,
+ 'view_count': int,
+ 'like_count': int,
'repost_count': int,
+ 'comment_count': int,
}
+ }, {
+ # Promoted content/ad
+ 'url': 'https://www.tiktok.com/@MS4wLjABAAAAAR29F6J2Ktu0Daw03BJyXPNoRQ-W7U5a0Mn3lVCq2rQhjOd_WNLclHUoFgwX8Eno/video/6932675057474981122',
+ 'only_matching': True,
}]
- def _real_initialize(self):
- # Setup session (will set necessary cookies)
- self._request_webpage(
- 'https://www.tiktok.com/', None, note='Setting up session')
+ def _extract_aweme_app(self, aweme_id):
+ aweme_detail = self._call_api('aweme/detail', {'aweme_id': aweme_id}, aweme_id,
+ note='Downloading video details', errnote='Unable to download video details')['aweme_detail']
+ return self._parse_aweme_video_app(aweme_detail)
def _real_extract(self, url):
video_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
- page_props = self._parse_json(self._search_regex(
- r'<script[^>]+\bid=["\']__NEXT_DATA__[^>]+>\s*({.+?})\s*</script',
- webpage, 'data'), video_id)['props']['pageProps']
- data = try_get(page_props, lambda x: x['itemInfo']['itemStruct'], dict)
- if not data and page_props.get('statusCode') == 10216:
+
+ try:
+ return self._extract_aweme_app(video_id)
+ except ExtractorError as e:
+ self.report_warning(f'{e}; Retrying with webpage')
+
+ # If we only call once, we get a 403 when downlaoding the video.
+ self._download_webpage(url, video_id)
+ webpage = self._download_webpage(url, video_id, note='Downloading video webpage')
+ json_string = self._search_regex(
+ r'id=\"__NEXT_DATA__\"\s+type=\"application\/json\"\s*[^>]+>\s*(?P<json_string_ld>[^<]+)',
+ webpage, 'json_string', group='json_string_ld')
+ json_data = self._parse_json(json_string, video_id)
+ props_data = try_get(json_data, lambda x: x['props'], expected_type=dict)
+
+ # Chech statusCode for success
+ status = props_data.get('pageProps').get('statusCode')
+ if status == 0:
+ return self._parse_aweme_video_web(props_data['pageProps']['itemInfo']['itemStruct'], url)
+ elif status == 10216:
raise ExtractorError('This video is private', expected=True)
- return self._extract_video(data, video_id)
+
+ raise ExtractorError('Video not available', video_id=video_id)
class TikTokUserIE(TikTokBaseIE):
- _VALID_URL = r'https://(?:www\.)?tiktok\.com/@(?P<id>[^/?#&]+)'
+ IE_NAME = 'tiktok:user'
+ _VALID_URL = r'https?://(?:www\.)?tiktok\.com/@(?P<id>[\w\.-]+)/?(?:$|[#?])'
_TESTS = [{
- 'url': 'https://www.tiktok.com/@zureeal',
+ 'url': 'https://tiktok.com/@corgibobaa?lang=en',
+ 'playlist_mincount': 45,
+ 'info_dict': {
+ 'id': '6935371178089399301',
+ 'title': 'corgibobaa',
+ },
+ 'expected_warnings': ['Retrying']
+ }, {
+ 'url': 'https://www.tiktok.com/@meme',
+ 'playlist_mincount': 593,
'info_dict': {
- 'id': '188294915489964032',
+ 'id': '79005827461758976',
+ 'title': 'meme',
},
- 'playlist_mincount': 24,
+ 'expected_warnings': ['Retrying']
}]
- _WORKING = False
- @classmethod
- def suitable(cls, url):
- return False if TikTokIE.suitable(url) else super(TikTokUserIE, cls).suitable(url)
+ r''' # TODO: Fix by adding _signature to api_url
+ def _entries(self, webpage, user_id, username):
+ secuid = self._search_regex(r'\"secUid\":\"(?P<secUid>[^\"]+)', webpage, username)
+ verifyfp_cookie = self._get_cookies('https://www.tiktok.com').get('s_v_web_id')
+ if not verifyfp_cookie:
+ raise ExtractorError('Improper cookies (missing s_v_web_id).', expected=True)
+ api_url = f'https://m.tiktok.com/api/post/item_list/?aid=1988&cookie_enabled=true&count=30&verifyFp={verifyfp_cookie.value}&secUid={secuid}&cursor='
+ cursor = '0'
+ for page in itertools.count():
+ data_json = self._download_json(api_url + cursor, username, note='Downloading Page %d' % page)
+ for video in data_json.get('itemList', []):
+ video_id = video['id']
+ video_url = f'https://www.tiktok.com/@{user_id}/video/{video_id}'
+ yield self._url_result(video_url, 'TikTok', video_id, str_or_none(video.get('desc')))
+ if not data_json.get('hasMore'):
+ break
+ cursor = data_json['cursor']
+ '''
+
+ def _entries_api(self, webpage, user_id, username):
+ query = {
+ 'user_id': user_id,
+ 'count': 21,
+ 'max_cursor': 0,
+ 'min_cursor': 0,
+ 'retry_type': 'no_retry',
+ 'device_id': ''.join(random.choice(string.digits) for i in range(19)), # Some endpoints don't like randomized device_id, so it isn't directly set in _call_api.
+ }
+
+ max_retries = self.get_param('extractor_retries', 3)
+ for page in itertools.count(1):
+ for retries in itertools.count():
+ try:
+ post_list = self._call_api('aweme/post', query, username,
+ note='Downloading user video list page %d%s' % (page, f' (attempt {retries})' if retries != 0 else ''),
+ errnote='Unable to download user video list')
+ except ExtractorError as e:
+ if isinstance(e.cause, json.JSONDecodeError) and e.cause.pos == 0 and retries != max_retries:
+ self.report_warning('%s. Retrying...' % str(e.cause or e.msg))
+ continue
+ raise
+ break
+ for video in post_list.get('aweme_list', []):
+ yield {
+ **self._parse_aweme_video_app(video),
+ 'ie_key': TikTokIE.ie_key(),
+ 'extractor': 'TikTok',
+ }
+ if not post_list.get('has_more'):
+ break
+ query['max_cursor'] = post_list['max_cursor']
+
+ def _real_extract(self, url):
+ user_name = self._match_id(url)
+ webpage = self._download_webpage(url, user_name, headers={
+ 'User-Agent': 'facebookexternalhit/1.1 (+http://www.facebook.com/externalhit_uatext.php)'
+ })
+ user_id = self._html_search_regex(r'snssdk\d*://user/profile/(\d+)', webpage, 'user ID')
+ return self.playlist_result(self._entries_api(webpage, user_id, user_name), user_id, user_name)
+
+
+class DouyinIE(TikTokIE):
+ _VALID_URL = r'https?://(?:www\.)?douyin\.com/video/(?P<id>[0-9]+)'
+ _TESTS = [{
+ 'url': 'https://www.douyin.com/video/6961737553342991651',
+ 'md5': '10523312c8b8100f353620ac9dc8f067',
+ 'info_dict': {
+ 'id': '6961737553342991651',
+ 'ext': 'mp4',
+ 'title': '#杨超越 小小水手带你去远航❤️',
+ 'uploader': '杨超越',
+ 'upload_date': '20210513',
+ 'timestamp': 1620905839,
+ 'uploader_id': '110403406559',
+ 'view_count': int,
+ 'like_count': int,
+ 'repost_count': int,
+ 'comment_count': int,
+ }
+ }, {
+ 'url': 'https://www.douyin.com/video/6982497745948921092',
+ 'md5': 'd78408c984b9b5102904cf6b6bc2d712',
+ 'info_dict': {
+ 'id': '6982497745948921092',
+ 'ext': 'mp4',
+ 'title': '这个夏日和小羊@杨超越 一起遇见白色幻想',
+ 'uploader': '杨超越工作室',
+ 'upload_date': '20210708',
+ 'timestamp': 1625739481,
+ 'uploader_id': '408654318141572',
+ 'view_count': int,
+ 'like_count': int,
+ 'repost_count': int,
+ 'comment_count': int,
+ }
+ }, {
+ 'url': 'https://www.douyin.com/video/6953975910773099811',
+ 'md5': '72e882e24f75064c218b76c8b713c185',
+ 'info_dict': {
+ 'id': '6953975910773099811',
+ 'ext': 'mp4',
+ 'title': '#一起看海 出现在你的夏日里',
+ 'uploader': '杨超越',
+ 'upload_date': '20210422',
+ 'timestamp': 1619098692,
+ 'uploader_id': '110403406559',
+ 'view_count': int,
+ 'like_count': int,
+ 'repost_count': int,
+ 'comment_count': int,
+ }
+ }, {
+ 'url': 'https://www.douyin.com/video/6950251282489675042',
+ 'md5': 'b4db86aec367ef810ddd38b1737d2fed',
+ 'info_dict': {
+ 'id': '6950251282489675042',
+ 'ext': 'mp4',
+ 'title': '哈哈哈,成功了哈哈哈哈哈哈',
+ 'uploader': '杨超越',
+ 'upload_date': '20210412',
+ 'timestamp': 1618231483,
+ 'uploader_id': '110403406559',
+ 'view_count': int,
+ 'like_count': int,
+ 'repost_count': int,
+ 'comment_count': int,
+ }
+ }, {
+ 'url': 'https://www.douyin.com/video/6963263655114722595',
+ 'md5': '1abe1c477d05ee62efb40bf2329957cf',
+ 'info_dict': {
+ 'id': '6963263655114722595',
+ 'ext': 'mp4',
+ 'title': '#哪个爱豆的105度最甜 换个角度看看我哈哈',
+ 'uploader': '杨超越',
+ 'upload_date': '20210517',
+ 'timestamp': 1621261163,
+ 'uploader_id': '110403406559',
+ 'view_count': int,
+ 'like_count': int,
+ 'repost_count': int,
+ 'comment_count': int,
+ }
+ }]
+ _APP_VERSION = '9.6.0'
+ _MANIFEST_APP_VERSION = '960'
+ _APP_NAME = 'aweme'
+ _AID = 1128
+ _API_HOSTNAME = 'aweme.snssdk.com'
+ _UPLOADER_URL_FORMAT = 'https://www.douyin.com/user/%s'
def _real_extract(self, url):
- user_id = self._match_id(url)
- data = self._download_json(
- 'https://m.tiktok.com/h5/share/usr/list/%s/' % user_id, user_id,
- query={'_signature': '_'})
- entries = []
- for aweme in data['aweme_list']:
- try:
- entry = self._extract_video(aweme)
- except ExtractorError:
- continue
- entry['extractor_key'] = TikTokIE.ie_key()
- entries.append(entry)
- return self.playlist_result(entries, user_id)
+ video_id = self._match_id(url)
+
+ try:
+ return self._extract_aweme_app(video_id)
+ except ExtractorError as e:
+ self.report_warning(f'{e}; Retrying with webpage')
+
+ webpage = self._download_webpage(url, video_id)
+ render_data_json = self._search_regex(
+ r'<script [^>]*\bid=[\'"]RENDER_DATA[\'"][^>]*>(%7B.+%7D)</script>',
+ webpage, 'render data', default=None)
+ if not render_data_json:
+ # TODO: Run verification challenge code to generate signature cookies
+ raise ExtractorError('Fresh cookies (not necessarily logged in) are needed')
+
+ render_data = self._parse_json(
+ render_data_json, video_id, transform_source=compat_urllib_parse_unquote)
+ return self._parse_aweme_video_web(
+ traverse_obj(render_data, (..., 'aweme', 'detail'), get_all=False), url)
diff --git a/hypervideo_dl/extractor/tinypic.py b/hypervideo_dl/extractor/tinypic.py
index bc2def5..39056e5 100644
--- a/hypervideo_dl/extractor/tinypic.py
+++ b/hypervideo_dl/extractor/tinypic.py
@@ -28,7 +28,7 @@ class TinyPicIE(InfoExtractor):
]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id, 'Downloading page')
diff --git a/hypervideo_dl/extractor/tmz.py b/hypervideo_dl/extractor/tmz.py
index 3d1bf75..aee2273 100644
--- a/hypervideo_dl/extractor/tmz.py
+++ b/hypervideo_dl/extractor/tmz.py
@@ -1,111 +1,157 @@
# coding: utf-8
from __future__ import unicode_literals
+import re
+
from .common import InfoExtractor
-from .jwplatform import JWPlatformIE
-from .kaltura import KalturaIE
from ..utils import (
- int_or_none,
- unified_timestamp,
+ ExtractorError,
+ get_element_by_attribute,
)
class TMZIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?tmz\.com/videos/(?P<id>[^/?#&]+)'
- _TESTS = [{
- 'url': 'http://www.tmz.com/videos/0-cegprt2p/',
- 'md5': '31f9223e20eef55954973359afa61a20',
- 'info_dict': {
- 'id': 'P6YjLBLk',
- 'ext': 'mp4',
- 'title': "No Charges Against Hillary Clinton? Harvey Says It Ain't Over Yet",
- 'description': 'md5:b714359fc18607715ebccbd2da8ff488',
- 'timestamp': 1467831837,
- 'upload_date': '20160706',
+ _VALID_URL = r"https?://(?:www\.)?tmz\.com/.*"
+ _TESTS = [
+ {
+ "url": "http://www.tmz.com/videos/0-cegprt2p/",
+ "info_dict": {
+ "id": "http://www.tmz.com/videos/0-cegprt2p/",
+ "ext": "mp4",
+ "title": "No Charges Against Hillary Clinton? Harvey Says It Ain't Over Yet",
+ "description": "Harvey talks about Director Comey’s decision not to prosecute Hillary Clinton.",
+ "timestamp": 1467831837,
+ "uploader": "{'@type': 'Person', 'name': 'TMZ Staff'}",
+ "upload_date": "20160706",
+ },
},
- 'add_ie': [JWPlatformIE.ie_key()],
- }, {
- 'url': 'http://www.tmz.com/videos/0_okj015ty/',
- 'only_matching': True,
- }, {
- 'url': 'https://www.tmz.com/videos/071119-chris-morgan-women-4590005-0-zcsejvcr/',
- 'only_matching': True,
- }, {
- 'url': 'https://www.tmz.com/videos/2021-02-19-021921-floyd-mayweather-1043872/',
- 'only_matching': True,
- }]
-
- def _real_extract(self, url):
- video_id = self._match_id(url).replace('-', '_')
-
- webpage = self._download_webpage(url, video_id, fatal=False)
- if webpage:
- tmz_video_id = self._search_regex(
- r'nodeRef\s*:\s*["\']tmz:video:([\da-fA-F]{8}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{12})',
- webpage, 'video id', default=None)
- video = self._download_json(
- 'https://www.tmz.com/_/video/%s' % tmz_video_id, video_id,
- fatal=False)
- if video:
- message = video['message']
- info = {
- '_type': 'url_transparent',
- 'title': message.get('title'),
- 'description': message.get('description'),
- 'timestamp': unified_timestamp(message.get('published_at')),
- 'duration': int_or_none(message.get('duration')),
- }
- jwplatform_id = message.get('jwplayer_media_id')
- if jwplatform_id:
- info.update({
- 'url': 'jwplatform:%s' % jwplatform_id,
- 'ie_key': JWPlatformIE.ie_key(),
- })
- else:
- kaltura_entry_id = message.get('kaltura_entry_id') or video_id
- kaltura_partner_id = message.get('kaltura_partner_id') or '591531'
- info.update({
- 'url': 'kaltura:%s:%s' % (kaltura_partner_id, kaltura_entry_id),
- 'ie_key': KalturaIE.ie_key(),
- })
- return info
-
- return self.url_result(
- 'kaltura:591531:%s' % video_id, KalturaIE.ie_key(), video_id)
-
-
-class TMZArticleIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?tmz\.com/\d{4}/\d{2}/\d{2}/(?P<id>[^/?#&]+)'
- _TEST = {
- 'url': 'http://www.tmz.com/2015/04/19/bobby-brown-bobbi-kristina-awake-video-concert',
- 'info_dict': {
- 'id': 'PAKZa97W',
- 'ext': 'mp4',
- 'title': 'Bobby Brown Tells Crowd ... Bobbi Kristina is Awake',
- 'description': 'Bobby Brown stunned his audience during a concert Saturday night, when he told the crowd, "Bobbi is awake. She\'s watching me."',
- 'timestamp': 1429466400,
- 'upload_date': '20150419',
+ {
+ "url": "https://www.tmz.com/videos/071119-chris-morgan-women-4590005-0-zcsejvcr/",
+ "info_dict": {
+ "id": "https://www.tmz.com/videos/071119-chris-morgan-women-4590005-0-zcsejvcr/",
+ "ext": "mp4",
+ "title": "Angry Bagel Shop Guy Says He Doesn't Trust Women",
+ "description": "The enraged man who went viral for ranting about women on dating sites before getting ragdolled in a bagel shop is defending his misogyny ... he says it's women's fault in the first place.",
+ "timestamp": 1562889485,
+ "uploader": "{'@type': 'Person', 'name': 'TMZ Staff'}",
+ "upload_date": "20190711",
+ },
+ },
+ {
+ "url": "http://www.tmz.com/2015/04/19/bobby-brown-bobbi-kristina-awake-video-concert",
+ "md5": "5429c85db8bde39a473a56ca8c4c5602",
+ "info_dict": {
+ "id": "http://www.tmz.com/2015/04/19/bobby-brown-bobbi-kristina-awake-video-concert",
+ "ext": "mp4",
+ "title": "Bobby Brown Tells Crowd ... Bobbi Kristina is Awake",
+ "description": 'Bobby Brown stunned his audience during a concert Saturday night, when he told the crowd, "Bobbi is awake. She\'s watching me."',
+ "timestamp": 1429467813,
+ "uploader": "{'@type': 'Person', 'name': 'TMZ Staff'}",
+ "upload_date": "20150419",
+ },
},
- 'params': {
- 'skip_download': True,
+ {
+ "url": "http://www.tmz.com/2015/09/19/patti-labelle-concert-fan-stripping-kicked-out-nicki-minaj/",
+ "info_dict": {
+ "id": "http://www.tmz.com/2015/09/19/patti-labelle-concert-fan-stripping-kicked-out-nicki-minaj/",
+ "ext": "mp4",
+ "title": "Patti LaBelle -- Goes Nuclear On Stripping Fan",
+ "description": "Patti LaBelle made it known loud and clear last night ... NO "
+ "ONE gets on her stage and strips down.",
+ "timestamp": 1442683746,
+ "uploader": "{'@type': 'Person', 'name': 'TMZ Staff'}",
+ "upload_date": "20150919",
+ },
},
- 'add_ie': [JWPlatformIE.ie_key()],
- }
+ {
+ "url": "http://www.tmz.com/2016/01/28/adam-silver-sting-drake-blake-griffin/",
+ "info_dict": {
+ "id": "http://www.tmz.com/2016/01/28/adam-silver-sting-drake-blake-griffin/",
+ "ext": "mp4",
+ "title": "NBA's Adam Silver -- Blake Griffin's a Great Guy ... He'll Learn from This",
+ "description": "Two pretty parts of this video with NBA Commish Adam Silver.",
+ "timestamp": 1454010989,
+ "uploader": "{'@type': 'Person', 'name': 'TMZ Staff'}",
+ "upload_date": "20160128",
+ },
+ },
+ {
+ "url": "http://www.tmz.com/2016/10/27/donald-trump-star-vandal-arrested-james-otis/",
+ "info_dict": {
+ "id": "http://www.tmz.com/2016/10/27/donald-trump-star-vandal-arrested-james-otis/",
+ "ext": "mp4",
+ "title": "Trump Star Vandal -- I'm Not Afraid of Donald or the Cops!",
+ "description": "James Otis is the the guy who took a pickaxe to Donald Trump's star on the Walk of Fame, and he tells TMZ .. he's ready and willing to go to jail for the crime.",
+ "timestamp": 1477500095,
+ "uploader": "{'@type': 'Person', 'name': 'TMZ Staff'}",
+ "upload_date": "20161026",
+ },
+ },
+ {
+ "url": "https://www.tmz.com/videos/2020-10-31-103120-beverly-hills-protest-4878209/",
+ "info_dict": {
+ "id": "https://www.tmz.com/videos/2020-10-31-103120-beverly-hills-protest-4878209/",
+ "ext": "mp4",
+ "title": "Cops Use Billy Clubs Against Pro-Trump and Anti-Fascist "
+ "Demonstrators",
+ "description": "Beverly Hills may be an omen of what's coming next week, "
+ "because things got crazy on the streets and cops started "
+ "swinging their billy clubs at both Anti-Fascist and Pro-Trump "
+ "demonstrators.",
+ "timestamp": 1604182772,
+ "uploader": "{'@type': 'Person', 'name': 'TMZ Staff'}",
+ "upload_date": "20201031",
+ },
+ },
+ {
+ "url": "https://www.tmz.com/2020/11/05/gervonta-davis-car-crash-hit-and-run-police/",
+ "info_dict": {
+ "id": "Dddb6IGe-ws",
+ "ext": "mp4",
+ "title": "SICK LAMBO GERVONTA DAVIS IN HIS NEW RIDE RIGHT AFTER KO AFTER LEO EsNews Boxing",
+ "uploader": "ESNEWS",
+ "description": "md5:49675bc58883ccf80474b8aa701e1064",
+ "upload_date": "20201101",
+ "uploader_id": "ESNEWS",
+ },
+ },
+ {
+ "url": "https://www.tmz.com/2020/11/19/conor-mcgregor-dustin-poirier-contract-fight-ufc-257-fight-island/",
+ "info_dict": {
+ "id": "1329450007125225473",
+ "ext": "mp4",
+ "title": "TheMacLife - BREAKING: Conor McGregor (@thenotoriousmma) has signed his bout agreement for his rematch with Dustin Poirier for January 23.",
+ "uploader": "TheMacLife",
+ "description": "md5:56e6009bbc3d12498e10d08a8e1f1c69",
+ "upload_date": "20201119",
+ "uploader_id": "Maclifeofficial",
+ "timestamp": 1605800556,
+ },
+ },
+ ]
def _real_extract(self, url):
- video_id = self._match_id(url)
-
- webpage = self._download_webpage(url, video_id)
-
- tmz_url = self._search_regex(
- r'clickLink\s*\(\s*["\'](?P<url>%s)' % TMZIE._VALID_URL, webpage,
- 'video id', default=None, group='url')
- if tmz_url:
- return self.url_result(tmz_url, ie=TMZIE.ie_key())
-
- embedded_video_info = self._parse_json(self._html_search_regex(
- r'tmzVideoEmbed\(({.+?})\);', webpage, 'embedded video info'),
- video_id)
- return self.url_result(
- 'http://www.tmz.com/videos/%s/' % embedded_video_info['id'],
- ie=TMZIE.ie_key())
+ webpage = self._download_webpage(url, url)
+ jsonld = self._search_json_ld(webpage, url)
+ if not jsonld or "url" not in jsonld:
+ # try to extract from YouTube Player API
+ # see https://developers.google.com/youtube/iframe_api_reference#Video_Queueing_Functions
+ match_obj = re.search(r'\.cueVideoById\(\s*(?P<quote>[\'"])(?P<id>.*?)(?P=quote)', webpage)
+ if match_obj:
+ res = self.url_result(match_obj.group("id"))
+ return res
+ # try to extract from twitter
+ blockquote_el = get_element_by_attribute("class", "twitter-tweet", webpage)
+ if blockquote_el:
+ matches = re.findall(
+ r'<a[^>]+href=\s*(?P<quote>[\'"])(?P<link>.*?)(?P=quote)',
+ blockquote_el)
+ if matches:
+ for _, match in matches:
+ if "/status/" in match:
+ res = self.url_result(match)
+ return res
+ raise ExtractorError("No video found!")
+ if id not in jsonld:
+ jsonld["id"] = url
+ return jsonld
diff --git a/hypervideo_dl/extractor/tnaflix.py b/hypervideo_dl/extractor/tnaflix.py
index b3573c6..d7617f7 100644
--- a/hypervideo_dl/extractor/tnaflix.py
+++ b/hypervideo_dl/extractor/tnaflix.py
@@ -73,7 +73,7 @@ class TNAFlixNetworkBaseIE(InfoExtractor):
} for i in range(first, last + 1)]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
video_id = mobj.group('id')
for display_id_key in ('display_id', 'display_id_2'):
if display_id_key in mobj.groupdict():
diff --git a/hypervideo_dl/extractor/toggle.py b/hypervideo_dl/extractor/toggle.py
index 270c84d..eb87349 100644
--- a/hypervideo_dl/extractor/toggle.py
+++ b/hypervideo_dl/extractor/toggle.py
@@ -7,7 +7,6 @@ import re
from .common import InfoExtractor
from ..utils import (
determine_ext,
- ExtractorError,
float_or_none,
int_or_none,
parse_iso8601,
@@ -154,11 +153,10 @@ class ToggleIE(InfoExtractor):
})
if not formats:
for meta in (info.get('Metas') or []):
- if meta.get('Key') == 'Encryption' and meta.get('Value') == '1':
- raise ExtractorError(
- 'This video is DRM protected.', expected=True)
- # Most likely because geo-blocked
- raise ExtractorError('No downloadable videos found', expected=True)
+ if (not self.get_param('allow_unplayable_formats')
+ and meta.get('Key') == 'Encryption' and meta.get('Value') == '1'):
+ self.report_drm(video_id)
+ # Most likely because geo-blocked if no formats and no DRM
self._sort_formats(formats)
thumbnails = []
diff --git a/hypervideo_dl/extractor/tokentube.py b/hypervideo_dl/extractor/tokentube.py
new file mode 100644
index 0000000..d636211
--- /dev/null
+++ b/hypervideo_dl/extractor/tokentube.py
@@ -0,0 +1,152 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import functools
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ parse_count,
+ unified_strdate,
+ js_to_json,
+ OnDemandPagedList,
+)
+
+
+class TokentubeIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?tokentube\.net/(?:view\?[vl]=|[vl]/)(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'https://tokentube.net/l/3236632011/Praise-A-Thon-Pastori-Chrisin-ja-Pastori-Bennyn-kanssa-27-8-2021',
+ 'info_dict': {
+ 'id': '3236632011',
+ 'ext': 'mp4',
+ 'title': 'Praise-A-Thon Pastori Chrisin ja Pastori Bennyn kanssa 27.8.2021',
+ 'description': '',
+ 'uploader': 'Pastori Chris - Rapsodia.fi',
+ 'upload_date': '20210827',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'https://tokentube.net/v/3950239124/Linux-Ubuntu-Studio-perus-k%C3%A4ytt%C3%B6',
+ 'md5': '0e1f00421f501f5eada9890d38fcfb56',
+ 'info_dict': {
+ 'id': '3950239124',
+ 'ext': 'mp4',
+ 'title': 'Linux Ubuntu Studio perus käyttö',
+ 'description': 'md5:854ff1dc732ff708976de2880ea32050',
+ 'uploader': 'jyrilehtonen',
+ 'upload_date': '20210825',
+ },
+ }, {
+ 'url': 'https://tokentube.net/view?v=3582463289',
+ 'info_dict': {
+ 'id': '3582463289',
+ 'ext': 'mp4',
+ 'title': 'Police for Freedom - toiminta aloitetaan Suomessa ❤️??',
+ 'description': 'md5:cd92e620d7f5fa162e8410d0fc9a08be',
+ 'uploader': 'Voitontie',
+ 'upload_date': '20210428',
+ }
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ title = self._html_search_regex(r'<h1\s*class=["\']title-text["\']>(.+?)</h1>', webpage, 'title')
+
+ data_json = self._html_search_regex(r'({["\']html5["\'].+?}}}+)', webpage, 'data json')
+ data_json = self._parse_json(js_to_json(data_json), video_id, fatal=False)
+
+ sources = data_json.get('sources') or self._parse_json(
+ self._html_search_regex(r'updateSrc\(([^\)]+)\)', webpage, 'sources'),
+ video_id, transform_source=js_to_json)
+
+ formats = [{
+ 'url': format.get('src'),
+ 'format_id': format.get('label'),
+ 'height': format.get('res'),
+ } for format in sources]
+
+ view_count = parse_count(self._html_search_regex(
+ r'<p\s*class=["\']views_counter["\']>\s*([\d\.,]+)\s*<span>views?</span></p>',
+ webpage, 'view_count', fatal=False))
+
+ like_count = parse_count(self._html_search_regex(
+ r'<div\s*class="sh_button\s*likes_count">\s*(\d+)\s*</div>',
+ webpage, 'like count', fatal=False))
+
+ dislike_count = parse_count(self._html_search_regex(
+ r'<div\s*class="sh_button\s*dislikes_count">\s*(\d+)\s*</div>',
+ webpage, 'dislike count', fatal=False))
+
+ upload_date = unified_strdate(self._html_search_regex(
+ r'<span\s*class="p-date">Published\s*on\s+([^<]+)',
+ webpage, 'upload date', fatal=False))
+
+ uploader = self._html_search_regex(
+ r'<a\s*class="place-left"[^>]+>(.+?)</a>',
+ webpage, 'uploader', fatal=False)
+
+ description = self._html_search_meta('description', webpage)
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'formats': formats,
+ 'title': title,
+ 'view_count': view_count,
+ 'like_count': like_count,
+ 'dislike_count': dislike_count,
+ 'upload_date': upload_date,
+ 'description': description,
+ 'uploader': uploader,
+ }
+
+
+class TokentubeChannelIE(InfoExtractor):
+ _PAGE_SIZE = 20
+ IE_NAME = 'Tokentube:channel'
+ _VALID_URL = r'https?://(?:www\.)?tokentube\.net/channel/(?P<id>\d+)/[^/]+(?:/videos)?'
+ _TESTS = [{
+ 'url': 'https://tokentube.net/channel/3697658904/TokenTube',
+ 'info_dict': {
+ 'id': '3697658904',
+ },
+ 'playlist_mincount': 7,
+ }, {
+ 'url': 'https://tokentube.net/channel/3353234420/Linux/videos',
+ 'info_dict': {
+ 'id': '3353234420',
+ },
+ 'playlist_mincount': 20,
+ }, {
+ 'url': 'https://tokentube.net/channel/3475834195/Voitontie',
+ 'info_dict': {
+ 'id': '3475834195',
+ },
+ 'playlist_mincount': 150,
+ }]
+
+ def _fetch_page(self, channel_id, page):
+ page += 1
+ videos_info = self._download_webpage(
+ f'https://tokentube.net/videos?p=0&m=1&sort=recent&u={channel_id}&page={page}',
+ channel_id, headers={'X-Requested-With': 'XMLHttpRequest'},
+ note=f'Downloading page {page}', fatal=False)
+ if '</i> Sorry, no results were found.' not in videos_info:
+ for path, media_id in re.findall(
+ r'<a[^>]+\bhref=["\']([^"\']+/[lv]/(\d+)/\S+)["\'][^>]+>',
+ videos_info):
+ yield self.url_result(path, ie=TokentubeIE.ie_key(), video_id=media_id)
+
+ def _real_extract(self, url):
+ channel_id = self._match_id(url)
+
+ entries = OnDemandPagedList(functools.partial(
+ self._fetch_page, channel_id), self._PAGE_SIZE)
+
+ return self.playlist_result(entries, channel_id)
diff --git a/hypervideo_dl/extractor/toongoggles.py b/hypervideo_dl/extractor/toongoggles.py
index b5ba1c0..df13d64 100644
--- a/hypervideo_dl/extractor/toongoggles.py
+++ b/hypervideo_dl/extractor/toongoggles.py
@@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
from ..utils import (
@@ -63,7 +62,7 @@ class ToonGogglesIE(InfoExtractor):
}
def _real_extract(self, url):
- show_id, episode_id = re.match(self._VALID_URL, url).groups()
+ show_id, episode_id = self._match_valid_url(url).groups()
if episode_id:
episode_data = self._call_api('search', episode_id, {
'filter': 'episode',
diff --git a/hypervideo_dl/extractor/toutv.py b/hypervideo_dl/extractor/toutv.py
index 44b022f..6c84c21 100644
--- a/hypervideo_dl/extractor/toutv.py
+++ b/hypervideo_dl/extractor/toutv.py
@@ -74,7 +74,7 @@ class TouTvIE(RadioCanadaIE):
})
# IsDrm does not necessarily mean the video is DRM protected (see
# https://github.com/ytdl-org/youtube-dl/issues/13994).
- if metadata.get('IsDrm'):
+ if not self.get_param('allow_unplayable_formats') and metadata.get('IsDrm'):
self.report_warning('This video is probably DRM protected.', path)
video_id = metadata['IdMedia']
details = metadata['Details']
diff --git a/hypervideo_dl/extractor/traileraddict.py b/hypervideo_dl/extractor/traileraddict.py
index 747370d..10100fb 100644
--- a/hypervideo_dl/extractor/traileraddict.py
+++ b/hypervideo_dl/extractor/traileraddict.py
@@ -20,7 +20,7 @@ class TrailerAddictIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
name = mobj.group('movie') + '/' + mobj.group('trailer_name')
webpage = self._download_webpage(url, name)
diff --git a/hypervideo_dl/extractor/trovo.py b/hypervideo_dl/extractor/trovo.py
index de0107a..ec55f41 100644
--- a/hypervideo_dl/extractor/trovo.py
+++ b/hypervideo_dl/extractor/trovo.py
@@ -1,6 +1,7 @@
# coding: utf-8
from __future__ import unicode_literals
+import itertools
import json
from .common import InfoExtractor
@@ -14,6 +15,7 @@ from ..utils import (
class TrovoBaseIE(InfoExtractor):
_VALID_URL_BASE = r'https?://(?:www\.)?trovo\.live/'
+ _HEADERS = {'Origin': 'https://trovo.live'}
def _extract_streamer_info(self, data):
streamer_info = data.get('streamerInfo') or {}
@@ -35,7 +37,7 @@ class TrovoIE(TrovoBaseIE):
'query': '''{
getLiveInfo(params: {userName: "%s"}) {
isLive
- programInfo {
+ programInfo {
coverUrl
id
streamInfo {
@@ -68,6 +70,7 @@ class TrovoIE(TrovoBaseIE):
'format_id': format_id,
'height': int_or_none(format_id[:-1]) if format_id else None,
'url': play_url,
+ 'http_headers': self._HEADERS,
})
self._sort_formats(formats)
@@ -153,7 +156,7 @@ class TrovoVodIE(TrovoBaseIE):
'protocol': 'm3u8_native',
'tbr': int_or_none(play_info.get('bitrate')),
'url': play_url,
- 'http_headers': {'Origin': 'https://trovo.live'},
+ 'http_headers': self._HEADERS,
})
self._sort_formats(formats)
@@ -192,3 +195,69 @@ class TrovoVodIE(TrovoBaseIE):
}
info.update(self._extract_streamer_info(vod_detail_info))
return info
+
+
+class TrovoChannelBaseIE(InfoExtractor):
+ def _get_vod_json(self, page, uid):
+ raise NotImplementedError('This method must be implemented by subclasses')
+
+ def _entries(self, uid):
+ for page in itertools.count(1):
+ vod_json = self._get_vod_json(page, uid)
+ vods = vod_json.get('vodInfos', [])
+ for vod in vods:
+ yield self.url_result(
+ 'https://trovo.live/%s/%s' % (self._TYPE, vod.get('vid')),
+ ie=TrovoVodIE.ie_key())
+ has_more = vod_json['hasMore']
+ if not has_more:
+ break
+
+ def _real_extract(self, url):
+ id = self._match_id(url)
+ uid = str(self._download_json('https://gql.trovo.live/', id, query={
+ 'query': '{getLiveInfo(params:{userName:"%s"}){streamerInfo{uid}}}' % id
+ })['data']['getLiveInfo']['streamerInfo']['uid'])
+ return self.playlist_result(self._entries(uid), playlist_id=uid)
+
+
+class TrovoChannelVodIE(TrovoChannelBaseIE):
+ _VALID_URL = r'trovovod:(?P<id>[^\s]+)'
+ IE_DESC = 'All VODs of a trovo.live channel, "trovovod" keyword'
+
+ _TESTS = [{
+ 'url': 'trovovod:OneTappedYou',
+ 'playlist_mincount': 24,
+ 'info_dict': {
+ 'id': '100719456',
+ },
+ }]
+
+ _QUERY = '{getChannelLtvVideoInfos(params:{pageSize:99,currPage:%d,channelID:%s}){hasMore,vodInfos{vid}}}'
+ _TYPE = 'video'
+
+ def _get_vod_json(self, page, uid):
+ return self._download_json('https://gql.trovo.live/', uid, query={
+ 'query': self._QUERY % (page, uid)
+ })['data']['getChannelLtvVideoInfos']
+
+
+class TrovoChannelClipIE(TrovoChannelBaseIE):
+ _VALID_URL = r'trovoclip:(?P<id>[^\s]+)'
+ IE_DESC = 'All Clips of a trovo.live channel, "trovoclip" keyword'
+
+ _TESTS = [{
+ 'url': 'trovoclip:OneTappedYou',
+ 'playlist_mincount': 29,
+ 'info_dict': {
+ 'id': '100719456',
+ },
+ }]
+
+ _QUERY = '{getChannelClipVideoInfos(params:{pageSize:99,currPage:%d,channelID:%s,albumType:VOD_CLIP_ALBUM_TYPE_LATEST}){hasMore,vodInfos{vid}}}'
+ _TYPE = 'clip'
+
+ def _get_vod_json(self, page, uid):
+ return self._download_json('https://gql.trovo.live/', uid, query={
+ 'query': self._QUERY % (page, uid)
+ })['data']['getChannelClipVideoInfos']
diff --git a/hypervideo_dl/extractor/trutv.py b/hypervideo_dl/extractor/trutv.py
index ce892c8..c09ff89 100644
--- a/hypervideo_dl/extractor/trutv.py
+++ b/hypervideo_dl/extractor/trutv.py
@@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
from .turner import TurnerBaseIE
from ..utils import (
@@ -27,7 +26,7 @@ class TruTVIE(TurnerBaseIE):
}
def _real_extract(self, url):
- series_slug, clip_slug, video_id = re.match(self._VALID_URL, url).groups()
+ series_slug, clip_slug, video_id = self._match_valid_url(url).groups()
if video_id:
path = 'episode'
diff --git a/hypervideo_dl/extractor/tubitv.py b/hypervideo_dl/extractor/tubitv.py
index ebfb05c..2e9b325 100644
--- a/hypervideo_dl/extractor/tubitv.py
+++ b/hypervideo_dl/extractor/tubitv.py
@@ -7,13 +7,19 @@ from .common import InfoExtractor
from ..utils import (
ExtractorError,
int_or_none,
+ js_to_json,
sanitized_Request,
urlencode_postdata,
)
class TubiTvIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?tubitv\.com/(?:video|movies|tv-shows)/(?P<id>[0-9]+)'
+ _VALID_URL = r'''(?x)
+ (?:
+ tubitv:|
+ https?://(?:www\.)?tubitv\.com/(?:video|movies|tv-shows)/
+ )
+ (?P<id>[0-9]+)'''
_LOGIN_URL = 'http://tubitv.com/login'
_NETRC_MACHINE = 'tubitv'
_GEO_COUNTRIES = ['US']
@@ -75,9 +81,13 @@ class TubiTvIE(InfoExtractor):
'http://tubitv.com/oz/videos/%s/content' % video_id, video_id)
title = video_data['title']
- formats = self._extract_m3u8_formats(
- self._proto_relative_url(video_data['url']),
- video_id, 'mp4', 'm3u8_native')
+ formats = []
+ url = video_data['url']
+ # URL can be sometimes empty. Does this only happen when there is DRM?
+ if url:
+ formats = self._extract_m3u8_formats(
+ self._proto_relative_url(url),
+ video_id, 'mp4', 'm3u8_native')
self._sort_formats(formats)
thumbnails = []
@@ -108,3 +118,28 @@ class TubiTvIE(InfoExtractor):
'uploader_id': video_data.get('publisher_id'),
'release_year': int_or_none(video_data.get('year')),
}
+
+
+class TubiTvShowIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?tubitv\.com/series/[0-9]+/(?P<show_name>[^/?#]+)'
+ _TESTS = [{
+ 'url': 'https://tubitv.com/series/3936/the-joy-of-painting-with-bob-ross?start=true',
+ 'playlist_mincount': 390,
+ 'info_dict': {
+ 'id': 'the-joy-of-painting-with-bob-ross',
+ }
+ }]
+
+ def _entries(self, show_url, show_name):
+ show_webpage = self._download_webpage(show_url, show_name)
+ show_json = self._parse_json(self._search_regex(
+ r"window\.__data\s*=\s*({.+?});\s*</script>",
+ show_webpage, 'data',), show_name, transform_source=js_to_json)['video']
+ for episode_id in show_json['fullContentById'].keys():
+ yield self.url_result(
+ 'tubitv:%s' % episode_id,
+ ie=TubiTvIE.ie_key(), video_id=episode_id)
+
+ def _real_extract(self, url):
+ show_name = self._match_valid_url(url).group('show_name')
+ return self.playlist_result(self._entries(url, show_name), playlist_id=show_name)
diff --git a/hypervideo_dl/extractor/tumblr.py b/hypervideo_dl/extractor/tumblr.py
index ae584ad..adc3701 100644
--- a/hypervideo_dl/extractor/tumblr.py
+++ b/hypervideo_dl/extractor/tumblr.py
@@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
from ..utils import (
@@ -143,7 +142,7 @@ class TumblrIE(InfoExtractor):
self.report_warning('Login has probably failed')
def _real_extract(self, url):
- m_url = re.match(self._VALID_URL, url)
+ m_url = self._match_valid_url(url)
video_id = m_url.group('id')
blog = m_url.group('blog_name')
diff --git a/hypervideo_dl/extractor/turbo.py b/hypervideo_dl/extractor/turbo.py
index be3eaa5..f6bbf25 100644
--- a/hypervideo_dl/extractor/turbo.py
+++ b/hypervideo_dl/extractor/turbo.py
@@ -30,7 +30,7 @@ class TurboIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
diff --git a/hypervideo_dl/extractor/turner.py b/hypervideo_dl/extractor/turner.py
index 81229a5..32125bc 100644
--- a/hypervideo_dl/extractor/turner.py
+++ b/hypervideo_dl/extractor/turner.py
@@ -144,7 +144,7 @@ class TurnerBaseIE(AdobePassIE):
m3u8_id=format_id or 'hls', fatal=False)
if '/secure/' in video_url and '?hdnea=' in video_url:
for f in m3u8_formats:
- f['_seekable'] = False
+ f['_ffmpeg_args'] = ['-seekable', '0']
formats.extend(m3u8_formats)
elif ext == 'f4m':
formats.extend(self._extract_f4m_formats(
@@ -221,6 +221,7 @@ class TurnerBaseIE(AdobePassIE):
}
def _extract_ngtv_info(self, media_id, tokenizer_query, ap_data=None):
+ is_live = ap_data.get('is_live')
streams_data = self._download_json(
'http://medium.ngtv.io/media/%s/tv' % media_id,
media_id)['media']['tv']
@@ -237,11 +238,11 @@ class TurnerBaseIE(AdobePassIE):
'http://token.ngtv.io/token/token_spe',
m3u8_url, media_id, ap_data or {}, tokenizer_query)
formats.extend(self._extract_m3u8_formats(
- m3u8_url, media_id, 'mp4', m3u8_id='hls', fatal=False))
+ m3u8_url, media_id, 'mp4', m3u8_id='hls', live=is_live, fatal=False))
duration = float_or_none(stream_data.get('totalRuntime'))
- if not chapters:
+ if not chapters and not is_live:
for chapter in stream_data.get('contentSegments', []):
start_time = float_or_none(chapter.get('start'))
chapter_duration = float_or_none(chapter.get('duration'))
diff --git a/hypervideo_dl/extractor/tv2.py b/hypervideo_dl/extractor/tv2.py
index 334b7d5..e085153 100644
--- a/hypervideo_dl/extractor/tv2.py
+++ b/hypervideo_dl/extractor/tv2.py
@@ -24,37 +24,34 @@ class TV2IE(InfoExtractor):
'url': 'http://www.tv2.no/v/916509/',
'info_dict': {
'id': '916509',
- 'ext': 'flv',
+ 'ext': 'mp4',
'title': 'Se Frode Gryttens hyllest av Steven Gerrard',
'description': 'TV 2 Sportens huspoet tar avskjed med Liverpools kaptein Steven Gerrard.',
'timestamp': 1431715610,
'upload_date': '20150515',
- 'duration': 156.967,
+ 'duration': 157,
'view_count': int,
'categories': list,
},
}]
- _API_DOMAIN = 'sumo.tv2.no'
- _PROTOCOLS = ('HDS', 'HLS', 'DASH')
+ _PROTOCOLS = ('HLS', 'DASH')
_GEO_COUNTRIES = ['NO']
def _real_extract(self, url):
video_id = self._match_id(url)
- api_base = 'http://%s/api/web/asset/%s' % (self._API_DOMAIN, video_id)
-
- asset = self._download_json(
- api_base + '.json', video_id,
- 'Downloading metadata JSON')['asset']
- title = asset.get('subtitle') or asset['title']
+ asset = self._download_json('https://sumo.tv2.no/rest/assets/' + video_id, video_id,
+ 'Downloading metadata JSON')
+ title = asset['title']
is_live = asset.get('live') is True
formats = []
format_urls = []
for protocol in self._PROTOCOLS:
try:
- data = self._download_json(
- api_base + '/play.json?protocol=%s&videoFormat=SMIL+ISMUSP' % protocol,
- video_id, 'Downloading play JSON')['playback']
+ data = self._download_json('https://api.sumo.tv2.no/play/%s?stream=%s' % (video_id, protocol),
+ video_id, 'Downloading playabck JSON',
+ headers={'content-type': 'application/json'},
+ data='{"device":{"id":"1-1-1","name":"Nettleser (HTML)"}}'.encode())['playback']
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
error = self._parse_json(e.cause.read().decode(), video_id)['error']
@@ -65,18 +62,12 @@ class TV2IE(InfoExtractor):
self.raise_login_required()
raise ExtractorError(error['description'])
raise
- items = try_get(data, lambda x: x['items']['item'])
- if not items:
- continue
- if not isinstance(items, list):
- items = [items]
+ items = data.get('streams', [])
for item in items:
- if not isinstance(item, dict):
- continue
video_url = item.get('url')
if not video_url or video_url in format_urls:
continue
- format_id = '%s-%s' % (protocol.lower(), item.get('mediaFormat'))
+ format_id = '%s-%s' % (protocol.lower(), item.get('type'))
if not self._is_valid_url(video_url, video_id, format_id):
continue
format_urls.append(video_url)
@@ -99,17 +90,15 @@ class TV2IE(InfoExtractor):
formats.append({
'url': video_url,
'format_id': format_id,
- 'tbr': int_or_none(item.get('bitrate')),
- 'filesize': int_or_none(item.get('fileSize')),
})
if not formats and data.get('drmProtected'):
- raise ExtractorError('This video is DRM protected.', expected=True)
+ self.report_drm(video_id)
self._sort_formats(formats)
thumbnails = [{
- 'id': thumbnail.get('@type'),
- 'url': thumbnail.get('url'),
- } for _, thumbnail in (asset.get('imageVersions') or {}).items()]
+ 'id': type,
+ 'url': thumb_url,
+ } for type, thumb_url in (asset.get('images') or {}).items()]
return {
'id': video_id,
@@ -117,10 +106,10 @@ class TV2IE(InfoExtractor):
'title': self._live_title(title) if is_live else title,
'description': strip_or_none(asset.get('description')),
'thumbnails': thumbnails,
- 'timestamp': parse_iso8601(asset.get('createTime')),
+ 'timestamp': parse_iso8601(asset.get('live_broadcast_time') or asset.get('update_time')),
'duration': float_or_none(asset.get('accurateDuration') or asset.get('duration')),
'view_count': int_or_none(asset.get('views')),
- 'categories': asset.get('keywords', '').split(','),
+ 'categories': asset.get('tags', '').split(','),
'formats': formats,
'is_live': is_live,
}
@@ -170,7 +159,7 @@ class TV2ArticleIE(InfoExtractor):
return self.playlist_result(entries, playlist_id, title, description)
-class KatsomoIE(TV2IE):
+class KatsomoIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?(?:katsomo|mtv(uutiset)?)\.fi/(?:sarja/[0-9a-z-]+-\d+/[0-9a-z-]+-|(?:#!/)?jakso/(?:\d+/[^/]+/)?|video/prog)(?P<id>\d+)'
_TESTS = [{
'url': 'https://www.mtv.fi/sarja/mtv-uutiset-live-33001002003/lahden-pelicans-teki-kovan-ratkaisun-ville-nieminen-pihalle-1181321',
@@ -203,6 +192,93 @@ class KatsomoIE(TV2IE):
_PROTOCOLS = ('HLS', 'MPD')
_GEO_COUNTRIES = ['FI']
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ api_base = 'http://%s/api/web/asset/%s' % (self._API_DOMAIN, video_id)
+
+ asset = self._download_json(
+ api_base + '.json', video_id,
+ 'Downloading metadata JSON')['asset']
+ title = asset.get('subtitle') or asset['title']
+ is_live = asset.get('live') is True
+
+ formats = []
+ format_urls = []
+ for protocol in self._PROTOCOLS:
+ try:
+ data = self._download_json(
+ api_base + '/play.json?protocol=%s&videoFormat=SMIL+ISMUSP' % protocol,
+ video_id, 'Downloading play JSON')['playback']
+ except ExtractorError as e:
+ if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
+ error = self._parse_json(e.cause.read().decode(), video_id)['error']
+ error_code = error.get('code')
+ if error_code == 'ASSET_PLAYBACK_INVALID_GEO_LOCATION':
+ self.raise_geo_restricted(countries=self._GEO_COUNTRIES)
+ elif error_code == 'SESSION_NOT_AUTHENTICATED':
+ self.raise_login_required()
+ raise ExtractorError(error['description'])
+ raise
+ items = try_get(data, lambda x: x['items']['item'])
+ if not items:
+ continue
+ if not isinstance(items, list):
+ items = [items]
+ for item in items:
+ if not isinstance(item, dict):
+ continue
+ video_url = item.get('url')
+ if not video_url or video_url in format_urls:
+ continue
+ format_id = '%s-%s' % (protocol.lower(), item.get('mediaFormat'))
+ if not self._is_valid_url(video_url, video_id, format_id):
+ continue
+ format_urls.append(video_url)
+ ext = determine_ext(video_url)
+ if ext == 'f4m':
+ formats.extend(self._extract_f4m_formats(
+ video_url, video_id, f4m_id=format_id, fatal=False))
+ elif ext == 'm3u8':
+ if not data.get('drmProtected'):
+ formats.extend(self._extract_m3u8_formats(
+ video_url, video_id, 'mp4',
+ 'm3u8' if is_live else 'm3u8_native',
+ m3u8_id=format_id, fatal=False))
+ elif ext == 'mpd':
+ formats.extend(self._extract_mpd_formats(
+ video_url, video_id, format_id, fatal=False))
+ elif ext == 'ism' or video_url.endswith('.ism/Manifest'):
+ pass
+ else:
+ formats.append({
+ 'url': video_url,
+ 'format_id': format_id,
+ 'tbr': int_or_none(item.get('bitrate')),
+ 'filesize': int_or_none(item.get('fileSize')),
+ })
+ if not formats and data.get('drmProtected'):
+ self.report_drm(video_id)
+ self._sort_formats(formats)
+
+ thumbnails = [{
+ 'id': thumbnail.get('@type'),
+ 'url': thumbnail.get('url'),
+ } for _, thumbnail in (asset.get('imageVersions') or {}).items()]
+
+ return {
+ 'id': video_id,
+ 'url': video_url,
+ 'title': self._live_title(title) if is_live else title,
+ 'description': strip_or_none(asset.get('description')),
+ 'thumbnails': thumbnails,
+ 'timestamp': parse_iso8601(asset.get('createTime')),
+ 'duration': float_or_none(asset.get('accurateDuration') or asset.get('duration')),
+ 'view_count': int_or_none(asset.get('views')),
+ 'categories': asset.get('keywords', '').split(','),
+ 'formats': formats,
+ 'is_live': is_live,
+ }
+
class MTVUutisetArticleIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)mtvuutiset\.fi/artikkeli/[^/]+/(?P<id>\d+)'
diff --git a/hypervideo_dl/extractor/tv2hu.py b/hypervideo_dl/extractor/tv2hu.py
index 86017b7..f210435 100644
--- a/hypervideo_dl/extractor/tv2hu.py
+++ b/hypervideo_dl/extractor/tv2hu.py
@@ -2,61 +2,109 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..utils import int_or_none
+from ..utils import (
+ traverse_obj,
+ UnsupportedError,
+)
class TV2HuIE(InfoExtractor):
- IE_NAME = 'tv2.hu'
- _VALID_URL = r'https?://(?:www\.)?tv2\.hu/(?:[^/]+/)+(?P<id>\d+)_[^/?#]+?\.html'
+ IE_NAME = 'tv2play.hu'
+ _VALID_URL = r'https?://(?:www\.)?tv2play\.hu/(?!szalag/)(?P<id>[^#&?]+)'
_TESTS = [{
- 'url': 'http://tv2.hu/ezek_megorultek/217679_ezek-megorultek---1.-adas-1.-resz.html',
- 'md5': '585e58e2e090f34603804bb2c48e98d8',
+ 'url': 'https://tv2play.hu/mintaapak/mintaapak_213_epizod_resz',
'info_dict': {
- 'id': '217679',
+ 'id': '249240',
'ext': 'mp4',
- 'title': 'Ezek megőrültek! - 1. adás 1. rész',
- 'upload_date': '20160826',
- 'thumbnail': r're:^https?://.*\.jpg$'
- }
- }, {
- 'url': 'http://tv2.hu/ezek_megorultek/teljes_adasok/217677_ezek-megorultek---1.-adas-2.-resz.html',
- 'only_matching': True
+ 'title': 'Mintaapák - 213. epizód',
+ 'series': 'Mintaapák',
+ 'duration': 2164,
+ 'description': 'md5:7350147e75485a59598e806c47967b07',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'release_date': '20210825',
+ 'season_number': None,
+ 'episode_number': 213,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
}, {
- 'url': 'http://tv2.hu/musoraink/aktiv/aktiv_teljes_adas/217963_aktiv-teljes-adas---2016.08.30..html',
- 'only_matching': True
+ 'url': 'https://tv2play.hu/taxi_2',
+ 'md5': '585e58e2e090f34603804bb2c48e98d8',
+ 'info_dict': {
+ 'id': '199363',
+ 'ext': 'mp4',
+ 'title': 'Taxi 2',
+ 'series': 'Taxi 2',
+ 'duration': 5087,
+ 'description': 'md5:47762155dc9a50241797ded101b1b08c',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'release_date': '20210118',
+ 'season_number': None,
+ 'episode_number': None,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
}]
def _real_extract(self, url):
- video_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
- json_url = self._search_regex(
- r'jsonUrl\s*=\s*"([^"]+)"', webpage, 'json url')
- json_data = self._download_json(json_url, video_id)
-
- formats = []
- for b in ('bitrates', 'backupBitrates'):
- bitrates = json_data.get(b, {})
- m3u8_url = bitrates.get('hls')
- if m3u8_url:
- formats.extend(self._extract_wowza_formats(
- m3u8_url, video_id, skip_protocols=['rtmp', 'rtsp']))
-
- for mp4_url in bitrates.get('mp4', []):
- height = int_or_none(self._search_regex(
- r'\.(\d+)p\.mp4', mp4_url, 'height', default=None))
- formats.append({
- 'format_id': 'http' + ('-%d' % height if height else ''),
- 'url': mp4_url,
- 'height': height,
- 'width': int_or_none(height / 9.0 * 16.0 if height else None),
- })
+ id = self._match_id(url)
+ json_data = self._download_json(f'https://tv2play.hu/api/search/{id}', id)
+
+ if json_data['contentType'] == 'showpage':
+ ribbon_ids = traverse_obj(json_data, ('pages', ..., 'tabs', ..., 'ribbonIds'), get_all=False, expected_type=list)
+ entries = [self.url_result(f'https://tv2play.hu/szalag/{ribbon_id}',
+ ie=TV2HuSeriesIE.ie_key(), video_id=ribbon_id) for ribbon_id in ribbon_ids]
+ return self.playlist_result(entries, playlist_id=id)
+ elif json_data['contentType'] != 'video':
+ raise UnsupportedError(url)
+
+ video_id = str(json_data['id'])
+ player_id = json_data.get('playerId')
+ series_json = json_data.get('seriesInfo', {})
+
+ video_json_url = self._download_json(f'https://tv2play.hu/api/streaming-url?playerId={player_id}', video_id)['url']
+ video_json = self._download_json(video_json_url, video_id)
+ m3u8_url = self._proto_relative_url(traverse_obj(video_json, ('bitrates', 'hls')))
+ formats, subtitles = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id)
self._sort_formats(formats)
return {
'id': video_id,
- 'title': self._og_search_title(webpage).strip(),
- 'thumbnail': self._og_search_thumbnail(webpage),
- 'upload_date': self._search_regex(
- r'/vod/(\d{8})/', json_url, 'upload_date', default=None),
+ 'title': json_data['title'],
+ 'series': json_data.get('seriesTitle'),
+ 'duration': json_data.get('length'),
+ 'description': json_data.get('description'),
+ 'thumbnail': 'https://tv2play.hu' + json_data.get('thumbnailUrl'),
+ 'release_date': json_data.get('uploadedAt').replace('.', ''),
+ 'season_number': series_json.get('seasonNr'),
+ 'episode_number': series_json.get('episodeNr'),
'formats': formats,
+ 'subtitles': subtitles,
}
+
+
+class TV2HuSeriesIE(InfoExtractor):
+ IE_NAME = 'tv2playseries.hu'
+ _VALID_URL = r'https?://(?:www\.)?tv2play\.hu/szalag/(?P<id>[^#&?]+)'
+
+ _TESTS = [{
+ 'url': 'https://tv2play.hu/szalag/59?rendezes=nepszeruseg',
+ 'playlist_mincount': 284,
+ 'info_dict': {
+ 'id': '59',
+ }
+ }]
+
+ def _real_extract(self, url):
+ id = self._match_id(url)
+ json_data = self._download_json(f'https://tv2play.hu/api/ribbons/{id}/0?size=100000', id)
+ entries = []
+ for card in json_data.get('cards', []):
+ video_id = card.get('slug')
+ if video_id:
+ entries.append(self.url_result(f'https://tv2play.hu/{video_id}',
+ ie=TV2HuIE.ie_key(), video_id=video_id))
+
+ return self.playlist_result(entries, playlist_id=id)
diff --git a/hypervideo_dl/extractor/tv4.py b/hypervideo_dl/extractor/tv4.py
index b73bab9..4043e63 100644
--- a/hypervideo_dl/extractor/tv4.py
+++ b/hypervideo_dl/extractor/tv4.py
@@ -93,21 +93,34 @@ class TV4IE(InfoExtractor):
'device': 'browser',
'protocol': 'hls',
})['playbackItem']['manifestUrl']
- formats = self._extract_m3u8_formats(
+ formats = []
+ subtitles = {}
+
+ fmts, subs = self._extract_m3u8_formats_and_subtitles(
manifest_url, video_id, 'mp4',
'm3u8_native', m3u8_id='hls', fatal=False)
- formats.extend(self._extract_mpd_formats(
+ formats.extend(fmts)
+ subtitles = self._merge_subtitles(subtitles, subs)
+
+ fmts, subs = self._extract_mpd_formats_and_subtitles(
manifest_url.replace('.m3u8', '.mpd'),
- video_id, mpd_id='dash', fatal=False))
- formats.extend(self._extract_f4m_formats(
+ video_id, mpd_id='dash', fatal=False)
+ formats.extend(fmts)
+ subtitles = self._merge_subtitles(subtitles, subs)
+
+ fmts = self._extract_f4m_formats(
manifest_url.replace('.m3u8', '.f4m'),
- video_id, f4m_id='hds', fatal=False))
- formats.extend(self._extract_ism_formats(
+ video_id, f4m_id='hds', fatal=False)
+ formats.extend(fmts)
+
+ fmts, subs = self._extract_ism_formats_and_subtitles(
re.sub(r'\.ism/.*?\.m3u8', r'.ism/Manifest', manifest_url),
- video_id, ism_id='mss', fatal=False))
+ video_id, ism_id='mss', fatal=False)
+ formats.extend(fmts)
+ subtitles = self._merge_subtitles(subtitles, subs)
if not formats and info.get('is_geo_restricted'):
- self.raise_geo_restricted(countries=self._GEO_COUNTRIES)
+ self.raise_geo_restricted(countries=self._GEO_COUNTRIES, metadata_available=True)
self._sort_formats(formats)
@@ -115,7 +128,7 @@ class TV4IE(InfoExtractor):
'id': video_id,
'title': title,
'formats': formats,
- # 'subtitles': subtitles,
+ 'subtitles': subtitles,
'description': info.get('description'),
'timestamp': parse_iso8601(info.get('broadcast_date_time')),
'duration': int_or_none(info.get('duration')),
diff --git a/hypervideo_dl/extractor/tv5mondeplus.py b/hypervideo_dl/extractor/tv5mondeplus.py
index b7fe082..a0832d2 100644
--- a/hypervideo_dl/extractor/tv5mondeplus.py
+++ b/hypervideo_dl/extractor/tv5mondeplus.py
@@ -7,6 +7,7 @@ from ..utils import (
extract_attributes,
int_or_none,
parse_duration,
+ try_get,
)
@@ -15,28 +16,28 @@ class TV5MondePlusIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?(?:tv5mondeplus|revoir\.tv5monde)\.com/toutes-les-videos/[^/]+/(?P<id>[^/?#]+)'
_TESTS = [{
# movie
- 'url': 'https://revoir.tv5monde.com/toutes-les-videos/cinema/rendez-vous-a-atlit',
- 'md5': '8cbde5ea7b296cf635073e27895e227f',
+ 'url': 'https://revoir.tv5monde.com/toutes-les-videos/cinema/ceux-qui-travaillent',
+ 'md5': '32fa0cde16a4480d1251502a66856d5f',
'info_dict': {
- 'id': '822a4756-0712-7329-1859-a13ac7fd1407',
- 'display_id': 'rendez-vous-a-atlit',
+ 'id': 'dc57a011-ec4b-4648-2a9a-4f03f8352ed3',
+ 'display_id': 'ceux-qui-travaillent',
'ext': 'mp4',
- 'title': 'Rendez-vous à Atlit',
- 'description': 'md5:2893a4c5e1dbac3eedff2d87956e4efb',
- 'upload_date': '20200130',
+ 'title': 'Ceux qui travaillent',
+ 'description': 'md5:570e8bb688036ace873b2d50d24c026d',
+ 'upload_date': '20210819',
},
}, {
# series episode
- 'url': 'https://revoir.tv5monde.com/toutes-les-videos/series-fictions/c-est-la-vie-ennemie-juree',
+ 'url': 'https://revoir.tv5monde.com/toutes-les-videos/series-fictions/vestiaires-caro-actrice',
'info_dict': {
- 'id': '0df7007c-4900-3936-c601-87a13a93a068',
- 'display_id': 'c-est-la-vie-ennemie-juree',
+ 'id': '9e9d599e-23af-6915-843e-ecbf62e97925',
+ 'display_id': 'vestiaires-caro-actrice',
'ext': 'mp4',
- 'title': "C'est la vie - Ennemie jurée",
- 'description': 'md5:dfb5c63087b6f35fe0cc0af4fe44287e',
- 'upload_date': '20200130',
- 'series': "C'est la vie",
- 'episode': 'Ennemie jurée',
+ 'title': "Vestiaires - Caro actrice",
+ 'description': 'md5:db15d2e1976641e08377f942778058ea',
+ 'upload_date': '20210819',
+ 'series': "Vestiaires",
+ 'episode': 'Caro actrice',
},
'params': {
'skip_download': True,
@@ -63,7 +64,7 @@ class TV5MondePlusIE(InfoExtractor):
webpage, 'video player loader'))
video_files = self._parse_json(
- vpl_data['data-broadcast'], display_id).get('files', [])
+ vpl_data['data-broadcast'], display_id)
formats = []
for video_file in video_files:
v_url = video_file.get('url')
@@ -81,6 +82,11 @@ class TV5MondePlusIE(InfoExtractor):
})
self._sort_formats(formats)
+ metadata = self._parse_json(
+ vpl_data['data-metadata'], display_id)
+ duration = (int_or_none(try_get(metadata, lambda x: x['content']['duration']))
+ or parse_duration(self._html_search_meta('duration', webpage)))
+
description = self._html_search_regex(
r'(?s)<div[^>]+class=["\']episode-texte[^>]+>(.+?)</div>', webpage,
'description', fatal=False)
@@ -109,7 +115,7 @@ class TV5MondePlusIE(InfoExtractor):
'title': title,
'description': description,
'thumbnail': vpl_data.get('data-image'),
- 'duration': int_or_none(vpl_data.get('data-duration')) or parse_duration(self._html_search_meta('duration', webpage)),
+ 'duration': duration,
'upload_date': upload_date,
'formats': formats,
'series': series,
diff --git a/hypervideo_dl/extractor/tv5unis.py b/hypervideo_dl/extractor/tv5unis.py
index eabdc22..398b85d 100644
--- a/hypervideo_dl/extractor/tv5unis.py
+++ b/hypervideo_dl/extractor/tv5unis.py
@@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
from ..utils import (
@@ -16,7 +15,7 @@ class TV5UnisBaseIE(InfoExtractor):
_GEO_COUNTRIES = ['CA']
def _real_extract(self, url):
- groups = re.match(self._VALID_URL, url).groups()
+ groups = self._match_valid_url(url).groups()
product = self._download_json(
'https://api.tv5unis.ca/graphql', groups[0], query={
'query': '''{
diff --git a/hypervideo_dl/extractor/tver.py b/hypervideo_dl/extractor/tver.py
index a4a30b1..943b3eb 100644
--- a/hypervideo_dl/extractor/tver.py
+++ b/hypervideo_dl/extractor/tver.py
@@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
from ..compat import compat_str
@@ -38,7 +37,7 @@ class TVerIE(InfoExtractor):
'https://tver.jp/api/access_token.php', None)['token']
def _real_extract(self, url):
- path, video_id = re.match(self._VALID_URL, url).groups()
+ path, video_id = self._match_valid_url(url).groups()
main = self._download_json(
'https://api.tver.jp/v4/' + path, video_id,
query={'token': self._TOKEN})['main']
diff --git a/hypervideo_dl/extractor/tvigle.py b/hypervideo_dl/extractor/tvigle.py
index 180259a..aa25ba0 100644
--- a/hypervideo_dl/extractor/tvigle.py
+++ b/hypervideo_dl/extractor/tvigle.py
@@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
from ..utils import (
@@ -54,7 +53,7 @@ class TvigleIE(InfoExtractor):
]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
video_id = mobj.group('id')
display_id = mobj.group('display_id')
diff --git a/hypervideo_dl/extractor/tvland.py b/hypervideo_dl/extractor/tvland.py
index 7911441..9ebf57f 100644
--- a/hypervideo_dl/extractor/tvland.py
+++ b/hypervideo_dl/extractor/tvland.py
@@ -1,10 +1,12 @@
# coding: utf-8
from __future__ import unicode_literals
-from .spike import ParamountNetworkIE
+from .mtv import MTVServicesInfoExtractor
+# TODO: Remove - Reason not used anymore - Service moved to youtube
-class TVLandIE(ParamountNetworkIE):
+
+class TVLandIE(MTVServicesInfoExtractor):
IE_NAME = 'tvland.com'
_VALID_URL = r'https?://(?:www\.)?tvland\.com/(?:video-clips|(?:full-)?episodes)/(?P<id>[^/?#.]+)'
_FEED_URL = 'http://www.tvland.com/feeds/mrss/'
@@ -17,6 +19,7 @@ class TVLandIE(ParamountNetworkIE):
'title': 'The Dog',
},
'playlist_mincount': 5,
+ 'skip': '404 Not found',
}, {
'url': 'https://www.tvland.com/video-clips/4n87f2/younger-a-first-look-at-younger-season-6',
'md5': 'e2c6389401cf485df26c79c247b08713',
diff --git a/hypervideo_dl/extractor/tvnow.py b/hypervideo_dl/extractor/tvnow.py
index 9c8a8a0..b318184 100644
--- a/hypervideo_dl/extractor/tvnow.py
+++ b/hypervideo_dl/extractor/tvnow.py
@@ -7,10 +7,12 @@ from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
ExtractorError,
+ get_element_by_id,
int_or_none,
parse_iso8601,
parse_duration,
str_or_none,
+ try_get,
update_url_query,
urljoin,
)
@@ -67,7 +69,7 @@ class TVNowBaseIE(InfoExtractor):
if formats:
break
else:
- if info.get('isDrm'):
+ if not self.get_param('allow_unplayable_formats') and info.get('isDrm'):
raise ExtractorError(
'Video %s is DRM protected' % video_id, expected=True)
if info.get('geoblocked'):
@@ -167,7 +169,7 @@ class TVNowIE(TVNowBaseIE):
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
display_id = '%s/%s' % mobj.group(2, 3)
info = self._call_api(
@@ -194,7 +196,7 @@ class TVNowNewIE(InfoExtractor):
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
base_url = re.sub(r'(?:shows|serien)', '_', mobj.group('base_url'))
show, episode = mobj.group('show', 'episode')
return self.url_result(
@@ -204,6 +206,86 @@ class TVNowNewIE(InfoExtractor):
ie=TVNowIE.ie_key(), video_id=mobj.group('id'))
+class TVNowFilmIE(TVNowBaseIE):
+ _VALID_URL = r'''(?x)
+ (?P<base_url>https?://
+ (?:www\.)?tvnow\.(?:de|at|ch)/
+ (?:filme))/
+ (?P<title>[^/?$&]+)-(?P<id>\d+)
+ '''
+ _TESTS = [{
+ 'url': 'https://www.tvnow.de/filme/lord-of-war-haendler-des-todes-7959',
+ 'info_dict': {
+ 'id': '1426690',
+ 'display_id': 'lord-of-war-haendler-des-todes',
+ 'ext': 'mp4',
+ 'title': 'Lord of War',
+ 'description': 'md5:5eda15c0d5b8cb70dac724c8a0ff89a9',
+ 'timestamp': 1550010000,
+ 'upload_date': '20190212',
+ 'duration': 7016,
+ },
+ }, {
+ 'url': 'https://www.tvnow.de/filme/the-machinist-12157',
+ 'info_dict': {
+ 'id': '328160',
+ 'display_id': 'the-machinist',
+ 'ext': 'mp4',
+ 'title': 'The Machinist',
+ 'description': 'md5:9a0e363fdd74b3a9e1cdd9e21d0ecc28',
+ 'timestamp': 1496469720,
+ 'upload_date': '20170603',
+ 'duration': 5836,
+ },
+ }, {
+ 'url': 'https://www.tvnow.de/filme/horst-schlaemmer-isch-kandidiere-17777',
+ 'only_matching': True, # DRM protected
+ }]
+
+ def _real_extract(self, url):
+ mobj = self._match_valid_url(url)
+ display_id = mobj.group('title')
+
+ webpage = self._download_webpage(url, display_id, fatal=False)
+ if not webpage:
+ raise ExtractorError('Cannot download "%s"' % url, expected=True)
+
+ json_text = get_element_by_id('now-web-state', webpage)
+ if not json_text:
+ raise ExtractorError('Cannot read video data', expected=True)
+
+ json_data = self._parse_json(
+ json_text,
+ display_id,
+ transform_source=lambda x: x.replace('&q;', '"'),
+ fatal=False)
+ if not json_data:
+ raise ExtractorError('Cannot read video data', expected=True)
+
+ player_key = next(
+ (key for key in json_data.keys() if 'module/player' in key),
+ None)
+ page_key = next(
+ (key for key in json_data.keys() if 'page/filme' in key),
+ None)
+ movie_id = try_get(
+ json_data,
+ [
+ lambda x: x[player_key]['body']['id'],
+ lambda x: x[page_key]['body']['modules'][0]['id'],
+ lambda x: x[page_key]['body']['modules'][1]['id']],
+ int)
+ if not movie_id:
+ raise ExtractorError('Cannot extract movie ID', expected=True)
+
+ info = self._call_api(
+ 'movies/%d' % movie_id,
+ display_id,
+ query={'fields': ','.join(self._VIDEO_FIELDS)})
+
+ return self._extract_video(info, display_id)
+
+
class TVNowNewBaseIE(InfoExtractor):
def _call_api(self, path, video_id, query={}):
result = self._download_json(
@@ -342,9 +424,85 @@ class TVNowIE(TVNowNewBaseIE):
}
def _real_extract(self, url):
- display_id, video_id = re.match(self._VALID_URL, url).groups()
+ display_id, video_id = self._match_valid_url(url).groups()
info = self._call_api('player/' + video_id, video_id)
return self._extract_video(info, video_id, display_id)
+
+
+class TVNowFilmIE(TVNowIE):
+ _VALID_URL = r'''(?x)
+ (?P<base_url>https?://
+ (?:www\.)?tvnow\.(?:de|at|ch)/
+ (?:filme))/
+ (?P<title>[^/?$&]+)-(?P<id>\d+)
+ '''
+ _TESTS = [{
+ 'url': 'https://www.tvnow.de/filme/lord-of-war-haendler-des-todes-7959',
+ 'info_dict': {
+ 'id': '1426690',
+ 'display_id': 'lord-of-war-haendler-des-todes',
+ 'ext': 'mp4',
+ 'title': 'Lord of War',
+ 'description': 'md5:5eda15c0d5b8cb70dac724c8a0ff89a9',
+ 'timestamp': 1550010000,
+ 'upload_date': '20190212',
+ 'duration': 7016,
+ },
+ }, {
+ 'url': 'https://www.tvnow.de/filme/the-machinist-12157',
+ 'info_dict': {
+ 'id': '328160',
+ 'display_id': 'the-machinist',
+ 'ext': 'mp4',
+ 'title': 'The Machinist',
+ 'description': 'md5:9a0e363fdd74b3a9e1cdd9e21d0ecc28',
+ 'timestamp': 1496469720,
+ 'upload_date': '20170603',
+ 'duration': 5836,
+ },
+ }, {
+ 'url': 'https://www.tvnow.de/filme/horst-schlaemmer-isch-kandidiere-17777',
+ 'only_matching': True, # DRM protected
+ }]
+
+ def _real_extract(self, url):
+ mobj = self._match_valid_url(url)
+ display_id = mobj.group('title')
+
+ webpage = self._download_webpage(url, display_id, fatal=False)
+ if not webpage:
+ raise ExtractorError('Cannot download "%s"' % url, expected=True)
+
+ json_text = get_element_by_id('now-web-state', webpage)
+ if not json_text:
+ raise ExtractorError('Cannot read video data', expected=True)
+
+ json_data = self._parse_json(
+ json_text,
+ display_id,
+ transform_source=lambda x: x.replace('&q;', '"'),
+ fatal=False)
+ if not json_data:
+ raise ExtractorError('Cannot read video data', expected=True)
+
+ player_key = next(
+ (key for key in json_data.keys() if 'module/player' in key),
+ None)
+ page_key = next(
+ (key for key in json_data.keys() if 'page/filme' in key),
+ None)
+ movie_id = try_get(
+ json_data,
+ [
+ lambda x: x[player_key]['body']['id'],
+ lambda x: x[page_key]['body']['modules'][0]['id'],
+ lambda x: x[page_key]['body']['modules'][1]['id']],
+ int)
+ if not movie_id:
+ raise ExtractorError('Cannot extract movie ID', expected=True)
+
+ info = self._call_api('player/%d' % movie_id, display_id)
+ return self._extract_video(info, url, display_id)
"""
@@ -394,7 +552,7 @@ class TVNowSeasonIE(TVNowListBaseIE):
}]
def _real_extract(self, url):
- _, show_id, season_id = re.match(self._VALID_URL, url).groups()
+ _, show_id, season_id = self._match_valid_url(url).groups()
return self._extract_items(
url, show_id, season_id, {'season': season_id})
@@ -410,7 +568,7 @@ class TVNowAnnualIE(TVNowListBaseIE):
}]
def _real_extract(self, url):
- _, show_id, year, month = re.match(self._VALID_URL, url).groups()
+ _, show_id, year, month = self._match_valid_url(url).groups()
return self._extract_items(
url, show_id, '%s-%s' % (year, month), {
'year': int(year),
@@ -442,7 +600,7 @@ class TVNowShowIE(TVNowListBaseIE):
else super(TVNowShowIE, cls).suitable(url))
def _real_extract(self, url):
- base_url, show_id = re.match(self._VALID_URL, url).groups()
+ base_url, show_id = self._match_valid_url(url).groups()
result = self._call_api(
'teaserrow/format/navigation/' + show_id, show_id)
diff --git a/hypervideo_dl/extractor/tvp.py b/hypervideo_dl/extractor/tvp.py
index accff75..1e42b33 100644
--- a/hypervideo_dl/extractor/tvp.py
+++ b/hypervideo_dl/extractor/tvp.py
@@ -246,7 +246,7 @@ class TVPWebsiteIE(InfoExtractor):
video_id=video_id)
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
display_id, playlist_id = mobj.group('display_id', 'id')
return self.playlist_result(
self._entries(display_id, playlist_id), playlist_id)
diff --git a/hypervideo_dl/extractor/tvplay.py b/hypervideo_dl/extractor/tvplay.py
index 0d858c0..fbafb41 100644
--- a/hypervideo_dl/extractor/tvplay.py
+++ b/hypervideo_dl/extractor/tvplay.py
@@ -34,8 +34,8 @@ class TVPlayIE(InfoExtractor):
tvplay(?:\.skaties)?\.lv(?:/parraides)?|
(?:tv3play|play\.tv3)\.lt(?:/programos)?|
tv3play(?:\.tv3)?\.ee/sisu|
- (?:tv(?:3|6|8|10)play|viafree)\.se/program|
- (?:(?:tv3play|viasat4play|tv6play|viafree)\.no|(?:tv3play|viafree)\.dk)/programmer|
+ (?:tv(?:3|6|8|10)play)\.se/program|
+ (?:(?:tv3play|viasat4play|tv6play)\.no|(?:tv3play)\.dk)/programmer|
play\.nova(?:tv)?\.bg/programi
)
/(?:[^/]+/)+
@@ -224,10 +224,6 @@ class TVPlayIE(InfoExtractor):
'only_matching': True,
},
{
- 'url': 'http://www.viafree.se/program/underhallning/i-like-radio-live/sasong-1/676869',
- 'only_matching': True,
- },
- {
'url': 'mtg:418113',
'only_matching': True,
}
@@ -298,7 +294,8 @@ class TVPlayIE(InfoExtractor):
if not formats and video.get('is_geo_blocked'):
self.raise_geo_restricted(
- 'This content might not be available in your country due to copyright reasons')
+ 'This content might not be available in your country due to copyright reasons',
+ metadata_available=True)
self._sort_formats(formats)
@@ -339,8 +336,8 @@ class ViafreeIE(InfoExtractor):
_VALID_URL = r'''(?x)
https?://
(?:www\.)?
- viafree\.(?P<country>dk|no|se)
- /(?P<id>program(?:mer)?/(?:[^/]+/)+[^/?#&]+)
+ viafree\.(?P<country>dk|no|se|fi)
+ /(?P<id>(?:program(?:mer)?|ohjelmat)?/(?:[^/]+/)+[^/?#&]+)
'''
_TESTS = [{
'url': 'http://www.viafree.no/programmer/underholdning/det-beste-vorspielet/sesong-2/episode-1',
@@ -359,6 +356,23 @@ class ViafreeIE(InfoExtractor):
'skip_download': True,
},
}, {
+ 'url': 'https://www.viafree.dk/programmer/humor/comedy-central-roast-of-charlie-sheen/film/1047660',
+ 'info_dict': {
+ 'id': '1047660',
+ 'ext': 'mp4',
+ 'title': 'Comedy Central Roast of Charlie Sheen - Comedy Central Roast of Charlie Sheen',
+ 'description': 'md5:ec956d941ae9fd7c65a48fd64951dc6d',
+ 'series': 'Comedy Central Roast of Charlie Sheen',
+ 'season_number': 1,
+ 'duration': 3747,
+ 'timestamp': 1608246060,
+ 'upload_date': '20201217'
+ },
+ 'params': {
+ 'format': 'bestvideo',
+ 'skip_download': True
+ }
+ }, {
# with relatedClips
'url': 'http://www.viafree.se/program/reality/sommaren-med-youtube-stjarnorna/sasong-1/avsnitt-1',
'only_matching': True,
@@ -372,15 +386,17 @@ class ViafreeIE(InfoExtractor):
}, {
'url': 'http://www.viafree.dk/programmer/reality/paradise-hotel/saeson-7/episode-5',
'only_matching': True,
+ }, {
+ 'url': 'http://www.viafree.se/program/underhallning/i-like-radio-live/sasong-1/676869',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.viafree.fi/ohjelmat/entertainment/amazing-makeovers/kausi-7/jakso-2',
+ 'only_matching': True,
}]
_GEO_BYPASS = False
- @classmethod
- def suitable(cls, url):
- return False if TVPlayIE.suitable(url) else super(ViafreeIE, cls).suitable(url)
-
def _real_extract(self, url):
- country, path = re.match(self._VALID_URL, url).groups()
+ country, path = self._match_valid_url(url).groups()
content = self._download_json(
'https://viafree-content.mtg-api.com/viafree-content/v1/%s/path/%s' % (country, path), path)
program = content['_embedded']['viafreeBlocks'][0]['_embedded']['program']
@@ -397,16 +413,16 @@ class ViafreeIE(InfoExtractor):
self.raise_geo_restricted(countries=[country])
raise
- formats = self._extract_m3u8_formats(stream_href, guid, 'mp4')
+ formats, subtitles = self._extract_m3u8_formats_and_subtitles(stream_href, guid, 'mp4')
self._sort_formats(formats)
episode = program.get('episode') or {}
-
return {
'id': guid,
'title': title,
'thumbnail': meta.get('image'),
'description': meta.get('description'),
'series': episode.get('seriesTitle'),
+ 'subtitles': subtitles,
'episode_number': int_or_none(episode.get('episodeNumber')),
'season_number': int_or_none(episode.get('seasonNumber')),
'duration': int_or_none(try_get(program, lambda x: x['video']['duration']['milliseconds']), 1000),
diff --git a/hypervideo_dl/extractor/twentyfourvideo.py b/hypervideo_dl/extractor/twentyfourvideo.py
index 74d1404..ae19e11 100644
--- a/hypervideo_dl/extractor/twentyfourvideo.py
+++ b/hypervideo_dl/extractor/twentyfourvideo.py
@@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
from ..utils import (
@@ -68,7 +67,7 @@ class TwentyFourVideoIE(InfoExtractor):
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
video_id = mobj.group('id')
host = mobj.group('host')
diff --git a/hypervideo_dl/extractor/twentythreevideo.py b/hypervideo_dl/extractor/twentythreevideo.py
index dc56091..e8cf5a1 100644
--- a/hypervideo_dl/extractor/twentythreevideo.py
+++ b/hypervideo_dl/extractor/twentythreevideo.py
@@ -1,6 +1,5 @@
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
from ..utils import int_or_none
@@ -27,7 +26,7 @@ class TwentyThreeVideoIE(InfoExtractor):
}]
def _real_extract(self, url):
- domain, query, photo_id = re.match(self._VALID_URL, url).groups()
+ domain, query, photo_id = self._match_valid_url(url).groups()
base_url = 'https://%s' % domain
photo_data = self._download_json(
base_url + '/api/photo/list?' + query, photo_id, query={
diff --git a/hypervideo_dl/extractor/twitcasting.py b/hypervideo_dl/extractor/twitcasting.py
index 6596eef..3acf1b1 100644
--- a/hypervideo_dl/extractor/twitcasting.py
+++ b/hypervideo_dl/extractor/twitcasting.py
@@ -1,23 +1,29 @@
# coding: utf-8
from __future__ import unicode_literals
+import itertools
import re
from .common import InfoExtractor
+from ..downloader.websocket import has_websockets
from ..utils import (
clean_html,
float_or_none,
get_element_by_class,
get_element_by_id,
parse_duration,
+ qualities,
str_to_int,
+ try_get,
unified_timestamp,
urlencode_postdata,
+ urljoin,
+ ExtractorError,
)
class TwitCastingIE(InfoExtractor):
- _VALID_URL = r'https?://(?:[^/]+\.)?twitcasting\.tv/(?P<uploader_id>[^/]+)/movie/(?P<id>\d+)'
+ _VALID_URL = r'https?://(?:[^/]+\.)?twitcasting\.tv/(?P<uploader_id>[^/]+)/(?:movie|twplayer)/(?P<id>\d+)'
_TESTS = [{
'url': 'https://twitcasting.tv/ivetesangalo/movie/2357609',
'md5': '745243cad58c4681dc752490f7540d7f',
@@ -57,19 +63,20 @@ class TwitCastingIE(InfoExtractor):
}]
def _real_extract(self, url):
- uploader_id, video_id = re.match(self._VALID_URL, url).groups()
+ uploader_id, video_id = self._match_valid_url(url).groups()
- video_password = self._downloader.params.get('videopassword')
+ video_password = self.get_param('videopassword')
request_data = None
if video_password:
request_data = urlencode_postdata({
'password': video_password,
})
- webpage = self._download_webpage(url, video_id, data=request_data)
+ webpage = self._download_webpage(
+ url, video_id, data=request_data,
+ headers={'Origin': 'https://twitcasting.tv'})
- title = clean_html(get_element_by_id(
- 'movietitle', webpage)) or self._html_search_meta(
- ['og:title', 'twitter:title'], webpage, fatal=True)
+ title = (clean_html(get_element_by_id('movietitle', webpage))
+ or self._html_search_meta(['og:title', 'twitter:title'], webpage, fatal=True))
video_js_data = {}
m3u8_url = self._search_regex(
@@ -77,13 +84,31 @@ class TwitCastingIE(InfoExtractor):
webpage, 'm3u8 url', group='url', default=None)
if not m3u8_url:
video_js_data = self._parse_json(self._search_regex(
- r"data-movie-playlist='(\[[^']+\])'",
- webpage, 'movie playlist'), video_id)[0]
- m3u8_url = video_js_data['source']['url']
+ r'data-movie-playlist=(["\'])(?P<url>(?:(?!\1).)+)',
+ webpage, 'movie playlist', group='url', default='[{}]'), video_id)
+ if isinstance(video_js_data, dict):
+ video_js_data = list(video_js_data.values())[0]
+ video_js_data = video_js_data[0]
+ m3u8_url = try_get(video_js_data, lambda x: x['source']['url'])
- # use `m3u8` entry_protocol until EXT-X-MAP is properly supported by `m3u8_native` entry_protocol
- formats = self._extract_m3u8_formats(
- m3u8_url, video_id, 'mp4', m3u8_id='hls')
+ stream_server_data = self._download_json(
+ 'https://twitcasting.tv/streamserver.php?target=%s&mode=client' % uploader_id, video_id,
+ 'Downloading live info', fatal=False)
+
+ is_live = 'data-status="online"' in webpage
+ formats = []
+ if is_live and not m3u8_url:
+ m3u8_url = 'https://twitcasting.tv/%s/metastream.m3u8' % uploader_id
+ if is_live and has_websockets and stream_server_data:
+ qq = qualities(['base', 'mobilesource', 'main'])
+ for mode, ws_url in stream_server_data['llfmp4']['streams'].items():
+ formats.append({
+ 'url': ws_url,
+ 'format_id': 'ws-%s' % mode,
+ 'ext': 'mp4',
+ 'quality': qq(mode),
+ 'protocol': 'websocket_frag', # TwitCasting simply sends moof atom directly over WS
+ })
thumbnail = video_js_data.get('thumbnailUrl') or self._og_search_thumbnail(webpage)
description = clean_html(get_element_by_id(
@@ -98,6 +123,11 @@ class TwitCastingIE(InfoExtractor):
r'data-toggle="true"[^>]+datetime="([^"]+)"',
webpage, 'datetime', None))
+ if m3u8_url:
+ formats.extend(self._extract_m3u8_formats(
+ m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', live=is_live))
+ self._sort_formats(formats)
+
return {
'id': video_id,
'title': title,
@@ -108,4 +138,59 @@ class TwitCastingIE(InfoExtractor):
'duration': duration,
'view_count': view_count,
'formats': formats,
+ 'is_live': is_live,
}
+
+
+class TwitCastingLiveIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:[^/]+\.)?twitcasting\.tv/(?P<id>[^/]+)/?(?:[#?]|$)'
+ _TESTS = [{
+ 'url': 'https://twitcasting.tv/ivetesangalo',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ uploader_id = self._match_id(url)
+ self.to_screen(
+ 'Downloading live video of user {0}. '
+ 'Pass "https://twitcasting.tv/{0}/show" to download the history'.format(uploader_id))
+
+ webpage = self._download_webpage(url, uploader_id)
+ current_live = self._search_regex(
+ (r'data-type="movie" data-id="(\d+)">',
+ r'tw-sound-flag-open-link" data-id="(\d+)" style=',),
+ webpage, 'current live ID', default=None)
+ if not current_live:
+ raise ExtractorError('The user is not currently live')
+ return self.url_result('https://twitcasting.tv/%s/movie/%s' % (uploader_id, current_live))
+
+
+class TwitCastingUserIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:[^/]+\.)?twitcasting\.tv/(?P<id>[^/]+)/show/?(?:[#?]|$)'
+ _TESTS = [{
+ 'url': 'https://twitcasting.tv/noriyukicas/show',
+ 'only_matching': True,
+ }]
+
+ def _entries(self, uploader_id):
+ base_url = next_url = 'https://twitcasting.tv/%s/show' % uploader_id
+ for page_num in itertools.count(1):
+ webpage = self._download_webpage(
+ next_url, uploader_id, query={'filter': 'watchable'}, note='Downloading page %d' % page_num)
+ matches = re.finditer(
+ r'''(?isx)<a\s+class="tw-movie-thumbnail"\s*href="(?P<url>/[^/]+/movie/\d+)"\s*>.+?</a>''',
+ webpage)
+ for mobj in matches:
+ yield self.url_result(urljoin(base_url, mobj.group('url')))
+
+ next_url = self._search_regex(
+ r'<a href="(/%s/show/%d-\d+)[?"]' % (re.escape(uploader_id), page_num),
+ webpage, 'next url', default=None)
+ next_url = urljoin(base_url, next_url)
+ if not next_url:
+ return
+
+ def _real_extract(self, url):
+ uploader_id = self._match_id(url)
+ return self.playlist_result(
+ self._entries(uploader_id), uploader_id, '%s - Live History' % uploader_id)
diff --git a/hypervideo_dl/extractor/twitch.py b/hypervideo_dl/extractor/twitch.py
index a378bd6..be70bee 100644
--- a/hypervideo_dl/extractor/twitch.py
+++ b/hypervideo_dl/extractor/twitch.py
@@ -11,7 +11,6 @@ from .common import InfoExtractor
from ..compat import (
compat_parse_qs,
compat_str,
- compat_urlparse,
compat_urllib_parse_urlencode,
compat_urllib_parse_urlparse,
)
@@ -23,6 +22,7 @@ from ..utils import (
int_or_none,
parse_duration,
parse_iso8601,
+ parse_qs,
qualities,
try_get,
unified_timestamp,
@@ -376,7 +376,7 @@ def _make_video_result(node):
return {
'_type': 'url_transparent',
'ie_key': TwitchVodIE.ie_key(),
- 'id': video_id,
+ 'id': 'v' + video_id,
'url': 'https://www.twitch.tv/videos/%s' % video_id,
'title': node.get('title'),
'thumbnail': node.get('previewThumbnailURL'),
@@ -571,7 +571,7 @@ class TwitchVideosIE(TwitchPlaylistBaseIE):
def _real_extract(self, url):
channel_name = self._match_id(url)
- qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
+ qs = parse_qs(url)
filter = qs.get('filter', ['all'])[0]
sort = qs.get('sort', ['time'])[0]
broadcast = self._BROADCASTS.get(filter, self._DEFAULT_BROADCAST)
@@ -647,7 +647,7 @@ class TwitchVideosClipsIE(TwitchPlaylistBaseIE):
def _real_extract(self, url):
channel_name = self._match_id(url)
- qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
+ qs = parse_qs(url)
range = qs.get('range', ['7d'])[0]
clip = self._RANGE.get(range, self._DEFAULT_CLIP)
return self.playlist_result(
@@ -864,6 +864,7 @@ class TwitchClipsIE(TwitchBaseIE):
'md5': '761769e1eafce0ffebfb4089cb3847cd',
'info_dict': {
'id': '42850523',
+ 'display_id': 'FaintLightGullWholeWheat',
'ext': 'mp4',
'title': 'EA Play 2016 Live from the Novo Theatre',
'thumbnail': r're:^https?://.*\.jpg',
@@ -976,6 +977,7 @@ class TwitchClipsIE(TwitchBaseIE):
return {
'id': clip.get('id') or video_id,
+ 'display_id': video_id,
'title': clip.get('title') or video_id,
'formats': formats,
'duration': int_or_none(clip.get('durationSeconds')),
diff --git a/hypervideo_dl/extractor/twitter.py b/hypervideo_dl/extractor/twitter.py
index cfa7a73..485b781 100644
--- a/hypervideo_dl/extractor/twitter.py
+++ b/hypervideo_dl/extractor/twitter.py
@@ -37,9 +37,9 @@ class TwitterBaseIE(InfoExtractor):
def _extract_variant_formats(self, variant, video_id):
variant_url = variant.get('url')
if not variant_url:
- return []
+ return [], {}
elif '.m3u8' in variant_url:
- return self._extract_m3u8_formats(
+ return self._extract_m3u8_formats_and_subtitles(
variant_url, video_id, 'mp4', 'm3u8_native',
m3u8_id='hls', fatal=False)
else:
@@ -50,7 +50,7 @@ class TwitterBaseIE(InfoExtractor):
'tbr': tbr,
}
self._search_dimensions_in_video_url(f, variant_url)
- return [f]
+ return [f], {}
def _extract_formats_from_vmap_url(self, vmap_url, video_id):
vmap_url = url_or_none(vmap_url)
@@ -58,17 +58,22 @@ class TwitterBaseIE(InfoExtractor):
return []
vmap_data = self._download_xml(vmap_url, video_id)
formats = []
+ subtitles = {}
urls = []
for video_variant in vmap_data.findall('.//{http://twitter.com/schema/videoVMapV2.xsd}videoVariant'):
video_variant.attrib['url'] = compat_urllib_parse_unquote(
video_variant.attrib['url'])
urls.append(video_variant.attrib['url'])
- formats.extend(self._extract_variant_formats(
- video_variant.attrib, video_id))
+ fmts, subs = self._extract_variant_formats(
+ video_variant.attrib, video_id)
+ formats.extend(fmts)
+ subtitles = self._merge_subtitles(subtitles, subs)
video_url = strip_or_none(xpath_text(vmap_data, './/MediaFile'))
if video_url not in urls:
- formats.extend(self._extract_variant_formats({'url': video_url}, video_id))
- return formats
+ fmts, subs = self._extract_variant_formats({'url': video_url}, video_id)
+ formats.extend(fmts)
+ subtitles = self._merge_subtitles(subtitles, subs)
+ return formats, subtitles
@staticmethod
def _search_dimensions_in_video_url(a_format, video_url):
@@ -475,8 +480,11 @@ class TwitterIE(TwitterBaseIE):
video_info = media.get('video_info') or {}
formats = []
+ subtitles = {}
for variant in video_info.get('variants', []):
- formats.extend(self._extract_variant_formats(variant, twid))
+ fmts, subs = self._extract_variant_formats(variant, twid)
+ subtitles = self._merge_subtitles(subtitles, subs)
+ formats.extend(fmts)
self._sort_formats(formats)
thumbnails = []
@@ -495,6 +503,7 @@ class TwitterIE(TwitterBaseIE):
info.update({
'formats': formats,
+ 'subtitles': subtitles,
'thumbnails': thumbnails,
'duration': float_or_none(video_info.get('duration_millis'), 1000),
})
@@ -544,7 +553,7 @@ class TwitterIE(TwitterBaseIE):
is_amplify = card_name == 'amplify'
vmap_url = get_binding_value('amplify_url_vmap') if is_amplify else get_binding_value('player_stream_url')
content_id = get_binding_value('%s_content_id' % (card_name if is_amplify else 'player'))
- formats = self._extract_formats_from_vmap_url(vmap_url, content_id or twid)
+ formats, subtitles = self._extract_formats_from_vmap_url(vmap_url, content_id or twid)
self._sort_formats(formats)
thumbnails = []
@@ -562,6 +571,7 @@ class TwitterIE(TwitterBaseIE):
info.update({
'formats': formats,
+ 'subtitles': subtitles,
'thumbnails': thumbnails,
'duration': int_or_none(get_binding_value(
'content_duration_seconds')),
@@ -667,3 +677,21 @@ class TwitterBroadcastIE(TwitterBaseIE, PeriscopeBaseIE):
info['formats'] = self._extract_pscp_m3u8_formats(
m3u8_url, broadcast_id, m3u8_id, state, width, height)
return info
+
+
+class TwitterShortenerIE(TwitterBaseIE):
+ IE_NAME = 'twitter:shortener'
+ _VALID_URL = r'https?://t.co/(?P<id>[^?]+)|tco:(?P<eid>[^?]+)'
+ _BASE_URL = 'https://t.co/'
+
+ def _real_extract(self, url):
+ mobj = self._match_valid_url(url)
+ eid, id = mobj.group('eid', 'id')
+ if eid:
+ id = eid
+ url = self._BASE_URL + id
+ new_url = self._request_webpage(url, id, headers={'User-Agent': 'curl'}).geturl()
+ __UNSAFE_LINK = "https://twitter.com/safety/unsafe_link_warning?unsafe_link="
+ if new_url.startswith(__UNSAFE_LINK):
+ new_url = new_url.replace(__UNSAFE_LINK, "")
+ return self.url_result(new_url)
diff --git a/hypervideo_dl/extractor/udemy.py b/hypervideo_dl/extractor/udemy.py
index bc5059b..74f638e 100644
--- a/hypervideo_dl/extractor/udemy.py
+++ b/hypervideo_dl/extractor/udemy.py
@@ -405,7 +405,7 @@ class UdemyIE(InfoExtractor):
if f.get('url'):
formats.append(f)
- self._sort_formats(formats, field_preference=('height', 'width', 'tbr', 'format_id'))
+ self._sort_formats(formats)
return {
'id': video_id,
diff --git a/hypervideo_dl/extractor/ukcolumn.py b/hypervideo_dl/extractor/ukcolumn.py
new file mode 100644
index 0000000..d2626f0
--- /dev/null
+++ b/hypervideo_dl/extractor/ukcolumn.py
@@ -0,0 +1,72 @@
+from __future__ import unicode_literals
+
+from ..utils import (
+ unescapeHTML,
+ urljoin,
+ ExtractorError,
+)
+from .common import InfoExtractor
+from .vimeo import VimeoIE
+from .youtube import YoutubeIE
+
+
+class UkColumnIE(InfoExtractor):
+ IE_NAME = 'ukcolumn'
+ _VALID_URL = r'(?i)https?://(?:www\.)?ukcolumn\.org(/index\.php)?/(?:video|ukcolumn-news)/(?P<id>[-a-z0-9]+)'
+
+ _TESTS = [{
+ 'url': 'https://www.ukcolumn.org/ukcolumn-news/uk-column-news-28th-april-2021',
+ 'info_dict': {
+ 'id': '541632443',
+ 'ext': 'mp4',
+ 'title': 'UK Column News - 28th April 2021',
+ 'uploader_id': 'ukcolumn',
+ 'uploader': 'UK Column',
+ },
+ 'add_ie': [VimeoIE.ie_key()],
+ 'expected_warnings': ['Unable to download JSON metadata'],
+ 'params': {
+ 'skip_download': 'Handled by Vimeo',
+ },
+ }, {
+ 'url': 'https://www.ukcolumn.org/video/insight-eu-military-unification',
+ 'info_dict': {
+ 'id': 'Fzbnb9t7XAw',
+ 'ext': 'mp4',
+ 'title': 'Insight: EU Military Unification',
+ 'uploader_id': 'ukcolumn',
+ 'description': 'md5:29a207965271af89baa0bc191f5de576',
+ 'uploader': 'UK Column',
+ 'upload_date': '20170514',
+ },
+ 'add_ie': [YoutubeIE.ie_key()],
+ 'params': {
+ 'skip_download': 'Handled by Youtube',
+ },
+ }, {
+ 'url': 'https://www.ukcolumn.org/index.php/ukcolumn-news/uk-column-news-30th-april-2021',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
+
+ oembed_url = urljoin(url, unescapeHTML(self._search_regex(
+ r'<iframe[^>]+src=(["\'])(?P<url>/media/oembed\?url=.+?)\1',
+ webpage, 'OEmbed URL', group='url')))
+ oembed_webpage = self._download_webpage(
+ oembed_url, display_id, note='Downloading OEmbed page')
+
+ ie, video_url = YoutubeIE, YoutubeIE._extract_url(oembed_webpage)
+ if not video_url:
+ ie, video_url = VimeoIE, VimeoIE._extract_url(url, oembed_webpage)
+ if not video_url:
+ raise ExtractorError('No embedded video found')
+
+ return {
+ '_type': 'url_transparent',
+ 'title': self._og_search_title(webpage),
+ 'url': video_url,
+ 'ie_key': ie.ie_key(),
+ }
diff --git a/hypervideo_dl/extractor/umg.py b/hypervideo_dl/extractor/umg.py
index d815cd9..c1b65d1 100644
--- a/hypervideo_dl/extractor/umg.py
+++ b/hypervideo_dl/extractor/umg.py
@@ -28,7 +28,7 @@ class UMGDeIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
video_data = self._download_json(
- 'https://api.universal-music.de/graphql',
+ 'https://graphql.universal-music.de/',
video_id, query={
'query': '''{
universalMusic(channel:16) {
@@ -56,11 +56,9 @@ class UMGDeIE(InfoExtractor):
formats = []
def add_m3u8_format(format_id):
- m3u8_formats = self._extract_m3u8_formats(
+ formats.extend(self._extract_m3u8_formats(
hls_url_template % format_id, video_id, 'mp4',
- 'm3u8_native', m3u8_id='hls', fatal='False')
- if m3u8_formats and m3u8_formats[0].get('height'):
- formats.extend(m3u8_formats)
+ 'm3u8_native', m3u8_id='hls', fatal=False))
for f in video_data.get('formats', []):
f_url = f.get('url')
@@ -91,7 +89,7 @@ class UMGDeIE(InfoExtractor):
if not formats:
for format_id in (867, 836, 940):
add_m3u8_format(format_id)
- self._sort_formats(formats, ('width', 'height', 'filesize', 'tbr'))
+ self._sort_formats(formats)
return {
'id': video_id,
diff --git a/hypervideo_dl/extractor/unistra.py b/hypervideo_dl/extractor/unistra.py
index a724cdb..685d74f 100644
--- a/hypervideo_dl/extractor/unistra.py
+++ b/hypervideo_dl/extractor/unistra.py
@@ -33,7 +33,7 @@ class UnistraIE(InfoExtractor):
]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
diff --git a/hypervideo_dl/extractor/uol.py b/hypervideo_dl/extractor/uol.py
index 628adf2..4a2a97f 100644
--- a/hypervideo_dl/extractor/uol.py
+++ b/hypervideo_dl/extractor/uol.py
@@ -110,7 +110,6 @@ class UOLIE(InfoExtractor):
'format_id': format_id,
'url': f_url,
'quality': quality(format_id),
- 'preference': -1,
})
self._sort_formats(formats)
diff --git a/hypervideo_dl/extractor/uplynk.py b/hypervideo_dl/extractor/uplynk.py
index f06bf5b..9adb969 100644
--- a/hypervideo_dl/extractor/uplynk.py
+++ b/hypervideo_dl/extractor/uplynk.py
@@ -30,7 +30,7 @@ class UplynkIE(InfoExtractor):
def _extract_uplynk_info(self, uplynk_content_url):
path, external_id, video_id, session_id = re.match(UplynkIE._VALID_URL, uplynk_content_url).groups()
display_id = video_id or external_id
- formats = self._extract_m3u8_formats(
+ formats, subtitles = self._extract_m3u8_formats_and_subtitles(
'http://content.uplynk.com/%s.m3u8' % path,
display_id, 'mp4', 'm3u8_native')
if session_id:
@@ -48,6 +48,7 @@ class UplynkIE(InfoExtractor):
'duration': float_or_none(asset.get('duration')),
'uploader_id': asset.get('owner'),
'formats': formats,
+ 'subtitles': subtitles,
}
def _real_extract(self, url):
@@ -60,7 +61,7 @@ class UplynkPreplayIE(UplynkIE):
_TEST = None
def _real_extract(self, url):
- path, external_id, video_id = re.match(self._VALID_URL, url).groups()
+ path, external_id, video_id = self._match_valid_url(url).groups()
display_id = video_id or external_id
preplay = self._download_json(url, display_id)
content_url = 'http://content.uplynk.com/%s.m3u8' % path
diff --git a/hypervideo_dl/extractor/urort.py b/hypervideo_dl/extractor/urort.py
index 8f6edab..020425f 100644
--- a/hypervideo_dl/extractor/urort.py
+++ b/hypervideo_dl/extractor/urort.py
@@ -44,7 +44,7 @@ class UrortIE(InfoExtractor):
'ext': f['FileType'],
'format_id': '%s-%s' % (f['FileType'], f.get('Quality', '')),
'url': 'http://p3urort.blob.core.windows.net/tracks/%s' % f['FileRef'],
- 'preference': 3 if f['FileType'] == 'mp3' else 2,
+ 'quality': 3 if f['FileType'] == 'mp3' else 2,
} for f in s['Files']]
self._sort_formats(formats)
e = {
diff --git a/hypervideo_dl/extractor/urplay.py b/hypervideo_dl/extractor/urplay.py
index d6c7914..753ffa4 100644
--- a/hypervideo_dl/extractor/urplay.py
+++ b/hypervideo_dl/extractor/urplay.py
@@ -56,13 +56,12 @@ class URPlayIE(InfoExtractor):
webpage, 'urplayer data'), video_id)['accessibleEpisodes']
urplayer_data = next(e for e in accessible_episodes if e.get('id') == vid)
episode = urplayer_data['title']
- raw_streaming_info = urplayer_data['streamingInfo']['raw']
- host = self._download_json(
- 'http://streaming-loadbalancer.ur.se/loadbalancer.json',
- video_id)['redirect']
+ host = self._download_json('http://streaming-loadbalancer.ur.se/loadbalancer.json', video_id)['redirect']
formats = []
- for k, v in raw_streaming_info.items():
+ urplayer_streams = urplayer_data.get('streamingInfo', {})
+
+ for k, v in urplayer_streams.get('raw', {}).items():
if not (k in ('sd', 'hd') and isinstance(v, dict)):
continue
file_http = v.get('location')
@@ -72,6 +71,13 @@ class URPlayIE(InfoExtractor):
video_id, skip_protocols=['f4m', 'rtmp', 'rtsp']))
self._sort_formats(formats)
+ subtitles = {}
+ subs = urplayer_streams.get("sweComplete", {}).get("tt", {}).get("location")
+ if subs:
+ subtitles.setdefault('Svenska', []).append({
+ 'url': subs,
+ })
+
image = urplayer_data.get('image') or {}
thumbnails = []
for k, v in image.items():
@@ -92,6 +98,7 @@ class URPlayIE(InfoExtractor):
return {
'id': video_id,
+ 'subtitles': subtitles,
'title': '%s : %s' % (series_title, episode) if series_title else episode,
'description': urplayer_data.get('description'),
'thumbnails': thumbnails,
diff --git a/hypervideo_dl/extractor/usanetwork.py b/hypervideo_dl/extractor/usanetwork.py
index e3784e5..d953e46 100644
--- a/hypervideo_dl/extractor/usanetwork.py
+++ b/hypervideo_dl/extractor/usanetwork.py
@@ -5,7 +5,7 @@ from .nbc import NBCIE
class USANetworkIE(NBCIE):
- _VALID_URL = r'https?(?P<permalink>://(?:www\.)?usanetwork\.com/[^/]+/video/[^/]+/(?P<id>\d+))'
+ _VALID_URL = r'https?(?P<permalink>://(?:www\.)?usanetwork\.com/(?:[^/]+/videos?|movies?)/(?:[^/]+/)?(?P<id>\d+))'
_TESTS = [{
'url': 'https://www.usanetwork.com/peacock-trailers/video/intelligence-trailer/4185302',
'info_dict': {
diff --git a/hypervideo_dl/extractor/ustream.py b/hypervideo_dl/extractor/ustream.py
index 1e29cbe..8b75879 100644
--- a/hypervideo_dl/extractor/ustream.py
+++ b/hypervideo_dl/extractor/ustream.py
@@ -165,7 +165,7 @@ class UstreamIE(InfoExtractor):
return formats
def _real_extract(self, url):
- m = re.match(self._VALID_URL, url)
+ m = self._match_valid_url(url)
video_id = m.group('id')
# some sites use this embed format (see: https://github.com/ytdl-org/youtube-dl/issues/2990)
@@ -258,7 +258,7 @@ class UstreamChannelIE(InfoExtractor):
}
def _real_extract(self, url):
- m = re.match(self._VALID_URL, url)
+ m = self._match_valid_url(url)
display_id = m.group('slug')
webpage = self._download_webpage(url, display_id)
channel_id = self._html_search_meta('ustream:channel_id', webpage)
diff --git a/hypervideo_dl/extractor/ustudio.py b/hypervideo_dl/extractor/ustudio.py
index 56509be..92509d1 100644
--- a/hypervideo_dl/extractor/ustudio.py
+++ b/hypervideo_dl/extractor/ustudio.py
@@ -1,6 +1,5 @@
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
from ..utils import (
@@ -29,7 +28,7 @@ class UstudioIE(InfoExtractor):
}
def _real_extract(self, url):
- video_id, display_id = re.match(self._VALID_URL, url).groups()
+ video_id, display_id = self._match_valid_url(url).groups()
config = self._download_xml(
'http://v1.ustudio.com/embed/%s/ustudio/config.xml' % video_id,
@@ -83,7 +82,7 @@ class UstudioEmbedIE(InfoExtractor):
}
def _real_extract(self, url):
- uploader_id, video_id = re.match(self._VALID_URL, url).groups()
+ uploader_id, video_id = self._match_valid_url(url).groups()
video_data = self._download_json(
'http://app.ustudio.com/embed/%s/%s/config.json' % (uploader_id, video_id),
video_id)['videos'][0]
diff --git a/hypervideo_dl/extractor/utreon.py b/hypervideo_dl/extractor/utreon.py
new file mode 100644
index 0000000..4a25f0c
--- /dev/null
+++ b/hypervideo_dl/extractor/utreon.py
@@ -0,0 +1,85 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ dict_get,
+ int_or_none,
+ str_or_none,
+ try_get,
+ unified_strdate,
+ url_or_none,
+)
+
+
+class UtreonIE(InfoExtractor):
+ _VALID_URL = r'(?:https?://)(?:www\.)?utreon.com/v/(?P<id>[a-zA-Z0-9_-]+)'
+ _TESTS = [{
+ 'url': 'https://utreon.com/v/z_I7ikQbuDw',
+ 'info_dict': {
+ 'id': 'z_I7ikQbuDw',
+ 'ext': 'mp4',
+ 'title': 'Freedom Friday meditation - Rising in the wind',
+ 'description': 'md5:a9bf15a42434a062fe313b938343ad1b',
+ 'uploader': 'Heather Dawn Elemental Health',
+ 'thumbnail': 'https://data-1.utreon.com/v/MG/M2/NT/z_I7ikQbuDw/z_I7ikQbuDw_preview.jpg',
+ 'release_date': '20210723',
+ }
+ }, {
+ 'url': 'https://utreon.com/v/jerJw5EOOVU',
+ 'info_dict': {
+ 'id': 'jerJw5EOOVU',
+ 'ext': 'mp4',
+ 'title': 'When I\'m alone, I love to reflect in peace, to make my dreams come true... [Quotes and Poems]',
+ 'description': 'md5:61ee6c2da98be51b04b969ca80273aaa',
+ 'uploader': 'Frases e Poemas Quotes and Poems',
+ 'thumbnail': 'https://data-1.utreon.com/v/Mz/Zh/ND/jerJw5EOOVU/jerJw5EOOVU_89af85470a4b16eededde7f8674c96d9_cover.jpg',
+ 'release_date': '20210723',
+ }
+ }, {
+ 'url': 'https://utreon.com/v/C4ZxXhYBBmE',
+ 'info_dict': {
+ 'id': 'C4ZxXhYBBmE',
+ 'ext': 'mp4',
+ 'title': 'Biden’s Capital Gains Tax Rate to Test World’s Highest',
+ 'description': 'md5:fb5a6c2e506f013cc76f133f673bc5c8',
+ 'uploader': 'Nomad Capitalist',
+ 'thumbnail': 'https://data-1.utreon.com/v/ZD/k1/Mj/C4ZxXhYBBmE/C4ZxXhYBBmE_628342076198c9c06dd6b2c665978584_cover.jpg',
+ 'release_date': '20210723',
+ }
+ }, {
+ 'url': 'https://utreon.com/v/Y-stEH-FBm8',
+ 'info_dict': {
+ 'id': 'Y-stEH-FBm8',
+ 'ext': 'mp4',
+ 'title': 'Creeper-Chan Pranks Steve! 💚 [MINECRAFT ANIME]',
+ 'description': 'md5:7a48450b0d761b96dec194be0c5ecb5f',
+ 'uploader': 'Merryweather Comics',
+ 'thumbnail': 'https://data-1.utreon.com/v/MT/E4/Zj/Y-stEH-FBm8/Y-stEH-FBm8_5290676a41a4a1096db133b09f54f77b_cover.jpg',
+ 'release_date': '20210718',
+ }},
+ ]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ json_data = self._download_json(
+ 'https://api.utreon.com/v1/videos/' + video_id,
+ video_id)
+ videos_json = json_data['videos']
+ formats = [{
+ 'url': format_url,
+ 'format_id': format_key.split('_')[1],
+ 'height': int(format_key.split('_')[1][:-1]),
+ } for format_key, format_url in videos_json.items() if url_or_none(format_url)]
+ self._sort_formats(formats)
+ thumbnail = url_or_none(dict_get(json_data, ('cover_image_url', 'preview_image_url')))
+ return {
+ 'id': video_id,
+ 'title': json_data['title'],
+ 'formats': formats,
+ 'description': str_or_none(json_data.get('description')),
+ 'duration': int_or_none(json_data.get('duration')),
+ 'uploader': str_or_none(try_get(json_data, lambda x: x['channel']['title'])),
+ 'thumbnail': thumbnail,
+ 'release_date': unified_strdate(json_data.get('published_datetime')),
+ }
diff --git a/hypervideo_dl/extractor/varzesh3.py b/hypervideo_dl/extractor/varzesh3.py
index f474ed7..81313dc 100644
--- a/hypervideo_dl/extractor/varzesh3.py
+++ b/hypervideo_dl/extractor/varzesh3.py
@@ -2,12 +2,9 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..compat import (
- compat_urllib_parse_urlparse,
- compat_parse_qs,
-)
from ..utils import (
clean_html,
+ parse_qs,
remove_start,
)
@@ -59,7 +56,7 @@ class Varzesh3IE(InfoExtractor):
fb_sharer_url = self._search_regex(
r'<a[^>]+href="(https?://www\.facebook\.com/sharer/sharer\.php?[^"]+)"',
webpage, 'facebook sharer URL', fatal=False)
- sharer_params = compat_parse_qs(compat_urllib_parse_urlparse(fb_sharer_url).query)
+ sharer_params = parse_qs(fb_sharer_url)
thumbnail = sharer_params.get('p[images][0]', [None])[0]
video_id = self._search_regex(
diff --git a/hypervideo_dl/extractor/veo.py b/hypervideo_dl/extractor/veo.py
new file mode 100644
index 0000000..4e57a52
--- /dev/null
+++ b/hypervideo_dl/extractor/veo.py
@@ -0,0 +1,74 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+from ..utils import (
+ int_or_none,
+ mimetype2ext,
+ unified_timestamp,
+ url_or_none,
+)
+
+
+class VeoIE(InfoExtractor):
+ _VALID_URL = r'https?://app\.veo\.co/matches/(?P<id>[0-9A-Za-z-]+)'
+
+ _TESTS = [{
+ 'url': 'https://app.veo.co/matches/20201027-last-period/',
+ 'info_dict': {
+ 'id': '20201027-last-period',
+ 'ext': 'mp4',
+ 'title': 'Akidemy u11s v Bradford Boys u11s (Game 3)',
+ 'thumbnail': 're:https://c.veocdn.com/.+/thumbnail.jpg',
+ 'upload_date': '20201028',
+ 'timestamp': 1603847208,
+ 'duration': 1916,
+ }
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ metadata = self._download_json(
+ 'https://app.veo.co/api/app/matches/%s' % video_id, video_id)
+
+ video_data = self._download_json(
+ 'https://app.veo.co/api/app/matches/%s/videos' % video_id, video_id, 'Downloading video data')
+
+ title = metadata.get('title')
+ thumbnail = url_or_none(metadata.get('thumbnail'))
+
+ timestamp = unified_timestamp(metadata.get('created'))
+ duration = int_or_none(metadata.get('duration'))
+ view_count = int_or_none(metadata.get('view_count'))
+
+ formats = []
+ for fmt in video_data:
+ mimetype = fmt.get('mime_type')
+ # skip configuration file for panoramic video
+ if mimetype == 'video/mp2t':
+ continue
+ height = int_or_none(fmt.get('height'))
+ bitrate = int_or_none(fmt.get('bit_rate'), scale=1000)
+ render_type = fmt.get('render_type')
+ formats.append({
+ 'url': url_or_none(fmt.get('url')),
+ 'format_id': '%s-%sp' % (render_type, height),
+ 'ext': mimetype2ext(mimetype),
+ 'width': int_or_none(fmt.get('width')),
+ 'height': height,
+ 'vbr': bitrate
+ })
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'formats': formats,
+ 'thumbnail': thumbnail,
+ 'timestamp': timestamp,
+ 'view_count': view_count,
+ 'duration': duration
+ }
diff --git a/hypervideo_dl/extractor/vesti.py b/hypervideo_dl/extractor/vesti.py
index 5ab7168..002047d 100644
--- a/hypervideo_dl/extractor/vesti.py
+++ b/hypervideo_dl/extractor/vesti.py
@@ -101,7 +101,7 @@ class VestiIE(InfoExtractor):
]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
video_id = mobj.group('id')
page = self._download_webpage(url, video_id, 'Downloading page')
diff --git a/hypervideo_dl/extractor/vevo.py b/hypervideo_dl/extractor/vevo.py
index 4ea9f1b..8a0f292 100644
--- a/hypervideo_dl/extractor/vevo.py
+++ b/hypervideo_dl/extractor/vevo.py
@@ -6,13 +6,13 @@ import json
from .common import InfoExtractor
from ..compat import (
compat_str,
- compat_urlparse,
compat_HTTPError,
)
from ..utils import (
ExtractorError,
int_or_none,
parse_iso8601,
+ parse_qs,
)
@@ -38,117 +38,7 @@ class VevoIE(VevoBaseIE):
vevo:)
(?P<id>[^&?#]+)'''
- _TESTS = [{
- 'url': 'http://www.vevo.com/watch/hurts/somebody-to-die-for/GB1101300280',
- 'md5': '95ee28ee45e70130e3ab02b0f579ae23',
- 'info_dict': {
- 'id': 'GB1101300280',
- 'ext': 'mp4',
- 'title': 'Hurts - Somebody to Die For',
- 'timestamp': 1372057200,
- 'upload_date': '20130624',
- 'uploader': 'Hurts',
- 'track': 'Somebody to Die For',
- 'artist': 'Hurts',
- 'genre': 'Pop',
- },
- 'expected_warnings': ['Unable to download SMIL file', 'Unable to download info'],
- }, {
- 'note': 'v3 SMIL format',
- 'url': 'http://www.vevo.com/watch/cassadee-pope/i-wish-i-could-break-your-heart/USUV71302923',
- 'md5': 'f6ab09b034f8c22969020b042e5ac7fc',
- 'info_dict': {
- 'id': 'USUV71302923',
- 'ext': 'mp4',
- 'title': 'Cassadee Pope - I Wish I Could Break Your Heart',
- 'timestamp': 1392796919,
- 'upload_date': '20140219',
- 'uploader': 'Cassadee Pope',
- 'track': 'I Wish I Could Break Your Heart',
- 'artist': 'Cassadee Pope',
- 'genre': 'Country',
- },
- 'expected_warnings': ['Unable to download SMIL file', 'Unable to download info'],
- }, {
- 'note': 'Age-limited video',
- 'url': 'https://www.vevo.com/watch/justin-timberlake/tunnel-vision-explicit/USRV81300282',
- 'info_dict': {
- 'id': 'USRV81300282',
- 'ext': 'mp4',
- 'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
- 'age_limit': 18,
- 'timestamp': 1372888800,
- 'upload_date': '20130703',
- 'uploader': 'Justin Timberlake',
- 'track': 'Tunnel Vision (Explicit)',
- 'artist': 'Justin Timberlake',
- 'genre': 'Pop',
- },
- 'expected_warnings': ['Unable to download SMIL file', 'Unable to download info'],
- }, {
- 'note': 'No video_info',
- 'url': 'http://www.vevo.com/watch/k-camp-1/Till-I-Die/USUV71503000',
- 'md5': '8b83cc492d72fc9cf74a02acee7dc1b0',
- 'info_dict': {
- 'id': 'USUV71503000',
- 'ext': 'mp4',
- 'title': 'K Camp ft. T.I. - Till I Die',
- 'age_limit': 18,
- 'timestamp': 1449468000,
- 'upload_date': '20151207',
- 'uploader': 'K Camp',
- 'track': 'Till I Die',
- 'artist': 'K Camp',
- 'genre': 'Hip-Hop',
- },
- 'expected_warnings': ['Unable to download SMIL file', 'Unable to download info'],
- }, {
- 'note': 'Featured test',
- 'url': 'https://www.vevo.com/watch/lemaitre/Wait/USUV71402190',
- 'md5': 'd28675e5e8805035d949dc5cf161071d',
- 'info_dict': {
- 'id': 'USUV71402190',
- 'ext': 'mp4',
- 'title': 'Lemaitre ft. LoLo - Wait',
- 'age_limit': 0,
- 'timestamp': 1413432000,
- 'upload_date': '20141016',
- 'uploader': 'Lemaitre',
- 'track': 'Wait',
- 'artist': 'Lemaitre',
- 'genre': 'Electronic',
- },
- 'expected_warnings': ['Unable to download SMIL file', 'Unable to download info'],
- }, {
- 'note': 'Only available via webpage',
- 'url': 'http://www.vevo.com/watch/GBUV71600656',
- 'md5': '67e79210613865b66a47c33baa5e37fe',
- 'info_dict': {
- 'id': 'GBUV71600656',
- 'ext': 'mp4',
- 'title': 'ABC - Viva Love',
- 'age_limit': 0,
- 'timestamp': 1461830400,
- 'upload_date': '20160428',
- 'uploader': 'ABC',
- 'track': 'Viva Love',
- 'artist': 'ABC',
- 'genre': 'Pop',
- },
- 'expected_warnings': ['Failed to download video versions info'],
- }, {
- # no genres available
- 'url': 'http://www.vevo.com/watch/INS171400764',
- 'only_matching': True,
- }, {
- # Another case available only via the webpage; using streams/streamsV3 formats
- # Geo-restricted to Netherlands/Germany
- 'url': 'http://www.vevo.com/watch/boostee/pop-corn-clip-officiel/FR1A91600909',
- 'only_matching': True,
- }, {
- 'url': 'https://embed.vevo.com/?isrc=USH5V1923499&partnerId=4d61b777-8023-4191-9ede-497ed6c24647&partnerAdCode=',
- 'only_matching': True,
- }]
+ _TESTS = []
_VERSIONS = {
0: 'youtube', # only in AuthenticateVideo videoVersions
1: 'level3',
@@ -310,13 +200,6 @@ class VevoPlaylistIE(VevoBaseIE):
_VALID_URL = r'https?://(?:www\.)?vevo\.com/watch/(?P<kind>playlist|genre)/(?P<id>[^/?#&]+)'
_TESTS = [{
- 'url': 'http://www.vevo.com/watch/playlist/dadbf4e7-b99f-4184-9670-6f0e547b6a29',
- 'info_dict': {
- 'id': 'dadbf4e7-b99f-4184-9670-6f0e547b6a29',
- 'title': 'Best-Of: Birdman',
- },
- 'playlist_count': 10,
- }, {
'url': 'http://www.vevo.com/watch/genre/rock',
'info_dict': {
'id': 'rock',
@@ -324,33 +207,18 @@ class VevoPlaylistIE(VevoBaseIE):
},
'playlist_count': 20,
}, {
- 'url': 'http://www.vevo.com/watch/playlist/dadbf4e7-b99f-4184-9670-6f0e547b6a29?index=0',
- 'md5': '32dcdfddddf9ec6917fc88ca26d36282',
- 'info_dict': {
- 'id': 'USCMV1100073',
- 'ext': 'mp4',
- 'title': 'Birdman - Y.U. MAD',
- 'timestamp': 1323417600,
- 'upload_date': '20111209',
- 'uploader': 'Birdman',
- 'track': 'Y.U. MAD',
- 'artist': 'Birdman',
- 'genre': 'Rap/Hip-Hop',
- },
- 'expected_warnings': ['Unable to download SMIL file'],
- }, {
'url': 'http://www.vevo.com/watch/genre/rock?index=0',
'only_matching': True,
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
playlist_id = mobj.group('id')
playlist_kind = mobj.group('kind')
webpage = self._download_webpage(url, playlist_id)
- qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
+ qs = parse_qs(url)
index = qs.get('index', [None])[0]
if index:
diff --git a/hypervideo_dl/extractor/vgtv.py b/hypervideo_dl/extractor/vgtv.py
index 22e99e8..b6131ff 100644
--- a/hypervideo_dl/extractor/vgtv.py
+++ b/hypervideo_dl/extractor/vgtv.py
@@ -165,7 +165,7 @@ class VGTVIE(XstreamIE):
]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
video_id = mobj.group('id')
host = mobj.group('host')
appname = self._HOST_TO_APPNAME[host] if host else mobj.group('appname')
diff --git a/hypervideo_dl/extractor/vh1.py b/hypervideo_dl/extractor/vh1.py
index dff94a2..862c5c7 100644
--- a/hypervideo_dl/extractor/vh1.py
+++ b/hypervideo_dl/extractor/vh1.py
@@ -3,27 +3,29 @@ from __future__ import unicode_literals
from .mtv import MTVServicesInfoExtractor
+# TODO Remove - Reason: Outdated Site
+
class VH1IE(MTVServicesInfoExtractor):
IE_NAME = 'vh1.com'
_FEED_URL = 'http://www.vh1.com/feeds/mrss/'
_TESTS = [{
- 'url': 'http://www.vh1.com/episodes/0umwpq/hip-hop-squares-kent-jones-vs-nick-young-season-1-ep-120',
+ 'url': 'https://www.vh1.com/episodes/0aqivv/nick-cannon-presents-wild-n-out-foushee-season-16-ep-12',
'info_dict': {
- 'title': 'Kent Jones vs. Nick Young',
- 'description': 'Come to Play. Stay to Party. With Mike Epps, TIP, O’Shea Jackson Jr., T-Pain, Tisha Campbell-Martin and more.',
+ 'title': 'Fousheé',
+ 'description': 'Fousheé joins Team Evolutions fight against Nick and Team Revolution in Baby Daddy, Baby Mama; Kick Em Out the Classroom; Backseat of My Ride and Wildstyle; and Fousheé performs.',
},
'playlist_mincount': 4,
+ 'skip': '404 Not found',
}, {
# Clip
- 'url': 'http://www.vh1.com/video-clips/t74mif/scared-famous-scared-famous-extended-preview',
+ 'url': 'https://www.vh1.com/video-clips/e0sja0/nick-cannon-presents-wild-n-out-foushee-clap-for-him',
'info_dict': {
- 'id': '0a50c2d2-a86b-4141-9565-911c7e2d0b92',
+ 'id': 'a07563f7-a37b-4e7f-af68-85855c2c7cc3',
'ext': 'mp4',
- 'title': 'Scared Famous|October 9, 2017|1|NO-EPISODE#|Scared Famous + Extended Preview',
- 'description': 'md5:eff5551a274c473a29463de40f7b09da',
- 'upload_date': '20171009',
- 'timestamp': 1507574700,
+ 'title': 'Fousheé - "clap for him"',
+ 'description': 'Singer Fousheé hits the Wild N Out: In the Dark stage with a performance of the tongue-in-cheek track "clap for him" from her 2021 album "time machine."',
+ 'upload_date': '20210826',
},
'params': {
# m3u8 download
@@ -32,10 +34,3 @@ class VH1IE(MTVServicesInfoExtractor):
}]
_VALID_URL = r'https?://(?:www\.)?vh1\.com/(?:video-clips|episodes)/(?P<id>[^/?#.]+)'
-
- def _real_extract(self, url):
- playlist_id = self._match_id(url)
- webpage = self._download_webpage(url, playlist_id)
- mgid = self._extract_triforce_mgid(webpage)
- videos_info = self._get_videos_info(mgid)
- return videos_info
diff --git a/hypervideo_dl/extractor/vice.py b/hypervideo_dl/extractor/vice.py
index e374995..ca4d3ed 100644
--- a/hypervideo_dl/extractor/vice.py
+++ b/hypervideo_dl/extractor/vice.py
@@ -118,7 +118,7 @@ class ViceIE(ViceBaseIE, AdobePassIE):
return urls[0] if urls else None
def _real_extract(self, url):
- locale, video_id = re.match(self._VALID_URL, url).groups()
+ locale, video_id = self._match_valid_url(url).groups()
video = self._call_api('videos', 'id', video_id, locale, '''body
locked
@@ -225,7 +225,7 @@ class ViceShowIE(ViceBaseIE):
video['url'], ViceIE.ie_key(), video.get('id'))
def _real_extract(self, url):
- locale, display_id = re.match(self._VALID_URL, url).groups()
+ locale, display_id = self._match_valid_url(url).groups()
show = self._call_api('shows', 'slug', display_id, locale, '''dek
id
title''')[0]
@@ -302,7 +302,7 @@ class ViceArticleIE(ViceBaseIE):
}]
def _real_extract(self, url):
- locale, display_id = re.match(self._VALID_URL, url).groups()
+ locale, display_id = self._match_valid_url(url).groups()
article = self._call_api('articles', 'slug', display_id, locale, '''body
embed_code''')[0]
diff --git a/hypervideo_dl/extractor/viddler.py b/hypervideo_dl/extractor/viddler.py
index 6423584..ecc4824 100644
--- a/hypervideo_dl/extractor/viddler.py
+++ b/hypervideo_dl/extractor/viddler.py
@@ -1,6 +1,5 @@
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
from ..utils import (
@@ -75,7 +74,7 @@ class ViddlerIE(InfoExtractor):
}]
def _real_extract(self, url):
- video_id, secret = re.match(self._VALID_URL, url).groups()
+ video_id, secret = self._match_valid_url(url).groups()
query = {
'video_id': video_id,
diff --git a/hypervideo_dl/extractor/videa.py b/hypervideo_dl/extractor/videa.py
index ab2c15c..512ade7 100644
--- a/hypervideo_dl/extractor/videa.py
+++ b/hypervideo_dl/extractor/videa.py
@@ -11,7 +11,9 @@ from ..utils import (
int_or_none,
mimetype2ext,
parse_codecs,
+ parse_qs,
update_url_query,
+ urljoin,
xpath_element,
xpath_text,
)
@@ -45,10 +47,24 @@ class VideaIE(InfoExtractor):
},
}, {
'url': 'http://videa.hu/videok/origo/jarmuvek/supercars-elozes-jAHDWfWSJH5XuFhH',
- 'only_matching': True,
+ 'md5': 'd57ccd8812c7fd491d33b1eab8c99975',
+ 'info_dict': {
+ 'id': 'jAHDWfWSJH5XuFhH',
+ 'ext': 'mp4',
+ 'title': 'Supercars előzés',
+ 'thumbnail': r're:^https?://.*',
+ 'duration': 64,
+ },
}, {
'url': 'http://videa.hu/player?v=8YfIAjxwWGwT8HVQ',
- 'only_matching': True,
+ 'md5': '97a7af41faeaffd9f1fc864a7c7e7603',
+ 'info_dict': {
+ 'id': '8YfIAjxwWGwT8HVQ',
+ 'ext': 'mp4',
+ 'title': 'Az őrült kígyász 285 kígyót enged szabadon',
+ 'thumbnail': r're:^https?://.*',
+ 'duration': 21,
+ },
}, {
'url': 'http://videa.hu/player/v/8YfIAjxwWGwT8HVQ?autoplay=1',
'only_matching': True,
@@ -95,9 +111,17 @@ class VideaIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
- query = {'v': video_id}
- player_page = self._download_webpage(
- 'https://videa.hu/player', video_id, query=query)
+
+ video_page = self._download_webpage(url, video_id)
+
+ if 'videa.hu/player' in url:
+ player_url = url
+ player_page = video_page
+ else:
+ player_url = self._search_regex(
+ r'<iframe.*?src="(/player\?[^"]+)"', video_page, 'player url')
+ player_url = urljoin(url, player_url)
+ player_page = self._download_webpage(player_url, video_id)
nonce = self._search_regex(
r'_xt\s*=\s*"([^"]+)"', player_page, 'nonce')
@@ -107,6 +131,7 @@ class VideaIE(InfoExtractor):
for i in range(0, 32):
result += s[i - (self._STATIC_SECRET.index(l[i]) - 31)]
+ query = parse_qs(player_url)
random_seed = ''.join(random.choice(string.ascii_letters + string.digits) for _ in range(8))
query['_s'] = random_seed
query['_t'] = result[:16]
@@ -127,7 +152,7 @@ class VideaIE(InfoExtractor):
sources = xpath_element(
info, './video_sources', 'sources', fatal=True)
hash_values = xpath_element(
- info, './hash_values', 'hash values', fatal=True)
+ info, './hash_values', 'hash values', fatal=False)
title = xpath_text(video, './title', fatal=True)
@@ -136,15 +161,16 @@ class VideaIE(InfoExtractor):
source_url = source.text
source_name = source.get('name')
source_exp = source.get('exp')
- if not (source_url and source_name and source_exp):
+ if not (source_url and source_name):
continue
- hash_value = xpath_text(hash_values, 'hash_value_' + source_name)
- if not hash_value:
- continue
- source_url = update_url_query(source_url, {
- 'md5': hash_value,
- 'expires': source_exp,
- })
+ hash_value = None
+ if hash_values:
+ hash_value = xpath_text(hash_values, 'hash_value_' + source_name)
+ if hash_value and source_exp:
+ source_url = update_url_query(source_url, {
+ 'md5': hash_value,
+ 'expires': source_exp,
+ })
f = parse_codecs(source.get('codecs'))
f.update({
'url': self._proto_relative_url(source_url),
diff --git a/hypervideo_dl/extractor/videomore.py b/hypervideo_dl/extractor/videomore.py
index e0c10aa..17ef3b1 100644
--- a/hypervideo_dl/extractor/videomore.py
+++ b/hypervideo_dl/extractor/videomore.py
@@ -5,13 +5,11 @@ import re
from .common import InfoExtractor
from ..compat import (
- compat_parse_qs,
compat_str,
- compat_urllib_parse_urlparse,
)
from ..utils import (
- ExtractorError,
int_or_none,
+ parse_qs,
)
@@ -145,9 +143,9 @@ class VideomoreIE(InfoExtractor):
return mobj.group('url')
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
video_id = mobj.group('sid') or mobj.group('id')
- partner_id = mobj.group('partner_id') or compat_parse_qs(compat_urllib_parse_urlparse(url).query).get('partner_id', [None])[0] or '97'
+ partner_id = mobj.group('partner_id') or parse_qs(url).get('partner_id', [None])[0] or '97'
item = self._download_json(
'https://siren.more.tv/player/config', video_id, query={
@@ -193,8 +191,8 @@ class VideomoreIE(InfoExtractor):
error = item.get('error')
if error:
if error in ('Данное видео недоступно для просмотра на территории этой страны', 'Данное видео доступно для просмотра только на территории России'):
- self.raise_geo_restricted(countries=['RU'])
- raise ExtractorError(error, expected=True)
+ self.raise_geo_restricted(countries=['RU'], metadata_available=True)
+ self.raise_no_formats(error, expected=True)
self._sort_formats(formats)
return {
diff --git a/hypervideo_dl/extractor/vidio.py b/hypervideo_dl/extractor/vidio.py
index b1243e8..571448b 100644
--- a/hypervideo_dl/extractor/vidio.py
+++ b/hypervideo_dl/extractor/vidio.py
@@ -1,19 +1,80 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
from ..utils import (
+ clean_html,
+ ExtractorError,
+ get_element_by_class,
int_or_none,
parse_iso8601,
+ smuggle_url,
str_or_none,
strip_or_none,
try_get,
+ unsmuggle_url,
+ urlencode_postdata,
)
-class VidioIE(InfoExtractor):
+class VidioBaseIE(InfoExtractor):
+ _LOGIN_URL = 'https://www.vidio.com/users/login'
+ _NETRC_MACHINE = 'vidio'
+
+ def _login(self):
+ username, password = self._get_login_info()
+ if username is None:
+ return
+
+ def is_logged_in():
+ res = self._download_json(
+ 'https://www.vidio.com/interactions.json', None, 'Checking if logged in', fatal=False) or {}
+ return bool(res.get('current_user'))
+
+ if is_logged_in():
+ return
+
+ login_page = self._download_webpage(
+ self._LOGIN_URL, None, 'Downloading log in page')
+
+ login_form = self._form_hidden_inputs("login-form", login_page)
+ login_form.update({
+ 'user[login]': username,
+ 'user[password]': password,
+ })
+ login_post, login_post_urlh = self._download_webpage_handle(
+ self._LOGIN_URL, None, 'Logging in', data=urlencode_postdata(login_form), expected_status=[302, 401])
+
+ if login_post_urlh.status == 401:
+ if get_element_by_class('onboarding-content-register-popup__title', login_post):
+ raise ExtractorError(
+ 'Unable to log in: The provided email has not registered yet.', expected=True)
+
+ reason = get_element_by_class('onboarding-form__general-error', login_post) or get_element_by_class('onboarding-modal__title', login_post)
+ if 'Akun terhubung ke' in reason:
+ raise ExtractorError(
+ 'Unable to log in: Your account is linked to a social media account. '
+ 'Use --cookies to provide account credentials instead', expected=True)
+ elif reason:
+ subreason = get_element_by_class('onboarding-modal__description-text', login_post) or ''
+ raise ExtractorError(
+ 'Unable to log in: %s. %s' % (reason, clean_html(subreason)), expected=True)
+ raise ExtractorError('Unable to log in')
+
+ def _real_initialize(self):
+ self._api_key = self._download_json(
+ 'https://www.vidio.com/auth', None, data=b'')['api_key']
+ self._login()
+
+ def _call_api(self, url, video_id, note=None):
+ return self._download_json(url, video_id, note=note, headers={
+ 'Content-Type': 'application/vnd.api+json',
+ 'X-API-KEY': self._api_key,
+ })
+
+
+class VidioIE(VidioBaseIE):
_VALID_URL = r'https?://(?:www\.)?vidio\.com/watch/(?P<id>\d+)-(?P<display_id>[^/?#&]+)'
_TESTS = [{
'url': 'http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015',
@@ -41,24 +102,43 @@ class VidioIE(InfoExtractor):
}, {
'url': 'https://www.vidio.com/watch/77949-south-korea-test-fires-missile-that-can-strike-all-of-the-north',
'only_matching': True,
+ }, {
+ # Premier-exclusive video
+ 'url': 'https://www.vidio.com/watch/1550718-stand-by-me-doraemon',
+ 'only_matching': True
}]
- def _real_initialize(self):
- self._api_key = self._download_json(
- 'https://www.vidio.com/auth', None, data=b'')['api_key']
-
def _real_extract(self, url):
- video_id, display_id = re.match(self._VALID_URL, url).groups()
- data = self._download_json(
- 'https://api.vidio.com/videos/' + video_id, display_id, headers={
- 'Content-Type': 'application/vnd.api+json',
- 'X-API-KEY': self._api_key,
- })
+ match = self._match_valid_url(url).groupdict()
+ video_id, display_id = match.get('id'), match.get('display_id')
+ data = self._call_api('https://api.vidio.com/videos/' + video_id, display_id)
video = data['videos'][0]
title = video['title'].strip()
+ is_premium = video.get('is_premium')
+
+ if is_premium:
+ sources = self._download_json(
+ 'https://www.vidio.com/interactions_stream.json?video_id=%s&type=videos' % video_id,
+ display_id, note='Downloading premier API JSON')
+ if not (sources.get('source') or sources.get('source_dash')):
+ self.raise_login_required('This video is only available for registered users with the appropriate subscription')
+
+ formats, subs = [], {}
+ if sources.get('source'):
+ hls_formats, hls_subs = self._extract_m3u8_formats_and_subtitles(
+ sources['source'], display_id, 'mp4', 'm3u8_native')
+ formats.extend(hls_formats)
+ subs.update(hls_subs)
+ if sources.get('source_dash'): # TODO: Find video example with source_dash
+ dash_formats, dash_subs = self._extract_mpd_formats_and_subtitles(
+ sources['source_dash'], display_id, 'dash')
+ formats.extend(dash_formats)
+ subs.update(dash_subs)
+ else:
+ hls_url = data['clips'][0]['hls_url']
+ formats, subs = self._extract_m3u8_formats_and_subtitles(
+ hls_url, display_id, 'mp4', 'm3u8_native')
- formats = self._extract_m3u8_formats(
- data['clips'][0]['hls_url'], display_id, 'mp4', 'm3u8_native')
self._sort_formats(formats)
get_first = lambda x: try_get(data, lambda y: y[x + 's'][0], dict) or {}
@@ -76,6 +156,7 @@ class VidioIE(InfoExtractor):
'duration': int_or_none(video.get('duration')),
'like_count': get_count('likes'),
'formats': formats,
+ 'subtitles': subs,
'uploader': user.get('name'),
'timestamp': parse_iso8601(video.get('created_at')),
'uploader_id': username,
@@ -87,3 +168,128 @@ class VidioIE(InfoExtractor):
'comment_count': get_count('comments'),
'tags': video.get('tag_list'),
}
+
+
+class VidioPremierIE(VidioBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?vidio\.com/premier/(?P<id>\d+)/(?P<display_id>[^/?#&]+)'
+ _TESTS = [{
+ 'url': 'https://www.vidio.com/premier/2885/badai-pasti-berlalu',
+ 'playlist_mincount': 14,
+ }, {
+ # Series with both free and premier-exclusive videos
+ 'url': 'https://www.vidio.com/premier/2567/sosmed',
+ 'only_matching': True,
+ }]
+
+ def _playlist_entries(self, playlist_url, display_id):
+ index = 1
+ while playlist_url:
+ playlist_json = self._call_api(playlist_url, display_id, 'Downloading API JSON page %s' % index)
+ for video_json in playlist_json.get('data', []):
+ link = video_json['links']['watchpage']
+ yield self.url_result(link, 'Vidio', video_json['id'])
+ playlist_url = try_get(playlist_json, lambda x: x['links']['next'])
+ index += 1
+
+ def _real_extract(self, url):
+ url, idata = unsmuggle_url(url, {})
+ playlist_id, display_id = self._match_valid_url(url).groups()
+
+ playlist_url = idata.get('url')
+ if playlist_url: # Smuggled data contains an API URL. Download only that playlist
+ playlist_id = idata['id']
+ return self.playlist_result(
+ self._playlist_entries(playlist_url, playlist_id),
+ playlist_id=playlist_id, playlist_title=idata.get('title'))
+
+ playlist_data = self._call_api('https://api.vidio.com/content_profiles/%s/playlists' % playlist_id, display_id)
+
+ return self.playlist_from_matches(
+ playlist_data.get('data', []), playlist_id=playlist_id, ie=self.ie_key(),
+ getter=lambda data: smuggle_url(url, {
+ 'url': data['relationships']['videos']['links']['related'],
+ 'id': data['id'],
+ 'title': try_get(data, lambda x: x['attributes']['name'])
+ }))
+
+
+class VidioLiveIE(VidioBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?vidio\.com/live/(?P<id>\d+)-(?P<display_id>[^/?#&]+)'
+ _TESTS = [{
+ 'url': 'https://www.vidio.com/live/204-sctv',
+ 'info_dict': {
+ 'id': '204',
+ 'title': 'SCTV',
+ 'uploader': 'SCTV',
+ 'uploader_id': 'sctv',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ },
+ }, {
+ # Premier-exclusive livestream
+ 'url': 'https://www.vidio.com/live/6362-tvn',
+ 'only_matching': True,
+ }, {
+ # DRM premier-exclusive livestream
+ 'url': 'https://www.vidio.com/live/6299-bein-1',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id, display_id = self._match_valid_url(url).groups()
+ stream_data = self._call_api(
+ 'https://www.vidio.com/api/livestreamings/%s/detail' % video_id, display_id)
+ stream_meta = stream_data['livestreamings'][0]
+ user = stream_data.get('users', [{}])[0]
+
+ title = stream_meta.get('title')
+ username = user.get('username')
+
+ formats = []
+ if stream_meta.get('is_drm'):
+ if not self.get_param('allow_unplayable_formats'):
+ self.report_drm(video_id)
+ if stream_meta.get('is_premium'):
+ sources = self._download_json(
+ 'https://www.vidio.com/interactions_stream.json?video_id=%s&type=livestreamings' % video_id,
+ display_id, note='Downloading premier API JSON')
+ if not (sources.get('source') or sources.get('source_dash')):
+ self.raise_login_required('This video is only available for registered users with the appropriate subscription')
+
+ if str_or_none(sources.get('source')):
+ token_json = self._download_json(
+ 'https://www.vidio.com/live/%s/tokens' % video_id,
+ display_id, note='Downloading HLS token JSON', data=b'')
+ formats.extend(self._extract_m3u8_formats(
+ sources['source'] + '?' + token_json.get('token', ''), display_id, 'mp4', 'm3u8_native'))
+ if str_or_none(sources.get('source_dash')):
+ pass
+ else:
+ if stream_meta.get('stream_token_url'):
+ token_json = self._download_json(
+ 'https://www.vidio.com/live/%s/tokens' % video_id,
+ display_id, note='Downloading HLS token JSON', data=b'')
+ formats.extend(self._extract_m3u8_formats(
+ stream_meta['stream_token_url'] + '?' + token_json.get('token', ''),
+ display_id, 'mp4', 'm3u8_native'))
+ if stream_meta.get('stream_dash_url'):
+ pass
+ if stream_meta.get('stream_url'):
+ formats.extend(self._extract_m3u8_formats(
+ stream_meta['stream_url'], display_id, 'mp4', 'm3u8_native'))
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'display_id': display_id,
+ 'title': title,
+ 'is_live': True,
+ 'description': strip_or_none(stream_meta.get('description')),
+ 'thumbnail': stream_meta.get('image'),
+ 'like_count': int_or_none(stream_meta.get('like')),
+ 'dislike_count': int_or_none(stream_meta.get('dislike')),
+ 'formats': formats,
+ 'uploader': user.get('name'),
+ 'timestamp': parse_iso8601(stream_meta.get('start_time')),
+ 'uploader_id': username,
+ 'uploader_url': 'https://www.vidio.com/@' + username if username else None,
+ }
diff --git a/hypervideo_dl/extractor/vidzi.py b/hypervideo_dl/extractor/vidzi.py
new file mode 100644
index 0000000..42ea495
--- /dev/null
+++ b/hypervideo_dl/extractor/vidzi.py
@@ -0,0 +1,68 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ decode_packed_codes,
+ js_to_json,
+ NO_DEFAULT,
+ PACKED_CODES_RE,
+)
+
+
+class VidziIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?vidzi\.(?:tv|cc|si|nu)/(?:embed-)?(?P<id>[0-9a-zA-Z]+)'
+ _TESTS = [{
+ 'url': 'http://vidzi.tv/cghql9yq6emu.html',
+ 'md5': '4f16c71ca0c8c8635ab6932b5f3f1660',
+ 'info_dict': {
+ 'id': 'cghql9yq6emu',
+ 'ext': 'mp4',
+ 'title': 'youtube-dl test video 1\\\\2\'3/4<5\\\\6ä7↭',
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'http://vidzi.tv/embed-4z2yb0rzphe9-600x338.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://vidzi.cc/cghql9yq6emu.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://vidzi.si/rph9gztxj1et.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://vidzi.nu/cghql9yq6emu.html',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(
+ 'http://vidzi.tv/%s' % video_id, video_id)
+ title = self._html_search_regex(
+ r'(?s)<h2 class="video-title">(.*?)</h2>', webpage, 'title')
+
+ codes = [webpage]
+ codes.extend([
+ decode_packed_codes(mobj.group(0)).replace('\\\'', '\'')
+ for mobj in re.finditer(PACKED_CODES_RE, webpage)])
+ for num, code in enumerate(codes, 1):
+ jwplayer_data = self._parse_json(
+ self._search_regex(
+ r'setup\(([^)]+)\)', code, 'jwplayer data',
+ default=NO_DEFAULT if num == len(codes) else '{}'),
+ video_id, transform_source=lambda s: js_to_json(
+ re.sub(r'\s*\+\s*window\[.+?\]', '', s)))
+ if jwplayer_data:
+ break
+
+ info_dict = self._parse_jwplayer_data(jwplayer_data, video_id, require_title=False)
+ info_dict['title'] = title
+
+ return info_dict
diff --git a/hypervideo_dl/extractor/vier.py b/hypervideo_dl/extractor/vier.py
index dbd5ba9..94aa350 100644
--- a/hypervideo_dl/extractor/vier.py
+++ b/hypervideo_dl/extractor/vier.py
@@ -135,7 +135,7 @@ class VierIE(InfoExtractor):
self._logged_in = True
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
embed_id = mobj.group('embed_id')
display_id = mobj.group('display_id') or embed_id
video_id = mobj.group('id') or embed_id
@@ -234,7 +234,7 @@ class VierVideosIE(InfoExtractor):
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
program = mobj.group('program')
site = mobj.group('site')
diff --git a/hypervideo_dl/extractor/viewlift.py b/hypervideo_dl/extractor/viewlift.py
index d6b92b1..c3b2e86 100644
--- a/hypervideo_dl/extractor/viewlift.py
+++ b/hypervideo_dl/extractor/viewlift.py
@@ -92,7 +92,7 @@ class ViewLiftEmbedIE(ViewLiftBaseIE):
return mobj.group('url')
def _real_extract(self, url):
- domain, film_id = re.match(self._VALID_URL, url).groups()
+ domain, film_id = self._match_valid_url(url).groups()
site = domain.split('.')[-2]
if site in self._SITE_MAP:
site = self._SITE_MAP[site]
@@ -134,7 +134,7 @@ class ViewLiftEmbedIE(ViewLiftBaseIE):
if hls_url:
formats.extend(self._extract_m3u8_formats(
hls_url, film_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))
- self._sort_formats(formats, ('height', 'tbr', 'format_id'))
+ self._sort_formats(formats)
info = {
'id': film_id,
@@ -229,7 +229,7 @@ class ViewLiftIE(ViewLiftBaseIE):
return False if ViewLiftEmbedIE.suitable(url) else super(ViewLiftIE, cls).suitable(url)
def _real_extract(self, url):
- domain, path, display_id = re.match(self._VALID_URL, url).groups()
+ domain, path, display_id = self._match_valid_url(url).groups()
site = domain.split('.')[-2]
if site in self._SITE_MAP:
site = self._SITE_MAP[site]
diff --git a/hypervideo_dl/extractor/viidea.py b/hypervideo_dl/extractor/viidea.py
index a0abbae..0da0681 100644
--- a/hypervideo_dl/extractor/viidea.py
+++ b/hypervideo_dl/extractor/viidea.py
@@ -117,7 +117,7 @@ class ViideaIE(InfoExtractor):
}]
def _real_extract(self, url):
- lecture_slug, explicit_part_id = re.match(self._VALID_URL, url).groups()
+ lecture_slug, explicit_part_id = self._match_valid_url(url).groups()
webpage = self._download_webpage(url, lecture_slug)
diff --git a/hypervideo_dl/extractor/viki.py b/hypervideo_dl/extractor/viki.py
index 2e9cbf1..acb5ae5 100644
--- a/hypervideo_dl/extractor/viki.py
+++ b/hypervideo_dl/extractor/viki.py
@@ -1,38 +1,28 @@
# coding: utf-8
from __future__ import unicode_literals
-
-import base64
import hashlib
import hmac
-import itertools
import json
-import re
import time
from .common import InfoExtractor
-from ..compat import (
- compat_parse_qs,
- compat_urllib_parse_urlparse,
-)
from ..utils import (
ExtractorError,
int_or_none,
parse_age_limit,
parse_iso8601,
- sanitized_Request,
- std_headers,
try_get,
)
class VikiBaseIE(InfoExtractor):
_VALID_URL_BASE = r'https?://(?:www\.)?viki\.(?:com|net|mx|jp|fr)/'
- _API_QUERY_TEMPLATE = '/v4/%sapp=%s&t=%s&site=www.viki.com'
- _API_URL_TEMPLATE = 'https://api.viki.io%s&sig=%s'
+ _API_URL_TEMPLATE = 'https://api.viki.io%s'
+ _DEVICE_ID = '86085977d' # used for android api
_APP = '100005a'
- _APP_VERSION = '6.0.0'
- _APP_SECRET = 'MM_d*yP@`&1@]@!AVrXf_o-HVEnoTnm$O-ti4[G~$JDI/Dc-&piU&z&5.;:}95=Iad'
+ _APP_VERSION = '6.11.3'
+ _APP_SECRET = 'd96704b180208dbb2efa30fe44c48bd8690441af9f567ba8fd710a72badc85198f7472'
_GEO_BYPASS = False
_NETRC_MACHINE = 'viki'
@@ -45,43 +35,57 @@ class VikiBaseIE(InfoExtractor):
'paywall': 'Sorry, this content is only available to Viki Pass Plus subscribers',
}
- def _prepare_call(self, path, timestamp=None, post_data=None):
+ def _stream_headers(self, timestamp, sig):
+ return {
+ 'X-Viki-manufacturer': 'vivo',
+ 'X-Viki-device-model': 'vivo 1606',
+ 'X-Viki-device-os-ver': '6.0.1',
+ 'X-Viki-connection-type': 'WIFI',
+ 'X-Viki-carrier': '',
+ 'X-Viki-as-id': '100005a-1625321982-3932',
+ 'timestamp': str(timestamp),
+ 'signature': str(sig),
+ 'x-viki-app-ver': self._APP_VERSION
+ }
+
+ def _api_query(self, path, version=4, **kwargs):
path += '?' if '?' not in path else '&'
- if not timestamp:
- timestamp = int(time.time())
- query = self._API_QUERY_TEMPLATE % (path, self._APP, timestamp)
+ query = f'/v{version}/{path}app={self._APP}'
if self._token:
query += '&token=%s' % self._token
+ return query + ''.join(f'&{name}={val}' for name, val in kwargs.items())
+
+ def _sign_query(self, path):
+ timestamp = int(time.time())
+ query = self._api_query(path, version=5)
sig = hmac.new(
- self._APP_SECRET.encode('ascii'),
- query.encode('ascii'),
- hashlib.sha1
- ).hexdigest()
- url = self._API_URL_TEMPLATE % (query, sig)
- return sanitized_Request(
- url, json.dumps(post_data).encode('utf-8')) if post_data else url
-
- def _call_api(self, path, video_id, note, timestamp=None, post_data=None):
+ self._APP_SECRET.encode('ascii'), f'{query}&t={timestamp}'.encode('ascii'), hashlib.sha1).hexdigest()
+ return timestamp, sig, self._API_URL_TEMPLATE % query
+
+ def _call_api(
+ self, path, video_id, note='Downloading JSON metadata', data=None, query=None, fatal=True):
+ if query is None:
+ timestamp, sig, url = self._sign_query(path)
+ else:
+ url = self._API_URL_TEMPLATE % self._api_query(path, version=4)
resp = self._download_json(
- self._prepare_call(path, timestamp, post_data), video_id, note,
- headers={'x-viki-app-ver': self._APP_VERSION})
-
- error = resp.get('error')
- if error:
- if error == 'invalid timestamp':
- resp = self._download_json(
- self._prepare_call(path, int(resp['current_timestamp']), post_data),
- video_id, '%s (retry)' % note)
- error = resp.get('error')
- if error:
- self._raise_error(resp['error'])
+ url, video_id, note, fatal=fatal, query=query,
+ data=json.dumps(data).encode('utf-8') if data else None,
+ headers=({'x-viki-app-ver': self._APP_VERSION} if data
+ else self._stream_headers(timestamp, sig) if query is None
+ else None), expected_status=400) or {}
+ self._raise_error(resp.get('error'), fatal)
return resp
- def _raise_error(self, error):
- raise ExtractorError(
- '%s returned error: %s' % (self.IE_NAME, error),
- expected=True)
+ def _raise_error(self, error, fatal=True):
+ if error is None:
+ return
+ msg = '%s said: %s' % (self.IE_NAME, error)
+ if fatal:
+ raise ExtractorError(msg, expected=True)
+ else:
+ self.report_warning(msg)
def _check_errors(self, data):
for reason, status in (data.get('blocking') or {}).items():
@@ -90,9 +94,10 @@ class VikiBaseIE(InfoExtractor):
if reason == 'geo':
self.raise_geo_restricted(msg=message)
elif reason == 'paywall':
+ if try_get(data, lambda x: x['paywallable']['tvod']):
+ self._raise_error('This video is for rent only or TVOD (Transactional Video On demand)')
self.raise_login_required(message)
- raise ExtractorError('%s said: %s' % (
- self.IE_NAME, message), expected=True)
+ self._raise_error(message)
def _real_initialize(self):
self._login()
@@ -102,35 +107,38 @@ class VikiBaseIE(InfoExtractor):
if username is None:
return
- login_form = {
- 'login_id': username,
- 'password': password,
- }
-
- login = self._call_api(
- 'sessions.json', None,
- 'Logging in', post_data=login_form)
-
- self._token = login.get('token')
+ self._token = self._call_api(
+ 'sessions.json', None, 'Logging in', fatal=False,
+ data={'username': username, 'password': password}).get('token')
if not self._token:
- self.report_warning('Unable to get session token, login has probably failed')
+ self.report_warning('Login Failed: Unable to get session token')
@staticmethod
- def dict_selection(dict_obj, preferred_key, allow_fallback=True):
+ def dict_selection(dict_obj, preferred_key):
if preferred_key in dict_obj:
- return dict_obj.get(preferred_key)
-
- if not allow_fallback:
- return
-
- filtered_dict = list(filter(None, [dict_obj.get(k) for k in dict_obj.keys()]))
- return filtered_dict[0] if filtered_dict else None
+ return dict_obj[preferred_key]
+ return (list(filter(None, dict_obj.values())) or [None])[0]
class VikiIE(VikiBaseIE):
IE_NAME = 'viki'
_VALID_URL = r'%s(?:videos|player)/(?P<id>[0-9]+v)' % VikiBaseIE._VALID_URL_BASE
_TESTS = [{
+ 'note': 'Free non-DRM video with storyboards in MPD',
+ 'url': 'https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1',
+ 'info_dict': {
+ 'id': '1175236v',
+ 'ext': 'mp4',
+ 'title': 'Choosing Spouse by Lottery - Episode 1',
+ 'timestamp': 1606463239,
+ 'age_limit': 13,
+ 'uploader': 'FCC',
+ 'upload_date': '20201127',
+ },
+ 'params': {
+ 'format': 'bestvideo',
+ },
+ }, {
'url': 'http://www.viki.com/videos/1023585v-heirs-episode-14',
'info_dict': {
'id': '1023585v',
@@ -147,7 +155,6 @@ class VikiIE(VikiBaseIE):
'format': 'bestvideo',
},
'skip': 'Blocked in the US',
- 'expected_warnings': ['Unknown MIME type image/jpeg in DASH manifest'],
}, {
# clip
'url': 'http://www.viki.com/videos/1067139v-the-avengers-age-of-ultron-press-conference',
@@ -199,7 +206,6 @@ class VikiIE(VikiBaseIE):
'params': {
'format': 'bestvideo',
},
- 'expected_warnings': ['Unknown MIME type image/jpeg in DASH manifest'],
}, {
# youtube external
'url': 'http://www.viki.com/videos/50562v-poor-nastya-complete-episode-1',
@@ -238,23 +244,14 @@ class VikiIE(VikiBaseIE):
'params': {
'format': 'bestvideo',
},
- 'expected_warnings': ['Unknown MIME type image/jpeg in DASH manifest'],
}]
def _real_extract(self, url):
video_id = self._match_id(url)
-
- resp = self._download_json(
- 'https://www.viki.com/api/videos/' + video_id,
- video_id, 'Downloading video JSON', headers={
- 'x-client-user-agent': std_headers['User-Agent'],
- 'x-viki-app-ver': '3.0.0',
- })
- video = resp['video']
-
+ video = self._call_api(f'videos/{video_id}.json', video_id, 'Downloading video JSON', query={})
self._check_errors(video)
- title = self.dict_selection(video.get('titles', {}), 'en', allow_fallback=False)
+ title = try_get(video, lambda x: x['titles']['en'], str)
episode_number = int_or_none(video.get('number'))
if not title:
title = 'Episode %d' % episode_number if video.get('type') == 'episode' else video.get('id') or video_id
@@ -262,113 +259,46 @@ class VikiIE(VikiBaseIE):
container_title = self.dict_selection(container_titles, 'en')
title = '%s - %s' % (container_title, title)
- description = self.dict_selection(video.get('descriptions', {}), 'en')
-
- like_count = int_or_none(try_get(video, lambda x: x['likes']['count']))
-
- thumbnails = []
- for thumbnail_id, thumbnail in (video.get('images') or {}).items():
- thumbnails.append({
- 'id': thumbnail_id,
- 'url': thumbnail.get('url'),
- })
-
- subtitles = {}
- for subtitle_lang, _ in (video.get('subtitle_completions') or {}).items():
- subtitles[subtitle_lang] = [{
- 'ext': subtitles_format,
- 'url': self._prepare_call(
- 'videos/%s/subtitles/%s.%s' % (video_id, subtitle_lang, subtitles_format)),
- } for subtitles_format in ('srt', 'vtt')]
+ thumbnails = [{
+ 'id': thumbnail_id,
+ 'url': thumbnail['url'],
+ } for thumbnail_id, thumbnail in (video.get('images') or {}).items() if thumbnail.get('url')]
+
+ resp = self._call_api(
+ 'playback_streams/%s.json?drms=dt1,dt2&device_id=%s' % (video_id, self._DEVICE_ID),
+ video_id, 'Downloading video streams JSON')['main'][0]
+
+ stream_id = try_get(resp, lambda x: x['properties']['track']['stream_id'])
+ subtitles = dict((lang, [{
+ 'ext': ext,
+ 'url': self._API_URL_TEMPLATE % self._api_query(
+ f'videos/{video_id}/auth_subtitles/{lang}.{ext}', stream_id=stream_id)
+ } for ext in ('srt', 'vtt')]) for lang in (video.get('subtitle_completions') or {}).keys())
+
+ mpd_url = resp['url']
+ # 1080p is hidden in another mpd which can be found in the current manifest content
+ mpd_content = self._download_webpage(mpd_url, video_id, note='Downloading initial MPD manifest')
+ mpd_url = self._search_regex(
+ r'(?mi)<BaseURL>(http.+.mpd)', mpd_content, 'new manifest', default=mpd_url)
+ formats = self._extract_mpd_formats(mpd_url, video_id)
+ self._sort_formats(formats)
- result = {
+ return {
'id': video_id,
+ 'formats': formats,
'title': title,
- 'description': description,
+ 'description': self.dict_selection(video.get('descriptions', {}), 'en'),
'duration': int_or_none(video.get('duration')),
'timestamp': parse_iso8601(video.get('created_at')),
'uploader': video.get('author'),
'uploader_url': video.get('author_url'),
- 'like_count': like_count,
+ 'like_count': int_or_none(try_get(video, lambda x: x['likes']['count'])),
'age_limit': parse_age_limit(video.get('rating')),
'thumbnails': thumbnails,
'subtitles': subtitles,
'episode_number': episode_number,
}
- formats = []
-
- def add_format(format_id, format_dict, protocol='http'):
- # rtmps URLs does not seem to work
- if protocol == 'rtmps':
- return
- format_url = format_dict.get('url')
- if not format_url:
- return
- qs = compat_parse_qs(compat_urllib_parse_urlparse(format_url).query)
- stream = qs.get('stream', [None])[0]
- if stream:
- format_url = base64.b64decode(stream).decode()
- if format_id in ('m3u8', 'hls'):
- m3u8_formats = self._extract_m3u8_formats(
- format_url, video_id, 'mp4',
- entry_protocol='m3u8_native',
- m3u8_id='m3u8-%s' % protocol, fatal=False)
- # Despite CODECS metadata in m3u8 all video-only formats
- # are actually video+audio
- for f in m3u8_formats:
- if '_drm/index_' in f['url']:
- continue
- if f.get('acodec') == 'none' and f.get('vcodec') != 'none':
- f['acodec'] = None
- formats.append(f)
- elif format_id in ('mpd', 'dash'):
- formats.extend(self._extract_mpd_formats(
- format_url, video_id, 'mpd-%s' % protocol, fatal=False))
- elif format_url.startswith('rtmp'):
- mobj = re.search(
- r'^(?P<url>rtmp://[^/]+/(?P<app>.+?))/(?P<playpath>mp4:.+)$',
- format_url)
- if not mobj:
- return
- formats.append({
- 'format_id': 'rtmp-%s' % format_id,
- 'ext': 'flv',
- 'url': mobj.group('url'),
- 'play_path': mobj.group('playpath'),
- 'app': mobj.group('app'),
- 'page_url': url,
- })
- else:
- formats.append({
- 'url': format_url,
- 'format_id': '%s-%s' % (format_id, protocol),
- 'height': int_or_none(self._search_regex(
- r'^(\d+)[pP]$', format_id, 'height', default=None)),
- })
-
- for format_id, format_dict in (resp.get('streams') or {}).items():
- add_format(format_id, format_dict)
- if not formats:
- streams = self._call_api(
- 'videos/%s/streams.json' % video_id, video_id,
- 'Downloading video streams JSON')
-
- if 'external' in streams:
- result.update({
- '_type': 'url_transparent',
- 'url': streams['external']['url'],
- })
- return result
-
- for format_id, stream_dict in streams.items():
- for protocol, format_dict in stream_dict.items():
- add_format(format_id, format_dict, protocol)
- self._sort_formats(formats)
-
- result['formats'] = formats
- return result
-
class VikiChannelIE(VikiBaseIE):
IE_NAME = 'viki:channel'
@@ -380,7 +310,7 @@ class VikiChannelIE(VikiBaseIE):
'title': 'Boys Over Flowers',
'description': 'md5:804ce6e7837e1fd527ad2f25420f4d59',
},
- 'playlist_mincount': 71,
+ 'playlist_mincount': 51,
}, {
'url': 'http://www.viki.com/tv/1354c-poor-nastya-complete',
'info_dict': {
@@ -401,33 +331,35 @@ class VikiChannelIE(VikiBaseIE):
'only_matching': True,
}]
- _PER_PAGE = 25
+ _video_types = ('episodes', 'movies', 'clips', 'trailers')
+
+ def _entries(self, channel_id):
+ params = {
+ 'app': self._APP, 'token': self._token, 'only_ids': 'true',
+ 'direction': 'asc', 'sort': 'number', 'per_page': 30
+ }
+ video_types = self._configuration_arg('video_types') or self._video_types
+ for video_type in video_types:
+ if video_type not in self._video_types:
+ self.report_warning(f'Unknown video_type: {video_type}')
+ page_num = 0
+ while True:
+ page_num += 1
+ params['page'] = page_num
+ res = self._call_api(
+ f'containers/{channel_id}/{video_type}.json', channel_id, query=params, fatal=False,
+ note='Downloading %s JSON page %d' % (video_type.title(), page_num))
+
+ for video_id in res.get('response') or []:
+ yield self.url_result(f'https://www.viki.com/videos/{video_id}', VikiIE.ie_key(), video_id)
+ if not res.get('more'):
+ break
def _real_extract(self, url):
channel_id = self._match_id(url)
-
- channel = self._call_api(
- 'containers/%s.json' % channel_id, channel_id,
- 'Downloading channel JSON')
-
+ channel = self._call_api('containers/%s.json' % channel_id, channel_id, 'Downloading channel JSON')
self._check_errors(channel)
-
- title = self.dict_selection(channel['titles'], 'en')
-
- description = self.dict_selection(channel['descriptions'], 'en')
-
- entries = []
- for video_type in ('episodes', 'clips', 'movies'):
- for page_num in itertools.count(1):
- page = self._call_api(
- 'containers/%s/%s.json?per_page=%d&sort=number&direction=asc&with_paging=true&page=%d'
- % (channel_id, video_type, self._PER_PAGE, page_num), channel_id,
- 'Downloading %s JSON page #%d' % (video_type, page_num))
- for video in page['response']:
- video_id = video['id']
- entries.append(self.url_result(
- 'https://www.viki.com/videos/%s' % video_id, 'Viki'))
- if not page['pagination']['next']:
- break
-
- return self.playlist_result(entries, channel_id, title, description)
+ return self.playlist_result(
+ self._entries(channel_id), channel_id,
+ self.dict_selection(channel['titles'], 'en'),
+ self.dict_selection(channel['descriptions'], 'en'))
diff --git a/hypervideo_dl/extractor/vimeo.py b/hypervideo_dl/extractor/vimeo.py
index 6323219..9fb5475 100644
--- a/hypervideo_dl/extractor/vimeo.py
+++ b/hypervideo_dl/extractor/vimeo.py
@@ -3,6 +3,7 @@ from __future__ import unicode_literals
import base64
import functools
+import json
import re
import itertools
@@ -16,14 +17,16 @@ from ..compat import (
from ..utils import (
clean_html,
determine_ext,
+ dict_get,
ExtractorError,
- get_element_by_class,
js_to_json,
int_or_none,
merge_dicts,
OnDemandPagedList,
parse_filesize,
parse_iso8601,
+ parse_qs,
+ RegexNotFoundError,
sanitized_Request,
smuggle_url,
std_headers,
@@ -74,7 +77,7 @@ class VimeoBaseInfoExtractor(InfoExtractor):
raise ExtractorError('Unable to log in')
def _get_video_password(self):
- password = self._downloader.params.get('videopassword')
+ password = self.get_param('videopassword')
if password is None:
raise ExtractorError(
'This video is protected by a password, use the --video-password option',
@@ -118,18 +121,18 @@ class VimeoBaseInfoExtractor(InfoExtractor):
def _vimeo_sort_formats(self, formats):
# Bitrates are completely broken. Single m3u8 may contain entries in kbps and bps
# at the same time without actual units specified. This lead to wrong sorting.
- self._sort_formats(formats, field_preference=('preference', 'height', 'width', 'fps', 'tbr', 'format_id'))
+ # But since hypervideo prefers 'res,fps' anyway, 'field_preference' is not needed
+ self._sort_formats(formats)
def _parse_config(self, config, video_id):
video_data = config['video']
video_title = video_data['title']
live_event = video_data.get('live_event') or {}
is_live = live_event.get('status') == 'started'
- request = config.get('request') or {}
formats = []
- config_files = video_data.get('files') or request.get('files') or {}
- for f in (config_files.get('progressive') or []):
+ config_files = video_data.get('files') or config['request'].get('files', {})
+ for f in config_files.get('progressive', []):
video_url = f.get('url')
if not video_url:
continue
@@ -145,7 +148,7 @@ class VimeoBaseInfoExtractor(InfoExtractor):
# TODO: fix handling of 308 status code returned for live archive manifest requests
sep_pattern = r'/sep/video/'
for files_type in ('hls', 'dash'):
- for cdn_name, cdn_data in (try_get(config_files, lambda x: x[files_type]['cdns']) or {}).items():
+ for cdn_name, cdn_data in config_files.get(files_type, {}).get('cdns', {}).items():
manifest_url = cdn_data.get('url')
if not manifest_url:
continue
@@ -181,25 +184,21 @@ class VimeoBaseInfoExtractor(InfoExtractor):
formats.append({
'format_id': 'live-archive-source',
'url': live_archive_source_url,
- 'preference': 1,
+ 'quality': 10,
})
- for f in formats:
- if f.get('vcodec') == 'none':
- f['preference'] = -50
- elif f.get('acodec') == 'none':
- f['preference'] = -40
-
subtitles = {}
- for tt in (request.get('text_tracks') or []):
- subtitles[tt['lang']] = [{
- 'ext': 'vtt',
- 'url': urljoin('https://vimeo.com', tt['url']),
- }]
+ text_tracks = config['request'].get('text_tracks')
+ if text_tracks:
+ for tt in text_tracks:
+ subtitles[tt['lang']] = [{
+ 'ext': 'vtt',
+ 'url': urljoin('https://vimeo.com', tt['url']),
+ }]
thumbnails = []
if not is_live:
- for key, thumb in (video_data.get('thumbs') or {}).items():
+ for key, thumb in video_data.get('thumbs', {}).items():
thumbnails.append({
'id': key,
'width': int_or_none(key),
@@ -252,9 +251,33 @@ class VimeoBaseInfoExtractor(InfoExtractor):
'height': int_or_none(source_file.get('height')),
'filesize': parse_filesize(source_file.get('size')),
'format_id': source_name,
- 'preference': 1,
+ 'quality': 1,
}
+ jwt_response = self._download_json(
+ 'https://vimeo.com/_rv/viewer', video_id, note='Downloading jwt token', fatal=False) or {}
+ if not jwt_response.get('jwt'):
+ return
+ headers = {'Authorization': 'jwt %s' % jwt_response['jwt']}
+ original_response = self._download_json(
+ f'https://api.vimeo.com/videos/{video_id}', video_id,
+ headers=headers, fatal=False) or {}
+ for download_data in original_response.get('download') or {}:
+ download_url = download_data.get('link')
+ if not download_url or download_data.get('quality') != 'source':
+ continue
+ query = parse_qs(download_url)
+ return {
+ 'url': download_url,
+ 'ext': determine_ext(query.get('filename', [''])[0].lower()),
+ 'format_id': download_data.get('public_name', 'Original'),
+ 'width': int_or_none(download_data.get('width')),
+ 'height': int_or_none(download_data.get('height')),
+ 'fps': int_or_none(download_data.get('fps')),
+ 'filesize': int_or_none(download_data.get('size')),
+ 'quality': 1,
+ }
+
class VimeoIE(VimeoBaseInfoExtractor):
"""Information extractor for vimeo.com."""
@@ -290,7 +313,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
'info_dict': {
'id': '56015672',
'ext': 'mp4',
- 'title': "hypervideo test video - \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550",
+ 'title': "youtube-dl test video - \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550",
'description': 'md5:2d3305bad981a06ff79f027f19865021',
'timestamp': 1355990239,
'upload_date': '20121220',
@@ -319,7 +342,6 @@ class VimeoIE(VimeoBaseInfoExtractor):
'duration': 1595,
'upload_date': '20130610',
'timestamp': 1370893156,
- 'license': 'by',
},
'params': {
'format': 'best[protocol=https]',
@@ -351,7 +373,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
'info_dict': {
'id': '68375962',
'ext': 'mp4',
- 'title': 'hypervideo password protected test video',
+ 'title': 'youtube-dl password protected test video',
'timestamp': 1371200155,
'upload_date': '20130614',
'uploader_url': r're:https?://(?:www\.)?vimeo\.com/user18948128',
@@ -362,7 +384,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
},
'params': {
'format': 'best[protocol=https]',
- 'videopassword': 'hypervideo',
+ 'videopassword': 'youtube-dl',
},
},
{
@@ -398,12 +420,6 @@ class VimeoIE(VimeoBaseInfoExtractor):
'uploader_id': 'staff',
'uploader': 'Vimeo Staff',
'duration': 62,
- 'subtitles': {
- 'de': [{'ext': 'vtt'}],
- 'en': [{'ext': 'vtt'}],
- 'es': [{'ext': 'vtt'}],
- 'fr': [{'ext': 'vtt'}],
- },
}
},
{
@@ -436,6 +452,22 @@ class VimeoIE(VimeoBaseInfoExtractor):
},
},
{
+ 'note': 'Contains original format not accessible in webpage',
+ 'url': 'https://vimeo.com/393756517',
+ 'md5': 'c464af248b592190a5ffbb5d33f382b0',
+ 'info_dict': {
+ 'id': '393756517',
+ 'ext': 'mov',
+ 'timestamp': 1582642091,
+ 'uploader_id': 'frameworkla',
+ 'title': 'Straight To Hell - Sabrina: Netflix',
+ 'uploader': 'Framework Studio',
+ 'description': 'md5:f2edc61af3ea7a5592681ddbb683db73',
+ 'upload_date': '20200225',
+ },
+ 'expected_warnings': ['Unable to download JSON metadata'],
+ },
+ {
# only available via https://vimeo.com/channels/tributes/6213729 and
# not via https://vimeo.com/6213729
'url': 'https://vimeo.com/channels/tributes/6213729',
@@ -484,7 +516,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
'info_dict': {
'id': '68375962',
'ext': 'mp4',
- 'title': 'hypervideo password protected test video',
+ 'title': 'youtube-dl password protected test video',
'uploader_url': r're:https?://(?:www\.)?vimeo\.com/user18948128',
'uploader_id': 'user18948128',
'uploader': 'Jaime Marquínez Ferrándiz',
@@ -492,7 +524,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
},
'params': {
'format': 'best[protocol=https]',
- 'videopassword': 'hypervideo',
+ 'videopassword': 'youtube-dl',
},
},
{
@@ -513,6 +545,24 @@ class VimeoIE(VimeoBaseInfoExtractor):
'only_matching': True,
},
{
+ 'url': 'https://vimeo.com/showcase/3253534/video/119195465',
+ 'note': 'A video in a password protected album (showcase)',
+ 'info_dict': {
+ 'id': '119195465',
+ 'ext': 'mp4',
+ 'title': 'youtube-dl test video \'ä"BaW_jenozKc',
+ 'uploader': 'Philipp Hagemeister',
+ 'uploader_id': 'user20132939',
+ 'description': 'md5:fa7b6c6d8db0bdc353893df2f111855b',
+ 'upload_date': '20150209',
+ 'timestamp': 1423518307,
+ },
+ 'params': {
+ 'format': 'best[protocol=https]',
+ 'videopassword': 'youtube-dl',
+ },
+ },
+ {
# source file returns 403: Forbidden
'url': 'https://vimeo.com/7809605',
'only_matching': True,
@@ -576,36 +626,43 @@ class VimeoIE(VimeoBaseInfoExtractor):
def _real_initialize(self):
self._login()
- def _extract_from_api(self, video_id, unlisted_hash=None):
- token = self._download_json(
- 'https://vimeo.com/_rv/jwt', video_id, headers={
- 'X-Requested-With': 'XMLHttpRequest'
- })['token']
- api_url = 'https://api.vimeo.com/videos/' + video_id
- if unlisted_hash:
- api_url += ':' + unlisted_hash
- video = self._download_json(
- api_url, video_id, headers={
- 'Authorization': 'jwt ' + token,
- }, query={
- 'fields': 'config_url,created_time,description,license,metadata.connections.comments.total,metadata.connections.likes.total,release_time,stats.plays',
- })
- info = self._parse_config(self._download_json(
- video['config_url'], video_id), video_id)
- self._vimeo_sort_formats(info['formats'])
- get_timestamp = lambda x: parse_iso8601(video.get(x + '_time'))
- info.update({
- 'description': video.get('description'),
- 'license': video.get('license'),
- 'release_timestamp': get_timestamp('release'),
- 'timestamp': get_timestamp('created'),
- 'view_count': int_or_none(try_get(video, lambda x: x['stats']['plays'])),
- })
- connections = try_get(
- video, lambda x: x['metadata']['connections'], dict) or {}
- for k in ('comment', 'like'):
- info[k + '_count'] = int_or_none(try_get(connections, lambda x: x[k + 's']['total']))
- return info
+ def _try_album_password(self, url):
+ album_id = self._search_regex(
+ r'vimeo\.com/(?:album|showcase)/([^/]+)', url, 'album id', default=None)
+ if not album_id:
+ return
+ viewer = self._download_json(
+ 'https://vimeo.com/_rv/viewer', album_id, fatal=False)
+ if not viewer:
+ webpage = self._download_webpage(url, album_id)
+ viewer = self._parse_json(self._search_regex(
+ r'bootstrap_data\s*=\s*({.+?})</script>',
+ webpage, 'bootstrap data'), album_id)['viewer']
+ jwt = viewer['jwt']
+ album = self._download_json(
+ 'https://api.vimeo.com/albums/' + album_id,
+ album_id, headers={'Authorization': 'jwt ' + jwt},
+ query={'fields': 'description,name,privacy'})
+ if try_get(album, lambda x: x['privacy']['view']) == 'password':
+ password = self.get_param('videopassword')
+ if not password:
+ raise ExtractorError(
+ 'This album is protected by a password, use the --video-password option',
+ expected=True)
+ self._set_vimeo_cookie('vuid', viewer['vuid'])
+ try:
+ self._download_json(
+ 'https://vimeo.com/showcase/%s/auth' % album_id,
+ album_id, 'Verifying the password', data=urlencode_postdata({
+ 'password': password,
+ 'token': viewer['xsrft'],
+ }), headers={
+ 'X-Requested-With': 'XMLHttpRequest',
+ })
+ except ExtractorError as e:
+ if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
+ raise ExtractorError('Wrong password', expected=True)
+ raise
def _real_extract(self, url):
url, data = unsmuggle_url(url, {})
@@ -615,22 +672,52 @@ class VimeoIE(VimeoBaseInfoExtractor):
if 'Referer' not in headers:
headers['Referer'] = url
- mobj = re.match(self._VALID_URL, url).groupdict()
- video_id, unlisted_hash = mobj['id'], mobj.get('unlisted_hash')
+ # Extract ID from URL
+ video_id, unlisted_hash = self._match_valid_url(url).groups()
if unlisted_hash:
- return self._extract_from_api(video_id, unlisted_hash)
+ token = self._download_json(
+ 'https://vimeo.com/_rv/jwt', video_id, headers={
+ 'X-Requested-With': 'XMLHttpRequest'
+ })['token']
+ video = self._download_json(
+ 'https://api.vimeo.com/videos/%s:%s' % (video_id, unlisted_hash),
+ video_id, headers={
+ 'Authorization': 'jwt ' + token,
+ }, query={
+ 'fields': 'config_url,created_time,description,license,metadata.connections.comments.total,metadata.connections.likes.total,release_time,stats.plays',
+ })
+ info = self._parse_config(self._download_json(
+ video['config_url'], video_id), video_id)
+ self._vimeo_sort_formats(info['formats'])
+ get_timestamp = lambda x: parse_iso8601(video.get(x + '_time'))
+ info.update({
+ 'description': video.get('description'),
+ 'license': video.get('license'),
+ 'release_timestamp': get_timestamp('release'),
+ 'timestamp': get_timestamp('created'),
+ 'view_count': int_or_none(try_get(video, lambda x: x['stats']['plays'])),
+ })
+ connections = try_get(
+ video, lambda x: x['metadata']['connections'], dict) or {}
+ for k in ('comment', 'like'):
+ info[k + '_count'] = int_or_none(try_get(connections, lambda x: x[k + 's']['total']))
+ return info
orig_url = url
is_pro = 'vimeopro.com/' in url
+ is_player = '://player.vimeo.com/video/' in url
if is_pro:
# some videos require portfolio_id to be present in player url
# https://github.com/ytdl-org/youtube-dl/issues/20070
url = self._extract_url(url, self._download_webpage(url, video_id))
if not url:
url = 'https://vimeo.com/' + video_id
+ elif is_player:
+ url = 'https://player.vimeo.com/video/' + video_id
elif any(p in url for p in ('play_redirect_hls', 'moogaloop.swf')):
url = 'https://vimeo.com/' + video_id
+ self._try_album_password(url)
try:
# Retrieve video webpage to extract further information
webpage, urlh = self._download_webpage_handle(
@@ -647,25 +734,14 @@ class VimeoIE(VimeoBaseInfoExtractor):
expected=True)
raise
- if '//player.vimeo.com/video/' in url:
- config = self._parse_json(self._search_regex(
- r'\bconfig\s*=\s*({.+?})\s*;', webpage, 'info section'), video_id)
- if config.get('view') == 4:
- config = self._verify_player_video_password(
- redirect_url, video_id, headers)
- info = self._parse_config(config, video_id)
- self._vimeo_sort_formats(info['formats'])
- return info
-
- if re.search(r'<form[^>]+?id="pw_form"', webpage):
- video_password = self._get_video_password()
- token, vuid = self._extract_xsrft_and_vuid(webpage)
- webpage = self._verify_video_password(
- redirect_url, video_id, video_password, token, vuid)
+ # Now we begin extracting as much information as we can from what we
+ # retrieved. First we extract the information common to all extractors,
+ # and latter we extract those that are Vimeo specific.
+ self.report_extraction(video_id)
vimeo_config = self._extract_vimeo_config(webpage, video_id, default=None)
if vimeo_config:
- seed_status = vimeo_config.get('seed_status') or {}
+ seed_status = vimeo_config.get('seed_status', {})
if seed_status.get('state') == 'failed':
raise ExtractorError(
'%s said: %s' % (self.IE_NAME, seed_status['title']),
@@ -674,40 +750,70 @@ class VimeoIE(VimeoBaseInfoExtractor):
cc_license = None
timestamp = None
video_description = None
- info_dict = {}
- channel_id = self._search_regex(
- r'vimeo\.com/channels/([^/]+)', url, 'channel id', default=None)
- if channel_id:
- config_url = self._html_search_regex(
- r'\bdata-config-url="([^"]+)"', webpage, 'config URL')
- video_description = clean_html(get_element_by_class('description', webpage))
- info_dict.update({
- 'channel_id': channel_id,
- 'channel_url': 'https://vimeo.com/channels/' + channel_id,
- })
+ # Extract the config JSON
+ try:
+ try:
+ config_url = self._html_search_regex(
+ r' data-config-url="(.+?)"', webpage,
+ 'config URL', default=None)
+ if not config_url:
+ # Sometimes new react-based page is served instead of old one that require
+ # different config URL extraction approach (see
+ # https://github.com/ytdl-org/youtube-dl/pull/7209)
+ page_config = self._parse_json(self._search_regex(
+ r'vimeo\.(?:clip|vod_title)_page_config\s*=\s*({.+?});',
+ webpage, 'page config'), video_id)
+ config_url = page_config['player']['config_url']
+ cc_license = page_config.get('cc_license')
+ timestamp = try_get(
+ page_config, lambda x: x['clip']['uploaded_on'],
+ compat_str)
+ video_description = clean_html(dict_get(
+ page_config, ('description', 'description_html_escaped')))
+ config = self._download_json(config_url, video_id)
+ except RegexNotFoundError:
+ # For pro videos or player.vimeo.com urls
+ # We try to find out to which variable is assigned the config dic
+ m_variable_name = re.search(r'(\w)\.video\.id', webpage)
+ if m_variable_name is not None:
+ config_re = [r'%s=({[^}].+?});' % re.escape(m_variable_name.group(1))]
+ else:
+ config_re = [r' = {config:({.+?}),assets:', r'(?:[abc])=({.+?});']
+ config_re.append(r'\bvar\s+r\s*=\s*({.+?})\s*;')
+ config_re.append(r'\bconfig\s*=\s*({.+?})\s*;')
+ config = self._search_regex(config_re, webpage, 'info section',
+ flags=re.DOTALL)
+ config = json.loads(config)
+ except Exception as e:
+ if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
+ raise ExtractorError('The author has restricted the access to this video, try with the "--referer" option')
+
+ if re.search(r'<form[^>]+?id="pw_form"', webpage) is not None:
+ if '_video_password_verified' in data:
+ raise ExtractorError('video password verification failed!')
+ video_password = self._get_video_password()
+ token, vuid = self._extract_xsrft_and_vuid(webpage)
+ self._verify_video_password(
+ redirect_url, video_id, video_password, token, vuid)
+ return self._real_extract(
+ smuggle_url(redirect_url, {'_video_password_verified': 'verified'}))
+ else:
+ raise ExtractorError('Unable to extract info section',
+ cause=e)
else:
- page_config = self._parse_json(self._search_regex(
- r'vimeo\.(?:clip|vod_title)_page_config\s*=\s*({.+?});',
- webpage, 'page config', default='{}'), video_id, fatal=False)
- if not page_config:
- return self._extract_from_api(video_id)
- config_url = page_config['player']['config_url']
- cc_license = page_config.get('cc_license')
- clip = page_config.get('clip') or {}
- timestamp = clip.get('uploaded_on')
- video_description = clean_html(
- clip.get('description') or page_config.get('description_html_escaped'))
- config = self._download_json(config_url, video_id)
+ if config.get('view') == 4:
+ config = self._verify_player_video_password(redirect_url, video_id, headers)
+
video = config.get('video') or {}
vod = video.get('vod') or {}
def is_rented():
if '>You rented this title.<' in webpage:
return True
- if try_get(config, lambda x: x['user']['purchased']):
+ if config.get('user', {}).get('purchased'):
return True
- for purchase_option in (vod.get('purchase_options') or []):
+ for purchase_option in vod.get('purchase_options', []):
if purchase_option.get('purchased'):
return True
label = purchase_option.get('label_string')
@@ -722,10 +828,14 @@ class VimeoIE(VimeoBaseInfoExtractor):
'https://player.vimeo.com/player/%s' % feature_id,
{'force_feature_id': True}), 'Vimeo')
+ # Extract video description
+ if not video_description:
+ video_description = self._html_search_regex(
+ r'(?s)<div\s+class="[^"]*description[^"]*"[^>]*>(.*?)</div>',
+ webpage, 'description', default=None)
if not video_description:
video_description = self._html_search_meta(
- ['description', 'og:description', 'twitter:description'],
- webpage, default=None)
+ 'description', webpage, default=None)
if not video_description and is_pro:
orig_webpage = self._download_webpage(
orig_url, video_id,
@@ -734,14 +844,25 @@ class VimeoIE(VimeoBaseInfoExtractor):
if orig_webpage:
video_description = self._html_search_meta(
'description', orig_webpage, default=None)
- if not video_description:
- self._downloader.report_warning('Cannot find video description')
+ if not video_description and not is_player:
+ self.report_warning('Cannot find video description')
+ # Extract upload date
if not timestamp:
timestamp = self._search_regex(
r'<time[^>]+datetime="([^"]+)"', webpage,
'timestamp', default=None)
+ try:
+ view_count = int(self._search_regex(r'UserPlays:(\d+)', webpage, 'view count'))
+ like_count = int(self._search_regex(r'UserLikes:(\d+)', webpage, 'like count'))
+ comment_count = int(self._search_regex(r'UserComments:(\d+)', webpage, 'comment count'))
+ except RegexNotFoundError:
+ # This info is only available in vimeo.com/{id} urls
+ view_count = None
+ like_count = None
+ comment_count = None
+
formats = []
source_format = self._extract_original_format(
@@ -760,20 +881,31 @@ class VimeoIE(VimeoBaseInfoExtractor):
r'<link[^>]+rel=["\']license["\'][^>]+href=(["\'])(?P<license>(?:(?!\1).)+)\1',
webpage, 'license', default=None, group='license')
- info_dict.update({
+ channel_id = self._search_regex(
+ r'vimeo\.com/channels/([^/]+)', url, 'channel id', default=None)
+ channel_url = 'https://vimeo.com/channels/%s' % channel_id if channel_id else None
+
+ info_dict = {
'formats': formats,
'timestamp': unified_timestamp(timestamp),
'description': video_description,
'webpage_url': url,
+ 'view_count': view_count,
+ 'like_count': like_count,
+ 'comment_count': comment_count,
'license': cc_license,
- })
+ 'channel_id': channel_id,
+ 'channel_url': channel_url,
+ }
- return merge_dicts(info_dict, info_dict_config, json_ld)
+ info_dict = merge_dicts(info_dict, info_dict_config, json_ld)
+
+ return info_dict
class VimeoOndemandIE(VimeoIE):
IE_NAME = 'vimeo:ondemand'
- _VALID_URL = r'https?://(?:www\.)?vimeo\.com/ondemand/(?:[^/]+/)?(?P<id>[^/?#&]+)'
+ _VALID_URL = r'https?://(?:www\.)?vimeo\.com/ondemand/([^/]+/)?(?P<id>[^/?#&]+)'
_TESTS = [{
# ondemand video not available via https://vimeo.com/id
'url': 'https://vimeo.com/ondemand/20704',
@@ -920,7 +1052,7 @@ class VimeoAlbumIE(VimeoBaseInfoExtractor):
},
'playlist_count': 1,
'params': {
- 'videopassword': 'hypervideo',
+ 'videopassword': 'youtube-dl',
}
}]
_PAGE_SIZE = 100
@@ -967,7 +1099,7 @@ class VimeoAlbumIE(VimeoBaseInfoExtractor):
query={'fields': 'description,name,privacy'})
hashed_pass = None
if try_get(album, lambda x: x['privacy']['view']) == 'password':
- password = self._downloader.params.get('videopassword')
+ password = self.get_param('videopassword')
if not password:
raise ExtractorError(
'This album is protected by a password, use the --video-password option',
@@ -1056,7 +1188,7 @@ class VimeoReviewIE(VimeoBaseInfoExtractor):
self._login()
def _real_extract(self, url):
- page_url, video_id = re.match(self._VALID_URL, url).groups()
+ page_url, video_id = self._match_valid_url(url).groups()
data = self._download_json(
page_url.replace('/review/', '/review/data/'), video_id)
if data.get('isLocked') is True:
diff --git a/hypervideo_dl/extractor/vine.py b/hypervideo_dl/extractor/vine.py
index 80b896b..07fce0d 100644
--- a/hypervideo_dl/extractor/vine.py
+++ b/hypervideo_dl/extractor/vine.py
@@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
from ..compat import compat_str
@@ -88,6 +87,7 @@ class VineIE(InfoExtractor):
'format_id': format_id or 'standard',
'quality': quality,
})
+ self._check_formats(formats, video_id)
self._sort_formats(formats)
username = data.get('username')
@@ -132,7 +132,7 @@ class VineUserIE(InfoExtractor):
return False if VineIE.suitable(url) else super(VineUserIE, cls).suitable(url)
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
user = mobj.group('user')
u = mobj.group('u')
diff --git a/hypervideo_dl/extractor/viu.py b/hypervideo_dl/extractor/viu.py
index 3bd3752..1b34c52 100644
--- a/hypervideo_dl/extractor/viu.py
+++ b/hypervideo_dl/extractor/viu.py
@@ -1,16 +1,22 @@
# coding: utf-8
from __future__ import unicode_literals
+import json
import re
from .common import InfoExtractor
from ..compat import (
compat_kwargs,
compat_str,
+ compat_urlparse,
+ compat_urllib_request,
)
from ..utils import (
ExtractorError,
int_or_none,
+ try_get,
+ smuggle_url,
+ unsmuggle_url,
)
@@ -168,7 +174,8 @@ class ViuPlaylistIE(ViuBaseIE):
class ViuOTTIE(InfoExtractor):
IE_NAME = 'viu:ott'
- _VALID_URL = r'https?://(?:www\.)?viu\.com/ott/(?P<country_code>[a-z]{2})/[a-z]{2}-[a-z]{2}/vod/(?P<id>\d+)'
+ _NETRC_MACHINE = 'viu'
+ _VALID_URL = r'https?://(?:www\.)?viu\.com/ott/(?P<country_code>[a-z]{2})/(?P<lang_code>[a-z]{2}-[a-z]{2})/vod/(?P<id>\d+)'
_TESTS = [{
'url': 'http://www.viu.com/ott/sg/en-us/vod/3421/The%20Prime%20Minister%20and%20I',
'info_dict': {
@@ -179,6 +186,7 @@ class ViuOTTIE(InfoExtractor):
},
'params': {
'skip_download': 'm3u8 download',
+ 'noplaylist': True,
},
'skip': 'Geo-restricted to Singapore',
}, {
@@ -191,6 +199,19 @@ class ViuOTTIE(InfoExtractor):
},
'params': {
'skip_download': 'm3u8 download',
+ 'noplaylist': True,
+ },
+ 'skip': 'Geo-restricted to Hong Kong',
+ }, {
+ 'url': 'https://www.viu.com/ott/hk/zh-hk/vod/68776/%E6%99%82%E5%B0%9A%E5%AA%BD%E5%92%AA',
+ 'playlist_count': 12,
+ 'info_dict': {
+ 'id': '3916',
+ 'title': '時尚媽咪',
+ },
+ 'params': {
+ 'skip_download': 'm3u8 download',
+ 'noplaylist': False,
},
'skip': 'Geo-restricted to Hong Kong',
}]
@@ -201,9 +222,51 @@ class ViuOTTIE(InfoExtractor):
'TH': 4,
'PH': 5,
}
+ _LANGUAGE_FLAG = {
+ 'zh-hk': 1,
+ 'zh-cn': 2,
+ 'en-us': 3,
+ }
+ _user_info = None
+
+ def _detect_error(self, response):
+ code = response.get('status', {}).get('code')
+ if code > 0:
+ message = try_get(response, lambda x: x['status']['message'])
+ raise ExtractorError('%s said: %s (%s)' % (
+ self.IE_NAME, message, code), expected=True)
+ return response['data']
+
+ def _raise_login_required(self):
+ raise ExtractorError(
+ 'This video requires login. '
+ 'Specify --username and --password or --netrc (machine: %s) '
+ 'to provide account credentials.' % self._NETRC_MACHINE,
+ expected=True)
+
+ def _login(self, country_code, video_id):
+ if not self._user_info:
+ username, password = self._get_login_info()
+ if username is None or password is None:
+ return
+
+ data = self._download_json(
+ compat_urllib_request.Request(
+ 'https://www.viu.com/ott/%s/index.php' % country_code, method='POST'),
+ video_id, 'Logging in', errnote=False, fatal=False,
+ query={'r': 'user/login'},
+ data=json.dumps({
+ 'username': username,
+ 'password': password,
+ 'platform_flag_label': 'web',
+ }).encode())
+ self._user_info = self._detect_error(data)['user']
+
+ return self._user_info
def _real_extract(self, url):
- country_code, video_id = re.match(self._VALID_URL, url).groups()
+ url, idata = unsmuggle_url(url, {})
+ country_code, lang_code, video_id = self._match_valid_url(url).groups()
query = {
'r': 'vod/ajax-detail',
@@ -223,20 +286,88 @@ class ViuOTTIE(InfoExtractor):
if not video_data:
raise ExtractorError('This video is not available in your region.', expected=True)
- stream_data = self._download_json(
- 'https://d1k2us671qcoau.cloudfront.net/distribute_web_%s.php' % country_code,
- video_id, 'Downloading stream info', query={
- 'ccs_product_id': video_data['ccs_product_id'],
- }, headers={
- 'Referer': url,
- 'Origin': re.search(r'https?://[^/]+', url).group(0),
- })['data']['stream']
+ series_id = video_data.get('series_id')
+ if not self.get_param('noplaylist') and not idata.get('force_noplaylist'):
+ self.to_screen('Downloading playlist %s - add --no-playlist to just download video' % series_id)
+ series = product_data.get('series', {})
+ product = series.get('product')
+ if product:
+ entries = []
+ for entry in sorted(product, key=lambda x: int_or_none(x.get('number', 0))):
+ item_id = entry.get('product_id')
+ if not item_id:
+ continue
+ item_id = compat_str(item_id)
+ entries.append(self.url_result(
+ smuggle_url(
+ 'http://www.viu.com/ott/%s/%s/vod/%s/' % (country_code, lang_code, item_id),
+ {'force_noplaylist': True}), # prevent infinite recursion
+ 'ViuOTT',
+ item_id,
+ entry.get('synopsis', '').strip()))
+
+ return self.playlist_result(entries, series_id, series.get('name'), series.get('description'))
+
+ if self.get_param('noplaylist'):
+ self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
+
+ duration_limit = False
+ query = {
+ 'ccs_product_id': video_data['ccs_product_id'],
+ 'language_flag_id': self._LANGUAGE_FLAG.get(lang_code.lower()) or '3',
+ }
+ headers = {
+ 'Referer': url,
+ 'Origin': url,
+ }
+ try:
+ stream_data = self._download_json(
+ 'https://d1k2us671qcoau.cloudfront.net/distribute_web_%s.php' % country_code,
+ video_id, 'Downloading stream info', query=query, headers=headers)
+ stream_data = self._detect_error(stream_data)['stream']
+ except (ExtractorError, KeyError):
+ stream_data = None
+ if video_data.get('user_level', 0) > 0:
+ user = self._login(country_code, video_id)
+ if user:
+ query['identity'] = user['identity']
+ stream_data = self._download_json(
+ 'https://d1k2us671qcoau.cloudfront.net/distribute_web_%s.php' % country_code,
+ video_id, 'Downloading stream info', query=query, headers=headers)
+ stream_data = self._detect_error(stream_data).get('stream')
+ else:
+ # preview is limited to 3min for non-members
+ # try to bypass the duration limit
+ duration_limit = True
+ query['duration'] = '180'
+ stream_data = self._download_json(
+ 'https://d1k2us671qcoau.cloudfront.net/distribute_web_%s.php' % country_code,
+ video_id, 'Downloading stream info', query=query, headers=headers)
+ try:
+ stream_data = self._detect_error(stream_data)['stream']
+ except (ExtractorError, KeyError): # if still not working, give up
+ self._raise_login_required()
+
+ if not stream_data:
+ raise ExtractorError('Cannot get stream info', expected=True)
stream_sizes = stream_data.get('size', {})
formats = []
for vid_format, stream_url in stream_data.get('url', {}).items():
height = int_or_none(self._search_regex(
r's(\d+)p', vid_format, 'height', default=None))
+
+ # bypass preview duration limit
+ if duration_limit:
+ stream_url = compat_urlparse.urlparse(stream_url)
+ query = dict(compat_urlparse.parse_qsl(stream_url.query, keep_blank_values=True))
+ time_duration = int_or_none(video_data.get('time_duration'))
+ query.update({
+ 'duration': time_duration if time_duration > 0 else '9999999',
+ 'duration_start': '0',
+ })
+ stream_url = stream_url._replace(query=compat_urlparse.urlencode(query)).geturl()
+
formats.append({
'format_id': vid_format,
'url': stream_url,
diff --git a/hypervideo_dl/extractor/vk.py b/hypervideo_dl/extractor/vk.py
index 6b3513e..d8a9b9a 100644
--- a/hypervideo_dl/extractor/vk.py
+++ b/hypervideo_dl/extractor/vk.py
@@ -308,7 +308,7 @@ class VKIE(VKBaseIE):
webpage)]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
video_id = mobj.group('videoid')
mv_data = {}
@@ -538,7 +538,7 @@ class VKUserVideosIE(VKBaseIE):
'http://vk.com/video' + video_id, VKIE.ie_key(), video_id)
def _real_extract(self, url):
- page_id, section = re.match(self._VALID_URL, url).groups()
+ page_id, section = self._match_valid_url(url).groups()
if not section:
section = 'all'
diff --git a/hypervideo_dl/extractor/vlive.py b/hypervideo_dl/extractor/vlive.py
index 42da34d..84f51a5 100644
--- a/hypervideo_dl/extractor/vlive.py
+++ b/hypervideo_dl/extractor/vlive.py
@@ -72,6 +72,13 @@ class VLiveIE(VLiveBaseIE):
# works only with gcc=KR
'url': 'https://www.vlive.tv/video/225019',
'only_matching': True,
+ }, {
+ 'url': 'https://www.vlive.tv/video/223906',
+ 'info_dict': {
+ 'id': '58',
+ 'title': 'RUN BTS!'
+ },
+ 'playlist_mincount': 120
}]
def _real_initialize(self):
@@ -105,10 +112,12 @@ class VLiveIE(VLiveBaseIE):
if not is_logged_in():
raise ExtractorError('Unable to log in', expected=True)
- def _call_api(self, path_template, video_id, fields=None):
+ def _call_api(self, path_template, video_id, fields=None, limit=None):
query = {'appId': self._APP_ID, 'gcc': 'KR', 'platformType': 'PC'}
if fields:
query['fields'] = fields
+ if limit:
+ query['limit'] = limit
try:
return self._download_json(
'https://www.vlive.tv/globalv-web/vam-web/' + path_template % video_id, video_id,
@@ -124,10 +133,34 @@ class VLiveIE(VLiveBaseIE):
post = self._call_api(
'post/v1.0/officialVideoPost-%s', video_id,
- 'author{nickname},channel{channelCode,channelName},officialVideo{commentCount,exposeStatus,likeCount,playCount,playTime,status,title,type,vodId}')
-
- video = post['officialVideo']
-
+ 'author{nickname},channel{channelCode,channelName},officialVideo{commentCount,exposeStatus,likeCount,playCount,playTime,status,title,type,vodId},playlist{playlistSeq,totalCount,name}')
+
+ playlist = post.get('playlist')
+ if not playlist or self.get_param('noplaylist'):
+ if playlist:
+ self.to_screen(
+ 'Downloading just video %s because of --no-playlist'
+ % video_id)
+
+ video = post['officialVideo']
+ return self._get_vlive_info(post, video, video_id)
+ else:
+ playlist_name = playlist.get('name')
+ playlist_id = str_or_none(playlist.get('playlistSeq'))
+ playlist_count = str_or_none(playlist.get('totalCount'))
+
+ playlist = self._call_api(
+ 'playlist/v1.0/playlist-%s/posts', playlist_id, 'data', limit=playlist_count)
+
+ entries = []
+ for video_data in playlist['data']:
+ video = video_data.get('officialVideo')
+ video_id = str_or_none(video.get('videoSeq'))
+ entries.append(self._get_vlive_info(video_data, video, video_id))
+
+ return self.playlist_result(entries, playlist_id, playlist_name)
+
+ def _get_vlive_info(self, post, video, video_id):
def get_common_fields():
channel = post.get('channel') or {}
return {
@@ -145,9 +178,15 @@ class VLiveIE(VLiveBaseIE):
if video_type == 'VOD':
inkey = self._call_api('video/v1.0/vod/%s/inkey', video_id)['inkey']
vod_id = video['vodId']
- return merge_dicts(
+ info_dict = merge_dicts(
get_common_fields(),
self._extract_video_info(video_id, vod_id, inkey))
+ thumbnail = video.get('thumb')
+ if thumbnail:
+ if not info_dict.get('thumbnails') and info_dict.get('thumbnail'):
+ info_dict['thumbnails'] = [{'url': info_dict.pop('thumbnail')}]
+ info_dict.setdefault('thumbnails', []).append({'url': thumbnail, 'preference': 1})
+ return info_dict
elif video_type == 'LIVE':
status = video.get('status')
if status == 'ON_AIR':
@@ -316,13 +355,29 @@ class VLiveChannelIE(VLiveBaseIE):
for video in videos:
video_id = video.get('videoSeq')
- if not video_id:
+ video_type = video.get('videoType')
+
+ if not video_id or not video_type:
continue
video_id = compat_str(video_id)
- entries.append(
- self.url_result(
- 'http://www.vlive.tv/video/%s' % video_id,
- ie=VLiveIE.ie_key(), video_id=video_id))
+
+ if video_type in ('PLAYLIST'):
+ first_video_id = try_get(
+ video,
+ lambda x: x['videoPlaylist']['videoList'][0]['videoSeq'], int)
+
+ if not first_video_id:
+ continue
+
+ entries.append(
+ self.url_result(
+ 'http://www.vlive.tv/video/%s' % first_video_id,
+ ie=VLiveIE.ie_key(), video_id=first_video_id))
+ else:
+ entries.append(
+ self.url_result(
+ 'http://www.vlive.tv/video/%s' % video_id,
+ ie=VLiveIE.ie_key(), video_id=video_id))
return self.playlist_result(
entries, channel_code, channel_name)
diff --git a/hypervideo_dl/extractor/voicy.py b/hypervideo_dl/extractor/voicy.py
new file mode 100644
index 0000000..11ebe76
--- /dev/null
+++ b/hypervideo_dl/extractor/voicy.py
@@ -0,0 +1,147 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+ ExtractorError,
+ smuggle_url,
+ traverse_obj,
+ unsmuggle_url,
+ unified_strdate,
+)
+
+import itertools
+
+
+class VoicyBaseIE(InfoExtractor):
+ def _extract_from_playlist_data(self, value):
+ voice_id = compat_str(value.get('PlaylistId'))
+ upload_date = unified_strdate(value.get('Published'), False)
+ items = [self._extract_single_article(voice_data) for voice_data in value['VoiceData']]
+ return {
+ '_type': 'multi_video',
+ 'entries': items,
+ 'id': voice_id,
+ 'title': compat_str(value.get('PlaylistName')),
+ 'uploader': value.get('SpeakerName'),
+ 'uploader_id': compat_str(value.get('SpeakerId')),
+ 'channel': value.get('ChannelName'),
+ 'channel_id': compat_str(value.get('ChannelId')),
+ 'upload_date': upload_date,
+ }
+
+ def _extract_single_article(self, entry):
+ formats = [{
+ 'url': entry['VoiceHlsFile'],
+ 'format_id': 'hls',
+ 'ext': 'm4a',
+ 'acodec': 'aac',
+ 'vcodec': 'none',
+ 'protocol': 'm3u8_native',
+ }, {
+ 'url': entry['VoiceFile'],
+ 'format_id': 'mp3',
+ 'ext': 'mp3',
+ 'acodec': 'mp3',
+ 'vcodec': 'none',
+ }]
+ self._sort_formats(formats)
+ return {
+ 'id': compat_str(entry.get('ArticleId')),
+ 'title': entry.get('ArticleTitle'),
+ 'description': entry.get('MediaName'),
+ 'formats': formats,
+ }
+
+ def _call_api(self, url, video_id, **kwargs):
+ response = self._download_json(url, video_id, **kwargs)
+ if response.get('Status') != 0:
+ message = traverse_obj(response, ('Value', 'Error', 'Message'), expected_type=compat_str)
+ if not message:
+ message = 'There was a error in the response: %d' % response.get('Status')
+ raise ExtractorError(message, expected=False)
+ return response.get('Value')
+
+
+class VoicyIE(VoicyBaseIE):
+ IE_NAME = 'voicy'
+ _VALID_URL = r'https?://voicy\.jp/channel/(?P<channel_id>\d+)/(?P<id>\d+)'
+ ARTICLE_LIST_API_URL = 'https://vmw.api.voicy.jp/articles_list?channel_id=%s&pid=%s'
+ _TESTS = [{
+ 'url': 'https://voicy.jp/channel/1253/122754',
+ 'info_dict': {
+ 'id': '122754',
+ 'title': '1/21(木)声日記:ついに原稿終わった!!',
+ 'uploader': 'ちょまど@ ITエンジニアなオタク',
+ 'uploader_id': '7339',
+ },
+ 'playlist_mincount': 9,
+ }]
+
+ def _real_extract(self, url):
+ mobj = self._match_valid_url(url)
+ assert mobj
+ voice_id = mobj.group('id')
+ channel_id = mobj.group('channel_id')
+ url, article_list = unsmuggle_url(url)
+ if not article_list:
+ article_list = self._call_api(self.ARTICLE_LIST_API_URL % (channel_id, voice_id), voice_id)
+ return self._extract_from_playlist_data(article_list)
+
+
+class VoicyChannelIE(VoicyBaseIE):
+ IE_NAME = 'voicy:channel'
+ _VALID_URL = r'https?://voicy\.jp/channel/(?P<id>\d+)'
+ PROGRAM_LIST_API_URL = 'https://vmw.api.voicy.jp/program_list/all?channel_id=%s&limit=20&public_type=3%s'
+ _TESTS = [{
+ 'url': 'https://voicy.jp/channel/1253/',
+ 'info_dict': {
+ 'id': '7339',
+ 'title': 'ゆるふわ日常ラジオ #ちょまラジ',
+ 'uploader': 'ちょまど@ ITエンジニアなオタク',
+ 'uploader_id': '7339',
+ },
+ 'playlist_mincount': 54,
+ }]
+
+ @classmethod
+ def suitable(cls, url):
+ return not VoicyIE.suitable(url) and super(VoicyChannelIE, cls).suitable(url)
+
+ def _entries(self, channel_id):
+ pager = ''
+ for count in itertools.count(1):
+ article_list = self._call_api(self.PROGRAM_LIST_API_URL % (channel_id, pager), channel_id, note='Paging #%d' % count)
+ playlist_data = article_list.get('PlaylistData')
+ if not playlist_data:
+ break
+ yield from playlist_data
+ last = playlist_data[-1]
+ pager = '&pid=%d&p_date=%s&play_count=%s' % (last['PlaylistId'], last['Published'], last['PlayCount'])
+
+ def _real_extract(self, url):
+ channel_id = self._match_id(url)
+ articles = self._entries(channel_id)
+
+ first_article = next(articles, None)
+ title = traverse_obj(first_article, ('ChannelName', ), expected_type=compat_str)
+ speaker_name = traverse_obj(first_article, ('SpeakerName', ), expected_type=compat_str)
+ if not title and speaker_name:
+ title = 'Uploads from %s' % speaker_name
+ if not title:
+ title = 'Uploads from channel ID %s' % channel_id
+
+ articles = itertools.chain([first_article], articles) if first_article else articles
+
+ playlist = (
+ self.url_result(smuggle_url('https://voicy.jp/channel/%s/%d' % (channel_id, value['PlaylistId']), value), VoicyIE.ie_key())
+ for value in articles)
+ return {
+ '_type': 'playlist',
+ 'entries': playlist,
+ 'id': channel_id,
+ 'title': title,
+ 'channel': speaker_name,
+ 'channel_id': channel_id,
+ }
diff --git a/hypervideo_dl/extractor/voot.py b/hypervideo_dl/extractor/voot.py
index 751b21e..e2944ec 100644
--- a/hypervideo_dl/extractor/voot.py
+++ b/hypervideo_dl/extractor/voot.py
@@ -2,6 +2,7 @@
from __future__ import unicode_literals
from .common import InfoExtractor
+from ..compat import compat_str
from ..utils import (
ExtractorError,
int_or_none,
@@ -11,7 +12,17 @@ from ..utils import (
class VootIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?voot\.com/(?:[^/]+/)+(?P<id>\d+)'
+ _VALID_URL = r'''(?x)
+ (?:
+ voot:|
+ (?:https?://)(?:www\.)?voot\.com/?
+ (?:
+ movies/[^/]+/|
+ (?:shows|kids)/(?:[^/]+/){4}
+ )
+ )
+ (?P<id>\d{3,})
+ '''
_GEO_COUNTRIES = ['IN']
_TESTS = [{
'url': 'https://www.voot.com/shows/ishq-ka-rang-safed/1/360558/is-this-the-end-of-kamini-/441353',
@@ -22,7 +33,6 @@ class VootIE(InfoExtractor):
'description': 'md5:06291fbbbc4dcbe21235c40c262507c1',
'timestamp': 1472162937,
'upload_date': '20160825',
- 'duration': 1146,
'series': 'Ishq Ka Rang Safed',
'season_number': 1,
'episode': 'Is this the end of Kamini?',
@@ -44,7 +54,6 @@ class VootIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
-
media_info = self._download_json(
'https://wapi.voot.com/ws/ott/getMediaInfo.json', video_id,
query={
@@ -82,7 +91,6 @@ class VootIE(InfoExtractor):
episode = value
elif key == 'EpisodeNo':
episode_number = int_or_none(value)
-
return {
'extractor_key': 'Kaltura',
'id': entry_id,
@@ -98,3 +106,45 @@ class VootIE(InfoExtractor):
'like_count': int_or_none(media.get('like_counter')),
'formats': formats,
}
+
+
+class VootSeriesIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?voot\.com/shows/[^/]+/(?P<id>\d{3,})'
+ _TESTS = [{
+ 'url': 'https://www.voot.com/shows/chakravartin-ashoka-samrat/100002',
+ 'playlist_mincount': 442,
+ 'info_dict': {
+ 'id': '100002',
+ },
+ }, {
+ 'url': 'https://www.voot.com/shows/ishq-ka-rang-safed/100003',
+ 'playlist_mincount': 341,
+ 'info_dict': {
+ 'id': '100003',
+ },
+ }]
+ _SHOW_API = 'https://psapi.voot.com/media/voot/v1/voot-web/content/generic/season-by-show?sort=season%3Aasc&id={}&responseType=common'
+ _SEASON_API = 'https://psapi.voot.com/media/voot/v1/voot-web/content/generic/series-wise-episode?sort=episode%3Aasc&id={}&responseType=common&page={:d}'
+
+ def _entries(self, show_id):
+ show_json = self._download_json(self._SHOW_API.format(show_id), video_id=show_id)
+ for season in show_json.get('result', []):
+ page_num = 1
+ season_id = try_get(season, lambda x: x['id'], compat_str)
+ season_json = self._download_json(self._SEASON_API.format(season_id, page_num),
+ video_id=season_id,
+ note='Downloading JSON metadata page %d' % page_num)
+ episodes_json = season_json.get('result', [])
+ while episodes_json:
+ page_num += 1
+ for episode in episodes_json:
+ video_id = episode.get('id')
+ yield self.url_result(
+ 'voot:%s' % video_id, ie=VootIE.ie_key(), video_id=video_id)
+ episodes_json = self._download_json(self._SEASON_API.format(season_id, page_num),
+ video_id=season_id,
+ note='Downloading JSON metadata page %d' % page_num)['result']
+
+ def _real_extract(self, url):
+ show_id = self._match_id(url)
+ return self.playlist_result(self._entries(show_id), playlist_id=show_id)
diff --git a/hypervideo_dl/extractor/vrt.py b/hypervideo_dl/extractor/vrt.py
index 4220252..10dc94a 100644
--- a/hypervideo_dl/extractor/vrt.py
+++ b/hypervideo_dl/extractor/vrt.py
@@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
from ..utils import (
@@ -52,16 +51,16 @@ class VRTIE(InfoExtractor):
}
def _real_extract(self, url):
- site, display_id = re.match(self._VALID_URL, url).groups()
+ site, display_id = self._match_valid_url(url).groups()
webpage = self._download_webpage(url, display_id)
attrs = extract_attributes(self._search_regex(
- r'(<[^>]+class="vrtvideo"[^>]*>)', webpage, 'vrt video'))
+ r'(<[^>]+class="vrtvideo( [^"]*)?"[^>]*>)', webpage, 'vrt video'))
- asset_id = attrs['data-videoid']
- publication_id = attrs.get('data-publicationid')
+ asset_id = attrs['data-video-id']
+ publication_id = attrs.get('data-publication-id')
if publication_id:
asset_id = publication_id + '$' + asset_id
- client = attrs.get('data-client') or self._CLIENT_MAP[site]
+ client = attrs.get('data-client-code') or self._CLIENT_MAP[site]
title = strip_or_none(get_element_by_class(
'vrt-title', webpage) or self._html_search_meta(
diff --git a/hypervideo_dl/extractor/vrv.py b/hypervideo_dl/extractor/vrv.py
index 6e51469..4196021 100644
--- a/hypervideo_dl/extractor/vrv.py
+++ b/hypervideo_dl/extractor/vrv.py
@@ -19,6 +19,7 @@ from ..utils import (
ExtractorError,
float_or_none,
int_or_none,
+ traverse_obj,
)
@@ -217,7 +218,7 @@ class VRVIE(VRVBaseIE):
})
thumbnails = []
- for thumbnail in video_data.get('images', {}).get('thumbnails', []):
+ for thumbnail in traverse_obj(video_data, ('images', 'thumbnail', ..., ...)) or []:
thumbnail_url = thumbnail.get('source')
if not thumbnail_url:
continue
diff --git a/hypervideo_dl/extractor/vube.py b/hypervideo_dl/extractor/vube.py
index 8ce3a6b..1c8f80a 100644
--- a/hypervideo_dl/extractor/vube.py
+++ b/hypervideo_dl/extractor/vube.py
@@ -1,6 +1,5 @@
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
from ..compat import (
@@ -8,7 +7,6 @@ from ..compat import (
)
from ..utils import (
int_or_none,
- ExtractorError,
)
@@ -99,7 +97,7 @@ class VubeIE(InfoExtractor):
]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
video_id = mobj.group('id')
video = self._download_json(
@@ -125,13 +123,13 @@ class VubeIE(InfoExtractor):
})
formats.append(fmt)
- self._sort_formats(formats)
-
if not formats and video.get('vst') == 'dmca':
- raise ExtractorError(
+ self.raise_no_formats(
'This video has been removed in response to a complaint received under the US Digital Millennium Copyright Act.',
expected=True)
+ self._sort_formats(formats)
+
title = video['title']
description = video.get('description')
thumbnail = self._proto_relative_url(video.get('thumbnail_src'), scheme='http:')
diff --git a/hypervideo_dl/extractor/vupload.py b/hypervideo_dl/extractor/vupload.py
new file mode 100644
index 0000000..9846aba
--- /dev/null
+++ b/hypervideo_dl/extractor/vupload.py
@@ -0,0 +1,51 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ parse_duration,
+ parse_filesize,
+ extract_attributes,
+ int_or_none,
+)
+
+
+class VuploadIE(InfoExtractor):
+ _VALID_URL = r'https://vupload\.com/v/(?P<id>[a-z0-9]+)'
+ _TESTS = [{
+ 'url': 'https://vupload.com/v/u28d0pl2tphy',
+ 'md5': '9b42a4a193cca64d80248e58527d83c8',
+ 'info_dict': {
+ 'id': 'u28d0pl2tphy',
+ 'ext': 'mp4',
+ 'description': 'md5:e9e6c0045c78cbf0d5bb19a55ce199fb',
+ 'title': 'md5:e9e6c0045c78cbf0d5bb19a55ce199fb',
+ }
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ title = self._html_search_regex(r'<title>(.+?)</title>', webpage, 'title')
+ video_e = self._html_search_regex(r'\|([a-z0-9]{60})\|', webpage, 'video')
+ video_url = f'https://wurize.megaupload.to/{video_e}/v.mp4'
+ duration = parse_duration(self._html_search_regex(
+ r'<i\s*class=["\']fad\s*fa-clock["\']></i>\s*([\d:]+)\s*</div>', webpage, 'duration', fatal=False))
+ filesize_approx = parse_filesize(self._html_search_regex(
+ r'<i\s*class=["\']fad\s*fa-save["\']></i>\s*([^<]+)\s*</div>', webpage, 'filesize', fatal=False))
+ extra_video_info = extract_attributes(self._html_search_regex(
+ r'(<video[^>]+>)', webpage, 'video_info', fatal=False))
+ description = self._html_search_meta('description', webpage)
+
+ return {
+ 'id': video_id,
+ 'url': video_url,
+ 'duration': duration,
+ 'filesize_approx': filesize_approx,
+ 'width': int_or_none(extra_video_info.get('width')),
+ 'height': int_or_none(extra_video_info.get('height')),
+ 'format_id': extra_video_info.get('height', '') + 'p',
+ 'title': title,
+ 'description': description,
+ }
diff --git a/hypervideo_dl/extractor/vvvvid.py b/hypervideo_dl/extractor/vvvvid.py
index bc196f8..3faa90f 100644
--- a/hypervideo_dl/extractor/vvvvid.py
+++ b/hypervideo_dl/extractor/vvvvid.py
@@ -98,7 +98,7 @@ class VVVVIDIE(InfoExtractor):
}
def _real_extract(self, url):
- show_id, season_id, video_id = re.match(self._VALID_URL, url).groups()
+ show_id, season_id, video_id = self._match_valid_url(url).groups()
response = self._download_info(
show_id, 'season/%s' % season_id,
@@ -246,7 +246,7 @@ class VVVVIDShowIE(VVVVIDIE):
}]
def _real_extract(self, url):
- base_url, show_id, show_title = re.match(self._VALID_URL, url).groups()
+ base_url, show_id, show_title = self._match_valid_url(url).groups()
seasons = self._download_info(
show_id, 'seasons/', show_title)
diff --git a/hypervideo_dl/extractor/vzaar.py b/hypervideo_dl/extractor/vzaar.py
index b7d02fc..54f88bb 100644
--- a/hypervideo_dl/extractor/vzaar.py
+++ b/hypervideo_dl/extractor/vzaar.py
@@ -70,7 +70,7 @@ class VzaarIE(InfoExtractor):
f = {
'url': source_url,
'format_id': 'http',
- 'preference': 1,
+ 'quality': 1,
}
if 'audio' in source_url:
f.update({
diff --git a/hypervideo_dl/extractor/wakanim.py b/hypervideo_dl/extractor/wakanim.py
index f9a2395..c956d61 100644
--- a/hypervideo_dl/extractor/wakanim.py
+++ b/hypervideo_dl/extractor/wakanim.py
@@ -3,7 +3,6 @@ from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
- ExtractorError,
merge_dicts,
urljoin,
)
@@ -41,12 +40,13 @@ class WakanimIE(InfoExtractor):
m3u8_url = urljoin(url, self._search_regex(
r'file\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, 'm3u8 url',
group='url'))
- # https://docs.microsoft.com/en-us/azure/media-services/previous/media-services-content-protection-overview#streaming-urls
- encryption = self._search_regex(
- r'encryption%3D(c(?:enc|bc(?:s-aapl)?))',
- m3u8_url, 'encryption', default=None)
- if encryption and encryption in ('cenc', 'cbcs-aapl'):
- raise ExtractorError('This video is DRM protected.', expected=True)
+ if not self.get_param('allow_unplayable_formats'):
+ # https://docs.microsoft.com/en-us/azure/media-services/previous/media-services-content-protection-overview#streaming-urls
+ encryption = self._search_regex(
+ r'encryption%3D(c(?:enc|bc(?:s-aapl)?))',
+ m3u8_url, 'encryption', default=None)
+ if encryption in ('cenc', 'cbcs-aapl'):
+ self.report_drm(video_id)
formats = self._extract_m3u8_formats(
m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native',
diff --git a/hypervideo_dl/extractor/walla.py b/hypervideo_dl/extractor/walla.py
index cbb5486..00f081b 100644
--- a/hypervideo_dl/extractor/walla.py
+++ b/hypervideo_dl/extractor/walla.py
@@ -34,7 +34,7 @@ class WallaIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
video_id = mobj.group('id')
display_id = mobj.group('display_id')
diff --git a/hypervideo_dl/extractor/wat.py b/hypervideo_dl/extractor/wat.py
index f1bccc2..9ff4523 100644
--- a/hypervideo_dl/extractor/wat.py
+++ b/hypervideo_dl/extractor/wat.py
@@ -69,25 +69,30 @@ class WatIE(InfoExtractor):
title = video_info['title']
formats = []
+ subtitles = {}
def extract_formats(manifest_urls):
for f, f_url in manifest_urls.items():
if not f_url:
continue
if f in ('dash', 'mpd'):
- formats.extend(self._extract_mpd_formats(
+ fmts, subs = self._extract_mpd_formats_and_subtitles(
f_url.replace('://das-q1.tf1.fr/', '://das-q1-ssl.tf1.fr/'),
- video_id, mpd_id='dash', fatal=False))
+ video_id, mpd_id='dash', fatal=False)
elif f == 'hls':
- formats.extend(self._extract_m3u8_formats(
+ fmts, subs = self._extract_m3u8_formats_and_subtitles(
f_url, video_id, 'mp4',
- 'm3u8_native', m3u8_id='hls', fatal=False))
+ 'm3u8_native', m3u8_id='hls', fatal=False)
+ else:
+ continue
+ formats.extend(fmts)
+ self._merge_subtitles(subs, target=subtitles)
delivery = video_data.get('delivery') or {}
extract_formats({delivery.get('format'): delivery.get('url')})
if not formats:
if delivery.get('drm'):
- raise ExtractorError('This video is DRM protected.', expected=True)
+ self.report_drm(video_id)
manifest_urls = self._download_json(
'http://www.wat.tv/get/webhtml/' + video_id, video_id, fatal=False)
if manifest_urls:
@@ -103,4 +108,5 @@ class WatIE(InfoExtractor):
video_data, lambda x: x['mediametrie']['chapters'][0]['estatS4'])),
'duration': int_or_none(video_info.get('duration')),
'formats': formats,
+ 'subtitles': subtitles,
}
diff --git a/hypervideo_dl/extractor/watchbox.py b/hypervideo_dl/extractor/watchbox.py
index 5a4e46e..7469fe9 100644
--- a/hypervideo_dl/extractor/watchbox.py
+++ b/hypervideo_dl/extractor/watchbox.py
@@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
from ..compat import compat_str
@@ -63,7 +62,7 @@ class WatchBoxIE(InfoExtractor):
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
kind, video_id = mobj.group('kind', 'id')
webpage = self._download_webpage(url, video_id)
diff --git a/hypervideo_dl/extractor/watchindianporn.py b/hypervideo_dl/extractor/watchindianporn.py
index fadc539..a868191 100644
--- a/hypervideo_dl/extractor/watchindianporn.py
+++ b/hypervideo_dl/extractor/watchindianporn.py
@@ -27,7 +27,7 @@ class WatchIndianPornIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
video_id = mobj.group('id')
display_id = mobj.group('display_id')
diff --git a/hypervideo_dl/extractor/wdr.py b/hypervideo_dl/extractor/wdr.py
index 2903d18..f54aa6f 100644
--- a/hypervideo_dl/extractor/wdr.py
+++ b/hypervideo_dl/extractor/wdr.py
@@ -44,17 +44,25 @@ class WDRIE(InfoExtractor):
tracker_data = metadata['trackerData']
title = tracker_data['trackerClipTitle']
-
media_resource = metadata['mediaResource']
formats = []
+ subtitles = {}
# check if the metadata contains a direct URL to a file
for kind, media in media_resource.items():
- if not isinstance(media, dict):
+ if kind == 'captionsHash':
+ for ext, url in media.items():
+ subtitles.setdefault('de', []).append({
+ 'url': url,
+ 'ext': ext,
+ })
continue
+
if kind not in ('dflt', 'alt'):
continue
+ if not isinstance(media, dict):
+ continue
for tag_name, medium_url in media.items():
if tag_name not in ('videoURL', 'audioURL'):
@@ -86,7 +94,6 @@ class WDRIE(InfoExtractor):
self._sort_formats(formats)
- subtitles = {}
caption_url = media_resource.get('captionURL')
if caption_url:
subtitles['de'] = [{
@@ -233,7 +240,7 @@ class WDRPageIE(InfoExtractor):
]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
display_id = mobj.group('display_id')
webpage = self._download_webpage(url, display_id)
@@ -335,7 +342,7 @@ class WDRMobileIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
return {
'id': mobj.group('id'),
'title': mobj.group('title'),
diff --git a/hypervideo_dl/extractor/whowatch.py b/hypervideo_dl/extractor/whowatch.py
new file mode 100644
index 0000000..f8bc2e7
--- /dev/null
+++ b/hypervideo_dl/extractor/whowatch.py
@@ -0,0 +1,99 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ int_or_none,
+ qualities,
+ try_get,
+ ExtractorError,
+)
+from ..compat import compat_str
+
+
+class WhoWatchIE(InfoExtractor):
+ IE_NAME = 'whowatch'
+ _VALID_URL = r'https?://whowatch\.tv/viewer/(?P<id>\d+)'
+
+ _TESTS = [{
+ 'url': 'https://whowatch.tv/viewer/21450171',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ self._download_webpage(url, video_id)
+ metadata = self._download_json('https://api.whowatch.tv/lives/%s' % video_id, video_id)
+ live_data = self._download_json('https://api.whowatch.tv/lives/%s/play' % video_id, video_id)
+
+ title = try_get(None, (
+ lambda x: live_data['share_info']['live_title'][1:-1],
+ lambda x: metadata['live']['title'],
+ ), compat_str)
+
+ hls_url = live_data.get('hls_url')
+ if not hls_url:
+ raise ExtractorError(live_data.get('error_message') or 'The user is offline.', expected=True)
+
+ QUALITIES = qualities(['low', 'medium', 'high', 'veryhigh'])
+ formats = []
+
+ for i, fmt in enumerate(live_data.get('streams') or []):
+ name = fmt.get('quality') or fmt.get('name') or compat_str(i)
+ hls_url = fmt.get('hls_url')
+ rtmp_url = fmt.get('rtmp_url')
+ audio_only = fmt.get('audio_only')
+ quality = QUALITIES(fmt.get('quality'))
+
+ if hls_url:
+ hls_fmts = self._extract_m3u8_formats(
+ hls_url, video_id, ext='mp4', m3u8_id='hls-%s' % name, quality=quality)
+ formats.extend(hls_fmts)
+ else:
+ hls_fmts = []
+
+ # RTMP url for audio_only is same as high format, so skip it
+ if rtmp_url and not audio_only:
+ formats.append({
+ 'url': rtmp_url,
+ 'format_id': 'rtmp-%s' % name,
+ 'ext': 'mp4',
+ 'protocol': 'rtmp_ffmpeg', # ffmpeg can, while rtmpdump can't
+ 'vcodec': 'h264',
+ 'acodec': 'aac',
+ 'quality': quality,
+ 'format_note': fmt.get('label'),
+ # note: HLS and RTMP have same resolution for now, so it's acceptable
+ 'width': try_get(hls_fmts, lambda x: x[0]['width'], int),
+ 'height': try_get(hls_fmts, lambda x: x[0]['height'], int),
+ })
+
+ # This contains the same formats as the above manifests and is used only as a fallback
+ formats.extend(self._extract_m3u8_formats(
+ hls_url, video_id, ext='mp4', m3u8_id='hls'))
+ self._remove_duplicate_formats(formats)
+ self._sort_formats(formats)
+
+ uploader_url = try_get(metadata, lambda x: x['live']['user']['user_path'], compat_str)
+ if uploader_url:
+ uploader_url = 'https://whowatch.tv/profile/%s' % uploader_url
+ uploader_id = compat_str(try_get(metadata, lambda x: x['live']['user']['id'], int))
+ uploader = try_get(metadata, lambda x: x['live']['user']['name'], compat_str)
+ thumbnail = try_get(metadata, lambda x: x['live']['latest_thumbnail_url'], compat_str)
+ timestamp = int_or_none(try_get(metadata, lambda x: x['live']['started_at'], int), scale=1000)
+ view_count = try_get(metadata, lambda x: x['live']['total_view_count'], int)
+ comment_count = try_get(metadata, lambda x: x['live']['comment_count'], int)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'uploader_id': uploader_id,
+ 'uploader_url': uploader_url,
+ 'uploader': uploader,
+ 'formats': formats,
+ 'thumbnail': thumbnail,
+ 'timestamp': timestamp,
+ 'view_count': view_count,
+ 'comment_count': comment_count,
+ 'is_live': True,
+ }
diff --git a/hypervideo_dl/extractor/wimtv.py b/hypervideo_dl/extractor/wimtv.py
new file mode 100644
index 0000000..ea953bf
--- /dev/null
+++ b/hypervideo_dl/extractor/wimtv.py
@@ -0,0 +1,163 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ determine_ext,
+ parse_duration,
+ urlencode_postdata,
+ ExtractorError,
+)
+
+
+class WimTVIE(InfoExtractor):
+ _player = None
+ _UUID_RE = r'[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}'
+ _VALID_URL = r'''(?x)
+ https?://platform.wim.tv/
+ (?:
+ (?:embed/)?\?
+ |\#/webtv/.+?/
+ )
+ (?P<type>vod|live|cast)[=/]
+ (?P<id>%s).*?''' % _UUID_RE
+ _TESTS = [{
+ # vod stream
+ 'url': 'https://platform.wim.tv/embed/?vod=db29fb32-bade-47b6-a3a6-cb69fe80267a',
+ 'md5': 'db29fb32-bade-47b6-a3a6-cb69fe80267a',
+ 'info_dict': {
+ 'id': 'db29fb32-bade-47b6-a3a6-cb69fe80267a',
+ 'ext': 'mp4',
+ 'title': 'AMA SUPERCROSS 2020 - R2 ST. LOUIS',
+ 'duration': 6481,
+ 'thumbnail': r're:https?://.+?/thumbnail/.+?/720$'
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ # live stream
+ 'url': 'https://platform.wim.tv/embed/?live=28e22c22-49db-40f3-8c37-8cbb0ff44556&autostart=true',
+ 'info_dict': {
+ 'id': '28e22c22-49db-40f3-8c37-8cbb0ff44556',
+ 'ext': 'mp4',
+ 'title': 'Streaming MSmotorTV',
+ 'is_live': True,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'https://platform.wim.tv/#/webtv/automotornews/vod/422492b6-539e-474d-9c6b-68c9d5893365',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://platform.wim.tv/#/webtv/renzoarborechannel/cast/f47e0d15-5b45-455e-bf0d-dba8ffa96365',
+ 'only_matching': True,
+ }]
+
+ @staticmethod
+ def _extract_urls(webpage):
+ return [
+ mobj.group('url')
+ for mobj in re.finditer(
+ r'<iframe[^>]+src=["\'](?P<url>%s)' % WimTVIE._VALID_URL,
+ webpage)]
+
+ def _real_initialize(self):
+ if not self._player:
+ self._get_player_data()
+
+ def _get_player_data(self):
+ msg_id = 'Player data'
+ self._player = {}
+
+ datas = [{
+ 'url': 'https://platform.wim.tv/common/libs/player/wimtv/wim-rest.js',
+ 'vars': [{
+ 'regex': r'appAuth = "(.+?)"',
+ 'variable': 'app_auth',
+ }]
+ }, {
+ 'url': 'https://platform.wim.tv/common/config/endpointconfig.js',
+ 'vars': [{
+ 'regex': r'PRODUCTION_HOSTNAME_THUMB = "(.+?)"',
+ 'variable': 'thumb_server',
+ }, {
+ 'regex': r'PRODUCTION_HOSTNAME_THUMB\s*\+\s*"(.+?)"',
+ 'variable': 'thumb_server_path',
+ }]
+ }]
+
+ for data in datas:
+ temp = self._download_webpage(data['url'], msg_id)
+ for var in data['vars']:
+ val = self._search_regex(var['regex'], temp, msg_id)
+ if not val:
+ raise ExtractorError('%s not found' % var['variable'])
+ self._player[var['variable']] = val
+
+ def _generate_token(self):
+ json = self._download_json(
+ 'https://platform.wim.tv/wimtv-server/oauth/token', 'Token generation',
+ headers={'Authorization': 'Basic %s' % self._player['app_auth']},
+ data=urlencode_postdata({'grant_type': 'client_credentials'}))
+ token = json.get('access_token')
+ if not token:
+ raise ExtractorError('access token not generated')
+ return token
+
+ def _generate_thumbnail(self, thumb_id, width='720'):
+ if not thumb_id or not self._player.get('thumb_server'):
+ return None
+ if not self._player.get('thumb_server_path'):
+ self._player['thumb_server_path'] = ''
+ return '%s%s/asset/thumbnail/%s/%s' % (
+ self._player['thumb_server'],
+ self._player['thumb_server_path'],
+ thumb_id, width)
+
+ def _real_extract(self, url):
+ urlc = self._match_valid_url(url).groupdict()
+ video_id = urlc['id']
+ stream_type = is_live = None
+ if urlc['type'] in {'live', 'cast'}:
+ stream_type = urlc['type'] + '/channel'
+ is_live = True
+ else:
+ stream_type = 'vod'
+ is_live = False
+ token = self._generate_token()
+ json = self._download_json(
+ 'https://platform.wim.tv/wimtv-server/api/public/%s/%s/play' % (
+ stream_type, video_id), video_id,
+ headers={'Authorization': 'Bearer %s' % token,
+ 'Content-Type': 'application/json'},
+ data=bytes('{}', 'utf-8'))
+
+ formats = []
+ for src in json.get('srcs') or []:
+ if src.get('mimeType') == 'application/x-mpegurl':
+ formats.extend(
+ self._extract_m3u8_formats(
+ src.get('uniqueStreamer'), video_id, 'mp4'))
+ if src.get('mimeType') == 'video/flash':
+ formats.append({
+ 'format_id': 'rtmp',
+ 'url': src.get('uniqueStreamer'),
+ 'ext': determine_ext(src.get('uniqueStreamer'), 'flv'),
+ 'rtmp_live': is_live,
+ })
+ json = json.get('resource')
+ thumb = self._generate_thumbnail(json.get('thumbnailId'))
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': json.get('title') or json.get('name'),
+ 'duration': parse_duration(json.get('duration')),
+ 'formats': formats,
+ 'thumbnail': thumb,
+ 'is_live': is_live,
+ }
diff --git a/hypervideo_dl/extractor/wistia.py b/hypervideo_dl/extractor/wistia.py
index ae32a0a..a170966 100644
--- a/hypervideo_dl/extractor/wistia.py
+++ b/hypervideo_dl/extractor/wistia.py
@@ -62,7 +62,7 @@ class WistiaBaseIE(InfoExtractor):
'format_id': format_id,
'url': aurl,
'tbr': int_or_none(a.get('bitrate')) or None,
- 'preference': 1 if atype == 'original' else None,
+ 'quality': 1 if atype == 'original' else None,
}
if display_name == 'Audio':
f.update({
diff --git a/hypervideo_dl/extractor/xboxclips.py b/hypervideo_dl/extractor/xboxclips.py
index 25f487e..9bac982 100644
--- a/hypervideo_dl/extractor/xboxclips.py
+++ b/hypervideo_dl/extractor/xboxclips.py
@@ -4,14 +4,11 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..compat import (
- compat_parse_qs,
- compat_urllib_parse_urlparse,
-)
from ..utils import (
int_or_none,
month_by_abbreviation,
parse_filesize,
+ parse_qs,
)
@@ -37,7 +34,7 @@ class XboxClipsIE(InfoExtractor):
video_id = self._match_id(url)
if '/video.php' in url:
- qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
+ qs = parse_qs(url)
url = 'https://gameclips.io/%s/%s' % (qs['gamertag'][0], qs['vid'][0])
webpage = self._download_webpage(url, video_id)
diff --git a/hypervideo_dl/extractor/xfileshare.py b/hypervideo_dl/extractor/xfileshare.py
index df9efa9..cd97c77 100644
--- a/hypervideo_dl/extractor/xfileshare.py
+++ b/hypervideo_dl/extractor/xfileshare.py
@@ -98,7 +98,7 @@ class XFileShareIE(InfoExtractor):
webpage)]
def _real_extract(self, url):
- host, video_id = re.match(self._VALID_URL, url).groups()
+ host, video_id = self._match_valid_url(url).groups()
url = 'https://%s/' % host + ('embed-%s.html' % video_id if host in ('govid.me', 'vidlo.us') else video_id)
webpage = self._download_webpage(url, video_id)
diff --git a/hypervideo_dl/extractor/xhamster.py b/hypervideo_dl/extractor/xhamster.py
index f73b977..9d4ed47 100644
--- a/hypervideo_dl/extractor/xhamster.py
+++ b/hypervideo_dl/extractor/xhamster.py
@@ -120,7 +120,7 @@ class XHamsterIE(InfoExtractor):
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
video_id = mobj.group('id') or mobj.group('id_2')
display_id = mobj.group('display_id') or mobj.group('display_id_2')
@@ -231,7 +231,7 @@ class XHamsterIE(InfoExtractor):
'Referer': standard_url,
},
})
- self._sort_formats(formats, field_preference=('height', 'width', 'tbr', 'format_id'))
+ self._sort_formats(formats)
categories_list = video.get('categories')
if isinstance(categories_list, list):
@@ -245,6 +245,8 @@ class XHamsterIE(InfoExtractor):
else:
categories = None
+ uploader_url = url_or_none(try_get(video, lambda x: x['author']['pageURL']))
+
return {
'id': video_id,
'display_id': display_id,
@@ -253,6 +255,8 @@ class XHamsterIE(InfoExtractor):
'timestamp': int_or_none(video.get('created')),
'uploader': try_get(
video, lambda x: x['author']['name'], compat_str),
+ 'uploader_url': uploader_url,
+ 'uploader_id': uploader_url.split('/')[-1] if uploader_url else None,
'thumbnail': video.get('thumbURL'),
'duration': int_or_none(video.get('duration')),
'view_count': int_or_none(video.get('views')),
@@ -352,6 +356,7 @@ class XHamsterIE(InfoExtractor):
'description': description,
'upload_date': upload_date,
'uploader': uploader,
+ 'uploader_id': uploader.lower() if uploader else None,
'thumbnail': thumbnail,
'duration': duration,
'view_count': view_count,
diff --git a/hypervideo_dl/extractor/ximalaya.py b/hypervideo_dl/extractor/ximalaya.py
index a912e54..802d1bb 100644
--- a/hypervideo_dl/extractor/ximalaya.py
+++ b/hypervideo_dl/extractor/ximalaya.py
@@ -198,7 +198,7 @@ class XimalayaAlbumIE(XimalayaBaseIE):
def _real_extract(self, url):
self.scheme = scheme = 'https' if url.startswith('https') else 'http'
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
uid, playlist_id = mobj.group('uid'), mobj.group('id')
webpage = self._download_webpage(self._TEMPLATE_URL % (scheme, uid, playlist_id), playlist_id,
diff --git a/hypervideo_dl/extractor/xnxx.py b/hypervideo_dl/extractor/xnxx.py
index ac1ccc4..dd4fb54 100644
--- a/hypervideo_dl/extractor/xnxx.py
+++ b/hypervideo_dl/extractor/xnxx.py
@@ -54,7 +54,7 @@ class XNXXIE(InfoExtractor):
if determine_ext(format_url) == 'm3u8':
formats.extend(self._extract_m3u8_formats(
format_url, video_id, 'mp4', entry_protocol='m3u8_native',
- preference=1, m3u8_id='hls', fatal=False))
+ quality=1, m3u8_id='hls', fatal=False))
else:
format_id = mobj.group('id')
if format_id:
diff --git a/hypervideo_dl/extractor/xstream.py b/hypervideo_dl/extractor/xstream.py
index 76c91bd..792843d 100644
--- a/hypervideo_dl/extractor/xstream.py
+++ b/hypervideo_dl/extractor/xstream.py
@@ -93,7 +93,7 @@ class XstreamIE(InfoExtractor):
formats.append({
'url': link.get('href'),
'format_id': link.get('rel'),
- 'preference': 1,
+ 'quality': 1,
})
thumbnails = [{
@@ -112,7 +112,7 @@ class XstreamIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
partner_id = mobj.group('partner_id')
video_id = mobj.group('id')
diff --git a/hypervideo_dl/extractor/xtube.py b/hypervideo_dl/extractor/xtube.py
index 7246409..abd3191 100644
--- a/hypervideo_dl/extractor/xtube.py
+++ b/hypervideo_dl/extractor/xtube.py
@@ -40,22 +40,6 @@ class XTubeIE(InfoExtractor):
'age_limit': 18,
}
}, {
- # FLV videos with duplicated formats
- 'url': 'http://www.xtube.com/video-watch/A-Super-Run-Part-1-YT-9299752',
- 'md5': 'a406963eb349dd43692ec54631efd88b',
- 'info_dict': {
- 'id': '9299752',
- 'display_id': 'A-Super-Run-Part-1-YT',
- 'ext': 'flv',
- 'title': 'A Super Run - Part 1 (YT)',
- 'description': 'md5:4cc3af1aa1b0413289babc88f0d4f616',
- 'uploader': 'tshirtguy59',
- 'duration': 579,
- 'view_count': int,
- 'comment_count': int,
- 'age_limit': 18,
- },
- }, {
# new URL schema
'url': 'http://www.xtube.com/video-watch/strange-erotica-625837',
'only_matching': True,
@@ -71,7 +55,7 @@ class XTubeIE(InfoExtractor):
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
video_id = mobj.group('id')
display_id = mobj.group('display_id')
diff --git a/hypervideo_dl/extractor/xxxymovies.py b/hypervideo_dl/extractor/xxxymovies.py
index e34ebe3..0d53601 100644
--- a/hypervideo_dl/extractor/xxxymovies.py
+++ b/hypervideo_dl/extractor/xxxymovies.py
@@ -1,6 +1,5 @@
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
from ..utils import (
@@ -29,7 +28,7 @@ class XXXYMoviesIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
video_id = mobj.group('id')
display_id = mobj.group('display_id')
diff --git a/hypervideo_dl/extractor/yahoo.py b/hypervideo_dl/extractor/yahoo.py
index a17b10d..53556de 100644
--- a/hypervideo_dl/extractor/yahoo.py
+++ b/hypervideo_dl/extractor/yahoo.py
@@ -22,6 +22,7 @@ from ..utils import (
)
from .brightcove import BrightcoveNewIE
+from .youtube import YoutubeIE
class YahooIE(InfoExtractor):
@@ -38,6 +39,7 @@ class YahooIE(InfoExtractor):
'timestamp': 1369812016,
'upload_date': '20130529',
},
+ 'skip': 'No longer exists',
}, {
'url': 'https://screen.yahoo.com/community/community-sizzle-reel-203225340.html?format=embed',
'md5': '7993e572fac98e044588d0b5260f4352',
@@ -50,6 +52,7 @@ class YahooIE(InfoExtractor):
'timestamp': 1406838636,
'upload_date': '20140731',
},
+ 'skip': 'Unfortunately, this video is not available in your region',
}, {
'url': 'https://uk.screen.yahoo.com/editor-picks/cute-raccoon-freed-drain-using-091756545.html',
'md5': '71298482f7c64cbb7fa064e4553ff1c1',
@@ -61,7 +64,8 @@ class YahooIE(InfoExtractor):
'duration': 97,
'timestamp': 1414489862,
'upload_date': '20141028',
- }
+ },
+ 'skip': 'No longer exists',
}, {
'url': 'http://news.yahoo.com/video/china-moses-crazy-blues-104538833.html',
'md5': '88e209b417f173d86186bef6e4d1f160',
@@ -120,6 +124,7 @@ class YahooIE(InfoExtractor):
'season_number': 6,
'episode_number': 1,
},
+ 'skip': 'No longer exists',
}, {
# ytwnews://cavideo/
'url': 'https://tw.video.yahoo.com/movie-tw/單車天使-中文版預-092316541.html',
@@ -156,7 +161,7 @@ class YahooIE(InfoExtractor):
'id': '352CFDOQrKg',
'ext': 'mp4',
'title': 'Kyndal Inskeep "Performs the Hell Out of" Sia\'s "Elastic Heart" - The Voice Knockouts 2019',
- 'description': 'md5:35b61e94c2ae214bc965ff4245f80d11',
+ 'description': 'md5:7fe8e3d5806f96002e55f190d1d94479',
'uploader': 'The Voice',
'uploader_id': 'NBCTheVoice',
'upload_date': '20191029',
@@ -165,7 +170,7 @@ class YahooIE(InfoExtractor):
'params': {
'playlistend': 2,
},
- 'expected_warnings': ['HTTP Error 404'],
+ 'expected_warnings': ['HTTP Error 404', 'Ignoring subtitle tracks'],
}, {
'url': 'https://malaysia.news.yahoo.com/video/bystanders-help-ontario-policeman-bust-190932818.html',
'only_matching': True,
@@ -239,7 +244,7 @@ class YahooIE(InfoExtractor):
'm3u8_native', m3u8_id='hls', fatal=False))
if not formats and msg == 'geo restricted':
- self.raise_geo_restricted()
+ self.raise_geo_restricted(metadata_available=True)
self._sort_formats(formats)
@@ -274,18 +279,19 @@ class YahooIE(InfoExtractor):
}
def _real_extract(self, url):
- url, country, display_id = re.match(self._VALID_URL, url).groups()
+ url, country, display_id = self._match_valid_url(url).groups()
if not country:
country = 'us'
else:
country = country.split('-')[0]
- item = self._download_json(
+ items = self._download_json(
'https://%s.yahoo.com/caas/content/article' % country, display_id,
'Downloading content JSON metadata', query={
'url': url
- })['items'][0]['data']['partnerData']
+ })['items'][0]
+ item = items['data']['partnerData']
if item.get('type') != 'video':
entries = []
@@ -299,9 +305,19 @@ class YahooIE(InfoExtractor):
for e in (item.get('body') or []):
if e.get('type') == 'videoIframe':
iframe_url = e.get('url')
- if not iframe_url:
- continue
+ if iframe_url:
+ entries.append(self.url_result(iframe_url))
+
+ if item.get('type') == 'storywithleadvideo':
+ iframe_url = try_get(item, lambda x: x['meta']['player']['url'])
+ if iframe_url:
entries.append(self.url_result(iframe_url))
+ else:
+ self.report_warning("Yahoo didn't provide an iframe url for this storywithleadvideo")
+
+ if items.get('markup'):
+ entries.extend(
+ self.url_result(yt_url) for yt_url in YoutubeIE._extract_urls(items['markup']))
return self.playlist_result(
entries, item.get('uuid'),
@@ -318,35 +334,19 @@ class YahooSearchIE(SearchInfoExtractor):
IE_NAME = 'screen.yahoo:search'
_SEARCH_KEY = 'yvsearch'
- def _get_n_results(self, query, n):
- """Get a specified number of results for a query"""
- entries = []
+ def _search_results(self, query):
for pagenum in itertools.count(0):
result_url = 'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
info = self._download_json(result_url, query,
note='Downloading results page ' + str(pagenum + 1))
- m = info['m']
- results = info['results']
-
- for (i, r) in enumerate(results):
- if (pagenum * 30) + i >= n:
- break
- mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
- e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
- entries.append(e)
- if (pagenum * 30 + i >= n) or (m['last'] >= (m['total'] - 1)):
+ yield from (self.url_result(result['rurl']) for result in info['results'])
+ if info['m']['last'] >= info['m']['total'] - 1:
break
- return {
- '_type': 'playlist',
- 'id': query,
- 'entries': entries,
- }
-
class YahooGyaOPlayerIE(InfoExtractor):
IE_NAME = 'yahoo:gyao:player'
- _VALID_URL = r'https?://(?:gyao\.yahoo\.co\.jp/(?:player|episode/[^/]+)|streaming\.yahoo\.co\.jp/c/y)/(?P<id>\d+/v\d+/v\d+|[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'
+ _VALID_URL = r'https?://(?:gyao\.yahoo\.co\.jp/(?:player|episode(?:/[^/]+)?)|streaming\.yahoo\.co\.jp/c/y)/(?P<id>\d+/v\d+/v\d+|[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'
_TESTS = [{
'url': 'https://gyao.yahoo.co.jp/player/00998/v00818/v0000000000000008564/',
'info_dict': {
@@ -368,6 +368,9 @@ class YahooGyaOPlayerIE(InfoExtractor):
}, {
'url': 'https://gyao.yahoo.co.jp/episode/%E3%81%8D%E3%81%AE%E3%81%86%E4%BD%95%E9%A3%9F%E3%81%B9%E3%81%9F%EF%BC%9F%20%E7%AC%AC2%E8%A9%B1%202019%2F4%2F12%E6%94%BE%E9%80%81%E5%88%86/5cb02352-b725-409e-9f8d-88f947a9f682',
'only_matching': True,
+ }, {
+ 'url': 'https://gyao.yahoo.co.jp/episode/5fa1226c-ef8d-4e93-af7a-fd92f4e30597',
+ 'only_matching': True,
}]
_GEO_BYPASS = False
@@ -508,7 +511,7 @@ class YahooJapanNewsIE(InfoExtractor):
return formats
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
host = mobj.group('host')
display_id = mobj.group('id') or host
diff --git a/hypervideo_dl/extractor/yandexdisk.py b/hypervideo_dl/extractor/yandexdisk.py
index 6fcd8ee..c15f3a4 100644
--- a/hypervideo_dl/extractor/yandexdisk.py
+++ b/hypervideo_dl/extractor/yandexdisk.py
@@ -2,7 +2,6 @@
from __future__ import unicode_literals
import json
-import re
from .common import InfoExtractor
from ..utils import (
@@ -57,7 +56,7 @@ class YandexDiskIE(InfoExtractor):
}]
def _real_extract(self, url):
- domain, video_id = re.match(self._VALID_URL, url).groups()
+ domain, video_id = self._match_valid_url(url).groups()
webpage = self._download_webpage(url, video_id)
store = self._parse_json(self._search_regex(
diff --git a/hypervideo_dl/extractor/yandexmusic.py b/hypervideo_dl/extractor/yandexmusic.py
index 0b86c71..8e94f1f 100644
--- a/hypervideo_dl/extractor/yandexmusic.py
+++ b/hypervideo_dl/extractor/yandexmusic.py
@@ -3,7 +3,6 @@ from __future__ import unicode_literals
import hashlib
import itertools
-import re
from .common import InfoExtractor
from ..compat import compat_str
@@ -109,7 +108,7 @@ class YandexMusicTrackIE(YandexMusicBaseIE):
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
tld, album_id, track_id = mobj.group('tld'), mobj.group('album_id'), mobj.group('id')
track = self._call_api(
@@ -291,7 +290,7 @@ class YandexMusicAlbumIE(YandexMusicPlaylistBaseIE):
return False if YandexMusicTrackIE.suitable(url) else super(YandexMusicAlbumIE, cls).suitable(url)
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
tld = mobj.group('tld')
album_id = mobj.group('id')
@@ -342,7 +341,7 @@ class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE):
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
tld = mobj.group('tld')
user = mobj.group('user')
playlist_id = mobj.group('id')
@@ -381,7 +380,7 @@ class YandexMusicArtistBaseIE(YandexMusicPlaylistBaseIE):
})
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
tld = mobj.group('tld')
artist_id = mobj.group('id')
data = self._call_artist(tld, url, artist_id)
@@ -410,7 +409,7 @@ class YandexMusicArtistTracksIE(YandexMusicArtistBaseIE):
_ARTIST_WHAT = 'tracks'
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
tld = mobj.group('tld')
artist_id = mobj.group('id')
data = self._call_artist(tld, url, artist_id)
@@ -440,7 +439,7 @@ class YandexMusicArtistAlbumsIE(YandexMusicArtistBaseIE):
_ARTIST_WHAT = 'albums'
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
tld = mobj.group('tld')
artist_id = mobj.group('id')
data = self._call_artist(tld, url, artist_id)
diff --git a/hypervideo_dl/extractor/yandexvideo.py b/hypervideo_dl/extractor/yandexvideo.py
index 6a166ec..9974d65 100644
--- a/hypervideo_dl/extractor/yandexvideo.py
+++ b/hypervideo_dl/extractor/yandexvideo.py
@@ -1,6 +1,9 @@
# coding: utf-8
from __future__ import unicode_literals
+import itertools
+import re
+
from .common import InfoExtractor
from ..utils import (
determine_ext,
@@ -142,3 +145,88 @@ class YandexVideoIE(InfoExtractor):
'release_year': int_or_none(content.get('release_year')),
'formats': formats,
}
+
+
+class ZenYandexIE(InfoExtractor):
+ _VALID_URL = r'https?://zen\.yandex\.ru/media/(?:id/[^/]+/|[^/]+/)(?:[a-z0-9-]+)-(?P<id>[a-z0-9-]+)'
+ _TESTS = [{
+ 'url': 'https://zen.yandex.ru/media/popmech/izverjenie-vulkana-iz-spichek-zreliscnyi-opyt-6002240ff8b1af50bb2da5e3',
+ 'info_dict': {
+ 'id': '6002240ff8b1af50bb2da5e3',
+ 'ext': 'mp4',
+ 'title': 'Извержение вулкана из спичек: зрелищный опыт',
+ 'description': 'md5:053ad3c61b5596d510c9a199dc8ee633',
+ 'thumbnail': 'https://avatars.mds.yandex.net/get-zen-pub-og/3558619/pub_6002240ff8b1af50bb2da5e3_600bad814d953e4132a30b5e/orig',
+ 'uploader': 'Популярная механика',
+ },
+ }, {
+ 'url': 'https://zen.yandex.ru/media/id/606fd806cc13cb3c58c05cf5/vot-eto-focus-dedy-morozy-na-gidrociklah-60c7c443da18892ebfe85ed7',
+ 'info_dict': {
+ 'id': '60c7c443da18892ebfe85ed7',
+ 'ext': 'mp4',
+ 'title': 'ВОТ ЭТО Focus. Деды Морозы на гидроциклах',
+ 'description': 'md5:8684912f6086f298f8078d4af0e8a600',
+ 'thumbnail': 'https://avatars.mds.yandex.net/get-zen-pub-og/4410519/pub_60c7c443da18892ebfe85ed7_60c7c48e060a163121f42cc3/orig',
+ 'uploader': 'AcademeG DailyStream'
+ },
+ }, {
+ 'url': 'https://zen.yandex.ru/media/id/606fd806cc13cb3c58c05cf5/novyi-samsung-fold-3-moskvich-barahlit-612f93b7f8d48e7e945792a2?from=channel&rid=2286618386.482.1630817595976.42360',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ id = self._match_id(url)
+ webpage = self._download_webpage(url, id)
+ data_json = self._parse_json(self._search_regex(r'w\._data\s?=\s?({.+?});', webpage, 'metadata'), id)
+ stream_json = try_get(data_json, lambda x: x['publication']['content']['gifContent'], dict)
+ stream_url = stream_json.get('stream') or try_get(stream_json, lambda x: x['streams']['url'])
+ formats = self._extract_m3u8_formats(stream_url, id)
+ self._sort_formats(formats)
+ return {
+ 'id': id,
+ 'title': try_get(data_json, (lambda x: x['og']['title'], lambda x: x['publication']['content']['preview']['title'])),
+ 'uploader': data_json.get('authorName') or try_get(data_json, lambda x: x['publisher']['name']),
+ 'description': try_get(data_json, lambda x: x['og']['description']),
+ 'thumbnail': try_get(data_json, lambda x: x['og']['imageUrl']),
+ 'formats': formats,
+ }
+
+
+class ZenYandexChannelIE(InfoExtractor):
+ _VALID_URL = r'https?://zen\.yandex\.ru/(?!media)(?:id/)?(?P<id>[a-z0-9-_]+)'
+ _TESTS = [{
+ 'url': 'https://zen.yandex.ru/tok_media',
+ 'info_dict': {
+ 'id': 'tok_media',
+ },
+ 'playlist_mincount': 169,
+ }, {
+ 'url': 'https://zen.yandex.ru/id/606fd806cc13cb3c58c05cf5',
+ 'info_dict': {
+ 'id': '606fd806cc13cb3c58c05cf5',
+ },
+ 'playlist_mincount': 657,
+ }]
+
+ def _entries(self, id, url):
+ webpage = self._download_webpage(url, id)
+ data_json = self._parse_json(re.findall(r'var\s?data\s?=\s?({.+?})\s?;', webpage)[-1], id)
+ for key in data_json.keys():
+ if key.startswith('__serverState__'):
+ data_json = data_json[key]
+ items = list(try_get(data_json, lambda x: x['feed']['items'], dict).values())
+ more = try_get(data_json, lambda x: x['links']['more']) or None
+ for page in itertools.count(1):
+ for item in items:
+ video_id = item.get('publication_id') or item.get('publicationId')
+ video_url = item.get('link')
+ yield self.url_result(video_url, ie=ZenYandexIE.ie_key(), video_id=video_id.split(':')[-1])
+ if not more:
+ break
+ data_json = self._download_json(more, id, note='Downloading Page %d' % page)
+ items = data_json.get('items', [])
+ more = try_get(data_json, lambda x: x['more']['link']) or None
+
+ def _real_extract(self, url):
+ id = self._match_id(url)
+ return self.playlist_result(self._entries(id, url), playlist_id=id)
diff --git a/hypervideo_dl/extractor/youjizz.py b/hypervideo_dl/extractor/youjizz.py
index 88aabd2..5f5fbf2 100644
--- a/hypervideo_dl/extractor/youjizz.py
+++ b/hypervideo_dl/extractor/youjizz.py
@@ -1,6 +1,5 @@
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
from ..utils import (
@@ -32,7 +31,7 @@ class YouJizzIE(InfoExtractor):
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
video_id = mobj.group('id') or mobj.group('embed_id')
webpage = self._download_webpage(url, video_id)
diff --git a/hypervideo_dl/extractor/youku.py b/hypervideo_dl/extractor/youku.py
index 880c896..b505799 100644
--- a/hypervideo_dl/extractor/youku.py
+++ b/hypervideo_dl/extractor/youku.py
@@ -160,7 +160,7 @@ class YoukuIE(InfoExtractor):
'client_ts': time.time() / 1000,
}
- video_password = self._downloader.params.get('videopassword')
+ video_password = self.get_param('videopassword')
if video_password:
basic_data_params['password'] = video_password
diff --git a/hypervideo_dl/extractor/youporn.py b/hypervideo_dl/extractor/youporn.py
index 7084d3d..5feb568 100644
--- a/hypervideo_dl/extractor/youporn.py
+++ b/hypervideo_dl/extractor/youporn.py
@@ -74,7 +74,7 @@ class YouPornIE(InfoExtractor):
webpage)
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
video_id = mobj.group('id')
display_id = mobj.group('display_id') or video_id
diff --git a/hypervideo_dl/extractor/youtube.py b/hypervideo_dl/extractor/youtube.py
index 2272a02..dc5ee63 100644
--- a/hypervideo_dl/extractor/youtube.py
+++ b/hypervideo_dl/extractor/youtube.py
@@ -2,11 +2,17 @@
from __future__ import unicode_literals
+import base64
+import calendar
+import copy
+import datetime
+import hashlib
import itertools
import json
import os.path
import random
import re
+import time
import traceback
from .common import InfoExtractor, SearchInfoExtractor
@@ -22,231 +28,250 @@ from ..compat import (
)
from ..jsinterp import JSInterpreter
from ..utils import (
- ExtractorError,
+ bytes_to_intlist,
clean_html,
+ datetime_from_str,
dict_get,
+ error_to_compat_str,
+ ExtractorError,
float_or_none,
+ format_field,
int_or_none,
+ intlist_to_bytes,
+ is_html,
mimetype2ext,
+ network_exceptions,
+ orderedSet,
parse_codecs,
+ parse_count,
parse_duration,
+ parse_iso8601,
+ parse_qs,
qualities,
+ remove_end,
remove_start,
smuggle_url,
str_or_none,
str_to_int,
+ traverse_obj,
try_get,
unescapeHTML,
unified_strdate,
unsmuggle_url,
update_url_query,
url_or_none,
- urlencode_postdata,
urljoin,
+ variadic,
)
-def parse_qs(url):
- return compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
+# any clients starting with _ cannot be explicity requested by the user
+INNERTUBE_CLIENTS = {
+ 'web': {
+ 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
+ 'INNERTUBE_CONTEXT': {
+ 'client': {
+ 'clientName': 'WEB',
+ 'clientVersion': '2.20210622.10.00',
+ }
+ },
+ 'INNERTUBE_CONTEXT_CLIENT_NAME': 1
+ },
+ 'web_embedded': {
+ 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
+ 'INNERTUBE_CONTEXT': {
+ 'client': {
+ 'clientName': 'WEB_EMBEDDED_PLAYER',
+ 'clientVersion': '1.20210620.0.1',
+ },
+ },
+ 'INNERTUBE_CONTEXT_CLIENT_NAME': 56
+ },
+ 'web_music': {
+ 'INNERTUBE_API_KEY': 'AIzaSyC9XL3ZjWddXya6X74dJoCTL-WEYFDNX30',
+ 'INNERTUBE_HOST': 'music.youtube.com',
+ 'INNERTUBE_CONTEXT': {
+ 'client': {
+ 'clientName': 'WEB_REMIX',
+ 'clientVersion': '1.20210621.00.00',
+ }
+ },
+ 'INNERTUBE_CONTEXT_CLIENT_NAME': 67,
+ },
+ 'web_creator': {
+ 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
+ 'INNERTUBE_CONTEXT': {
+ 'client': {
+ 'clientName': 'WEB_CREATOR',
+ 'clientVersion': '1.20210621.00.00',
+ }
+ },
+ 'INNERTUBE_CONTEXT_CLIENT_NAME': 62,
+ },
+ 'android': {
+ 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
+ 'INNERTUBE_CONTEXT': {
+ 'client': {
+ 'clientName': 'ANDROID',
+ 'clientVersion': '16.20',
+ }
+ },
+ 'INNERTUBE_CONTEXT_CLIENT_NAME': 3,
+ 'REQUIRE_JS_PLAYER': False
+ },
+ 'android_embedded': {
+ 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
+ 'INNERTUBE_CONTEXT': {
+ 'client': {
+ 'clientName': 'ANDROID_EMBEDDED_PLAYER',
+ 'clientVersion': '16.20',
+ },
+ },
+ 'INNERTUBE_CONTEXT_CLIENT_NAME': 55,
+ 'REQUIRE_JS_PLAYER': False
+ },
+ 'android_music': {
+ 'INNERTUBE_API_KEY': 'AIzaSyC9XL3ZjWddXya6X74dJoCTL-WEYFDNX30',
+ 'INNERTUBE_HOST': 'music.youtube.com',
+ 'INNERTUBE_CONTEXT': {
+ 'client': {
+ 'clientName': 'ANDROID_MUSIC',
+ 'clientVersion': '4.32',
+ }
+ },
+ 'INNERTUBE_CONTEXT_CLIENT_NAME': 21,
+ 'REQUIRE_JS_PLAYER': False
+ },
+ 'android_creator': {
+ 'INNERTUBE_CONTEXT': {
+ 'client': {
+ 'clientName': 'ANDROID_CREATOR',
+ 'clientVersion': '21.24.100',
+ },
+ },
+ 'INNERTUBE_CONTEXT_CLIENT_NAME': 14,
+ 'REQUIRE_JS_PLAYER': False
+ },
+ # ios has HLS live streams
+ # See: https://github.com/TeamNewPipe/NewPipeExtractor/issues/680
+ 'ios': {
+ 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
+ 'INNERTUBE_CONTEXT': {
+ 'client': {
+ 'clientName': 'IOS',
+ 'clientVersion': '16.20',
+ }
+ },
+ 'INNERTUBE_CONTEXT_CLIENT_NAME': 5,
+ 'REQUIRE_JS_PLAYER': False
+ },
+ 'ios_embedded': {
+ 'INNERTUBE_API_KEY': 'AIzaSyDCU8hByM-4DrUqRUYnGn-3llEO78bcxq8',
+ 'INNERTUBE_CONTEXT': {
+ 'client': {
+ 'clientName': 'IOS_MESSAGES_EXTENSION',
+ 'clientVersion': '16.20',
+ },
+ },
+ 'INNERTUBE_CONTEXT_CLIENT_NAME': 66,
+ 'REQUIRE_JS_PLAYER': False
+ },
+ 'ios_music': {
+ 'INNERTUBE_API_KEY': 'AIzaSyDK3iBpDP9nHVTk2qL73FLJICfOC3c51Og',
+ 'INNERTUBE_HOST': 'music.youtube.com',
+ 'INNERTUBE_CONTEXT': {
+ 'client': {
+ 'clientName': 'IOS_MUSIC',
+ 'clientVersion': '4.32',
+ },
+ },
+ 'INNERTUBE_CONTEXT_CLIENT_NAME': 26,
+ 'REQUIRE_JS_PLAYER': False
+ },
+ 'ios_creator': {
+ 'INNERTUBE_CONTEXT': {
+ 'client': {
+ 'clientName': 'IOS_CREATOR',
+ 'clientVersion': '21.24.100',
+ },
+ },
+ 'INNERTUBE_CONTEXT_CLIENT_NAME': 15,
+ 'REQUIRE_JS_PLAYER': False
+ },
+ # mweb has 'ultralow' formats
+ # See: https://github.com/hypervideo/hypervideo/pull/557
+ 'mweb': {
+ 'INNERTUBE_API_KEY': 'AIzaSyDCU8hByM-4DrUqRUYnGn-3llEO78bcxq8',
+ 'INNERTUBE_CONTEXT': {
+ 'client': {
+ 'clientName': 'MWEB',
+ 'clientVersion': '2.20210721.07.00',
+ }
+ },
+ 'INNERTUBE_CONTEXT_CLIENT_NAME': 2
+ },
+}
+
+
+def build_innertube_clients():
+ third_party = {
+ 'embedUrl': 'https://google.com', # Can be any valid URL
+ }
+ base_clients = ('android', 'web', 'ios', 'mweb')
+ priority = qualities(base_clients[::-1])
+
+ for client, ytcfg in tuple(INNERTUBE_CLIENTS.items()):
+ ytcfg.setdefault('INNERTUBE_API_KEY', 'AIzaSyDCU8hByM-4DrUqRUYnGn-3llEO78bcxq8')
+ ytcfg.setdefault('INNERTUBE_HOST', 'www.youtube.com')
+ ytcfg.setdefault('REQUIRE_JS_PLAYER', True)
+ ytcfg['INNERTUBE_CONTEXT']['client'].setdefault('hl', 'en')
+ ytcfg['priority'] = 10 * priority(client.split('_', 1)[0])
+
+ if client in base_clients:
+ INNERTUBE_CLIENTS[f'{client}_agegate'] = agegate_ytcfg = copy.deepcopy(ytcfg)
+ agegate_ytcfg['INNERTUBE_CONTEXT']['client']['clientScreen'] = 'EMBED'
+ agegate_ytcfg['INNERTUBE_CONTEXT']['thirdParty'] = third_party
+ agegate_ytcfg['priority'] -= 1
+ elif client.endswith('_embedded'):
+ ytcfg['INNERTUBE_CONTEXT']['thirdParty'] = third_party
+ ytcfg['priority'] -= 2
+ else:
+ ytcfg['priority'] -= 3
+
+
+build_innertube_clients()
class YoutubeBaseInfoExtractor(InfoExtractor):
"""Provide base functions for Youtube extractors"""
- _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
- _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
- _LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup'
- _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge'
- _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}'
+ _RESERVED_NAMES = (
+ r'channel|c|user|playlist|watch|w|v|embed|e|watch_popup|clip|'
+ r'shorts|movies|results|shared|hashtag|trending|feed|feeds|'
+ r'browse|oembed|get_video_info|iframe_api|s/player|'
+ r'storefront|oops|index|account|reporthistory|t/terms|about|upload|signin|logout')
+
+ _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM|WL|LL|LM)'
_NETRC_MACHINE = 'youtube'
+
# If True it will raise an error if no login info is provided
_LOGIN_REQUIRED = False
- _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM)'
-
def _login(self):
"""
Attempt to log in to YouTube.
- True is returned if successful or skipped.
- False is returned if login failed.
-
If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
"""
- username, password = self._get_login_info()
- # No authentication to be performed
- if username is None:
- if self._LOGIN_REQUIRED and self._downloader.params.get('cookiefile') is None:
- raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
- return True
-
- login_page = self._download_webpage(
- self._LOGIN_URL, None,
- note='Downloading login page',
- errnote='unable to fetch login page', fatal=False)
- if login_page is False:
- return
-
- login_form = self._hidden_inputs(login_page)
-
- def req(url, f_req, note, errnote):
- data = login_form.copy()
- data.update({
- 'pstMsg': 1,
- 'checkConnection': 'youtube',
- 'checkedDomains': 'youtube',
- 'hl': 'en',
- 'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]',
- 'f.req': json.dumps(f_req),
- 'flowName': 'GlifWebSignIn',
- 'flowEntry': 'ServiceLogin',
- # TODO: reverse actual botguard identifier generation algo
- 'bgRequest': '["identifier",""]',
- })
- return self._download_json(
- url, None, note=note, errnote=errnote,
- transform_source=lambda s: re.sub(r'^[^[]*', '', s),
- fatal=False,
- data=urlencode_postdata(data), headers={
- 'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8',
- 'Google-Accounts-XSRF': 1,
- })
-
- def warn(message):
- self._downloader.report_warning(message)
-
- lookup_req = [
- username,
- None, [], None, 'US', None, None, 2, False, True,
- [
- None, None,
- [2, 1, None, 1,
- 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn',
- None, [], 4],
- 1, [None, None, []], None, None, None, True
- ],
- username,
- ]
-
- lookup_results = req(
- self._LOOKUP_URL, lookup_req,
- 'Looking up account info', 'Unable to look up account info')
-
- if lookup_results is False:
- return False
-
- user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str)
- if not user_hash:
- warn('Unable to extract user hash')
- return False
-
- challenge_req = [
- user_hash,
- None, 1, None, [1, None, None, None, [password, None, True]],
- [
- None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4],
- 1, [None, None, []], None, None, None, True
- ]]
-
- challenge_results = req(
- self._CHALLENGE_URL, challenge_req,
- 'Logging in', 'Unable to log in')
- if challenge_results is False:
- return
-
- login_res = try_get(challenge_results, lambda x: x[0][5], list)
- if login_res:
- login_msg = try_get(login_res, lambda x: x[5], compat_str)
- warn(
- 'Unable to login: %s' % 'Invalid password'
- if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg)
- return False
-
- res = try_get(challenge_results, lambda x: x[0][-1], list)
- if not res:
- warn('Unable to extract result entry')
- return False
-
- login_challenge = try_get(res, lambda x: x[0][0], list)
- if login_challenge:
- challenge_str = try_get(login_challenge, lambda x: x[2], compat_str)
- if challenge_str == 'TWO_STEP_VERIFICATION':
- # SEND_SUCCESS - TFA code has been successfully sent to phone
- # QUOTA_EXCEEDED - reached the limit of TFA codes
- status = try_get(login_challenge, lambda x: x[5], compat_str)
- if status == 'QUOTA_EXCEEDED':
- warn('Exceeded the limit of TFA codes, try later')
- return False
-
- tl = try_get(challenge_results, lambda x: x[1][2], compat_str)
- if not tl:
- warn('Unable to extract TL')
- return False
-
- tfa_code = self._get_tfa_info('2-step verification code')
-
- if not tfa_code:
- warn(
- 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
- '(Note that only TOTP (Google Authenticator App) codes work at this time.)')
- return False
-
- tfa_code = remove_start(tfa_code, 'G-')
-
- tfa_req = [
- user_hash, None, 2, None,
- [
- 9, None, None, None, None, None, None, None,
- [None, tfa_code, True, 2]
- ]]
-
- tfa_results = req(
- self._TFA_URL.format(tl), tfa_req,
- 'Submitting TFA code', 'Unable to submit TFA code')
-
- if tfa_results is False:
- return False
-
- tfa_res = try_get(tfa_results, lambda x: x[0][5], list)
- if tfa_res:
- tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str)
- warn(
- 'Unable to finish TFA: %s' % 'Invalid TFA code'
- if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg)
- return False
-
- check_cookie_url = try_get(
- tfa_results, lambda x: x[0][-1][2], compat_str)
- else:
- CHALLENGES = {
- 'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.",
- 'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.',
- 'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.",
- }
- challenge = CHALLENGES.get(
- challenge_str,
- '%s returned error %s.' % (self.IE_NAME, challenge_str))
- warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge)
- return False
- else:
- check_cookie_url = try_get(res, lambda x: x[2], compat_str)
-
- if not check_cookie_url:
- warn('Unable to extract CheckCookie URL')
- return False
-
- check_cookie_results = self._download_webpage(
- check_cookie_url, None, 'Checking cookie', fatal=False)
-
- if check_cookie_results is False:
- return False
-
- if 'https://myaccount.google.com/' not in check_cookie_results:
- warn('Unable to log in')
- return False
-
- return True
+ if (self._LOGIN_REQUIRED
+ and self.get_param('cookiefile') is None
+ and self.get_param('cookiesfrombrowser') is None):
+ self.raise_login_required(
+ 'Login details are needed to download this content', method='cookies')
+ username, password = self._get_login_info()
+ if username:
+ self.report_warning(f'Cannot login to YouTube using username and password. {self._LOGIN_HINTS["cookies"]}')
def _initialize_consent(self):
cookies = self._get_cookies('https://www.youtube.com/')
@@ -265,73 +290,402 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
def _real_initialize(self):
self._initialize_consent()
- if self._downloader is None:
- return
- if not self._login():
- return
-
- _DEFAULT_API_DATA = {
- 'context': {
- 'client': {
- 'clientName': 'WEB',
- 'clientVersion': '2.20201021.03.00',
- }
- },
- }
+ self._login()
_YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;'
_YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;'
_YT_INITIAL_BOUNDARY_RE = r'(?:var\s+meta|</script|\n)'
- def _call_api(self, ep, query, video_id, fatal=True):
- data = self._DEFAULT_API_DATA.copy()
- data.update(query)
+ def _get_default_ytcfg(self, client='web'):
+ return copy.deepcopy(INNERTUBE_CLIENTS[client])
+
+ def _get_innertube_host(self, client='web'):
+ return INNERTUBE_CLIENTS[client]['INNERTUBE_HOST']
+
+ def _ytcfg_get_safe(self, ytcfg, getter, expected_type=None, default_client='web'):
+ # try_get but with fallback to default ytcfg client values when present
+ _func = lambda y: try_get(y, getter, expected_type)
+ return _func(ytcfg) or _func(self._get_default_ytcfg(default_client))
+
+ def _extract_client_name(self, ytcfg, default_client='web'):
+ return self._ytcfg_get_safe(
+ ytcfg, (lambda x: x['INNERTUBE_CLIENT_NAME'],
+ lambda x: x['INNERTUBE_CONTEXT']['client']['clientName']), compat_str, default_client)
+ def _extract_client_version(self, ytcfg, default_client='web'):
+ return self._ytcfg_get_safe(
+ ytcfg, (lambda x: x['INNERTUBE_CLIENT_VERSION'],
+ lambda x: x['INNERTUBE_CONTEXT']['client']['clientVersion']), compat_str, default_client)
+
+ def _extract_api_key(self, ytcfg=None, default_client='web'):
+ return self._ytcfg_get_safe(ytcfg, lambda x: x['INNERTUBE_API_KEY'], compat_str, default_client)
+
+ def _extract_context(self, ytcfg=None, default_client='web'):
+ _get_context = lambda y: try_get(y, lambda x: x['INNERTUBE_CONTEXT'], dict)
+ context = _get_context(ytcfg)
+ if context:
+ return context
+
+ context = _get_context(self._get_default_ytcfg(default_client))
+ if not ytcfg:
+ return context
+
+ # Recreate the client context (required)
+ context['client'].update({
+ 'clientVersion': self._extract_client_version(ytcfg, default_client),
+ 'clientName': self._extract_client_name(ytcfg, default_client),
+ })
+ visitor_data = try_get(ytcfg, lambda x: x['VISITOR_DATA'], compat_str)
+ if visitor_data:
+ context['client']['visitorData'] = visitor_data
+ return context
+
+ _SAPISID = None
+
+ def _generate_sapisidhash_header(self, origin='https://www.youtube.com'):
+ time_now = round(time.time())
+ if self._SAPISID is None:
+ yt_cookies = self._get_cookies('https://www.youtube.com')
+ # Sometimes SAPISID cookie isn't present but __Secure-3PAPISID is.
+ # See: https://github.com/hypervideo/hypervideo/issues/393
+ sapisid_cookie = dict_get(
+ yt_cookies, ('__Secure-3PAPISID', 'SAPISID'))
+ if sapisid_cookie and sapisid_cookie.value:
+ self._SAPISID = sapisid_cookie.value
+ self.write_debug('Extracted SAPISID cookie')
+ # SAPISID cookie is required if not already present
+ if not yt_cookies.get('SAPISID'):
+ self.write_debug('Copying __Secure-3PAPISID cookie to SAPISID cookie')
+ self._set_cookie(
+ '.youtube.com', 'SAPISID', self._SAPISID, secure=True, expire_time=time_now + 3600)
+ else:
+ self._SAPISID = False
+ if not self._SAPISID:
+ return None
+ # SAPISIDHASH algorithm from https://stackoverflow.com/a/32065323
+ sapisidhash = hashlib.sha1(
+ f'{time_now} {self._SAPISID} {origin}'.encode('utf-8')).hexdigest()
+ return f'SAPISIDHASH {time_now}_{sapisidhash}'
+
+ def _call_api(self, ep, query, video_id, fatal=True, headers=None,
+ note='Downloading API JSON', errnote='Unable to download API page',
+ context=None, api_key=None, api_hostname=None, default_client='web'):
+
+ data = {'context': context} if context else {'context': self._extract_context(default_client=default_client)}
+ data.update(query)
+ real_headers = self.generate_api_headers(default_client=default_client)
+ real_headers.update({'content-type': 'application/json'})
+ if headers:
+ real_headers.update(headers)
return self._download_json(
- 'https://www.youtube.com/youtubei/v1/%s' % ep, video_id=video_id,
- note='Downloading API JSON', errnote='Unable to download API page',
- data=json.dumps(data).encode('utf8'), fatal=fatal,
- headers={'content-type': 'application/json'},
- query={'key': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8'})
+ 'https://%s/youtubei/v1/%s' % (api_hostname or self._get_innertube_host(default_client), ep),
+ video_id=video_id, fatal=fatal, note=note, errnote=errnote,
+ data=json.dumps(data).encode('utf8'), headers=real_headers,
+ query={'key': api_key or self._extract_api_key()})
+
+ def extract_yt_initial_data(self, item_id, webpage, fatal=True):
+ data = self._search_regex(
+ (r'%s\s*%s' % (self._YT_INITIAL_DATA_RE, self._YT_INITIAL_BOUNDARY_RE),
+ self._YT_INITIAL_DATA_RE), webpage, 'yt initial data', fatal=fatal)
+ if data:
+ return self._parse_json(data, item_id, fatal=fatal)
- def _extract_yt_initial_data(self, video_id, webpage):
- return self._parse_json(
- self._search_regex(
- (r'%s\s*%s' % (self._YT_INITIAL_DATA_RE, self._YT_INITIAL_BOUNDARY_RE),
- self._YT_INITIAL_DATA_RE), webpage, 'yt initial data'),
- video_id)
+ @staticmethod
+ def _extract_session_index(*data):
+ """
+ Index of current account in account list.
+ See: https://github.com/hypervideo/hypervideo/pull/519
+ """
+ for ytcfg in data:
+ session_index = int_or_none(try_get(ytcfg, lambda x: x['SESSION_INDEX']))
+ if session_index is not None:
+ return session_index
+
+ # Deprecated?
+ def _extract_identity_token(self, ytcfg=None, webpage=None):
+ if ytcfg:
+ token = try_get(ytcfg, lambda x: x['ID_TOKEN'], compat_str)
+ if token:
+ return token
+ if webpage:
+ return self._search_regex(
+ r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage,
+ 'identity token', default=None, fatal=False)
+
+ @staticmethod
+ def _extract_account_syncid(*args):
+ """
+ Extract syncId required to download private playlists of secondary channels
+ @params response and/or ytcfg
+ """
+ for data in args:
+ # ytcfg includes channel_syncid if on secondary channel
+ delegated_sid = try_get(data, lambda x: x['DELEGATED_SESSION_ID'], compat_str)
+ if delegated_sid:
+ return delegated_sid
+ sync_ids = (try_get(
+ data, (lambda x: x['responseContext']['mainAppWebResponseContext']['datasyncId'],
+ lambda x: x['DATASYNC_ID']), compat_str) or '').split('||')
+ if len(sync_ids) >= 2 and sync_ids[1]:
+ # datasyncid is of the form "channel_syncid||user_syncid" for secondary channel
+ # and just "user_syncid||" for primary channel. We only want the channel_syncid
+ return sync_ids[0]
- def _extract_ytcfg(self, video_id, webpage):
+ @staticmethod
+ def _extract_visitor_data(*args):
+ """
+ Extracts visitorData from an API response or ytcfg
+ Appears to be used to track session state
+ """
+ return traverse_obj(
+ args, (..., ('VISITOR_DATA', ('INNERTUBE_CONTEXT', 'client', 'visitorData'), ('responseContext', 'visitorData'))),
+ expected_type=compat_str, get_all=False)
+
+ @property
+ def is_authenticated(self):
+ return bool(self._generate_sapisidhash_header())
+
+ def extract_ytcfg(self, video_id, webpage):
+ if not webpage:
+ return {}
return self._parse_json(
self._search_regex(
r'ytcfg\.set\s*\(\s*({.+?})\s*\)\s*;', webpage, 'ytcfg',
default='{}'), video_id, fatal=False) or {}
+ def generate_api_headers(
+ self, *, ytcfg=None, account_syncid=None, session_index=None,
+ visitor_data=None, identity_token=None, api_hostname=None, default_client='web'):
+
+ origin = 'https://' + (api_hostname if api_hostname else self._get_innertube_host(default_client))
+ headers = {
+ 'X-YouTube-Client-Name': compat_str(
+ self._ytcfg_get_safe(ytcfg, lambda x: x['INNERTUBE_CONTEXT_CLIENT_NAME'], default_client=default_client)),
+ 'X-YouTube-Client-Version': self._extract_client_version(ytcfg, default_client),
+ 'Origin': origin,
+ 'X-Youtube-Identity-Token': identity_token or self._extract_identity_token(ytcfg),
+ 'X-Goog-PageId': account_syncid or self._extract_account_syncid(ytcfg),
+ 'X-Goog-Visitor-Id': visitor_data or self._extract_visitor_data(ytcfg)
+ }
+ if session_index is None:
+ session_index = self._extract_session_index(ytcfg)
+ if account_syncid or session_index is not None:
+ headers['X-Goog-AuthUser'] = session_index if session_index is not None else 0
+
+ auth = self._generate_sapisidhash_header(origin)
+ if auth is not None:
+ headers['Authorization'] = auth
+ headers['X-Origin'] = origin
+ return {h: v for h, v in headers.items() if v is not None}
+
+ @staticmethod
+ def _build_api_continuation_query(continuation, ctp=None):
+ query = {
+ 'continuation': continuation
+ }
+ # TODO: Inconsistency with clickTrackingParams.
+ # Currently we have a fixed ctp contained within context (from ytcfg)
+ # and a ctp in root query for continuation.
+ if ctp:
+ query['clickTracking'] = {'clickTrackingParams': ctp}
+ return query
+
+ @classmethod
+ def _extract_next_continuation_data(cls, renderer):
+ next_continuation = try_get(
+ renderer, (lambda x: x['continuations'][0]['nextContinuationData'],
+ lambda x: x['continuation']['reloadContinuationData']), dict)
+ if not next_continuation:
+ return
+ continuation = next_continuation.get('continuation')
+ if not continuation:
+ return
+ ctp = next_continuation.get('clickTrackingParams')
+ return cls._build_api_continuation_query(continuation, ctp)
+
+ @classmethod
+ def _extract_continuation_ep_data(cls, continuation_ep: dict):
+ if isinstance(continuation_ep, dict):
+ continuation = try_get(
+ continuation_ep, lambda x: x['continuationCommand']['token'], compat_str)
+ if not continuation:
+ return
+ ctp = continuation_ep.get('clickTrackingParams')
+ return cls._build_api_continuation_query(continuation, ctp)
+
+ @classmethod
+ def _extract_continuation(cls, renderer):
+ next_continuation = cls._extract_next_continuation_data(renderer)
+ if next_continuation:
+ return next_continuation
+
+ contents = []
+ for key in ('contents', 'items'):
+ contents.extend(try_get(renderer, lambda x: x[key], list) or [])
+
+ for content in contents:
+ if not isinstance(content, dict):
+ continue
+ continuation_ep = try_get(
+ content, (lambda x: x['continuationItemRenderer']['continuationEndpoint'],
+ lambda x: x['continuationItemRenderer']['button']['buttonRenderer']['command']),
+ dict)
+ continuation = cls._extract_continuation_ep_data(continuation_ep)
+ if continuation:
+ return continuation
+
+ @classmethod
+ def _extract_alerts(cls, data):
+ for alert_dict in try_get(data, lambda x: x['alerts'], list) or []:
+ if not isinstance(alert_dict, dict):
+ continue
+ for alert in alert_dict.values():
+ alert_type = alert.get('type')
+ if not alert_type:
+ continue
+ message = cls._get_text(alert, 'text')
+ if message:
+ yield alert_type, message
+
+ def _report_alerts(self, alerts, expected=True, fatal=True, only_once=False):
+ errors = []
+ warnings = []
+ for alert_type, alert_message in alerts:
+ if alert_type.lower() == 'error' and fatal:
+ errors.append([alert_type, alert_message])
+ else:
+ warnings.append([alert_type, alert_message])
+
+ for alert_type, alert_message in (warnings + errors[:-1]):
+ self.report_warning('YouTube said: %s - %s' % (alert_type, alert_message), only_once=only_once)
+ if errors:
+ raise ExtractorError('YouTube said: %s' % errors[-1][1], expected=expected)
+
+ def _extract_and_report_alerts(self, data, *args, **kwargs):
+ return self._report_alerts(self._extract_alerts(data), *args, **kwargs)
+
+ def _extract_badges(self, renderer: dict):
+ badges = set()
+ for badge in try_get(renderer, lambda x: x['badges'], list) or []:
+ label = try_get(badge, lambda x: x['metadataBadgeRenderer']['label'], compat_str)
+ if label:
+ badges.add(label.lower())
+ return badges
+
+ @staticmethod
+ def _get_text(data, *path_list, max_runs=None):
+ for path in path_list or [None]:
+ if path is None:
+ obj = [data]
+ else:
+ obj = traverse_obj(data, path, default=[])
+ if not any(key is ... or isinstance(key, (list, tuple)) for key in variadic(path)):
+ obj = [obj]
+ for item in obj:
+ text = try_get(item, lambda x: x['simpleText'], compat_str)
+ if text:
+ return text
+ runs = try_get(item, lambda x: x['runs'], list) or []
+ if not runs and isinstance(item, list):
+ runs = item
+
+ runs = runs[:min(len(runs), max_runs or len(runs))]
+ text = ''.join(traverse_obj(runs, (..., 'text'), expected_type=str, default=[]))
+ if text:
+ return text
+
+ def _extract_response(self, item_id, query, note='Downloading API JSON', headers=None,
+ ytcfg=None, check_get_keys=None, ep='browse', fatal=True, api_hostname=None,
+ default_client='web'):
+ response = None
+ last_error = None
+ count = -1
+ retries = self.get_param('extractor_retries', 3)
+ if check_get_keys is None:
+ check_get_keys = []
+ while count < retries:
+ count += 1
+ if last_error:
+ self.report_warning('%s. Retrying ...' % remove_end(last_error, '.'))
+ try:
+ response = self._call_api(
+ ep=ep, fatal=True, headers=headers,
+ video_id=item_id, query=query,
+ context=self._extract_context(ytcfg, default_client),
+ api_key=self._extract_api_key(ytcfg, default_client),
+ api_hostname=api_hostname, default_client=default_client,
+ note='%s%s' % (note, ' (retry #%d)' % count if count else ''))
+ except ExtractorError as e:
+ if isinstance(e.cause, network_exceptions):
+ if isinstance(e.cause, compat_HTTPError) and not is_html(e.cause.read(512)):
+ e.cause.seek(0)
+ yt_error = try_get(
+ self._parse_json(e.cause.read().decode(), item_id, fatal=False),
+ lambda x: x['error']['message'], compat_str)
+ if yt_error:
+ self._report_alerts([('ERROR', yt_error)], fatal=False)
+ # Downloading page may result in intermittent 5xx HTTP error
+ # Sometimes a 404 is also recieved. See: https://github.com/ytdl-org/youtube-dl/issues/28289
+ # We also want to catch all other network exceptions since errors in later pages can be troublesome
+ # See https://github.com/hypervideo/hypervideo/issues/507#issuecomment-880188210
+ if not isinstance(e.cause, compat_HTTPError) or e.cause.code not in (403, 429):
+ last_error = error_to_compat_str(e.cause or e.msg)
+ if count < retries:
+ continue
+ if fatal:
+ raise
+ else:
+ self.report_warning(error_to_compat_str(e))
+ return
+
+ else:
+ try:
+ self._extract_and_report_alerts(response, only_once=True)
+ except ExtractorError as e:
+ # YouTube servers may return errors we want to retry on in a 200 OK response
+ # See: https://github.com/hypervideo/hypervideo/issues/839
+ if 'unknown error' in e.msg.lower():
+ last_error = e.msg
+ continue
+ if fatal:
+ raise
+ self.report_warning(error_to_compat_str(e))
+ return
+ if not check_get_keys or dict_get(response, check_get_keys):
+ break
+ # Youtube sometimes sends incomplete data
+ # See: https://github.com/ytdl-org/youtube-dl/issues/28194
+ last_error = 'Incomplete data received'
+ if count >= retries:
+ if fatal:
+ raise ExtractorError(last_error)
+ else:
+ self.report_warning(last_error)
+ return
+ return response
+
+ @staticmethod
+ def is_music_url(url):
+ return re.match(r'https?://music\.youtube\.com/', url) is not None
+
def _extract_video(self, renderer):
- video_id = renderer['videoId']
- title = try_get(
- renderer,
- (lambda x: x['title']['runs'][0]['text'],
- lambda x: x['title']['simpleText']), compat_str)
- description = try_get(
- renderer, lambda x: x['descriptionSnippet']['runs'][0]['text'],
- compat_str)
- duration = parse_duration(try_get(
- renderer, lambda x: x['lengthText']['simpleText'], compat_str))
- view_count_text = try_get(
- renderer, lambda x: x['viewCountText']['simpleText'], compat_str) or ''
+ video_id = renderer.get('videoId')
+ title = self._get_text(renderer, 'title')
+ description = self._get_text(renderer, 'descriptionSnippet')
+ duration = parse_duration(self._get_text(
+ renderer, 'lengthText', ('thumbnailOverlays', ..., 'thumbnailOverlayTimeStatusRenderer', 'text')))
+ view_count_text = self._get_text(renderer, 'viewCountText') or ''
view_count = str_to_int(self._search_regex(
r'^([\d,]+)', re.sub(r'\s', '', view_count_text),
'view count', default=None))
- uploader = try_get(
- renderer,
- (lambda x: x['ownerText']['runs'][0]['text'],
- lambda x: x['shortBylineText']['runs'][0]['text']), compat_str)
+
+ uploader = self._get_text(renderer, 'ownerText', 'shortBylineText')
+
return {
'_type': 'url',
'ie_key': YoutubeIE.ie_key(),
'id': video_id,
- 'url': video_id,
+ 'url': f'https://www.youtube.com/watch?v={video_id}',
'title': title,
'description': description,
'duration': duration,
@@ -347,13 +701,17 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
r'(?:www\.)?redirect\.invidious\.io',
r'(?:(?:www|dev)\.)?invidio\.us',
# Invidious instances taken from https://github.com/iv-org/documentation/blob/master/Invidious-Instances.md
+ r'(?:www\.)?invidious\.pussthecat\.org',
+ r'(?:www\.)?invidious\.zee\.li',
+ r'(?:www\.)?invidious\.ethibox\.fr',
+ r'(?:www\.)?invidious\.3o7z6yfxhbw7n3za4rss6l434kmv55cgw2vuziwuigpwegswvwzqipyd\.onion',
+ # youtube-dl invidious instances list
r'(?:(?:www|no)\.)?invidiou\.sh',
r'(?:(?:www|fi)\.)?invidious\.snopyta\.org',
r'(?:www\.)?invidious\.kabi\.tk',
- r'(?:www\.)?invidious\.13ad\.de',
r'(?:www\.)?invidious\.mastodon\.host',
r'(?:www\.)?invidious\.zapashcanon\.fr',
- r'(?:www\.)?invidious\.kavin\.rocks',
+ r'(?:www\.)?(?:invidious(?:-us)?|piped)\.kavin\.rocks',
r'(?:www\.)?invidious\.tinfoil-hat\.net',
r'(?:www\.)?invidious\.himiko\.cloud',
r'(?:www\.)?invidious\.reallyancient\.tech',
@@ -380,6 +738,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
r'(?:www\.)?invidious\.toot\.koeln',
r'(?:www\.)?invidious\.fdn\.fr',
r'(?:www\.)?watch\.nettohikari\.com',
+ r'(?:www\.)?invidious\.namazso\.eu',
+ r'(?:www\.)?invidious\.silkky\.cloud',
+ r'(?:www\.)?invidious\.exonip\.de',
+ r'(?:www\.)?invidious\.riverside\.rocks',
+ r'(?:www\.)?invidious\.blamefran\.net',
+ r'(?:www\.)?invidious\.moomoo\.de',
+ r'(?:www\.)?ytb\.trom\.tf',
+ r'(?:www\.)?yt\.cyberhost\.uk',
r'(?:www\.)?kgg2m7yk5aybusll\.onion',
r'(?:www\.)?qklhadlycap4cnod\.onion',
r'(?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion',
@@ -388,6 +754,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
r'(?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion',
r'(?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p',
r'(?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion',
+ r'(?:www\.)?w6ijuptxiku4xpnnaetxvnkc5vqcdu7mgns2u77qefoixi63vbvnpnqd\.onion',
+ r'(?:www\.)?kbjggqkzv65ivcqj6bumvp337z6264huv5kpkwuv6gu5yjiskvan7fad\.onion',
+ r'(?:www\.)?grwp24hodrefzvjjuccrkw3mjq4tzhaaq32amf33dzpmuxe7ilepcmad\.onion',
+ r'(?:www\.)?hpniueoejy4opn7bc4ftgazyqjoeqwlvh2uiku2xqku6zpoa4bf5ruid\.onion',
)
_VALID_URL = r"""(?x)^
(
@@ -402,7 +772,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
youtube\.googleapis\.com)/ # the various hostnames, with wildcard subdomains
(?:.*?\#/)? # handle anchor (#/) redirect urls
(?: # the various things that can precede the ID:
- (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
+ (?:(?:v|embed|e|shorts)/(?!videoseries)) # v/ or embed/ or e/ or shorts/
|(?: # or the v= param in all its forms
(?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
(?:\?|\#!?) # the params delimiter ? or # or #!
@@ -421,7 +791,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
)? # all until now is optional -> you can pass the naked ID
(?P<id>[0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
(?(1).+)? # if we found the ID, everything can follow
- $""" % {
+ (?:\#|$)""" % {
'invidious': '|'.join(_INVIDIOUS_SITES),
}
_PLAYER_INFO_RE = (
@@ -429,7 +799,116 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
r'/(?P<id>[a-zA-Z0-9_-]{8,})/player(?:_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?|-plasma-ias-(?:phone|tablet)-[a-z]{2}_[A-Z]{2}\.vflset)/base\.js$',
r'\b(?P<id>vfl[a-zA-Z0-9_-]+)\b.*?\.js$',
)
- _SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt')
+ _formats = {
+ '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
+ '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
+ '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
+ '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},
+ '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},
+ '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
+ '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
+ '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
+ # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well
+ '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
+ '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
+ '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
+ '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
+ '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
+ '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
+ '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
+ '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
+ '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
+
+
+ # 3D videos
+ '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
+ '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
+ '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
+ '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
+ '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},
+ '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
+ '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
+
+ # Apple HTTP Live Streaming
+ '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
+ '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
+ '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
+ '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
+ '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
+ '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
+ '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
+ '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},
+
+ # DASH mp4 video
+ '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},
+ '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},
+ '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
+ '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},
+ '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},
+ '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559)
+ '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},
+ '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
+ '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},
+ '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
+ '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
+ '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},
+
+ # Dash mp4 audio
+ '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'},
+ '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'},
+ '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'},
+ '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
+ '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
+ '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},
+ '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},
+
+ # Dash webm
+ '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
+ '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
+ '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
+ '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
+ '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
+ '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
+ '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},
+ '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'},
+ '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'},
+ '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
+ '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
+ '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
+ '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'},
+ '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'},
+ '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'},
+ # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
+ '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
+ '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
+ '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
+ '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
+ '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
+ '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
+
+ # Dash webm audio
+ '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128},
+ '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256},
+
+ # Dash webm audio with opus inside
+ '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50},
+ '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70},
+ '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160},
+
+ # RTMP (unnamed)
+ '_rtmp': {'protocol': 'rtmp'},
+
+ # av01 video only formats sometimes served with "unknown" codecs
+ '394': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'av01.0.00M.08'},
+ '395': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'av01.0.00M.08'},
+ '396': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'av01.0.01M.08'},
+ '397': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'av01.0.04M.08'},
+ '398': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'av01.0.05M.08'},
+ '399': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'av01.0.08M.08'},
+ '400': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'av01.0.12M.08'},
+ '401': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'av01.0.12M.08'},
+ }
+ _SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'vtt')
_GEO_BYPASS = False
@@ -440,16 +919,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'info_dict': {
'id': 'BaW_jenozKc',
'ext': 'mp4',
- 'title': 'hypervideo test video "\'/\\ä↭𝕐',
+ 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
'uploader': 'Philipp Hagemeister',
'uploader_id': 'phihag',
'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q',
'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q',
'upload_date': '20121002',
- 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for hypervideo.\n\nFor more information, contact phihag@phihag.de .',
+ 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
'categories': ['Science & Technology'],
- 'tags': ['hypervideo'],
+ 'tags': ['youtube-dl'],
'duration': 10,
'view_count': int,
'like_count': int,
@@ -480,14 +959,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'info_dict': {
'id': 'BaW_jenozKc',
'ext': 'mp4',
- 'title': 'hypervideo test video "\'/\\ä↭𝕐',
+ 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
'uploader': 'Philipp Hagemeister',
'uploader_id': 'phihag',
'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
'upload_date': '20121002',
- 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for hypervideo.\n\nFor more information, contact phihag@phihag.de .',
+ 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
'categories': ['Science & Technology'],
- 'tags': ['hypervideo'],
+ 'tags': ['youtube-dl'],
'duration': 10,
'view_count': int,
'like_count': int,
@@ -535,23 +1014,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'format': '141/bestaudio[ext=m4a]',
},
},
- # Controversy video
- {
- 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
- 'info_dict': {
- 'id': 'T4XJQO3qol8',
- 'ext': 'mp4',
- 'duration': 219,
- 'upload_date': '20100909',
- 'uploader': 'Amazing Atheist',
- 'uploader_id': 'TheAmazingAtheist',
- 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist',
- 'title': 'Burning Everyone\'s Koran',
- 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms \r\n\r\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
- }
- },
- # Normal age-gate video (No vevo, embed allowed), available via embed page
+ # Age-gate videos. See https://github.com/hypervideo/hypervideo/pull/575#issuecomment-888837000
{
+ 'note': 'Embed allowed age-gate video',
'url': 'https://youtube.com/watch?v=HtVdAasjOgU',
'info_dict': {
'id': 'HtVdAasjOgU',
@@ -567,9 +1032,49 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
},
},
{
- # Age-gated video only available with authentication (unavailable
- # via embed page workaround)
- 'url': 'XgnwCQzjau8',
+ 'note': 'Age-gate video with embed allowed in public site',
+ 'url': 'https://youtube.com/watch?v=HsUATh_Nc2U',
+ 'info_dict': {
+ 'id': 'HsUATh_Nc2U',
+ 'ext': 'mp4',
+ 'title': 'Godzilla 2 (Official Video)',
+ 'description': 'md5:bf77e03fcae5529475e500129b05668a',
+ 'upload_date': '20200408',
+ 'uploader_id': 'FlyingKitty900',
+ 'uploader': 'FlyingKitty',
+ 'age_limit': 18,
+ },
+ },
+ {
+ 'note': 'Age-gate video embedable only with clientScreen=EMBED',
+ 'url': 'https://youtube.com/watch?v=Tq92D6wQ1mg',
+ 'info_dict': {
+ 'id': 'Tq92D6wQ1mg',
+ 'title': '[MMD] Adios - EVERGLOW [+Motion DL]',
+ 'ext': 'mp4',
+ 'upload_date': '20191227',
+ 'uploader_id': 'UC1yoRdFoFJaCY-AGfD9W0wQ',
+ 'uploader': 'Projekt Melody',
+ 'description': 'md5:17eccca93a786d51bc67646756894066',
+ 'age_limit': 18,
+ },
+ },
+ {
+ 'note': 'Non-Agegated non-embeddable video',
+ 'url': 'https://youtube.com/watch?v=MeJVWBSsPAY',
+ 'info_dict': {
+ 'id': 'MeJVWBSsPAY',
+ 'ext': 'mp4',
+ 'title': 'OOMPH! - Such Mich Find Mich (Lyrics)',
+ 'uploader': 'Herr Lurik',
+ 'uploader_id': 'st3in234',
+ 'description': 'Fan Video. Music & Lyrics by OOMPH!.',
+ 'upload_date': '20130730',
+ },
+ },
+ {
+ 'note': 'Non-bypassable age-gated video',
+ 'url': 'https://youtube.com/watch?v=Cr381pDsSsA',
'only_matching': True,
},
# video_info is None (https://github.com/ytdl-org/youtube-dl/issues/4421)
@@ -604,7 +1109,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader_id': 'olympic',
'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic',
'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
- 'uploader': 'Olympic',
+ 'uploader': 'Olympics',
'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
},
'params': {
@@ -740,6 +1245,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'params': {
'skip_download': True,
},
+ 'skip': 'Not multifeed anymore',
},
{
# Multifeed video with comma in title (see https://github.com/ytdl-org/youtube-dl/issues/8536)
@@ -769,16 +1275,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'id': 'lsguqyKfVQg',
'ext': 'mp4',
'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21',
- 'alt_title': 'Dark Walk - Position Music',
+ 'alt_title': 'Dark Walk',
'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',
'duration': 133,
'upload_date': '20151119',
'uploader_id': 'IronSoulElf',
'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf',
'uploader': 'IronSoulElf',
- 'creator': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
- 'track': 'Dark Walk - Position Music',
- 'artist': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
+ 'creator': 'Todd Haberman;\nDaniel Law Heath and Aaron Kaplan',
+ 'track': 'Dark Walk',
+ 'artist': 'Todd Haberman;\nDaniel Law Heath and Aaron Kaplan',
'album': 'Position Music - Production Music Vol. 143 - Dark Walk',
},
'params': {
@@ -1089,6 +1595,20 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'only_matching': True,
},
{
+ # controversial video, requires bpctr/contentCheckOk
+ 'url': 'https://www.youtube.com/watch?v=SZJvDhaSDnc',
+ 'info_dict': {
+ 'id': 'SZJvDhaSDnc',
+ 'ext': 'mp4',
+ 'title': 'San Diego teen commits suicide after bullying over embarrassing video',
+ 'channel_id': 'UC-SJ6nODDmufqBzPBwCvYvQ',
+ 'uploader': 'CBS This Morning',
+ 'uploader_id': 'CBSThisMorning',
+ 'upload_date': '20140716',
+ 'description': 'md5:acde3a73d3f133fc97e837a9f76b53b7'
+ }
+ },
+ {
# restricted location, https://github.com/ytdl-org/youtube-dl/issues/28685
'url': 'cBvYw8_A0vQ',
'info_dict': {
@@ -1104,119 +1624,55 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'params': {
'skip_download': True,
},
+ }, {
+ # Has multiple audio streams
+ 'url': 'WaOKSUlf4TM',
+ 'only_matching': True
+ }, {
+ # Requires Premium: has format 141 when requested using YTM url
+ 'url': 'https://music.youtube.com/watch?v=XclachpHxis',
+ 'only_matching': True
+ }, {
+ # multiple subtitles with same lang_code
+ 'url': 'https://www.youtube.com/watch?v=wsQiKKfKxug',
+ 'only_matching': True,
+ }, {
+ # Force use android client fallback
+ 'url': 'https://www.youtube.com/watch?v=YOelRv7fMxY',
+ 'info_dict': {
+ 'id': 'YOelRv7fMxY',
+ 'title': 'DIGGING A SECRET TUNNEL Part 1',
+ 'ext': '3gp',
+ 'upload_date': '20210624',
+ 'channel_id': 'UCp68_FLety0O-n9QU6phsgw',
+ 'uploader': 'colinfurze',
+ 'uploader_id': 'colinfurze',
+ 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCp68_FLety0O-n9QU6phsgw',
+ 'description': 'md5:b5096f56af7ccd7a555c84db81738b22'
+ },
+ 'params': {
+ 'format': '17', # 3gp format available on android
+ 'extractor_args': {'youtube': {'player_client': ['android']}},
+ },
+ },
+ {
+ # Skip download of additional client configs (remix client config in this case)
+ 'url': 'https://music.youtube.com/watch?v=MgNrAu2pzNs',
+ 'only_matching': True,
+ 'params': {
+ 'extractor_args': {'youtube': {'player_skip': ['configs']}},
+ },
+ }, {
+ # shorts
+ 'url': 'https://www.youtube.com/shorts/BGQWPY4IigY',
+ 'only_matching': True,
},
]
- _formats = {
- '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
- '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
- '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
- '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},
- '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},
- '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
- '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
- '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
- # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well
- '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
- '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
- '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
- '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
- '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
- '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
- '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
- '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
- '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
-
-
- # 3D videos
- '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
- '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
- '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
- '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
- '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},
- '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
- '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
-
- # Apple HTTP Live Streaming
- '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
- '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
- '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
- '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
- '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
- '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
- '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
- '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},
-
- # DASH mp4 video
- '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},
- '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},
- '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
- '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},
- '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},
- '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559)
- '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},
- '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
- '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},
- '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
- '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
- '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},
-
- # Dash mp4 audio
- '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'},
- '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'},
- '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'},
- '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
- '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
- '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},
- '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},
-
- # Dash webm
- '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
- '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
- '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
- '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
- '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
- '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
- '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},
- '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'},
- '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'},
- '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
- '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
- '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
- '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'},
- '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'},
- '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'},
- # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
- '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
- '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
- '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
- '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
- '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
- '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
-
- # Dash webm audio
- '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128},
- '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256},
-
- # Dash webm audio with opus inside
- '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50},
- '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70},
- '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160},
-
- # RTMP (unnamed)
- '_rtmp': {'protocol': 'rtmp'},
-
- # av01 video only formats sometimes served with "unknown" codecs
- '394': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
- '395': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
- '396': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
- '397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
- }
@classmethod
def suitable(cls, url):
- # Hack for lazy extractors until more generic solution is implemented
- # (see #28780)
- from .youtube import parse_qs
+ from ..utils import parse_qs
+
qs = parse_qs(url)
if qs.get('list', [None])[0]:
return False
@@ -1227,6 +1683,29 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
self._code_cache = {}
self._player_cache = {}
+ def _extract_player_url(self, *ytcfgs, webpage=None):
+ player_url = traverse_obj(
+ ytcfgs, (..., 'PLAYER_JS_URL'), (..., 'WEB_PLAYER_CONTEXT_CONFIGS', ..., 'jsUrl'),
+ get_all=False, expected_type=compat_str)
+ if not player_url:
+ return
+ if player_url.startswith('//'):
+ player_url = 'https:' + player_url
+ elif not re.match(r'https?://', player_url):
+ player_url = compat_urlparse.urljoin(
+ 'https://www.youtube.com', player_url)
+ return player_url
+
+ def _download_player_url(self, video_id, fatal=False):
+ res = self._download_webpage(
+ 'https://www.youtube.com/iframe_api',
+ note='Downloading iframe API JS', video_id=video_id, fatal=fatal)
+ if res:
+ player_version = self._search_regex(
+ r'player\\?/([0-9a-fA-F]{8})\\?/', res, 'player version', fatal=fatal)
+ if player_version:
+ return f'https://www.youtube.com/s/player/{player_version}/player_ias.vflset/en_US/base.js'
+
def _signature_cache_id(self, example_sig):
""" Return a string representation of a signature """
return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
@@ -1241,6 +1720,17 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
raise ExtractorError('Cannot identify player %r' % player_url)
return id_m.group('id')
+ def _load_player(self, video_id, player_url, fatal=True) -> bool:
+ player_id = self._extract_player_info(player_url)
+ if player_id not in self._code_cache:
+ code = self._download_webpage(
+ player_url, video_id, fatal=fatal,
+ note='Downloading player ' + player_id,
+ errnote='Download of %s failed' % player_url)
+ if code:
+ self._code_cache[player_id] = code
+ return player_id in self._code_cache
+
def _extract_signature_function(self, video_id, player_url, example_sig):
player_id = self._extract_player_info(player_url)
@@ -1253,20 +1743,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
if cache_spec is not None:
return lambda s: ''.join(s[i] for i in cache_spec)
- if player_id not in self._code_cache:
- self._code_cache[player_id] = self._download_webpage(
- player_url, video_id,
- note='Downloading player ' + player_id,
- errnote='Download of %s failed' % player_url)
- code = self._code_cache[player_id]
- res = self._parse_sig_js(code)
+ if self._load_player(video_id, player_url):
+ code = self._code_cache[player_id]
+ res = self._parse_sig_js(code)
- test_string = ''.join(map(compat_chr, range(len(example_sig))))
- cache_res = res(test_string)
- cache_spec = [ord(c) for c in cache_res]
+ test_string = ''.join(map(compat_chr, range(len(example_sig))))
+ cache_res = res(test_string)
+ cache_spec = [ord(c) for c in cache_res]
- self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
- return res
+ self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
+ return res
def _print_sig_code(self, func, example_sig):
def gen_sig_code(idxs):
@@ -1311,10 +1797,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
funcname = self._search_regex(
(r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
- r'\bm=(?P<sig>[a-zA-Z0-9$]{2})\(decodeURIComponent\(h\.s\)\)',
- r'\bc&&\(c=(?P<sig>[a-zA-Z0-9$]{2})\(decodeURIComponent\(c\)\)',
- r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\);[a-zA-Z0-9$]{2}\.[a-zA-Z0-9$]{2}\(a,\d+\)',
- r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
+ r'\bm=(?P<sig>[a-zA-Z0-9$]{2,})\(decodeURIComponent\(h\.s\)\)',
+ r'\bc&&\(c=(?P<sig>[a-zA-Z0-9$]{2,})\(decodeURIComponent\(c\)\)',
+ r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2,})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\);[a-zA-Z0-9$]{2}\.[a-zA-Z0-9$]{2}\(a,\d+\)',
+ r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2,})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
# Obsolete patterns
r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
@@ -1337,11 +1823,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
if player_url is None:
raise ExtractorError('Cannot decrypt signature without player_url')
- if player_url.startswith('//'):
- player_url = 'https:' + player_url
- elif not re.match(r'https?://', player_url):
- player_url = compat_urlparse.urljoin(
- 'https://www.youtube.com', player_url)
try:
player_id = (player_url, self._signature_cache_id(s))
if player_id not in self._player_cache:
@@ -1350,7 +1831,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
)
self._player_cache[player_id] = func
func = self._player_cache[player_id]
- if self._downloader.params.get('youtube_print_sig_code'):
+ if self.get_param('youtube_print_sig_code'):
self._print_sig_code(func, s)
return func(s)
except Exception as e:
@@ -1358,11 +1839,37 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
raise ExtractorError(
'Signature extraction failed: ' + tb, cause=e)
- def _mark_watched(self, video_id, player_response):
- playback_url = url_or_none(try_get(
- player_response,
- lambda x: x['playbackTracking']['videostatsPlaybackUrl']['baseUrl']))
+ def _extract_signature_timestamp(self, video_id, player_url, ytcfg=None, fatal=False):
+ """
+ Extract signatureTimestamp (sts)
+ Required to tell API what sig/player version is in use.
+ """
+ sts = None
+ if isinstance(ytcfg, dict):
+ sts = int_or_none(ytcfg.get('STS'))
+
+ if not sts:
+ # Attempt to extract from player
+ if player_url is None:
+ error_msg = 'Cannot extract signature timestamp without player_url.'
+ if fatal:
+ raise ExtractorError(error_msg)
+ self.report_warning(error_msg)
+ return
+ if self._load_player(video_id, player_url, fatal=fatal):
+ player_id = self._extract_player_info(player_url)
+ code = self._code_cache[player_id]
+ sts = int_or_none(self._search_regex(
+ r'(?:signatureTimestamp|sts)\s*:\s*(?P<sts>[0-9]{5})', code,
+ 'JS player signature timestamp', group='sts', fatal=fatal))
+ return sts
+
+ def _mark_watched(self, video_id, player_responses):
+ playback_url = traverse_obj(
+ player_responses, (..., 'playbackTracking', 'videostatsPlaybackUrl', 'baseUrl'),
+ expected_type=url_or_none, get_all=False)
if not playback_url:
+ self.report_warning('Unable to mark watched')
return
parsed_playback_url = compat_urlparse.urlparse(playback_url)
qs = compat_urlparse.parse_qs(parsed_playback_url.query)
@@ -1425,47 +1932,58 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
if mobj is None:
raise ExtractorError('Invalid URL: %s' % url)
- video_id = mobj.group(2)
- return video_id
-
- def _extract_chapters_from_json(self, data, video_id, duration):
- chapters_list = try_get(
+ return mobj.group('id')
+
+ def _extract_chapters_from_json(self, data, duration):
+ chapter_list = traverse_obj(
+ data, (
+ 'playerOverlays', 'playerOverlayRenderer', 'decoratedPlayerBarRenderer',
+ 'decoratedPlayerBarRenderer', 'playerBar', 'chapteredPlayerBarRenderer', 'chapters'
+ ), expected_type=list)
+
+ return self._extract_chapters(
+ chapter_list,
+ chapter_time=lambda chapter: float_or_none(
+ traverse_obj(chapter, ('chapterRenderer', 'timeRangeStartMillis')), scale=1000),
+ chapter_title=lambda chapter: traverse_obj(
+ chapter, ('chapterRenderer', 'title', 'simpleText'), expected_type=str),
+ duration=duration)
+
+ def _extract_chapters_from_engagement_panel(self, data, duration):
+ content_list = traverse_obj(
data,
- lambda x: x['playerOverlays']
- ['playerOverlayRenderer']
- ['decoratedPlayerBarRenderer']
- ['decoratedPlayerBarRenderer']
- ['playerBar']
- ['chapteredPlayerBarRenderer']
- ['chapters'],
- list)
- if not chapters_list:
- return
-
- def chapter_time(chapter):
- return float_or_none(
- try_get(
- chapter,
- lambda x: x['chapterRenderer']['timeRangeStartMillis'],
- int),
- scale=1000)
+ ('engagementPanels', ..., 'engagementPanelSectionListRenderer', 'content', 'macroMarkersListRenderer', 'contents'),
+ expected_type=list, default=[])
+ chapter_time = lambda chapter: parse_duration(self._get_text(chapter, 'timeDescription'))
+ chapter_title = lambda chapter: self._get_text(chapter, 'title')
+
+ return next((
+ filter(None, (
+ self._extract_chapters(
+ traverse_obj(contents, (..., 'macroMarkersListItemRenderer')),
+ chapter_time, chapter_title, duration)
+ for contents in content_list
+ ))), [])
+
+ def _extract_chapters(self, chapter_list, chapter_time, chapter_title, duration):
chapters = []
- for next_num, chapter in enumerate(chapters_list, start=1):
+ last_chapter = {'start_time': 0}
+ for idx, chapter in enumerate(chapter_list or []):
+ title = chapter_title(chapter)
start_time = chapter_time(chapter)
if start_time is None:
continue
- end_time = (chapter_time(chapters_list[next_num])
- if next_num < len(chapters_list) else duration)
- if end_time is None:
- continue
- title = try_get(
- chapter, lambda x: x['chapterRenderer']['title']['simpleText'],
- compat_str)
- chapters.append({
- 'start_time': start_time,
- 'end_time': end_time,
- 'title': title,
- })
+ last_chapter['end_time'] = start_time
+ if start_time < last_chapter['start_time']:
+ if idx == 1:
+ chapters.pop()
+ self.report_warning('Invalid start time for chapter "%s"' % last_chapter['title'])
+ else:
+ self.report_warning(f'Invalid start time for chapter "{title}"')
+ continue
+ last_chapter = {'start_time': start_time, 'title': title}
+ chapters.append(last_chapter)
+ last_chapter['end_time'] = duration
return chapters
def _extract_yt_initial_variable(self, webpage, regex, video_id, name):
@@ -1473,132 +1991,436 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
(r'%s\s*%s' % (regex, self._YT_INITIAL_BOUNDARY_RE),
regex), webpage, name, default='{}'), video_id, fatal=False)
- def _real_extract(self, url):
- url, smuggled_data = unsmuggle_url(url, {})
- video_id = self._match_id(url)
- base_url = self.http_scheme() + '//www.youtube.com/'
- webpage_url = base_url + 'watch?v=' + video_id
- webpage = self._download_webpage(
- webpage_url + '&bpctr=9999999999&has_verified=1', video_id, fatal=False)
+ @staticmethod
+ def parse_time_text(time_text):
+ """
+ Parse the comment time text
+ time_text is in the format 'X units ago (edited)'
+ """
+ time_text_split = time_text.split(' ')
+ if len(time_text_split) >= 3:
+ try:
+ return datetime_from_str('now-%s%s' % (time_text_split[0], time_text_split[1]), precision='auto')
+ except ValueError:
+ return None
+
+ def _extract_comment(self, comment_renderer, parent=None):
+ comment_id = comment_renderer.get('commentId')
+ if not comment_id:
+ return
- player_response = None
- if webpage:
- player_response = self._extract_yt_initial_variable(
- webpage, self._YT_INITIAL_PLAYER_RESPONSE_RE,
- video_id, 'initial player response')
- if not player_response:
- player_response = self._call_api(
- 'player', {'videoId': video_id}, video_id)
-
- playability_status = player_response.get('playabilityStatus') or {}
- if playability_status.get('reason') == 'Sign in to confirm your age':
- pr = self._parse_json(try_get(compat_parse_qs(
- self._download_webpage(
- base_url + 'get_video_info', video_id,
- 'Refetching age-gated info webpage',
- 'unable to download video info webpage', query={
- 'video_id': video_id,
- 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
- 'html5': 1,
- }, fatal=False)),
- lambda x: x['player_response'][0],
- compat_str) or '{}', video_id)
- if pr:
- player_response = pr
+ text = self._get_text(comment_renderer, 'contentText')
+
+ # note: timestamp is an estimate calculated from the current time and time_text
+ time_text = self._get_text(comment_renderer, 'publishedTimeText') or ''
+ time_text_dt = self.parse_time_text(time_text)
+ if isinstance(time_text_dt, datetime.datetime):
+ timestamp = calendar.timegm(time_text_dt.timetuple())
+ author = self._get_text(comment_renderer, 'authorText')
+ author_id = try_get(comment_renderer,
+ lambda x: x['authorEndpoint']['browseEndpoint']['browseId'], compat_str)
+
+ votes = parse_count(try_get(comment_renderer, (lambda x: x['voteCount']['simpleText'],
+ lambda x: x['likeCount']), compat_str)) or 0
+ author_thumbnail = try_get(comment_renderer,
+ lambda x: x['authorThumbnail']['thumbnails'][-1]['url'], compat_str)
+
+ author_is_uploader = try_get(comment_renderer, lambda x: x['authorIsChannelOwner'], bool)
+ is_favorited = 'creatorHeart' in (try_get(
+ comment_renderer, lambda x: x['actionButtons']['commentActionButtonsRenderer'], dict) or {})
+ return {
+ 'id': comment_id,
+ 'text': text,
+ 'timestamp': timestamp,
+ 'time_text': time_text,
+ 'like_count': votes,
+ 'is_favorited': is_favorited,
+ 'author': author,
+ 'author_id': author_id,
+ 'author_thumbnail': author_thumbnail,
+ 'author_is_uploader': author_is_uploader,
+ 'parent': parent or 'root'
+ }
- trailer_video_id = try_get(
- playability_status,
- lambda x: x['errorScreen']['playerLegacyDesktopYpcTrailerRenderer']['trailerVideoId'],
- compat_str)
- if trailer_video_id:
- return self.url_result(
- trailer_video_id, self.ie_key(), trailer_video_id)
+ def _comment_entries(self, root_continuation_data, ytcfg, video_id, parent=None, comment_counts=None):
- def get_text(x):
- if not x:
- return
- text = x.get('simpleText')
- if text and isinstance(text, compat_str):
- return text
- runs = x.get('runs')
- if not isinstance(runs, list):
- return
- return ''.join([r['text'] for r in runs if isinstance(r.get('text'), compat_str)])
-
- search_meta = (
- lambda x: self._html_search_meta(x, webpage, default=None)) \
- if webpage else lambda x: None
-
- video_details = player_response.get('videoDetails') or {}
- microformat = try_get(
- player_response,
- lambda x: x['microformat']['playerMicroformatRenderer'],
- dict) or {}
- video_title = video_details.get('title') \
- or get_text(microformat.get('title')) \
- or search_meta(['og:title', 'twitter:title', 'title'])
- video_description = video_details.get('shortDescription')
+ def extract_header(contents):
+ _continuation = None
+ for content in contents:
+ comments_header_renderer = try_get(content, lambda x: x['commentsHeaderRenderer'])
+ expected_comment_count = parse_count(self._get_text(
+ comments_header_renderer, 'countText', 'commentsCount', max_runs=1))
+
+ if expected_comment_count:
+ comment_counts[1] = expected_comment_count
+ self.to_screen('Downloading ~%d comments' % expected_comment_count)
+ sort_mode_str = self._configuration_arg('comment_sort', [''])[0]
+ comment_sort_index = int(sort_mode_str != 'top') # 1 = new, 0 = top
+
+ sort_menu_item = try_get(
+ comments_header_renderer,
+ lambda x: x['sortMenu']['sortFilterSubMenuRenderer']['subMenuItems'][comment_sort_index], dict) or {}
+ sort_continuation_ep = sort_menu_item.get('serviceEndpoint') or {}
+
+ _continuation = self._extract_continuation_ep_data(sort_continuation_ep) or self._extract_continuation(sort_menu_item)
+ if not _continuation:
+ continue
- if not smuggled_data.get('force_singlefeed', False):
- if not self._downloader.params.get('noplaylist'):
- multifeed_metadata_list = try_get(
- player_response,
- lambda x: x['multicamera']['playerLegacyMulticameraRenderer']['metadataList'],
- compat_str)
- if multifeed_metadata_list:
- entries = []
- feed_ids = []
- for feed in multifeed_metadata_list.split(','):
- # Unquote should take place before split on comma (,) since textual
- # fields may contain comma as well (see
- # https://github.com/ytdl-org/youtube-dl/issues/8536)
- feed_data = compat_parse_qs(
- compat_urllib_parse_unquote_plus(feed))
+ sort_text = sort_menu_item.get('title')
+ if isinstance(sort_text, compat_str):
+ sort_text = sort_text.lower()
+ else:
+ sort_text = 'top comments' if comment_sort_index == 0 else 'newest first'
+ self.to_screen('Sorting comments by %s' % sort_text)
+ break
+ return _continuation
- def feed_entry(name):
- return try_get(
- feed_data, lambda x: x[name][0], compat_str)
+ def extract_thread(contents):
+ if not parent:
+ comment_counts[2] = 0
+ for content in contents:
+ comment_thread_renderer = try_get(content, lambda x: x['commentThreadRenderer'])
+ comment_renderer = try_get(
+ comment_thread_renderer, (lambda x: x['comment']['commentRenderer'], dict)) or try_get(
+ content, (lambda x: x['commentRenderer'], dict))
- feed_id = feed_entry('id')
- if not feed_id:
- continue
- feed_title = feed_entry('title')
- title = video_title
- if feed_title:
- title += ' (%s)' % feed_title
- entries.append({
- '_type': 'url_transparent',
- 'ie_key': 'Youtube',
- 'url': smuggle_url(
- base_url + 'watch?v=' + feed_data['id'][0],
- {'force_singlefeed': True}),
- 'title': title,
- })
- feed_ids.append(feed_id)
- self.to_screen(
- 'Downloading multifeed video (%s) - add --no-playlist to just download video %s'
- % (', '.join(feed_ids), video_id))
- return self.playlist_result(
- entries, video_id, video_title, video_description)
+ if not comment_renderer:
+ continue
+ comment = self._extract_comment(comment_renderer, parent)
+ if not comment:
+ continue
+ comment_counts[0] += 1
+ yield comment
+ # Attempt to get the replies
+ comment_replies_renderer = try_get(
+ comment_thread_renderer, lambda x: x['replies']['commentRepliesRenderer'], dict)
+
+ if comment_replies_renderer:
+ comment_counts[2] += 1
+ comment_entries_iter = self._comment_entries(
+ comment_replies_renderer, ytcfg, video_id,
+ parent=comment.get('id'), comment_counts=comment_counts)
+
+ for reply_comment in comment_entries_iter:
+ yield reply_comment
+
+ # YouTube comments have a max depth of 2
+ max_depth = int_or_none(self._configuration_arg('max_comment_depth', [''])[0]) or float('inf')
+ if max_depth == 1 and parent:
+ return
+ if not comment_counts:
+ # comment so far, est. total comments, current comment thread #
+ comment_counts = [0, 0, 0]
+
+ continuation = self._extract_continuation(root_continuation_data)
+ if continuation and len(continuation['continuation']) < 27:
+ self.write_debug('Detected old API continuation token. Generating new API compatible token.')
+ continuation_token = self._generate_comment_continuation(video_id)
+ continuation = self._build_api_continuation_query(continuation_token, None)
+
+ message = self._get_text(root_continuation_data, ('contents', ..., 'messageRenderer', 'text'), max_runs=1)
+ if message and not parent:
+ self.report_warning(message, video_id=video_id)
+
+ visitor_data = None
+ is_first_continuation = parent is None
+
+ for page_num in itertools.count(0):
+ if not continuation:
+ break
+ headers = self.generate_api_headers(ytcfg=ytcfg, visitor_data=visitor_data)
+ comment_prog_str = '(%d/%d)' % (comment_counts[0], comment_counts[1])
+ if page_num == 0:
+ if is_first_continuation:
+ note_prefix = 'Downloading comment section API JSON'
+ else:
+ note_prefix = ' Downloading comment API JSON reply thread %d %s' % (
+ comment_counts[2], comment_prog_str)
else:
- self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
+ note_prefix = '%sDownloading comment%s API JSON page %d %s' % (
+ ' ' if parent else '', ' replies' if parent else '',
+ page_num, comment_prog_str)
+
+ response = self._extract_response(
+ item_id=None, query=continuation,
+ ep='next', ytcfg=ytcfg, headers=headers, note=note_prefix,
+ check_get_keys=('onResponseReceivedEndpoints', 'continuationContents'))
+ if not response:
+ break
+ visitor_data = try_get(
+ response,
+ lambda x: x['responseContext']['webResponseContextExtensionData']['ytConfigData']['visitorData'],
+ compat_str) or visitor_data
+
+ continuation_contents = dict_get(response, ('onResponseReceivedEndpoints', 'continuationContents'))
+
+ continuation = None
+ if isinstance(continuation_contents, list):
+ for continuation_section in continuation_contents:
+ if not isinstance(continuation_section, dict):
+ continue
+ continuation_items = try_get(
+ continuation_section,
+ (lambda x: x['reloadContinuationItemsCommand']['continuationItems'],
+ lambda x: x['appendContinuationItemsAction']['continuationItems']),
+ list) or []
+ if is_first_continuation:
+ continuation = extract_header(continuation_items)
+ is_first_continuation = False
+ if continuation:
+ break
+ continue
+ count = 0
+ for count, entry in enumerate(extract_thread(continuation_items)):
+ yield entry
+ continuation = self._extract_continuation({'contents': continuation_items})
+ if continuation:
+ # Sometimes YouTube provides a continuation without any comments
+ # In most cases we end up just downloading these with very little comments to come.
+ if count == 0:
+ if not parent:
+ self.report_warning('No comments received - assuming end of comments')
+ continuation = None
+ break
+
+ # Deprecated response structure
+ elif isinstance(continuation_contents, dict):
+ known_continuation_renderers = ('itemSectionContinuation', 'commentRepliesContinuation')
+ for key, continuation_renderer in continuation_contents.items():
+ if key not in known_continuation_renderers:
+ continue
+ if not isinstance(continuation_renderer, dict):
+ continue
+ if is_first_continuation:
+ header_continuation_items = [continuation_renderer.get('header') or {}]
+ continuation = extract_header(header_continuation_items)
+ is_first_continuation = False
+ if continuation:
+ break
+
+ # Sometimes YouTube provides a continuation without any comments
+ # In most cases we end up just downloading these with very little comments to come.
+ count = 0
+ for count, entry in enumerate(extract_thread(continuation_renderer.get('contents') or {})):
+ yield entry
+ continuation = self._extract_continuation(continuation_renderer)
+ if count == 0:
+ if not parent:
+ self.report_warning('No comments received - assuming end of comments')
+ continuation = None
+ break
+
+ @staticmethod
+ def _generate_comment_continuation(video_id):
+ """
+ Generates initial comment section continuation token from given video id
+ """
+ b64_vid_id = base64.b64encode(bytes(video_id.encode('utf-8')))
+ parts = ('Eg0SCw==', b64_vid_id, 'GAYyJyIRIgs=', b64_vid_id, 'MAB4AjAAQhBjb21tZW50cy1zZWN0aW9u')
+ new_continuation_intlist = list(itertools.chain.from_iterable(
+ [bytes_to_intlist(base64.b64decode(part)) for part in parts]))
+ return base64.b64encode(intlist_to_bytes(new_continuation_intlist)).decode('utf-8')
+
+ def _get_comments(self, ytcfg, video_id, contents, webpage):
+ """Entry for comment extraction"""
+ def _real_comment_extract(contents):
+ renderer = next((
+ item for item in traverse_obj(contents, (..., 'itemSectionRenderer'), default={})
+ if item.get('sectionIdentifier') == 'comment-item-section'), None)
+ yield from self._comment_entries(renderer, ytcfg, video_id)
+
+ max_comments = int_or_none(self._configuration_arg('max_comments', [''])[0])
+ # Force English regardless of account setting to prevent parsing issues
+ # See: https://github.com/hypervideo/hypervideo/issues/532
+ ytcfg = copy.deepcopy(ytcfg)
+ traverse_obj(
+ ytcfg, ('INNERTUBE_CONTEXT', 'client'), expected_type=dict, default={})['hl'] = 'en'
+ return itertools.islice(_real_comment_extract(contents), 0, max_comments)
+
+ @staticmethod
+ def _get_checkok_params():
+ return {'contentCheckOk': True, 'racyCheckOk': True}
+
+ @classmethod
+ def _generate_player_context(cls, sts=None):
+ context = {
+ 'html5Preference': 'HTML5_PREF_WANTS',
+ }
+ if sts is not None:
+ context['signatureTimestamp'] = sts
+ return {
+ 'playbackContext': {
+ 'contentPlaybackContext': context
+ },
+ **cls._get_checkok_params()
+ }
- formats = []
- itags = []
- itag_qualities = {}
+ @staticmethod
+ def _is_agegated(player_response):
+ if traverse_obj(player_response, ('playabilityStatus', 'desktopLegacyAgeGateReason')):
+ return True
+
+ reasons = traverse_obj(player_response, ('playabilityStatus', ('status', 'reason')), default=[])
+ AGE_GATE_REASONS = (
+ 'confirm your age', 'age-restricted', 'inappropriate', # reason
+ 'age_verification_required', 'age_check_required', # status
+ )
+ return any(expected in reason for expected in AGE_GATE_REASONS for reason in reasons)
+
+ @staticmethod
+ def _is_unplayable(player_response):
+ return traverse_obj(player_response, ('playabilityStatus', 'status')) == 'UNPLAYABLE'
+
+ def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg, player_url, initial_pr):
+
+ session_index = self._extract_session_index(player_ytcfg, master_ytcfg)
+ syncid = self._extract_account_syncid(player_ytcfg, master_ytcfg, initial_pr)
+ sts = self._extract_signature_timestamp(video_id, player_url, master_ytcfg, fatal=False) if player_url else None
+ headers = self.generate_api_headers(
+ ytcfg=player_ytcfg, account_syncid=syncid, session_index=session_index, default_client=client)
+
+ yt_query = {'videoId': video_id}
+ yt_query.update(self._generate_player_context(sts))
+ return self._extract_response(
+ item_id=video_id, ep='player', query=yt_query,
+ ytcfg=player_ytcfg, headers=headers, fatal=True,
+ default_client=client,
+ note='Downloading %s player API JSON' % client.replace('_', ' ').strip()
+ ) or None
+
+ def _get_requested_clients(self, url, smuggled_data):
+ requested_clients = []
+ allowed_clients = sorted(
+ [client for client in INNERTUBE_CLIENTS.keys() if client[:1] != '_'],
+ key=lambda client: INNERTUBE_CLIENTS[client]['priority'], reverse=True)
+ for client in self._configuration_arg('player_client'):
+ if client in allowed_clients:
+ requested_clients.append(client)
+ elif client == 'all':
+ requested_clients.extend(allowed_clients)
+ else:
+ self.report_warning(f'Skipping unsupported client {client}')
+ if not requested_clients:
+ requested_clients = ['android', 'web']
+
+ if smuggled_data.get('is_music_url') or self.is_music_url(url):
+ requested_clients.extend(
+ f'{client}_music' for client in requested_clients if f'{client}_music' in INNERTUBE_CLIENTS)
+
+ return orderedSet(requested_clients)
+
+ def _extract_player_ytcfg(self, client, video_id):
+ url = {
+ 'web_music': 'https://music.youtube.com',
+ 'web_embedded': f'https://www.youtube.com/embed/{video_id}?html5=1'
+ }.get(client)
+ if not url:
+ return {}
+ webpage = self._download_webpage(url, video_id, fatal=False, note=f'Downloading {client} config')
+ return self.extract_ytcfg(video_id, webpage) or {}
+
+ def _extract_player_responses(self, clients, video_id, webpage, master_ytcfg):
+ initial_pr = None
+ if webpage:
+ initial_pr = self._extract_yt_initial_variable(
+ webpage, self._YT_INITIAL_PLAYER_RESPONSE_RE,
+ video_id, 'initial player response')
+
+ original_clients = clients
+ clients = clients[::-1]
+ prs = []
+
+ def append_client(client_name):
+ if client_name in INNERTUBE_CLIENTS and client_name not in original_clients:
+ clients.append(client_name)
+
+ # Android player_response does not have microFormats which are needed for
+ # extraction of some data. So we return the initial_pr with formats
+ # stripped out even if not requested by the user
+ # See: https://github.com/hypervideo/hypervideo/issues/501
+ if initial_pr:
+ pr = dict(initial_pr)
+ pr['streamingData'] = None
+ prs.append(pr)
+
+ last_error = None
+ tried_iframe_fallback = False
player_url = None
- q = qualities(['tiny', 'small', 'medium', 'large', 'hd720', 'hd1080', 'hd1440', 'hd2160', 'hd2880', 'highres'])
- streaming_data = player_response.get('streamingData') or {}
- streaming_formats = streaming_data.get('formats') or []
- streaming_formats.extend(streaming_data.get('adaptiveFormats') or [])
+ while clients:
+ client = clients.pop()
+ player_ytcfg = master_ytcfg if client == 'web' else {}
+ if 'configs' not in self._configuration_arg('player_skip'):
+ player_ytcfg = self._extract_player_ytcfg(client, video_id) or player_ytcfg
+
+ player_url = player_url or self._extract_player_url(master_ytcfg, player_ytcfg, webpage=webpage)
+ require_js_player = self._get_default_ytcfg(client).get('REQUIRE_JS_PLAYER')
+ if 'js' in self._configuration_arg('player_skip'):
+ require_js_player = False
+ player_url = None
+
+ if not player_url and not tried_iframe_fallback and require_js_player:
+ player_url = self._download_player_url(video_id)
+ tried_iframe_fallback = True
+
+ try:
+ pr = initial_pr if client == 'web' and initial_pr else self._extract_player_response(
+ client, video_id, player_ytcfg or master_ytcfg, player_ytcfg, player_url if require_js_player else None, initial_pr)
+ except ExtractorError as e:
+ if last_error:
+ self.report_warning(last_error)
+ last_error = e
+ continue
+
+ if pr:
+ prs.append(pr)
+
+ # creator clients can bypass AGE_VERIFICATION_REQUIRED if logged in
+ if client.endswith('_agegate') and self._is_unplayable(pr) and self.is_authenticated:
+ append_client(client.replace('_agegate', '_creator'))
+ elif self._is_agegated(pr):
+ append_client(f'{client}_agegate')
+
+ if last_error:
+ if not len(prs):
+ raise last_error
+ self.report_warning(last_error)
+ return prs, player_url
+
+ def _extract_formats(self, streaming_data, video_id, player_url, is_live):
+ itags, stream_ids = [], []
+ itag_qualities, res_qualities = {}, {}
+ q = qualities([
+ # Normally tiny is the smallest video-only formats. But
+ # audio-only formats with unknown quality may get tagged as tiny
+ 'tiny',
+ 'audio_quality_ultralow', 'audio_quality_low', 'audio_quality_medium', 'audio_quality_high', # Audio only formats
+ 'small', 'medium', 'large', 'hd720', 'hd1080', 'hd1440', 'hd2160', 'hd2880', 'highres'
+ ])
+ streaming_formats = traverse_obj(streaming_data, (..., ('formats', 'adaptiveFormats'), ...), default=[])
+
for fmt in streaming_formats:
if fmt.get('targetDurationSec') or fmt.get('drmFamilies'):
continue
itag = str_or_none(fmt.get('itag'))
+ audio_track = fmt.get('audioTrack') or {}
+ stream_id = '%s.%s' % (itag or '', audio_track.get('id', ''))
+ if stream_id in stream_ids:
+ continue
+
quality = fmt.get('quality')
- if itag and quality:
- itag_qualities[itag] = quality
+ height = int_or_none(fmt.get('height'))
+ if quality == 'tiny' or not quality:
+ quality = fmt.get('audioQuality', '').lower() or quality
+ # The 3gp format (17) in android client has a quality of "small",
+ # but is actually worse than other formats
+ if itag == '17':
+ quality = 'tiny'
+ if quality:
+ if itag:
+ itag_qualities[itag] = quality
+ if height:
+ res_qualities[height] = quality
# FORMAT_STREAM_TYPE_OTF(otf=1) requires downloading the init fragment
# (adding `&sq=0` to the URL) and parsing emsg box to determine the
# number of fragment that would subsequently requested with (`&sq=N`)
@@ -1613,12 +2435,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
if not (sc and fmt_url and encrypted_sig):
continue
if not player_url:
- if not webpage:
- continue
- player_url = self._search_regex(
- r'"(?:PLAYER_JS_URL|jsUrl)"\s*:\s*"([^"]+)"',
- webpage, 'player URL', fatal=False)
- if not player_url:
continue
signature = self._decrypt_signature(sc['s'][0], video_id, player_url)
sp = try_get(sc, lambda x: x['sp'][0]) or 'signature'
@@ -1626,27 +2442,32 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
if itag:
itags.append(itag)
+ stream_ids.append(stream_id)
+
tbr = float_or_none(
fmt.get('averageBitrate') or fmt.get('bitrate'), 1000)
dct = {
'asr': int_or_none(fmt.get('audioSampleRate')),
'filesize': int_or_none(fmt.get('contentLength')),
'format_id': itag,
- 'format_note': fmt.get('qualityLabel') or quality,
+ 'format_note': ', '.join(filter(None, (
+ '%s%s' % (audio_track.get('displayName') or '',
+ ' (default)' if audio_track.get('audioIsDefault') else ''),
+ fmt.get('qualityLabel') or quality.replace('audio_quality_', '')))),
'fps': int_or_none(fmt.get('fps')),
- 'height': int_or_none(fmt.get('height')),
+ 'height': height,
'quality': q(quality),
'tbr': tbr,
'url': fmt_url,
- 'width': fmt.get('width'),
+ 'width': int_or_none(fmt.get('width')),
+ 'language': audio_track.get('id', '').split('.')[0],
+ 'language_preference': 1 if audio_track.get('audioIsDefault') else -1,
}
- mimetype = fmt.get('mimeType')
- if mimetype:
- mobj = re.match(
- r'((?:[^/]+)/(?:[^;]+))(?:;\s*codecs="([^"]+)")?', mimetype)
- if mobj:
- dct['ext'] = mimetype2ext(mobj.group(1))
- dct.update(parse_codecs(mobj.group(2)))
+ mime_mobj = re.match(
+ r'((?:[^/]+)/(?:[^;]+))(?:;\s*codecs="([^"]+)")?', fmt.get('mimeType') or '')
+ if mime_mobj:
+ dct['ext'] = mimetype2ext(mime_mobj.group(1))
+ dct.update(parse_codecs(mime_mobj.group(2)))
no_audio = dct.get('acodec') == 'none'
no_video = dct.get('vcodec') == 'none'
if no_audio:
@@ -1660,61 +2481,181 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
}
if dct.get('ext'):
dct['container'] = dct['ext'] + '_dash'
- formats.append(dct)
-
- hls_manifest_url = streaming_data.get('hlsManifestUrl')
- if hls_manifest_url:
- for f in self._extract_m3u8_formats(
- hls_manifest_url, video_id, 'mp4', fatal=False):
- itag = self._search_regex(
- r'/itag/(\d+)', f['url'], 'itag', default=None)
- if itag:
- f['format_id'] = itag
- formats.append(f)
+ yield dct
+
+ skip_manifests = self._configuration_arg('skip')
+ get_dash = (
+ (not is_live or self._configuration_arg('include_live_dash'))
+ and 'dash' not in skip_manifests and self.get_param('youtube_include_dash_manifest', True))
+ get_hls = 'hls' not in skip_manifests and self.get_param('youtube_include_hls_manifest', True)
+
+ def guess_quality(f):
+ for val, qdict in ((f.get('format_id'), itag_qualities), (f.get('height'), res_qualities)):
+ if val in qdict:
+ return q(qdict[val])
+ return -1
+
+ for sd in streaming_data:
+ hls_manifest_url = get_hls and sd.get('hlsManifestUrl')
+ if hls_manifest_url:
+ for f in self._extract_m3u8_formats(hls_manifest_url, video_id, 'mp4', fatal=False):
+ itag = self._search_regex(
+ r'/itag/(\d+)', f['url'], 'itag', default=None)
+ if itag in itags:
+ itag += '-hls'
+ if itag in itags:
+ continue
+ if itag:
+ f['format_id'] = itag
+ itags.append(itag)
+ f['quality'] = guess_quality(f)
+ yield f
- if self._downloader.params.get('youtube_include_dash_manifest', True):
- dash_manifest_url = streaming_data.get('dashManifestUrl')
+ dash_manifest_url = get_dash and sd.get('dashManifestUrl')
if dash_manifest_url:
- for f in self._extract_mpd_formats(
- dash_manifest_url, video_id, fatal=False):
+ for f in self._extract_mpd_formats(dash_manifest_url, video_id, fatal=False):
itag = f['format_id']
if itag in itags:
- continue
- if itag in itag_qualities:
- f['quality'] = q(itag_qualities[itag])
+ itag += '-dash'
+ if itag in itags:
+ continue
+ if itag:
+ f['format_id'] = itag
+ itags.append(itag)
+ f['quality'] = guess_quality(f)
filesize = int_or_none(self._search_regex(
r'/clen/(\d+)', f.get('fragment_base_url')
or f['url'], 'file size', default=None))
if filesize:
f['filesize'] = filesize
- formats.append(f)
+ yield f
+
+ def _real_extract(self, url):
+ url, smuggled_data = unsmuggle_url(url, {})
+ video_id = self._match_id(url)
+
+ base_url = self.http_scheme() + '//www.youtube.com/'
+ webpage_url = base_url + 'watch?v=' + video_id
+ webpage = None
+ if 'webpage' not in self._configuration_arg('player_skip'):
+ webpage = self._download_webpage(
+ webpage_url + '&bpctr=9999999999&has_verified=1', video_id, fatal=False)
+
+ master_ytcfg = self.extract_ytcfg(video_id, webpage) or self._get_default_ytcfg()
+
+ player_responses, player_url = self._extract_player_responses(
+ self._get_requested_clients(url, smuggled_data),
+ video_id, webpage, master_ytcfg)
+
+ get_first = lambda obj, keys, **kwargs: traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False)
+
+ playability_statuses = traverse_obj(
+ player_responses, (..., 'playabilityStatus'), expected_type=dict, default=[])
+
+ trailer_video_id = get_first(
+ playability_statuses,
+ ('errorScreen', 'playerLegacyDesktopYpcTrailerRenderer', 'trailerVideoId'),
+ expected_type=str)
+ if trailer_video_id:
+ return self.url_result(
+ trailer_video_id, self.ie_key(), trailer_video_id)
+
+ search_meta = ((lambda x: self._html_search_meta(x, webpage, default=None))
+ if webpage else (lambda x: None))
+
+ video_details = traverse_obj(
+ player_responses, (..., 'videoDetails'), expected_type=dict, default=[])
+ microformats = traverse_obj(
+ player_responses, (..., 'microformat', 'playerMicroformatRenderer'),
+ expected_type=dict, default=[])
+ video_title = (
+ get_first(video_details, 'title')
+ or self._get_text(microformats, (..., 'title'))
+ or search_meta(['og:title', 'twitter:title', 'title']))
+ video_description = get_first(video_details, 'shortDescription')
+
+ if not smuggled_data.get('force_singlefeed', False):
+ if not self.get_param('noplaylist'):
+ multifeed_metadata_list = get_first(
+ player_responses,
+ ('multicamera', 'playerLegacyMulticameraRenderer', 'metadataList'),
+ expected_type=str)
+ if multifeed_metadata_list:
+ entries = []
+ feed_ids = []
+ for feed in multifeed_metadata_list.split(','):
+ # Unquote should take place before split on comma (,) since textual
+ # fields may contain comma as well (see
+ # https://github.com/ytdl-org/youtube-dl/issues/8536)
+ feed_data = compat_parse_qs(
+ compat_urllib_parse_unquote_plus(feed))
+
+ def feed_entry(name):
+ return try_get(
+ feed_data, lambda x: x[name][0], compat_str)
+
+ feed_id = feed_entry('id')
+ if not feed_id:
+ continue
+ feed_title = feed_entry('title')
+ title = video_title
+ if feed_title:
+ title += ' (%s)' % feed_title
+ entries.append({
+ '_type': 'url_transparent',
+ 'ie_key': 'Youtube',
+ 'url': smuggle_url(
+ '%swatch?v=%s' % (base_url, feed_data['id'][0]),
+ {'force_singlefeed': True}),
+ 'title': title,
+ })
+ feed_ids.append(feed_id)
+ self.to_screen(
+ 'Downloading multifeed video (%s) - add --no-playlist to just download video %s'
+ % (', '.join(feed_ids), video_id))
+ return self.playlist_result(
+ entries, video_id, video_title, video_description)
+ else:
+ self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
+
+ live_broadcast_details = traverse_obj(microformats, (..., 'liveBroadcastDetails'))
+ is_live = get_first(video_details, 'isLive')
+ if is_live is None:
+ is_live = get_first(live_broadcast_details, 'isLiveNow')
+
+ streaming_data = traverse_obj(player_responses, (..., 'streamingData'), default=[])
+ formats = list(self._extract_formats(streaming_data, video_id, player_url, is_live))
if not formats:
- if streaming_data.get('licenseInfos'):
- raise ExtractorError(
- 'This video is DRM protected.', expected=True)
- pemr = try_get(
- playability_status,
- lambda x: x['errorScreen']['playerErrorMessageRenderer'],
- dict) or {}
- reason = get_text(pemr.get('reason')) or playability_status.get('reason')
- subreason = pemr.get('subreason')
+ if not self.get_param('allow_unplayable_formats') and traverse_obj(streaming_data, (..., 'licenseInfos')):
+ self.report_drm(video_id)
+ pemr = get_first(
+ playability_statuses,
+ ('errorScreen', 'playerErrorMessageRenderer'), expected_type=dict) or {}
+ reason = self._get_text(pemr, 'reason') or get_first(playability_statuses, 'reason')
+ subreason = clean_html(self._get_text(pemr, 'subreason') or '')
if subreason:
- subreason = clean_html(get_text(subreason))
if subreason == 'The uploader has not made this video available in your country.':
- countries = microformat.get('availableCountries')
+ countries = get_first(microformats, 'availableCountries')
if not countries:
regions_allowed = search_meta('regionsAllowed')
countries = regions_allowed.split(',') if regions_allowed else None
- self.raise_geo_restricted(
- subreason, countries)
- reason += '\n' + subreason
+ self.raise_geo_restricted(subreason, countries, metadata_available=True)
+ reason += f'. {subreason}'
if reason:
- raise ExtractorError(reason, expected=True)
+ self.raise_no_formats(reason, expected=True)
- self._sort_formats(formats)
+ for f in formats:
+ if '&c=WEB&' in f['url'] and '&ratebypass=yes&' not in f['url']: # throttled
+ f['source_preference'] = -10
+ # TODO: this method is not reliable
+ f['format_note'] = format_field(f, 'format_note', '%s ') + '(maybe throttled)'
- keywords = video_details.get('keywords') or []
+ # Source is given priority since formats that throttle are given lower source_preference
+ # When throttling issue is fully fixed, remove this
+ self._sort_formats(formats, ('quality', 'res', 'fps', 'hdr:12', 'source', 'codec:vp9.2', 'lang'))
+
+ keywords = get_first(video_details, 'keywords', expected_type=list) or []
if not keywords and webpage:
keywords = [
unescapeHTML(m.group('content'))
@@ -1733,35 +2674,71 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
break
thumbnails = []
- for container in (video_details, microformat):
- for thumbnail in (try_get(
- container,
- lambda x: x['thumbnail']['thumbnails'], list) or []):
- thumbnail_url = thumbnail.get('url')
- if not thumbnail_url:
- continue
- thumbnails.append({
- 'height': int_or_none(thumbnail.get('height')),
- 'url': thumbnail_url,
- 'width': int_or_none(thumbnail.get('width')),
- })
- if thumbnails:
- break
- else:
- thumbnail = search_meta(['og:image', 'twitter:image'])
- if thumbnail:
- thumbnails = [{'url': thumbnail}]
-
- category = microformat.get('category') or search_meta('genre')
- channel_id = video_details.get('channelId') \
- or microformat.get('externalChannelId') \
- or search_meta('channelId')
+ thumbnail_dicts = traverse_obj(
+ (video_details, microformats), (..., ..., 'thumbnail', 'thumbnails', ...),
+ expected_type=dict, default=[])
+ for thumbnail in thumbnail_dicts:
+ thumbnail_url = thumbnail.get('url')
+ if not thumbnail_url:
+ continue
+ # Sometimes youtube gives a wrong thumbnail URL. See:
+ # https://github.com/hypervideo/hypervideo/issues/233
+ # https://github.com/ytdl-org/youtube-dl/issues/28023
+ if 'maxresdefault' in thumbnail_url:
+ thumbnail_url = thumbnail_url.split('?')[0]
+ thumbnails.append({
+ 'url': thumbnail_url,
+ 'height': int_or_none(thumbnail.get('height')),
+ 'width': int_or_none(thumbnail.get('width')),
+ })
+ thumbnail_url = search_meta(['og:image', 'twitter:image'])
+ if thumbnail_url:
+ thumbnails.append({
+ 'url': thumbnail_url,
+ })
+ # The best resolution thumbnails sometimes does not appear in the webpage
+ # See: https://github.com/ytdl-org/youtube-dl/issues/29049, https://github.com/hypervideo/hypervideo/issues/340
+ # List of possible thumbnails - Ref: <https://stackoverflow.com/a/20542029>
+ thumbnail_names = [
+ 'maxresdefault', 'hq720', 'sddefault', 'sd1', 'sd2', 'sd3',
+ 'hqdefault', 'hq1', 'hq2', 'hq3', '0',
+ 'mqdefault', 'mq1', 'mq2', 'mq3',
+ 'default', '1', '2', '3'
+ ]
+ n_thumbnail_names = len(thumbnail_names)
+
+ thumbnails.extend({
+ 'url': 'https://i.ytimg.com/vi{webp}/{video_id}/{name}{live}.{ext}'.format(
+ video_id=video_id, name=name, ext=ext,
+ webp='_webp' if ext == 'webp' else '', live='_live' if is_live else ''),
+ } for name in thumbnail_names for ext in ('webp', 'jpg'))
+ for thumb in thumbnails:
+ i = next((i for i, t in enumerate(thumbnail_names) if f'/{video_id}/{t}' in thumb['url']), n_thumbnail_names)
+ thumb['preference'] = (0 if '.webp' in thumb['url'] else -1) - (2 * i)
+ self._remove_duplicate_formats(thumbnails)
+
+ category = get_first(microformats, 'category') or search_meta('genre')
+ channel_id = str_or_none(
+ get_first(video_details, 'channelId')
+ or get_first(microformats, 'externalChannelId')
+ or search_meta('channelId'))
duration = int_or_none(
- video_details.get('lengthSeconds')
- or microformat.get('lengthSeconds')) \
- or parse_duration(search_meta('duration'))
- is_live = video_details.get('isLive')
- owner_profile_url = microformat.get('ownerProfileUrl')
+ get_first(video_details, 'lengthSeconds')
+ or get_first(microformats, 'lengthSeconds')
+ or parse_duration(search_meta('duration'))) or None
+ owner_profile_url = get_first(microformats, 'ownerProfileUrl')
+
+ live_content = get_first(video_details, 'isLiveContent')
+ is_upcoming = get_first(video_details, 'isUpcoming')
+ if is_live is None:
+ if is_upcoming or live_content is False:
+ is_live = False
+ if is_upcoming is None and (live_content or is_live):
+ is_upcoming = False
+ live_starttime = parse_iso8601(get_first(live_broadcast_details, 'startTimestamp'))
+ live_endtime = parse_iso8601(get_first(live_broadcast_details, 'endTimestamp'))
+ if not duration and live_endtime and live_starttime:
+ duration = live_endtime - live_starttime
info = {
'id': video_id,
@@ -1770,35 +2747,50 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'thumbnails': thumbnails,
'description': video_description,
'upload_date': unified_strdate(
- microformat.get('uploadDate')
+ get_first(microformats, 'uploadDate')
or search_meta('uploadDate')),
- 'uploader': video_details['author'],
+ 'uploader': get_first(video_details, 'author'),
'uploader_id': self._search_regex(r'/(?:channel|user)/([^/?&#]+)', owner_profile_url, 'uploader id') if owner_profile_url else None,
'uploader_url': owner_profile_url,
'channel_id': channel_id,
- 'channel_url': 'https://www.youtube.com/channel/' + channel_id if channel_id else None,
+ 'channel_url': f'https://www.youtube.com/channel/{channel_id}' if channel_id else None,
'duration': duration,
'view_count': int_or_none(
- video_details.get('viewCount')
- or microformat.get('viewCount')
+ get_first((video_details, microformats), (..., 'viewCount'))
or search_meta('interactionCount')),
- 'average_rating': float_or_none(video_details.get('averageRating')),
+ 'average_rating': float_or_none(get_first(video_details, 'averageRating')),
'age_limit': 18 if (
- microformat.get('isFamilySafe') is False
+ get_first(microformats, 'isFamilySafe') is False
or search_meta('isFamilyFriendly') == 'false'
or search_meta('og:restrictions:age') == '18+') else 0,
'webpage_url': webpage_url,
'categories': [category] if category else None,
'tags': keywords,
+ 'playable_in_embed': get_first(playability_statuses, 'playableInEmbed'),
'is_live': is_live,
+ 'was_live': (False if is_live or is_upcoming or live_content is False
+ else None if is_live is None or is_upcoming is None
+ else live_content),
+ 'live_status': 'is_upcoming' if is_upcoming else None, # rest will be set by YoutubeDL
+ 'release_timestamp': live_starttime,
}
- pctr = try_get(
- player_response,
- lambda x: x['captions']['playerCaptionsTracklistRenderer'], dict)
+ pctr = traverse_obj(player_responses, (..., 'captions', 'playerCaptionsTracklistRenderer'), expected_type=dict)
if pctr:
- def process_language(container, base_url, lang_code, query):
- lang_subs = []
+ def get_lang_code(track):
+ return (remove_start(track.get('vssId') or '', '.').replace('.', '-')
+ or track.get('languageCode'))
+
+ # Converted into dicts to remove duplicates
+ captions = {
+ get_lang_code(sub): sub
+ for sub in traverse_obj(pctr, (..., 'captionTracks', ...), default=[])}
+ translation_languages = {
+ lang.get('languageCode'): self._get_text(lang.get('languageName'), max_runs=1)
+ for lang in traverse_obj(pctr, (..., 'translationLanguages', ...), default=[])}
+
+ def process_language(container, base_url, lang_code, sub_name, query):
+ lang_subs = container.setdefault(lang_code, [])
for fmt in self._SUBTITLE_FORMATS:
query.update({
'fmt': fmt,
@@ -1806,30 +2798,31 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
lang_subs.append({
'ext': fmt,
'url': update_url_query(base_url, query),
+ 'name': sub_name,
})
- container[lang_code] = lang_subs
- subtitles = {}
- for caption_track in (pctr.get('captionTracks') or []):
+ subtitles, automatic_captions = {}, {}
+ for lang_code, caption_track in captions.items():
base_url = caption_track.get('baseUrl')
if not base_url:
continue
+ lang_name = self._get_text(caption_track, 'name', max_runs=1)
if caption_track.get('kind') != 'asr':
- lang_code = caption_track.get('languageCode')
if not lang_code:
continue
process_language(
- subtitles, base_url, lang_code, {})
- continue
- automatic_captions = {}
- for translation_language in (pctr.get('translationLanguages') or []):
- translation_language_code = translation_language.get('languageCode')
- if not translation_language_code:
+ subtitles, base_url, lang_code, lang_name, {})
+ if not caption_track.get('isTranslatable'):
+ continue
+ for trans_code, trans_name in translation_languages.items():
+ if not trans_code:
continue
+ if caption_track.get('kind') != 'asr':
+ trans_code += f'-{lang_code}'
+ trans_name += format_field(lang_name, template=' from %s')
process_language(
- automatic_captions, base_url, translation_language_code,
- {'tlang': translation_language_code})
- info['automatic_captions'] = automatic_captions
+ automatic_captions, base_url, trans_code, trans_name, {'tlang': trans_code})
+ info['automatic_captions'] = automatic_captions
info['subtitles'] = subtitles
parsed_url = compat_urllib_parse_urlparse(url)
@@ -1841,6 +2834,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
if d_k not in info and k in s_ks:
info[d_k] = parse_duration(query[k][0])
+ # Youtube Music Auto-generated description
if video_description:
mobj = re.search(r'(?s)(?P<track>[^·\n]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?.+\nAuto-generated by YouTube\.\s*$', video_description)
if mobj:
@@ -1864,42 +2858,31 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
webpage, self._YT_INITIAL_DATA_RE, video_id,
'yt initial data')
if not initial_data:
- initial_data = self._call_api(
- 'next', {'videoId': video_id}, video_id, fatal=False)
+ query = {'videoId': video_id}
+ query.update(self._get_checkok_params())
+ initial_data = self._extract_response(
+ item_id=video_id, ep='next', fatal=False,
+ ytcfg=master_ytcfg, query=query,
+ headers=self.generate_api_headers(ytcfg=master_ytcfg),
+ note='Downloading initial data API JSON')
- if initial_data:
- chapters = self._extract_chapters_from_json(
- initial_data, video_id, duration)
- if not chapters:
- for engagment_pannel in (initial_data.get('engagementPanels') or []):
- contents = try_get(
- engagment_pannel, lambda x: x['engagementPanelSectionListRenderer']['content']['macroMarkersListRenderer']['contents'],
- list)
- if not contents:
- continue
+ try:
+ # This will error if there is no livechat
+ initial_data['contents']['twoColumnWatchNextResults']['conversationBar']['liveChatRenderer']['continuations'][0]['reloadContinuationData']['continuation']
+ info.setdefault('subtitles', {})['live_chat'] = [{
+ 'url': 'https://www.youtube.com/watch?v=%s' % video_id, # url is needed to set cookies
+ 'video_id': video_id,
+ 'ext': 'json',
+ 'protocol': 'youtube_live_chat' if is_live or is_upcoming else 'youtube_live_chat_replay',
+ }]
+ except (KeyError, IndexError, TypeError):
+ pass
- def chapter_time(mmlir):
- return parse_duration(
- get_text(mmlir.get('timeDescription')))
-
- chapters = []
- for next_num, content in enumerate(contents, start=1):
- mmlir = content.get('macroMarkersListItemRenderer') or {}
- start_time = chapter_time(mmlir)
- end_time = chapter_time(try_get(
- contents, lambda x: x[next_num]['macroMarkersListItemRenderer'])) \
- if next_num < len(contents) else duration
- if start_time is None or end_time is None:
- continue
- chapters.append({
- 'start_time': start_time,
- 'end_time': end_time,
- 'title': get_text(mmlir.get('title')),
- })
- if chapters:
- break
- if chapters:
- info['chapters'] = chapters
+ if initial_data:
+ info['chapters'] = (
+ self._extract_chapters_from_json(initial_data, duration)
+ or self._extract_chapters_from_engagement_panel(initial_data, duration)
+ or None)
contents = try_get(
initial_data,
@@ -1910,7 +2893,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
if vpir:
stl = vpir.get('superTitleLink')
if stl:
- stl = get_text(stl)
+ stl = self._get_text(stl)
if try_get(
vpir,
lambda x: x['superTitleIcon']['iconType']) == 'LOCATION_PIN':
@@ -1950,10 +2933,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
})
vsir = content.get('videoSecondaryInfoRenderer')
if vsir:
- info['channel'] = get_text(try_get(
- vsir,
- lambda x: x['owner']['videoOwnerRenderer']['title'],
- dict))
+ info['channel'] = self._get_text(vsir, ('owner', 'videoOwnerRenderer', 'title'))
rows = try_get(
vsir,
lambda x: x['metadataRowContainer']['metadataRowContainerRenderer']['rows'],
@@ -1968,8 +2948,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
mrr_title = mrr.get('title')
if not mrr_title:
continue
- mrr_title = get_text(mrr['title'])
- mrr_contents_text = get_text(mrr['contents'][0])
+ mrr_title = self._get_text(mrr, 'title')
+ mrr_contents_text = self._get_text(mrr, ('contents', 0))
if mrr_title == 'License':
info['license'] = mrr_contents_text
elif not multiple_songs:
@@ -1980,12 +2960,51 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
elif mrr_title == 'Song':
info['track'] = mrr_contents_text
+ fallbacks = {
+ 'channel': 'uploader',
+ 'channel_id': 'uploader_id',
+ 'channel_url': 'uploader_url',
+ }
+ for to, frm in fallbacks.items():
+ if not info.get(to):
+ info[to] = info.get(frm)
+
for s_k, d_k in [('artist', 'creator'), ('track', 'alt_title')]:
v = info.get(s_k)
if v:
info[d_k] = v
- self.mark_watched(video_id, player_response)
+ is_private = get_first(video_details, 'isPrivate', expected_type=bool)
+ is_unlisted = get_first(microformats, 'isUnlisted', expected_type=bool)
+ is_membersonly = None
+ is_premium = None
+ if initial_data and is_private is not None:
+ is_membersonly = False
+ is_premium = False
+ contents = try_get(initial_data, lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'], list) or []
+ badge_labels = set()
+ for content in contents:
+ if not isinstance(content, dict):
+ continue
+ badge_labels.update(self._extract_badges(content.get('videoPrimaryInfoRenderer')))
+ for badge_label in badge_labels:
+ if badge_label.lower() == 'members only':
+ is_membersonly = True
+ elif badge_label.lower() == 'premium':
+ is_premium = True
+ elif badge_label.lower() == 'unlisted':
+ is_unlisted = True
+
+ info['availability'] = self._availability(
+ is_private=is_private,
+ needs_premium=is_premium,
+ needs_subscription=is_membersonly,
+ needs_auth=info['age_limit'] >= 18,
+ is_unlisted=None if is_private is None else is_unlisted)
+
+ info['__post_extractor'] = self.extract_comments(master_ytcfg, video_id, contents, webpage)
+
+ self.mark_watched(video_id, player_responses)
return info
@@ -2000,127 +3019,161 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
invidio\.us
)/
(?:
- (?:channel|c|user|feed|hashtag)/|
- (?:playlist|watch)\?.*?\blist=|
- (?!(?:watch|embed|v|e)\b)
+ (?P<channel_type>channel|c|user|browse)/|
+ (?P<not_channel>
+ feed/|hashtag/|
+ (?:playlist|watch)\?.*?\blist=
+ )|
+ (?!(?:%s)\b) # Direct URLs
)
(?P<id>[^/?\#&]+)
- '''
+ ''' % YoutubeBaseInfoExtractor._RESERVED_NAMES
IE_NAME = 'youtube:tab'
_TESTS = [{
- # playlists, multipage
+ 'note': 'playlists, multipage',
'url': 'https://www.youtube.com/c/ИгорьКлейнер/playlists?view=1&flow=grid',
'playlist_mincount': 94,
'info_dict': {
'id': 'UCqj7Cz7revf5maW9g5pgNcg',
'title': 'Игорь Клейнер - Playlists',
'description': 'md5:be97ee0f14ee314f1f002cf187166ee2',
+ 'uploader': 'Игорь Клейнер',
+ 'uploader_id': 'UCqj7Cz7revf5maW9g5pgNcg',
},
}, {
- # playlists, multipage, different order
+ 'note': 'playlists, multipage, different order',
'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd',
'playlist_mincount': 94,
'info_dict': {
'id': 'UCqj7Cz7revf5maW9g5pgNcg',
'title': 'Игорь Клейнер - Playlists',
'description': 'md5:be97ee0f14ee314f1f002cf187166ee2',
+ 'uploader_id': 'UCqj7Cz7revf5maW9g5pgNcg',
+ 'uploader': 'Игорь Клейнер',
},
}, {
- # playlists, series
+ 'note': 'playlists, series',
'url': 'https://www.youtube.com/c/3blue1brown/playlists?view=50&sort=dd&shelf_id=3',
'playlist_mincount': 5,
'info_dict': {
'id': 'UCYO_jab_esuFRV4b17AJtAw',
'title': '3Blue1Brown - Playlists',
'description': 'md5:e1384e8a133307dd10edee76e875d62f',
+ 'uploader_id': 'UCYO_jab_esuFRV4b17AJtAw',
+ 'uploader': '3Blue1Brown',
},
}, {
- # playlists, singlepage
+ 'note': 'playlists, singlepage',
'url': 'https://www.youtube.com/user/ThirstForScience/playlists',
'playlist_mincount': 4,
'info_dict': {
'id': 'UCAEtajcuhQ6an9WEzY9LEMQ',
'title': 'ThirstForScience - Playlists',
'description': 'md5:609399d937ea957b0f53cbffb747a14c',
+ 'uploader': 'ThirstForScience',
+ 'uploader_id': 'UCAEtajcuhQ6an9WEzY9LEMQ',
}
}, {
'url': 'https://www.youtube.com/c/ChristophLaimer/playlists',
'only_matching': True,
}, {
- # basic, single video playlist
+ 'note': 'basic, single video playlist',
'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
'info_dict': {
'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
'uploader': 'Sergey M.',
'id': 'PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
- 'title': 'hypervideo public playlist',
+ 'title': 'youtube-dl public playlist',
},
'playlist_count': 1,
}, {
- # empty playlist
+ 'note': 'empty playlist',
'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
'info_dict': {
'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
'uploader': 'Sergey M.',
'id': 'PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
- 'title': 'hypervideo empty playlist',
+ 'title': 'youtube-dl empty playlist',
},
'playlist_count': 0,
}, {
- # Home tab
+ 'note': 'Home tab',
'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/featured',
'info_dict': {
'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
'title': 'lex will - Home',
'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
+ 'uploader': 'lex will',
+ 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
},
'playlist_mincount': 2,
}, {
- # Videos tab
+ 'note': 'Videos tab',
'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos',
'info_dict': {
'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
'title': 'lex will - Videos',
'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
+ 'uploader': 'lex will',
+ 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
},
'playlist_mincount': 975,
}, {
- # Videos tab, sorted by popular
+ 'note': 'Videos tab, sorted by popular',
'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos?view=0&sort=p&flow=grid',
'info_dict': {
'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
'title': 'lex will - Videos',
'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
+ 'uploader': 'lex will',
+ 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
},
'playlist_mincount': 199,
}, {
- # Playlists tab
+ 'note': 'Playlists tab',
'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/playlists',
'info_dict': {
'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
'title': 'lex will - Playlists',
'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
+ 'uploader': 'lex will',
+ 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
},
'playlist_mincount': 17,
}, {
- # Community tab
+ 'note': 'Community tab',
'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/community',
'info_dict': {
'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
'title': 'lex will - Community',
'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
+ 'uploader': 'lex will',
+ 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
},
'playlist_mincount': 18,
}, {
- # Channels tab
+ 'note': 'Channels tab',
'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/channels',
'info_dict': {
'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
'title': 'lex will - Channels',
'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
+ 'uploader': 'lex will',
+ 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
+ },
+ 'playlist_mincount': 12,
+ }, {
+ 'note': 'Search tab',
+ 'url': 'https://www.youtube.com/c/3blue1brown/search?query=linear%20algebra',
+ 'playlist_mincount': 40,
+ 'info_dict': {
+ 'id': 'UCYO_jab_esuFRV4b17AJtAw',
+ 'title': '3Blue1Brown - Search - linear algebra',
+ 'description': 'md5:e1384e8a133307dd10edee76e875d62f',
+ 'uploader': '3Blue1Brown',
+ 'uploader_id': 'UCYO_jab_esuFRV4b17AJtAw',
},
- 'playlist_mincount': 138,
}, {
'url': 'https://invidio.us/channel/UCmlqkdCBesrv2Lak1mF_MxA',
'only_matching': True,
@@ -2138,6 +3191,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
'uploader': 'Christiaan008',
'uploader_id': 'UCEPzS1rYsrkqzSLNp76nrcg',
+ 'description': 'md5:a14dc1a8ef8307a9807fe136a0660268',
},
'playlist_count': 96,
}, {
@@ -2151,7 +3205,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
},
'playlist_mincount': 1123,
}, {
- # even larger playlist, 8832 videos
+ 'note': 'even larger playlist, 8832 videos',
'url': 'http://www.youtube.com/user/NASAgovVideo/videos',
'only_matching': True,
}, {
@@ -2165,20 +3219,41 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
},
'playlist_mincount': 21,
}, {
- # https://github.com/ytdl-org/youtube-dl/issues/21844
+ 'note': 'Playlist with "show unavailable videos" button',
+ 'url': 'https://www.youtube.com/playlist?list=UUTYLiWFZy8xtPwxFwX9rV7Q',
+ 'info_dict': {
+ 'title': 'Uploads from Phim Siêu Nhân Nhật Bản',
+ 'id': 'UUTYLiWFZy8xtPwxFwX9rV7Q',
+ 'uploader': 'Phim Siêu Nhân Nhật Bản',
+ 'uploader_id': 'UCTYLiWFZy8xtPwxFwX9rV7Q',
+ },
+ 'playlist_mincount': 200,
+ }, {
+ 'note': 'Playlist with unavailable videos in page 7',
+ 'url': 'https://www.youtube.com/playlist?list=UU8l9frL61Yl5KFOl87nIm2w',
+ 'info_dict': {
+ 'title': 'Uploads from BlankTV',
+ 'id': 'UU8l9frL61Yl5KFOl87nIm2w',
+ 'uploader': 'BlankTV',
+ 'uploader_id': 'UC8l9frL61Yl5KFOl87nIm2w',
+ },
+ 'playlist_mincount': 1000,
+ }, {
+ 'note': 'https://github.com/ytdl-org/youtube-dl/issues/21844',
'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
'info_dict': {
'title': 'Data Analysis with Dr Mike Pound',
'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
'uploader_id': 'UC9-y-6csu5WGm29I7JiwpnA',
'uploader': 'Computerphile',
+ 'description': 'md5:7f567c574d13d3f8c0954d9ffee4e487',
},
'playlist_mincount': 11,
}, {
'url': 'https://invidio.us/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
'only_matching': True,
}, {
- # Playlist URL that does not actually serve a playlist
+ 'note': 'Playlist URL that does not actually serve a playlist',
'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4',
'info_dict': {
'id': 'FqZTN594JQw',
@@ -2210,14 +3285,14 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
}, {
'url': 'https://www.youtube.com/channel/UCoMdktPbSTixAyNGwb-UYkQ/live',
'info_dict': {
- 'id': '9Auq9mYxFEE',
+ 'id': '3yImotZU3tw', # This will keep changing
'ext': 'mp4',
- 'title': 'Watch Sky News live',
+ 'title': compat_str,
'uploader': 'Sky News',
'uploader_id': 'skynews',
'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/skynews',
- 'upload_date': '20191102',
- 'description': 'md5:78de4e1c2359d0ea3ed829678e38b662',
+ 'upload_date': r're:\d{8}',
+ 'description': compat_str,
'categories': ['News & Politics'],
'tags': list,
'like_count': int,
@@ -2226,6 +3301,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
'params': {
'skip_download': True,
},
+ 'expected_warnings': ['Downloading just video ', 'Ignoring subtitle tracks found in '],
}, {
'url': 'https://www.youtube.com/user/TheYoungTurks/live',
'info_dict': {
@@ -2254,30 +3330,30 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
'url': 'https://www.youtube.com/c/CommanderVideoHq/live',
'only_matching': True,
}, {
+ 'note': 'A channel that is not live. Should raise error',
+ 'url': 'https://www.youtube.com/user/numberphile/live',
+ 'only_matching': True,
+ }, {
'url': 'https://www.youtube.com/feed/trending',
'only_matching': True,
}, {
- # needs auth
'url': 'https://www.youtube.com/feed/library',
'only_matching': True,
}, {
- # needs auth
'url': 'https://www.youtube.com/feed/history',
'only_matching': True,
}, {
- # needs auth
'url': 'https://www.youtube.com/feed/subscriptions',
'only_matching': True,
}, {
- # needs auth
'url': 'https://www.youtube.com/feed/watch_later',
'only_matching': True,
}, {
- # no longer available?
+ 'note': 'Recommended - redirects to home page.',
'url': 'https://www.youtube.com/feed/recommended',
'only_matching': True,
}, {
- # inline playlist with not always working continuations
+ 'note': 'inline playlist with not always working continuations',
'url': 'https://www.youtube.com/watch?v=UC6u0Tct-Fo&list=PL36D642111D65BE7C',
'only_matching': True,
}, {
@@ -2305,6 +3381,116 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
}, {
'url': 'https://www.youtube.com/watch?list=PLW4dVinRY435CBE_JD3t-0SRXKfnZHS1P&feature=youtu.be&v=M9cJMXmQ_ZU',
'only_matching': True,
+ }, {
+ 'note': 'Requires Premium: should request additional YTM-info webpage (and have format 141) for videos in playlist',
+ 'url': 'https://music.youtube.com/playlist?list=PLRBp0Fe2GpgmgoscNFLxNyBVSFVdYmFkq',
+ 'only_matching': True
+ }, {
+ 'note': '/browse/ should redirect to /channel/',
+ 'url': 'https://music.youtube.com/browse/UC1a8OFewdjuLq6KlF8M_8Ng',
+ 'only_matching': True
+ }, {
+ 'note': 'VLPL, should redirect to playlist?list=PL...',
+ 'url': 'https://music.youtube.com/browse/VLPLRBp0Fe2GpgmgoscNFLxNyBVSFVdYmFkq',
+ 'info_dict': {
+ 'id': 'PLRBp0Fe2GpgmgoscNFLxNyBVSFVdYmFkq',
+ 'uploader': 'NoCopyrightSounds',
+ 'description': 'Providing you with copyright free / safe music for gaming, live streaming, studying and more!',
+ 'uploader_id': 'UC_aEa8K-EOJ3D6gOs7HcyNg',
+ 'title': 'NCS Releases',
+ },
+ 'playlist_mincount': 166,
+ }, {
+ 'note': 'Topic, should redirect to playlist?list=UU...',
+ 'url': 'https://music.youtube.com/browse/UC9ALqqC4aIeG5iDs7i90Bfw',
+ 'info_dict': {
+ 'id': 'UU9ALqqC4aIeG5iDs7i90Bfw',
+ 'uploader_id': 'UC9ALqqC4aIeG5iDs7i90Bfw',
+ 'title': 'Uploads from Royalty Free Music - Topic',
+ 'uploader': 'Royalty Free Music - Topic',
+ },
+ 'expected_warnings': [
+ 'A channel/user page was given',
+ 'The URL does not have a videos tab',
+ ],
+ 'playlist_mincount': 101,
+ }, {
+ 'note': 'Topic without a UU playlist',
+ 'url': 'https://www.youtube.com/channel/UCtFRv9O2AHqOZjjynzrv-xg',
+ 'info_dict': {
+ 'id': 'UCtFRv9O2AHqOZjjynzrv-xg',
+ 'title': 'UCtFRv9O2AHqOZjjynzrv-xg',
+ },
+ 'expected_warnings': [
+ 'A channel/user page was given',
+ 'The URL does not have a videos tab',
+ 'Falling back to channel URL',
+ ],
+ 'playlist_mincount': 9,
+ }, {
+ 'note': 'Youtube music Album',
+ 'url': 'https://music.youtube.com/browse/MPREb_gTAcphH99wE',
+ 'info_dict': {
+ 'id': 'OLAK5uy_l1m0thk3g31NmIIz_vMIbWtyv7eZixlH0',
+ 'title': 'Album - Royalty Free Music Library V2 (50 Songs)',
+ },
+ 'playlist_count': 50,
+ }, {
+ 'note': 'unlisted single video playlist',
+ 'url': 'https://www.youtube.com/playlist?list=PLwL24UFy54GrB3s2KMMfjZscDi1x5Dajf',
+ 'info_dict': {
+ 'uploader_id': 'UC9zHu_mHU96r19o-wV5Qs1Q',
+ 'uploader': 'colethedj',
+ 'id': 'PLwL24UFy54GrB3s2KMMfjZscDi1x5Dajf',
+ 'title': 'hypervideo unlisted playlist test',
+ 'availability': 'unlisted'
+ },
+ 'playlist_count': 1,
+ }, {
+ 'note': 'API Fallback: Recommended - redirects to home page. Requires visitorData',
+ 'url': 'https://www.youtube.com/feed/recommended',
+ 'info_dict': {
+ 'id': 'recommended',
+ 'title': 'recommended',
+ },
+ 'playlist_mincount': 50,
+ 'params': {
+ 'skip_download': True,
+ 'extractor_args': {'youtubetab': {'skip': ['webpage']}}
+ },
+ }, {
+ 'note': 'API Fallback: /videos tab, sorted by oldest first',
+ 'url': 'https://www.youtube.com/user/theCodyReeder/videos?view=0&sort=da&flow=grid',
+ 'info_dict': {
+ 'id': 'UCu6mSoMNzHQiBIOCkHUa2Aw',
+ 'title': 'Cody\'sLab - Videos',
+ 'description': 'md5:d083b7c2f0c67ee7a6c74c3e9b4243fa',
+ 'uploader': 'Cody\'sLab',
+ 'uploader_id': 'UCu6mSoMNzHQiBIOCkHUa2Aw',
+ },
+ 'playlist_mincount': 650,
+ 'params': {
+ 'skip_download': True,
+ 'extractor_args': {'youtubetab': {'skip': ['webpage']}}
+ },
+ }, {
+ 'note': 'API Fallback: Topic, should redirect to playlist?list=UU...',
+ 'url': 'https://music.youtube.com/browse/UC9ALqqC4aIeG5iDs7i90Bfw',
+ 'info_dict': {
+ 'id': 'UU9ALqqC4aIeG5iDs7i90Bfw',
+ 'uploader_id': 'UC9ALqqC4aIeG5iDs7i90Bfw',
+ 'title': 'Uploads from Royalty Free Music - Topic',
+ 'uploader': 'Royalty Free Music - Topic',
+ },
+ 'expected_warnings': [
+ 'A channel/user page was given',
+ 'The URL does not have a videos tab',
+ ],
+ 'playlist_mincount': 101,
+ 'params': {
+ 'skip_download': True,
+ 'extractor_args': {'youtubetab': {'skip': ['webpage']}}
+ },
}]
@classmethod
@@ -2326,25 +3512,28 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
channel_url, 'channel id')
@staticmethod
- def _extract_grid_item_renderer(item):
- assert isinstance(item, dict)
+ def _extract_basic_item_renderer(item):
+ # Modified from _extract_grid_item_renderer
+ known_basic_renderers = (
+ 'playlistRenderer', 'videoRenderer', 'channelRenderer', 'showRenderer'
+ )
for key, renderer in item.items():
- if not key.startswith('grid') or not key.endswith('Renderer'):
- continue
if not isinstance(renderer, dict):
continue
- return renderer
+ elif key in known_basic_renderers:
+ return renderer
+ elif key.startswith('grid') and key.endswith('Renderer'):
+ return renderer
def _grid_entries(self, grid_renderer):
for item in grid_renderer['items']:
if not isinstance(item, dict):
continue
- renderer = self._extract_grid_item_renderer(item)
+ renderer = self._extract_basic_item_renderer(item)
if not isinstance(renderer, dict):
continue
- title = try_get(
- renderer, (lambda x: x['title']['runs'][0]['text'],
- lambda x: x['title']['simpleText']), compat_str)
+ title = self._get_text(renderer, 'title')
+
# playlist
playlist_id = renderer.get('playlistId')
if playlist_id:
@@ -2361,8 +3550,6 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
# channel
channel_id = renderer.get('channelId')
if channel_id:
- title = try_get(
- renderer, lambda x: x['title']['simpleText'], compat_str)
yield self.url_result(
'https://www.youtube.com/channel/%s' % channel_id,
ie=YoutubeTabIE.ie_key(), video_title=title)
@@ -2382,7 +3569,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
content = shelf_renderer.get('content')
if not isinstance(content, dict):
return
- renderer = content.get('gridRenderer')
+ renderer = content.get('gridRenderer') or content.get('expandedShelfContentsRenderer')
if renderer:
# TODO: add support for nested playlists so each shelf is processed
# as separate playlist
@@ -2405,8 +3592,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
# will not work
if skip_channels and '/channels?' in shelf_url:
return
- title = try_get(
- shelf_renderer, lambda x: x['title']['runs'][0]['text'], compat_str)
+ title = self._get_text(shelf_renderer, 'title')
yield self.url_result(shelf_url, video_title=title)
# Shelf may not contain shelf URL, fallback to extraction from content
for entry in self._shelf_entries_from_content(shelf_renderer):
@@ -2424,6 +3610,14 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
continue
yield self._extract_video(renderer)
+ def _rich_entries(self, rich_grid_renderer):
+ renderer = try_get(
+ rich_grid_renderer, lambda x: x['content']['videoRenderer'], dict) or {}
+ video_id = renderer.get('videoId')
+ if not video_id:
+ return
+ yield self._extract_video(renderer)
+
def _video_entry(self, video_renderer):
video_id = video_renderer.get('videoId')
if video_id:
@@ -2436,12 +3630,19 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
return
# video attachment
video_renderer = try_get(
- post_renderer, lambda x: x['backstageAttachment']['videoRenderer'], dict)
- video_id = None
- if video_renderer:
- entry = self._video_entry(video_renderer)
+ post_renderer, lambda x: x['backstageAttachment']['videoRenderer'], dict) or {}
+ video_id = video_renderer.get('videoId')
+ if video_id:
+ entry = self._extract_video(video_renderer)
if entry:
yield entry
+ # playlist attachment
+ playlist_id = try_get(
+ post_renderer, lambda x: x['backstageAttachment']['playlistRenderer']['playlistId'], compat_str)
+ if playlist_id:
+ yield self.url_result(
+ 'https://www.youtube.com/playlist?list=%s' % playlist_id,
+ ie=YoutubeTabIE.ie_key(), video_id=playlist_id)
# inline video links
runs = try_get(post_renderer, lambda x: x['contentText']['runs'], list) or []
for run in runs:
@@ -2456,7 +3657,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
ep_video_id = YoutubeIE._match_id(ep_url)
if video_id == ep_video_id:
continue
- yield self.url_result(ep_url, ie=YoutubeIE.ie_key(), video_id=video_id)
+ yield self.url_result(ep_url, ie=YoutubeIE.ie_key(), video_id=ep_video_id)
def _post_thread_continuation_entries(self, post_thread_continuation):
contents = post_thread_continuation.get('contents')
@@ -2469,6 +3670,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
for entry in self._post_thread_entries(renderer):
yield entry
+ r''' # unused
def _rich_grid_entries(self, contents):
for content in contents:
video_renderer = try_get(content, lambda x: x['richItemRenderer']['content']['videoRenderer'], dict)
@@ -2476,316 +3678,264 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
entry = self._video_entry(video_renderer)
if entry:
yield entry
+ '''
+ def _entries(self, tab, item_id, ytcfg, account_syncid, visitor_data):
- @staticmethod
- def _build_continuation_query(continuation, ctp=None):
- query = {
- 'ctoken': continuation,
- 'continuation': continuation,
- }
- if ctp:
- query['itct'] = ctp
- return query
-
- @staticmethod
- def _extract_next_continuation_data(renderer):
- next_continuation = try_get(
- renderer, lambda x: x['continuations'][0]['nextContinuationData'], dict)
- if not next_continuation:
- return
- continuation = next_continuation.get('continuation')
- if not continuation:
- return
- ctp = next_continuation.get('clickTrackingParams')
- return YoutubeTabIE._build_continuation_query(continuation, ctp)
-
- @classmethod
- def _extract_continuation(cls, renderer):
- next_continuation = cls._extract_next_continuation_data(renderer)
- if next_continuation:
- return next_continuation
- contents = []
- for key in ('contents', 'items'):
- contents.extend(try_get(renderer, lambda x: x[key], list) or [])
- for content in contents:
- if not isinstance(content, dict):
- continue
- continuation_ep = try_get(
- content, lambda x: x['continuationItemRenderer']['continuationEndpoint'],
- dict)
- if not continuation_ep:
- continue
- continuation = try_get(
- continuation_ep, lambda x: x['continuationCommand']['token'], compat_str)
- if not continuation:
- continue
- ctp = continuation_ep.get('clickTrackingParams')
- return YoutubeTabIE._build_continuation_query(continuation, ctp)
-
- def _entries(self, tab, item_id, webpage):
- tab_content = try_get(tab, lambda x: x['content'], dict)
- if not tab_content:
- return
- slr_renderer = try_get(tab_content, lambda x: x['sectionListRenderer'], dict)
- if slr_renderer:
- is_channels_tab = tab.get('title') == 'Channels'
- continuation = None
- slr_contents = try_get(slr_renderer, lambda x: x['contents'], list) or []
- for slr_content in slr_contents:
- if not isinstance(slr_content, dict):
+ def extract_entries(parent_renderer): # this needs to called again for continuation to work with feeds
+ contents = try_get(parent_renderer, lambda x: x['contents'], list) or []
+ for content in contents:
+ if not isinstance(content, dict):
continue
- is_renderer = try_get(slr_content, lambda x: x['itemSectionRenderer'], dict)
+ is_renderer = try_get(content, lambda x: x['itemSectionRenderer'], dict)
if not is_renderer:
+ renderer = content.get('richItemRenderer')
+ if renderer:
+ for entry in self._rich_entries(renderer):
+ yield entry
+ continuation_list[0] = self._extract_continuation(parent_renderer)
continue
isr_contents = try_get(is_renderer, lambda x: x['contents'], list) or []
for isr_content in isr_contents:
if not isinstance(isr_content, dict):
continue
- renderer = isr_content.get('playlistVideoListRenderer')
- if renderer:
- for entry in self._playlist_entries(renderer):
- yield entry
- continuation = self._extract_continuation(renderer)
- continue
- renderer = isr_content.get('gridRenderer')
- if renderer:
- for entry in self._grid_entries(renderer):
- yield entry
- continuation = self._extract_continuation(renderer)
- continue
- renderer = isr_content.get('shelfRenderer')
- if renderer:
- for entry in self._shelf_entries(renderer, not is_channels_tab):
- yield entry
- continue
- renderer = isr_content.get('backstagePostThreadRenderer')
- if renderer:
- for entry in self._post_thread_entries(renderer):
- yield entry
- continuation = self._extract_continuation(renderer)
- continue
- renderer = isr_content.get('videoRenderer')
- if renderer:
- entry = self._video_entry(renderer)
- if entry:
- yield entry
- if not continuation:
- continuation = self._extract_continuation(is_renderer)
- if not continuation:
- continuation = self._extract_continuation(slr_renderer)
- else:
- rich_grid_renderer = tab_content.get('richGridRenderer')
- if not rich_grid_renderer:
- return
- for entry in self._rich_grid_entries(rich_grid_renderer.get('contents') or []):
- yield entry
- continuation = self._extract_continuation(rich_grid_renderer)
+ known_renderers = {
+ 'playlistVideoListRenderer': self._playlist_entries,
+ 'gridRenderer': self._grid_entries,
+ 'shelfRenderer': lambda x: self._shelf_entries(x, tab.get('title') != 'Channels'),
+ 'backstagePostThreadRenderer': self._post_thread_entries,
+ 'videoRenderer': lambda x: [self._video_entry(x)],
+ }
+ for key, renderer in isr_content.items():
+ if key not in known_renderers:
+ continue
+ for entry in known_renderers[key](renderer):
+ if entry:
+ yield entry
+ continuation_list[0] = self._extract_continuation(renderer)
+ break
- ytcfg = self._extract_ytcfg(item_id, webpage)
- client_version = try_get(
- ytcfg, lambda x: x['INNERTUBE_CLIENT_VERSION'], compat_str) or '2.20210407.08.00'
+ if not continuation_list[0]:
+ continuation_list[0] = self._extract_continuation(is_renderer)
- headers = {
- 'x-youtube-client-name': '1',
- 'x-youtube-client-version': client_version,
- 'content-type': 'application/json',
- }
+ if not continuation_list[0]:
+ continuation_list[0] = self._extract_continuation(parent_renderer)
- context = try_get(ytcfg, lambda x: x['INNERTUBE_CONTEXT'], dict) or {
- 'client': {
- 'clientName': 'WEB',
- 'clientVersion': client_version,
- }
- }
- visitor_data = try_get(context, lambda x: x['client']['visitorData'], compat_str)
-
- identity_token = self._extract_identity_token(ytcfg, webpage)
- if identity_token:
- headers['x-youtube-identity-token'] = identity_token
-
- data = {
- 'context': context,
- }
+ continuation_list = [None] # Python 2 does not support nonlocal
+ tab_content = try_get(tab, lambda x: x['content'], dict)
+ if not tab_content:
+ return
+ parent_renderer = (
+ try_get(tab_content, lambda x: x['sectionListRenderer'], dict)
+ or try_get(tab_content, lambda x: x['richGridRenderer'], dict) or {})
+ for entry in extract_entries(parent_renderer):
+ yield entry
+ continuation = continuation_list[0]
for page_num in itertools.count(1):
if not continuation:
break
- if visitor_data:
- headers['x-goog-visitor-id'] = visitor_data
- data['continuation'] = continuation['continuation']
- data['clickTracking'] = {
- 'clickTrackingParams': continuation['itct']
- }
- count = 0
- retries = 3
- while count <= retries:
- try:
- # Downloading page may result in intermittent 5xx HTTP error
- # that is usually worked around with a retry
- response = self._download_json(
- 'https://www.youtube.com/youtubei/v1/browse?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
- None, 'Downloading page %d%s' % (page_num, ' (retry #%d)' % count if count else ''),
- headers=headers, data=json.dumps(data).encode('utf8'))
- break
- except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503):
- count += 1
- if count <= retries:
- continue
- raise
+ headers = self.generate_api_headers(
+ ytcfg=ytcfg, account_syncid=account_syncid, visitor_data=visitor_data)
+ response = self._extract_response(
+ item_id='%s page %s' % (item_id, page_num),
+ query=continuation, headers=headers, ytcfg=ytcfg,
+ check_get_keys=('continuationContents', 'onResponseReceivedActions', 'onResponseReceivedEndpoints'))
+
if not response:
break
-
- visitor_data = try_get(
- response, lambda x: x['responseContext']['visitorData'], compat_str) or visitor_data
-
+ # Extracting updated visitor data is required to prevent an infinite extraction loop in some cases
+ # See: https://github.com/ytdl-org/youtube-dl/issues/28702
+ visitor_data = self._extract_visitor_data(response) or visitor_data
+
+ known_continuation_renderers = {
+ 'playlistVideoListContinuation': self._playlist_entries,
+ 'gridContinuation': self._grid_entries,
+ 'itemSectionContinuation': self._post_thread_continuation_entries,
+ 'sectionListContinuation': extract_entries, # for feeds
+ }
continuation_contents = try_get(
- response, lambda x: x['continuationContents'], dict)
- if continuation_contents:
- continuation_renderer = continuation_contents.get('playlistVideoListContinuation')
- if continuation_renderer:
- for entry in self._playlist_entries(continuation_renderer):
- yield entry
- continuation = self._extract_continuation(continuation_renderer)
- continue
- continuation_renderer = continuation_contents.get('gridContinuation')
- if continuation_renderer:
- for entry in self._grid_entries(continuation_renderer):
- yield entry
- continuation = self._extract_continuation(continuation_renderer)
- continue
- continuation_renderer = continuation_contents.get('itemSectionContinuation')
- if continuation_renderer:
- for entry in self._post_thread_continuation_entries(continuation_renderer):
- yield entry
- continuation = self._extract_continuation(continuation_renderer)
+ response, lambda x: x['continuationContents'], dict) or {}
+ continuation_renderer = None
+ for key, value in continuation_contents.items():
+ if key not in known_continuation_renderers:
continue
+ continuation_renderer = value
+ continuation_list = [None]
+ for entry in known_continuation_renderers[key](continuation_renderer):
+ yield entry
+ continuation = continuation_list[0] or self._extract_continuation(continuation_renderer)
+ break
+ if continuation_renderer:
+ continue
+ known_renderers = {
+ 'gridPlaylistRenderer': (self._grid_entries, 'items'),
+ 'gridVideoRenderer': (self._grid_entries, 'items'),
+ 'gridChannelRenderer': (self._grid_entries, 'items'),
+ 'playlistVideoRenderer': (self._playlist_entries, 'contents'),
+ 'itemSectionRenderer': (extract_entries, 'contents'), # for feeds
+ 'richItemRenderer': (extract_entries, 'contents'), # for hashtag
+ 'backstagePostThreadRenderer': (self._post_thread_continuation_entries, 'contents')
+ }
on_response_received = dict_get(response, ('onResponseReceivedActions', 'onResponseReceivedEndpoints'))
continuation_items = try_get(
on_response_received, lambda x: x[0]['appendContinuationItemsAction']['continuationItems'], list)
- if continuation_items:
- continuation_item = continuation_items[0]
- if not isinstance(continuation_item, dict):
- continue
- renderer = self._extract_grid_item_renderer(continuation_item)
- if renderer:
- grid_renderer = {'items': continuation_items}
- for entry in self._grid_entries(grid_renderer):
- yield entry
- continuation = self._extract_continuation(grid_renderer)
- continue
- renderer = continuation_item.get('playlistVideoRenderer') or continuation_item.get('itemSectionRenderer')
- if renderer:
- video_list_renderer = {'contents': continuation_items}
- for entry in self._playlist_entries(video_list_renderer):
- yield entry
- continuation = self._extract_continuation(video_list_renderer)
- continue
- renderer = continuation_item.get('backstagePostThreadRenderer')
- if renderer:
- continuation_renderer = {'contents': continuation_items}
- for entry in self._post_thread_continuation_entries(continuation_renderer):
- yield entry
- continuation = self._extract_continuation(continuation_renderer)
- continue
- renderer = continuation_item.get('richItemRenderer')
- if renderer:
- for entry in self._rich_grid_entries(continuation_items):
- yield entry
- continuation = self._extract_continuation({'contents': continuation_items})
+ continuation_item = try_get(continuation_items, lambda x: x[0], dict) or {}
+ video_items_renderer = None
+ for key, value in continuation_item.items():
+ if key not in known_renderers:
continue
-
+ video_items_renderer = {known_renderers[key][1]: continuation_items}
+ continuation_list = [None]
+ for entry in known_renderers[key][0](video_items_renderer):
+ yield entry
+ continuation = continuation_list[0] or self._extract_continuation(video_items_renderer)
+ break
+ if video_items_renderer:
+ continue
break
@staticmethod
def _extract_selected_tab(tabs):
for tab in tabs:
- if try_get(tab, lambda x: x['tabRenderer']['selected'], bool):
- return tab['tabRenderer']
+ renderer = dict_get(tab, ('tabRenderer', 'expandableTabRenderer')) or {}
+ if renderer.get('selected') is True:
+ return renderer
else:
raise ExtractorError('Unable to find selected tab')
- @staticmethod
- def _extract_uploader(data):
+ @classmethod
+ def _extract_uploader(cls, data):
uploader = {}
- sidebar_renderer = try_get(
- data, lambda x: x['sidebar']['playlistSidebarRenderer']['items'], list)
- if sidebar_renderer:
- for item in sidebar_renderer:
- if not isinstance(item, dict):
- continue
- renderer = item.get('playlistSidebarSecondaryInfoRenderer')
- if not isinstance(renderer, dict):
- continue
- owner = try_get(
- renderer, lambda x: x['videoOwner']['videoOwnerRenderer']['title']['runs'][0], dict)
- if owner:
- uploader['uploader'] = owner.get('text')
- uploader['uploader_id'] = try_get(
- owner, lambda x: x['navigationEndpoint']['browseEndpoint']['browseId'], compat_str)
- uploader['uploader_url'] = urljoin(
- 'https://www.youtube.com/',
- try_get(owner, lambda x: x['navigationEndpoint']['browseEndpoint']['canonicalBaseUrl'], compat_str))
- return uploader
+ renderer = cls._extract_sidebar_info_renderer(data, 'playlistSidebarSecondaryInfoRenderer') or {}
+ owner = try_get(
+ renderer, lambda x: x['videoOwner']['videoOwnerRenderer']['title']['runs'][0], dict)
+ if owner:
+ uploader['uploader'] = owner.get('text')
+ uploader['uploader_id'] = try_get(
+ owner, lambda x: x['navigationEndpoint']['browseEndpoint']['browseId'], compat_str)
+ uploader['uploader_url'] = urljoin(
+ 'https://www.youtube.com/',
+ try_get(owner, lambda x: x['navigationEndpoint']['browseEndpoint']['canonicalBaseUrl'], compat_str))
+ return {k: v for k, v in uploader.items() if v is not None}
+
+ def _extract_from_tabs(self, item_id, ytcfg, data, tabs):
+ playlist_id = title = description = channel_url = channel_name = channel_id = None
+ thumbnails_list = []
+ tags = []
- @staticmethod
- def _extract_alert(data):
- alerts = []
- for alert in try_get(data, lambda x: x['alerts'], list) or []:
- if not isinstance(alert, dict):
- continue
- alert_text = try_get(
- alert, lambda x: x['alertRenderer']['text'], dict)
- if not alert_text:
- continue
- text = try_get(
- alert_text,
- (lambda x: x['simpleText'], lambda x: x['runs'][0]['text']),
- compat_str)
- if text:
- alerts.append(text)
- return '\n'.join(alerts)
-
- def _extract_from_tabs(self, item_id, webpage, data, tabs):
selected_tab = self._extract_selected_tab(tabs)
renderer = try_get(
data, lambda x: x['metadata']['channelMetadataRenderer'], dict)
- playlist_id = item_id
- title = description = None
if renderer:
- channel_title = renderer.get('title') or item_id
- tab_title = selected_tab.get('title')
- title = channel_title or item_id
- if tab_title:
- title += ' - %s' % tab_title
- description = renderer.get('description')
- playlist_id = renderer.get('externalId')
+ channel_name = renderer.get('title')
+ channel_url = renderer.get('channelUrl')
+ channel_id = renderer.get('externalId')
else:
renderer = try_get(
data, lambda x: x['metadata']['playlistMetadataRenderer'], dict)
- if renderer:
- title = renderer.get('title')
- else:
- renderer = try_get(
- data, lambda x: x['header']['hashtagHeaderRenderer'], dict)
- if renderer:
- title = try_get(renderer, lambda x: x['hashtag']['simpleText'])
- playlist = self.playlist_result(
- self._entries(selected_tab, item_id, webpage),
- playlist_id=playlist_id, playlist_title=title,
- playlist_description=description)
- playlist.update(self._extract_uploader(data))
- return playlist
-
- def _extract_from_playlist(self, item_id, url, data, playlist):
+
+ if renderer:
+ title = renderer.get('title')
+ description = renderer.get('description', '')
+ playlist_id = channel_id
+ tags = renderer.get('keywords', '').split()
+ thumbnails_list = (
+ try_get(renderer, lambda x: x['avatar']['thumbnails'], list)
+ or try_get(
+ self._extract_sidebar_info_renderer(data, 'playlistSidebarPrimaryInfoRenderer'),
+ lambda x: x['thumbnailRenderer']['playlistVideoThumbnailRenderer']['thumbnail']['thumbnails'],
+ list)
+ or [])
+
+ thumbnails = []
+ for t in thumbnails_list:
+ if not isinstance(t, dict):
+ continue
+ thumbnail_url = url_or_none(t.get('url'))
+ if not thumbnail_url:
+ continue
+ thumbnails.append({
+ 'url': thumbnail_url,
+ 'width': int_or_none(t.get('width')),
+ 'height': int_or_none(t.get('height')),
+ })
+ if playlist_id is None:
+ playlist_id = item_id
+ if title is None:
+ title = (
+ try_get(data, lambda x: x['header']['hashtagHeaderRenderer']['hashtag']['simpleText'])
+ or playlist_id)
+ title += format_field(selected_tab, 'title', ' - %s')
+ title += format_field(selected_tab, 'expandedText', ' - %s')
+ metadata = {
+ 'playlist_id': playlist_id,
+ 'playlist_title': title,
+ 'playlist_description': description,
+ 'uploader': channel_name,
+ 'uploader_id': channel_id,
+ 'uploader_url': channel_url,
+ 'thumbnails': thumbnails,
+ 'tags': tags,
+ }
+ availability = self._extract_availability(data)
+ if availability:
+ metadata['availability'] = availability
+ if not channel_id:
+ metadata.update(self._extract_uploader(data))
+ metadata.update({
+ 'channel': metadata['uploader'],
+ 'channel_id': metadata['uploader_id'],
+ 'channel_url': metadata['uploader_url']})
+ return self.playlist_result(
+ self._entries(
+ selected_tab, playlist_id, ytcfg,
+ self._extract_account_syncid(ytcfg, data),
+ self._extract_visitor_data(data, ytcfg)),
+ **metadata)
+
+ def _extract_mix_playlist(self, playlist, playlist_id, data, ytcfg):
+ first_id = last_id = response = None
+ for page_num in itertools.count(1):
+ videos = list(self._playlist_entries(playlist))
+ if not videos:
+ return
+ start = next((i for i, v in enumerate(videos) if v['id'] == last_id), -1) + 1
+ if start >= len(videos):
+ return
+ for video in videos[start:]:
+ if video['id'] == first_id:
+ self.to_screen('First video %s found again; Assuming end of Mix' % first_id)
+ return
+ yield video
+ first_id = first_id or videos[0]['id']
+ last_id = videos[-1]['id']
+ watch_endpoint = try_get(
+ playlist, lambda x: x['contents'][-1]['playlistPanelVideoRenderer']['navigationEndpoint']['watchEndpoint'])
+ headers = self.generate_api_headers(
+ ytcfg=ytcfg, account_syncid=self._extract_account_syncid(ytcfg, data),
+ visitor_data=self._extract_visitor_data(response, data, ytcfg))
+ query = {
+ 'playlistId': playlist_id,
+ 'videoId': watch_endpoint.get('videoId') or last_id,
+ 'index': watch_endpoint.get('index') or len(videos),
+ 'params': watch_endpoint.get('params') or 'OAE%3D'
+ }
+ response = self._extract_response(
+ item_id='%s page %d' % (playlist_id, page_num),
+ query=query, ep='next', headers=headers, ytcfg=ytcfg,
+ check_get_keys='contents'
+ )
+ playlist = try_get(
+ response, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist'], dict)
+
+ def _extract_from_playlist(self, item_id, url, data, playlist, ytcfg):
title = playlist.get('title') or try_get(
data, lambda x: x['titleText']['simpleText'], compat_str)
playlist_id = playlist.get('playlistId') or item_id
- # Inline playlist rendition continuation does not always work
- # at Youtube side, so delegating regular tab-based playlist URL
- # processing whenever possible.
+
+ # Delegating everything except mix playlists to regular tab-based playlist URL
playlist_url = urljoin(url, try_get(
playlist, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'],
compat_str))
@@ -2793,54 +3943,297 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
return self.url_result(
playlist_url, ie=YoutubeTabIE.ie_key(), video_id=playlist_id,
video_title=title)
+
return self.playlist_result(
- self._playlist_entries(playlist), playlist_id=playlist_id,
- playlist_title=title)
+ self._extract_mix_playlist(playlist, playlist_id, data, ytcfg),
+ playlist_id=playlist_id, playlist_title=title)
- def _extract_identity_token(self, ytcfg, webpage):
- if ytcfg:
- token = try_get(ytcfg, lambda x: x['ID_TOKEN'], compat_str)
- if token:
- return token
- return self._search_regex(
- r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage,
- 'identity token', default=None)
+ def _extract_availability(self, data):
+ """
+ Gets the availability of a given playlist/tab.
+ Note: Unless YouTube tells us explicitly, we do not assume it is public
+ @param data: response
+ """
+ is_private = is_unlisted = None
+ renderer = self._extract_sidebar_info_renderer(data, 'playlistSidebarPrimaryInfoRenderer') or {}
+ badge_labels = self._extract_badges(renderer)
+
+ # Personal playlists, when authenticated, have a dropdown visibility selector instead of a badge
+ privacy_dropdown_entries = try_get(
+ renderer, lambda x: x['privacyForm']['dropdownFormFieldRenderer']['dropdown']['dropdownRenderer']['entries'], list) or []
+ for renderer_dict in privacy_dropdown_entries:
+ is_selected = try_get(
+ renderer_dict, lambda x: x['privacyDropdownItemRenderer']['isSelected'], bool) or False
+ if not is_selected:
+ continue
+ label = self._get_text(renderer_dict, ('privacyDropdownItemRenderer', 'label'))
+ if label:
+ badge_labels.add(label.lower())
+ break
+
+ for badge_label in badge_labels:
+ if badge_label == 'unlisted':
+ is_unlisted = True
+ elif badge_label == 'private':
+ is_private = True
+ elif badge_label == 'public':
+ is_unlisted = is_private = False
+ return self._availability(is_private, False, False, False, is_unlisted)
+
+ @staticmethod
+ def _extract_sidebar_info_renderer(data, info_renderer, expected_type=dict):
+ sidebar_renderer = try_get(
+ data, lambda x: x['sidebar']['playlistSidebarRenderer']['items'], list) or []
+ for item in sidebar_renderer:
+ renderer = try_get(item, lambda x: x[info_renderer], expected_type)
+ if renderer:
+ return renderer
+
+ def _reload_with_unavailable_videos(self, item_id, data, ytcfg):
+ """
+ Get playlist with unavailable videos if the 'show unavailable videos' button exists.
+ """
+ browse_id = params = None
+ renderer = self._extract_sidebar_info_renderer(data, 'playlistSidebarPrimaryInfoRenderer')
+ if not renderer:
+ return
+ menu_renderer = try_get(
+ renderer, lambda x: x['menu']['menuRenderer']['items'], list) or []
+ for menu_item in menu_renderer:
+ if not isinstance(menu_item, dict):
+ continue
+ nav_item_renderer = menu_item.get('menuNavigationItemRenderer')
+ text = try_get(
+ nav_item_renderer, lambda x: x['text']['simpleText'], compat_str)
+ if not text or text.lower() != 'show unavailable videos':
+ continue
+ browse_endpoint = try_get(
+ nav_item_renderer, lambda x: x['navigationEndpoint']['browseEndpoint'], dict) or {}
+ browse_id = browse_endpoint.get('browseId')
+ params = browse_endpoint.get('params')
+ break
+
+ headers = self.generate_api_headers(
+ ytcfg=ytcfg, account_syncid=self._extract_account_syncid(ytcfg, data),
+ visitor_data=self._extract_visitor_data(data, ytcfg))
+ query = {
+ 'params': params or 'wgYCCAA=',
+ 'browseId': browse_id or 'VL%s' % item_id
+ }
+ return self._extract_response(
+ item_id=item_id, headers=headers, query=query,
+ check_get_keys='contents', fatal=False, ytcfg=ytcfg,
+ note='Downloading API JSON with unavailable videos')
+
+ def _extract_webpage(self, url, item_id, fatal=True):
+ retries = self.get_param('extractor_retries', 3)
+ count = -1
+ webpage = data = last_error = None
+ while count < retries:
+ count += 1
+ # Sometimes youtube returns a webpage with incomplete ytInitialData
+ # See: https://github.com/hypervideo/hypervideo/issues/116
+ if last_error:
+ self.report_warning('%s. Retrying ...' % last_error)
+ try:
+ webpage = self._download_webpage(
+ url, item_id,
+ note='Downloading webpage%s' % (' (retry #%d)' % count if count else '',))
+ data = self.extract_yt_initial_data(item_id, webpage or '', fatal=fatal) or {}
+ except ExtractorError as e:
+ if isinstance(e.cause, network_exceptions):
+ if not isinstance(e.cause, compat_HTTPError) or e.cause.code not in (403, 429):
+ last_error = error_to_compat_str(e.cause or e.msg)
+ if count < retries:
+ continue
+ if fatal:
+ raise
+ self.report_warning(error_to_compat_str(e))
+ break
+ else:
+ try:
+ self._extract_and_report_alerts(data)
+ except ExtractorError as e:
+ if fatal:
+ raise
+ self.report_warning(error_to_compat_str(e))
+ break
+
+ if dict_get(data, ('contents', 'currentVideoEndpoint')):
+ break
+
+ last_error = 'Incomplete yt initial data received'
+ if count >= retries:
+ if fatal:
+ raise ExtractorError(last_error)
+ self.report_warning(last_error)
+ break
+
+ return webpage, data
+
+ def _extract_data(self, url, item_id, ytcfg=None, fatal=True, webpage_fatal=False, default_client='web'):
+ data = None
+ if 'webpage' not in self._configuration_arg('skip'):
+ webpage, data = self._extract_webpage(url, item_id, fatal=webpage_fatal)
+ ytcfg = ytcfg or self.extract_ytcfg(item_id, webpage)
+ if not data:
+ if not ytcfg and self.is_authenticated:
+ msg = 'Playlists that require authentication may not extract correctly without a successful webpage download.'
+ if 'authcheck' not in self._configuration_arg('skip') and fatal:
+ raise ExtractorError(
+ msg + ' If you are not downloading private content, or your cookies are only for the first account and channel,'
+ ' pass "--extractor-args youtubetab:skip=authcheck" to skip this check',
+ expected=True)
+ self.report_warning(msg, only_once=True)
+ data = self._extract_tab_endpoint(url, item_id, ytcfg, fatal=fatal, default_client=default_client)
+ return data, ytcfg
+
+ def _extract_tab_endpoint(self, url, item_id, ytcfg=None, fatal=True, default_client='web'):
+ headers = self.generate_api_headers(ytcfg=ytcfg, default_client=default_client)
+ resolve_response = self._extract_response(
+ item_id=item_id, query={'url': url}, check_get_keys='endpoint', headers=headers, ytcfg=ytcfg, fatal=fatal,
+ ep='navigation/resolve_url', note='Downloading API parameters API JSON', default_client=default_client)
+ endpoints = {'browseEndpoint': 'browse', 'watchEndpoint': 'next'}
+ for ep_key, ep in endpoints.items():
+ params = try_get(resolve_response, lambda x: x['endpoint'][ep_key], dict)
+ if params:
+ return self._extract_response(
+ item_id=item_id, query=params, ep=ep, headers=headers,
+ ytcfg=ytcfg, fatal=fatal, default_client=default_client,
+ check_get_keys=('contents', 'currentVideoEndpoint'))
+ err_note = 'Failed to resolve url (does the playlist exist?)'
+ if fatal:
+ raise ExtractorError(err_note, expected=True)
+ self.report_warning(err_note, item_id)
+
+ @staticmethod
+ def _smuggle_data(entries, data):
+ for entry in entries:
+ if data:
+ entry['url'] = smuggle_url(entry['url'], data)
+ yield entry
def _real_extract(self, url):
+ url, smuggled_data = unsmuggle_url(url, {})
+ if self.is_music_url(url):
+ smuggled_data['is_music_url'] = True
+ info_dict = self.__real_extract(url, smuggled_data)
+ if info_dict.get('entries'):
+ info_dict['entries'] = self._smuggle_data(info_dict['entries'], smuggled_data)
+ return info_dict
+
+ _url_re = re.compile(r'(?P<pre>%s)(?(channel_type)(?P<tab>/\w+))?(?P<post>.*)$' % _VALID_URL)
+
+ def __real_extract(self, url, smuggled_data):
item_id = self._match_id(url)
url = compat_urlparse.urlunparse(
compat_urlparse.urlparse(url)._replace(netloc='www.youtube.com'))
+ compat_opts = self.get_param('compat_opts', [])
+
+ def get_mobj(url):
+ mobj = self._url_re.match(url).groupdict()
+ mobj.update((k, '') for k, v in mobj.items() if v is None)
+ return mobj
+
+ mobj = get_mobj(url)
+ # Youtube returns incomplete data if tabname is not lower case
+ pre, tab, post, is_channel = mobj['pre'], mobj['tab'].lower(), mobj['post'], not mobj['not_channel']
+ if is_channel:
+ if smuggled_data.get('is_music_url'):
+ if item_id[:2] == 'VL':
+ # Youtube music VL channels have an equivalent playlist
+ item_id = item_id[2:]
+ pre, tab, post, is_channel = 'https://www.youtube.com/playlist?list=%s' % item_id, '', '', False
+ elif item_id[:2] == 'MP':
+ # Resolve albums (/[channel/browse]/MP...) to their equivalent playlist
+ mdata = self._extract_tab_endpoint(
+ 'https://music.youtube.com/channel/%s' % item_id, item_id, default_client='web_music')
+ murl = traverse_obj(
+ mdata, ('microformat', 'microformatDataRenderer', 'urlCanonical'), get_all=False, expected_type=compat_str)
+ if not murl:
+ raise ExtractorError('Failed to resolve album to playlist.')
+ return self.url_result(murl, ie=YoutubeTabIE.ie_key())
+ elif mobj['channel_type'] == 'browse':
+ # Youtube music /browse/ should be changed to /channel/
+ pre = 'https://www.youtube.com/channel/%s' % item_id
+ if is_channel and not tab and 'no-youtube-channel-redirect' not in compat_opts:
+ # Home URLs should redirect to /videos/
+ self.report_warning(
+ 'A channel/user page was given. All the channel\'s videos will be downloaded. '
+ 'To download only the videos in the home page, add a "/featured" to the URL')
+ tab = '/videos'
+
+ url = ''.join((pre, tab, post))
+ mobj = get_mobj(url)
+
# Handle both video/playlist URLs
qs = parse_qs(url)
video_id = qs.get('v', [None])[0]
playlist_id = qs.get('list', [None])[0]
+
+ if not video_id and mobj['not_channel'].startswith('watch'):
+ if not playlist_id:
+ # If there is neither video or playlist ids, youtube redirects to home page, which is undesirable
+ raise ExtractorError('Unable to recognize tab page')
+ # Common mistake: https://www.youtube.com/watch?list=playlist_id
+ self.report_warning('A video URL was given without video ID. Trying to download playlist %s' % playlist_id)
+ url = 'https://www.youtube.com/playlist?list=%s' % playlist_id
+ mobj = get_mobj(url)
+
if video_id and playlist_id:
- if self._downloader.params.get('noplaylist'):
+ if self.get_param('noplaylist'):
self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
- return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id)
- self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
- webpage = self._download_webpage(url, item_id)
- data = self._extract_yt_initial_data(item_id, webpage)
+ return self.url_result(f'https://www.youtube.com/watch?v={video_id}', ie=YoutubeIE.ie_key(), video_id=video_id)
+ self.to_screen('Downloading playlist %s; add --no-playlist to just download video %s' % (playlist_id, video_id))
+
+ data, ytcfg = self._extract_data(url, item_id)
+
+ tabs = try_get(
+ data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list)
+ if tabs:
+ selected_tab = self._extract_selected_tab(tabs)
+ tab_name = selected_tab.get('title', '')
+ if 'no-youtube-channel-redirect' not in compat_opts:
+ if mobj['tab'] == '/live':
+ # Live tab should have redirected to the video
+ raise ExtractorError('The channel is not currently live', expected=True)
+ if mobj['tab'] == '/videos' and tab_name.lower() != mobj['tab'][1:]:
+ if not mobj['not_channel'] and item_id[:2] == 'UC':
+ # Topic channels don't have /videos. Use the equivalent playlist instead
+ self.report_warning('The URL does not have a %s tab. Trying to redirect to playlist UU%s instead' % (mobj['tab'][1:], item_id[2:]))
+ pl_id = 'UU%s' % item_id[2:]
+ pl_url = 'https://www.youtube.com/playlist?list=%s%s' % (pl_id, mobj['post'])
+ try:
+ data, ytcfg, item_id, url = *self._extract_data(pl_url, pl_id, ytcfg=ytcfg, fatal=True), pl_id, pl_url
+ except ExtractorError:
+ self.report_warning('The playlist gave error. Falling back to channel URL')
+ else:
+ self.report_warning('The URL does not have a %s tab. %s is being downloaded instead' % (mobj['tab'][1:], tab_name))
+
+ self.write_debug('Final URL: %s' % url)
+
+ # YouTube sometimes provides a button to reload playlist with unavailable videos.
+ if 'no-youtube-unavailable-videos' not in compat_opts:
+ data = self._reload_with_unavailable_videos(item_id, data, ytcfg) or data
+ self._extract_and_report_alerts(data, only_once=True)
tabs = try_get(
data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list)
if tabs:
- return self._extract_from_tabs(item_id, webpage, data, tabs)
+ return self._extract_from_tabs(item_id, ytcfg, data, tabs)
+
playlist = try_get(
data, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist'], dict)
if playlist:
- return self._extract_from_playlist(item_id, url, data, playlist)
- # Fallback to video extraction if no playlist alike page is recognized.
- # First check for the current video then try the v attribute of URL query.
+ return self._extract_from_playlist(item_id, url, data, playlist, ytcfg)
+
video_id = try_get(
data, lambda x: x['currentVideoEndpoint']['watchEndpoint']['videoId'],
compat_str) or video_id
if video_id:
- return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id)
- # Capture and output alerts
- alert = self._extract_alert(data)
- if alert:
- raise ExtractorError(alert, expected=True)
- # Failed to recognize
+ if mobj['tab'] != '/live': # live tab is expected to redirect to video
+ self.report_warning('Unable to recognize playlist. Downloading just video %s' % video_id)
+ return self.url_result(f'https://www.youtube.com/watch?v={video_id}', ie=YoutubeIE.ie_key(), video_id=video_id)
+
raise ExtractorError('Unable to recognize tab page')
@@ -2867,6 +4260,7 @@ class YoutubePlaylistIE(InfoExtractor):
'id': 'PLBB231211A4F62143',
'uploader': 'Wickydoo',
'uploader_id': 'UCKSpbfbl5kRQpTdL7kMc-1Q',
+ 'description': 'md5:8fa6f52abb47a9552002fa3ddfc57fc2',
},
'playlist_mincount': 29,
}, {
@@ -2889,12 +4283,13 @@ class YoutubePlaylistIE(InfoExtractor):
}
}, {
'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
- 'playlist_mincount': 982,
+ 'playlist_mincount': 654,
'info_dict': {
'title': '2018 Chinese New Singles (11/6 updated)',
'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
'uploader': 'LBK',
'uploader_id': 'UC21nz3_MesPLqtDqwdvnoxA',
+ 'description': 'md5:da521864744d60a198e3a88af4db0d9d',
}
}, {
'url': 'TLGGrESM50VT6acwMjAyMjAxNw',
@@ -2919,15 +4314,17 @@ class YoutubePlaylistIE(InfoExtractor):
def _real_extract(self, url):
playlist_id = self._match_id(url)
- qs = parse_qs(url)
- if not qs:
- qs = {'list': playlist_id}
- return self.url_result(
- update_url_query('https://www.youtube.com/playlist', qs),
- ie=YoutubeTabIE.ie_key(), video_id=playlist_id)
+ is_music_url = YoutubeBaseInfoExtractor.is_music_url(url)
+ url = update_url_query(
+ 'https://www.youtube.com/playlist',
+ parse_qs(url) or {'list': playlist_id})
+ if is_music_url:
+ url = smuggle_url(url, {'is_music_url': True})
+ return self.url_result(url, ie=YoutubeTabIE.ie_key(), video_id=playlist_id)
class YoutubeYtBeIE(InfoExtractor):
+ IE_DESC = 'youtu.be'
_VALID_URL = r'https?://youtu\.be/(?P<id>[0-9A-Za-z_-]{11})/*?.*?\blist=(?P<playlist_id>%(playlist_id)s)' % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
_TESTS = [{
'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5',
@@ -2955,7 +4352,7 @@ class YoutubeYtBeIE(InfoExtractor):
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
video_id = mobj.group('id')
playlist_id = mobj.group('playlist_id')
return self.url_result(
@@ -2967,6 +4364,7 @@ class YoutubeYtBeIE(InfoExtractor):
class YoutubeYtUserIE(InfoExtractor):
+ IE_DESC = 'YouTube.com user videos, URL or "ytuser" keyword'
_VALID_URL = r'ytuser:(?P<id>.+)'
_TESTS = [{
'url': 'ytuser:phihag',
@@ -2982,8 +4380,8 @@ class YoutubeYtUserIE(InfoExtractor):
class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
IE_NAME = 'youtube:favorites'
- IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
- _VALID_URL = r'https?://(?:www\.)?youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
+ IE_DESC = 'YouTube.com liked videos, ":ytfav" for short (requires authentication)'
+ _VALID_URL = r':ytfav(?:ou?rite)?s?'
_LOGIN_REQUIRED = True
_TESTS = [{
'url': ':ytfav',
@@ -2999,8 +4397,8 @@ class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
ie=YoutubeTabIE.ie_key())
-class YoutubeSearchIE(SearchInfoExtractor, YoutubeBaseInfoExtractor):
- IE_DESC = 'YouTube.com searches'
+class YoutubeSearchIE(SearchInfoExtractor, YoutubeTabIE):
+ IE_DESC = 'YouTube.com searches, "ytsearch" keyword'
# there doesn't appear to be a real limit, for example if you search for
# 'python' you get more than 8.000.000 results
_MAX_RESULTS = float('inf')
@@ -3009,27 +4407,17 @@ class YoutubeSearchIE(SearchInfoExtractor, YoutubeBaseInfoExtractor):
_SEARCH_PARAMS = None
_TESTS = []
- def _entries(self, query, n):
- data = {
- 'context': {
- 'client': {
- 'clientName': 'WEB',
- 'clientVersion': '2.20201021.03.00',
- }
- },
- 'query': query,
- }
+ def _search_results(self, query):
+ data = {'query': query}
if self._SEARCH_PARAMS:
data['params'] = self._SEARCH_PARAMS
- total = 0
+ continuation = {}
for page_num in itertools.count(1):
- search = self._download_json(
- 'https://www.youtube.com/youtubei/v1/search?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
- video_id='query "%s"' % query,
- note='Downloading page %s' % page_num,
- errnote='Unable to download API page', fatal=False,
- data=json.dumps(data).encode('utf8'),
- headers={'content-type': 'application/json'})
+ data.update(continuation)
+ search = self._extract_response(
+ item_id='query "%s" page %s' % (query, page_num), ep='search', query=data,
+ check_get_keys=('contents', 'onResponseReceivedCommands')
+ )
if not search:
break
slr_contents = try_get(
@@ -3039,7 +4427,15 @@ class YoutubeSearchIE(SearchInfoExtractor, YoutubeBaseInfoExtractor):
list)
if not slr_contents:
break
+
+ # Youtube sometimes adds promoted content to searches,
+ # changing the index location of videos and token.
+ # So we search through all entries till we find them.
+ continuation = None
for slr_content in slr_contents:
+ if not continuation:
+ continuation = self._extract_continuation({'contents': [slr_content]})
+
isr_contents = try_get(
slr_content,
lambda x: x['itemSectionRenderer']['contents'],
@@ -3055,52 +4451,46 @@ class YoutubeSearchIE(SearchInfoExtractor, YoutubeBaseInfoExtractor):
video_id = video.get('videoId')
if not video_id:
continue
+
yield self._extract_video(video)
- total += 1
- if total == n:
- return
- token = try_get(
- slr_contents,
- lambda x: x[-1]['continuationItemRenderer']['continuationEndpoint']['continuationCommand']['token'],
- compat_str)
- if not token:
- break
- data['continuation'] = token
- def _get_n_results(self, query, n):
- """Get a specified number of results for a query"""
- return self.playlist_result(self._entries(query, n), query)
+ if not continuation:
+ break
class YoutubeSearchDateIE(YoutubeSearchIE):
IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
_SEARCH_KEY = 'ytsearchdate'
- IE_DESC = 'YouTube.com searches, newest videos first'
+ IE_DESC = 'YouTube.com searches, newest videos first, "ytsearchdate" keyword'
_SEARCH_PARAMS = 'CAI%3D'
-r"""
class YoutubeSearchURLIE(YoutubeSearchIE):
IE_DESC = 'YouTube.com search URLs'
- IE_NAME = 'youtube:search_url'
- _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P<query>[^&]+)(?:[&]|$)'
+ IE_NAME = YoutubeSearchIE.IE_NAME + '_url'
+ _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?:[^&]+)(?:[&]|$)'
+ # _MAX_RESULTS = 100
_TESTS = [{
'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
'playlist_mincount': 5,
'info_dict': {
- 'title': 'hypervideo test video',
+ 'id': 'youtube-dl test video',
+ 'title': 'youtube-dl test video',
}
}, {
'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB',
'only_matching': True,
}]
+ @classmethod
+ def _make_valid_url(cls):
+ return cls._VALID_URL
+
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- query = compat_urllib_parse_unquote_plus(mobj.group('query'))
- webpage = self._download_webpage(url, query)
- return self.playlist_result(self._process_page(webpage), playlist_title=query)
-"""
+ qs = parse_qs(url)
+ query = (qs.get('search_query') or qs.get('q'))[0]
+ self._SEARCH_PARAMS = qs.get('sp', ('',))[0]
+ return self._get_n_results(query, self._MAX_RESULTS)
class YoutubeFeedsInfoExtractor(YoutubeTabIE):
@@ -3109,14 +4499,12 @@ class YoutubeFeedsInfoExtractor(YoutubeTabIE):
Subclasses must define the _FEED_NAME property.
"""
_LOGIN_REQUIRED = True
+ _TESTS = []
@property
def IE_NAME(self):
return 'youtube:%s' % self._FEED_NAME
- def _real_initialize(self):
- self._login()
-
def _real_extract(self, url):
return self.url_result(
'https://www.youtube.com/feed/%s' % self._FEED_NAME,
@@ -3139,20 +4527,24 @@ class YoutubeWatchLaterIE(InfoExtractor):
class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
- _VALID_URL = r':ytrec(?:ommended)?'
+ _VALID_URL = r'https?://(?:www\.)?youtube\.com/?(?:[?#]|$)|:ytrec(?:ommended)?'
_FEED_NAME = 'recommended'
+ _LOGIN_REQUIRED = False
_TESTS = [{
'url': ':ytrec',
'only_matching': True,
}, {
'url': ':ytrecommended',
'only_matching': True,
+ }, {
+ 'url': 'https://youtube.com',
+ 'only_matching': True,
}]
class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
- IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
- _VALID_URL = r':ytsubs(?:criptions)?'
+ IE_DESC = 'YouTube.com subscriptions feed, ":ytsubs" for short (requires authentication)'
+ _VALID_URL = r':ytsub(?:scription)?s?'
_FEED_NAME = 'subscriptions'
_TESTS = [{
'url': ':ytsubs',
@@ -3164,8 +4556,8 @@ class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
- IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
- _VALID_URL = r':ythistory'
+ IE_DESC = 'Youtube watch history, ":ythis" for short (requires authentication)'
+ _VALID_URL = r':ythis(?:tory)?'
_FEED_NAME = 'history'
_TESTS = [{
'url': ':ythistory',
@@ -3216,12 +4608,22 @@ class YoutubeTruncatedURLIE(InfoExtractor):
raise ExtractorError(
'Did you forget to quote the URL? Remember that & is a meta '
'character in most shells, so you want to put the URL in quotes, '
- 'like hypervideo '
+ 'like youtube-dl '
'"https://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
- ' or simply hypervideo BaW_jenozKc .',
+ ' or simply youtube-dl BaW_jenozKc .',
expected=True)
+class YoutubeClipIE(InfoExtractor):
+ IE_NAME = 'youtube:clip'
+ IE_DESC = False # Do not list
+ _VALID_URL = r'https?://(?:www\.)?youtube\.com/clip/'
+
+ def _real_extract(self, url):
+ self.report_warning('YouTube clips are not currently supported. The entire video will be downloaded instead')
+ return self.url_result(url, 'Generic')
+
+
class YoutubeTruncatedIDIE(InfoExtractor):
IE_NAME = 'youtube:truncated_id'
IE_DESC = False # Do not list
diff --git a/hypervideo_dl/extractor/zapiks.py b/hypervideo_dl/extractor/zapiks.py
index f6496f5..161b011 100644
--- a/hypervideo_dl/extractor/zapiks.py
+++ b/hypervideo_dl/extractor/zapiks.py
@@ -46,7 +46,7 @@ class ZapiksIE(InfoExtractor):
]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ mobj = self._match_valid_url(url)
video_id = mobj.group('id')
display_id = mobj.group('display_id') or video_id
diff --git a/hypervideo_dl/extractor/zaq1.py b/hypervideo_dl/extractor/zaq1.py
new file mode 100644
index 0000000..889aff5
--- /dev/null
+++ b/hypervideo_dl/extractor/zaq1.py
@@ -0,0 +1,101 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ int_or_none,
+ unified_timestamp,
+)
+
+
+class Zaq1IE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?zaq1\.pl/video/(?P<id>[^/?#&]+)'
+ _TESTS = [{
+ 'url': 'http://zaq1.pl/video/xev0e',
+ 'md5': '24a5eb3f052e604ae597c4d0d19b351e',
+ 'info_dict': {
+ 'id': 'xev0e',
+ 'title': 'DJ NA WESELE. TANIEC Z FIGURAMI.węgrów/sokołów podlaski/siedlce/mińsk mazowiecki/warszawa',
+ 'description': 'www.facebook.com/weseledjKontakt: 728 448 199 / 505 419 147',
+ 'ext': 'mp4',
+ 'duration': 511,
+ 'timestamp': 1490896361,
+ 'uploader': 'Anonim',
+ 'upload_date': '20170330',
+ 'view_count': int,
+ }
+ }, {
+ # malformed JSON-LD
+ 'url': 'http://zaq1.pl/video/x81vn',
+ 'info_dict': {
+ 'id': 'x81vn',
+ 'title': 'SEKRETNE ŻYCIE WALTERA MITTY',
+ 'ext': 'mp4',
+ 'duration': 6234,
+ 'timestamp': 1493494860,
+ 'uploader': 'Anonim',
+ 'upload_date': '20170429',
+ 'view_count': int,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ 'expected_warnings': ['Failed to parse JSON'],
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ video_url = self._search_regex(
+ r'data-video-url=(["\'])(?P<url>(?:(?!\1).)+)\1', webpage,
+ 'video url', group='url')
+
+ info = self._search_json_ld(webpage, video_id, fatal=False)
+
+ def extract_data(field, name, fatal=False):
+ return self._search_regex(
+ r'data-%s=(["\'])(?P<field>(?:(?!\1).)+)\1' % field,
+ webpage, field, fatal=fatal, group='field')
+
+ if not info.get('title'):
+ info['title'] = extract_data('file-name', 'title', fatal=True)
+
+ if not info.get('duration'):
+ info['duration'] = int_or_none(extract_data('duration', 'duration'))
+
+ if not info.get('thumbnail'):
+ info['thumbnail'] = extract_data('photo-url', 'thumbnail')
+
+ if not info.get('timestamp'):
+ info['timestamp'] = unified_timestamp(self._html_search_meta(
+ 'uploadDate', webpage, 'timestamp'))
+
+ if not info.get('interactionCount'):
+ info['view_count'] = int_or_none(self._html_search_meta(
+ 'interactionCount', webpage, 'view count'))
+
+ uploader = self._html_search_regex(
+ r'Wideo dodał:\s*<a[^>]*>([^<]+)</a>', webpage, 'uploader',
+ fatal=False)
+
+ width = int_or_none(self._html_search_meta(
+ 'width', webpage, fatal=False))
+ height = int_or_none(self._html_search_meta(
+ 'height', webpage, fatal=False))
+
+ info.update({
+ 'id': video_id,
+ 'formats': [{
+ 'url': video_url,
+ 'width': width,
+ 'height': height,
+ 'http_headers': {
+ 'Referer': url,
+ },
+ }],
+ 'uploader': uploader,
+ })
+
+ return info
diff --git a/hypervideo_dl/extractor/zattoo.py b/hypervideo_dl/extractor/zattoo.py
index 6bac302..a13d124 100644
--- a/hypervideo_dl/extractor/zattoo.py
+++ b/hypervideo_dl/extractor/zattoo.py
@@ -182,7 +182,7 @@ class ZattooPlatformBaseIE(InfoExtractor):
else:
assert False
for this_format in this_formats:
- this_format['preference'] = preference
+ this_format['quality'] = preference
formats.extend(this_formats)
self._sort_formats(formats)
return formats
@@ -217,7 +217,7 @@ class QuicklineIE(QuicklineBaseIE):
}
def _real_extract(self, url):
- channel_name, video_id = re.match(self._VALID_URL, url).groups()
+ channel_name, video_id = self._match_valid_url(url).groups()
return self._extract_video(channel_name, video_id)
@@ -262,7 +262,7 @@ class ZattooIE(ZattooBaseIE):
}]
def _real_extract(self, url):
- channel_name, video_id, record_id = re.match(self._VALID_URL, url).groups()
+ channel_name, video_id, record_id = self._match_valid_url(url).groups()
return self._extract_video(channel_name, video_id, record_id)
diff --git a/hypervideo_dl/extractor/zdf.py b/hypervideo_dl/extractor/zdf.py
index 4dd56f6..8c279c5 100644
--- a/hypervideo_dl/extractor/zdf.py
+++ b/hypervideo_dl/extractor/zdf.py
@@ -14,6 +14,7 @@ from ..utils import (
orderedSet,
parse_codecs,
qualities,
+ str_or_none,
try_get,
unified_timestamp,
update_url_query,
@@ -49,35 +50,35 @@ class ZDFBaseIE(InfoExtractor):
def _extract_format(self, video_id, formats, format_urls, meta):
format_url = url_or_none(meta.get('url'))
- if not format_url:
- return
- if format_url in format_urls:
+ if not format_url or format_url in format_urls:
return
format_urls.add(format_url)
- mime_type = meta.get('mimeType')
- ext = determine_ext(format_url)
+
+ mime_type, ext = meta.get('mimeType'), determine_ext(format_url)
if mime_type == 'application/x-mpegURL' or ext == 'm3u8':
- formats.extend(self._extract_m3u8_formats(
+ new_formats = self._extract_m3u8_formats(
format_url, video_id, 'mp4', m3u8_id='hls',
- entry_protocol='m3u8_native', fatal=False))
+ entry_protocol='m3u8_native', fatal=False)
elif mime_type == 'application/f4m+xml' or ext == 'f4m':
- formats.extend(self._extract_f4m_formats(
- update_url_query(format_url, {'hdcore': '3.7.0'}), video_id, f4m_id='hds', fatal=False))
+ new_formats = self._extract_f4m_formats(
+ update_url_query(format_url, {'hdcore': '3.7.0'}), video_id, f4m_id='hds', fatal=False)
else:
f = parse_codecs(meta.get('mimeCodec'))
- format_id = ['http']
- for p in (meta.get('type'), meta.get('quality')):
- if p and isinstance(p, compat_str):
- format_id.append(p)
+ if not f and meta.get('type'):
+ data = meta['type'].split('_')
+ if try_get(data, lambda x: x[2]) == ext:
+ f = {'vcodec': data[0], 'acodec': data[1]}
f.update({
'url': format_url,
- 'format_id': '-'.join(format_id),
- 'format_note': meta.get('quality'),
- 'language': meta.get('language'),
- 'quality': qualities(self._QUALITIES)(meta.get('quality')),
- 'preference': -10,
+ 'format_id': '-'.join(filter(str_or_none, ('http', meta.get('type'), meta.get('quality')))),
})
- formats.append(f)
+ new_formats = [f]
+ formats.extend(merge_dicts(f, {
+ 'format_note': ', '.join(filter(None, (meta.get('quality'), meta.get('class')))),
+ 'language': meta.get('language'),
+ 'language_preference': 10 if meta.get('class') == 'main' else -10 if meta.get('class') == 'ad' else -1,
+ 'quality': qualities(self._QUALITIES)(meta.get('quality')),
+ }) for f in new_formats)
def _extract_ptmd(self, ptmd_url, video_id, api_token, referrer):
ptmd = self._call_api(
@@ -106,9 +107,10 @@ class ZDFBaseIE(InfoExtractor):
'type': f.get('type'),
'mimeType': f.get('mimeType'),
'quality': quality.get('quality'),
+ 'class': track.get('class'),
'language': track.get('language'),
})
- self._sort_formats(formats)
+ self._sort_formats(formats, ('hasaud', 'res', 'quality', 'language_preference'))
duration = float_or_none(try_get(
ptmd, lambda x: x['attributes']['duration']['value']), scale=1000)
diff --git a/hypervideo_dl/extractor/zee5.py b/hypervideo_dl/extractor/zee5.py
new file mode 100644
index 0000000..5366041
--- /dev/null
+++ b/hypervideo_dl/extractor/zee5.py
@@ -0,0 +1,244 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import json
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+ ExtractorError,
+ int_or_none,
+ parse_age_limit,
+ str_or_none,
+ try_get,
+ unified_strdate,
+ unified_timestamp,
+ url_or_none,
+)
+
+
+class Zee5IE(InfoExtractor):
+ _VALID_URL = r'''(?x)
+ (?:
+ zee5:|
+ (?:https?://)(?:www\.)?zee5\.com/(?:[^#?]+/)?
+ (?:
+ (?:tvshows|kids|zee5originals)(?:/[^#/?]+){3}
+ |movies/[^#/?]+
+ )/(?P<display_id>[^#/?]+)/
+ )
+ (?P<id>[^#/?]+)/?(?:$|[?#])
+ '''
+ _TESTS = [{
+ 'url': 'https://www.zee5.com/movies/details/krishna-the-birth/0-0-63098',
+ 'info_dict': {
+ 'id': '0-0-63098',
+ 'ext': 'mp4',
+ 'display_id': 'krishna-the-birth',
+ 'title': 'Krishna - The Birth',
+ 'duration': 4368,
+ 'average_rating': 4,
+ 'description': compat_str,
+ 'alt_title': 'Krishna - The Birth',
+ 'uploader': 'Zee Entertainment Enterprises Ltd',
+ 'release_date': '20060101',
+ 'upload_date': '20060101',
+ 'timestamp': 1136073600,
+ 'thumbnail': 'https://akamaividz.zee5.com/resources/0-0-63098/list/270x152/0063098_list_80888170.jpg',
+ 'tags': list
+ },
+ 'params': {
+ 'format': 'bv',
+ },
+ }, {
+ 'url': 'https://zee5.com/tvshows/details/krishna-balram/0-6-1871/episode-1-the-test-of-bramha/0-1-233402',
+ 'info_dict': {
+ 'id': '0-1-233402',
+ 'ext': 'mp4',
+ 'display_id': 'episode-1-the-test-of-bramha',
+ 'title': 'Episode 1 - The Test Of Bramha',
+ 'duration': 1336,
+ 'average_rating': 4,
+ 'description': compat_str,
+ 'alt_title': 'Episode 1 - The Test Of Bramha',
+ 'uploader': 'Zee Entertainment Enterprises Ltd',
+ 'release_date': '20090101',
+ 'upload_date': '20090101',
+ 'timestamp': 1230768000,
+ 'thumbnail': 'https://akamaividz.zee5.com/resources/0-1-233402/list/270x152/01233402_list.jpg',
+ 'series': 'Krishna Balram',
+ 'season_number': 1,
+ 'episode_number': 1,
+ 'tags': list,
+ },
+ 'params': {
+ 'format': 'bv',
+ },
+ }, {
+ 'url': 'https://www.zee5.com/hi/tvshows/details/kundali-bhagya/0-6-366/kundali-bhagya-march-08-2021/0-1-manual_7g9jv1os7730?country=IN',
+ 'only_matching': True
+ }, {
+ 'url': 'https://www.zee5.com/global/hi/tvshows/details/kundali-bhagya/0-6-366/kundali-bhagya-march-08-2021/0-1-manual_7g9jv1os7730',
+ 'only_matching': True
+ }]
+ _DETAIL_API_URL = 'https://spapi.zee5.com/singlePlayback/getDetails?content_id={}&device_id={}&platform_name=desktop_web&country=IN&check_parental_control=false'
+ _DEVICE_ID = 'iIxsxYf40cqO3koIkwzKHZhnJzHN13zb'
+ _USER_TOKEN = None
+ _LOGIN_HINT = 'Use "--username <mobile_number>" to login using otp or "--username token" and "--password <user_token>" to login using user token.'
+ _NETRC_MACHINE = 'zee5'
+
+ def _login(self):
+ username, password = self._get_login_info()
+ if username:
+ if len(username) == 10 and username.isdigit() and self._USER_TOKEN is None:
+ self.report_login()
+ otp_request_json = self._download_json('https://b2bapi.zee5.com/device/sendotp_v1.php?phoneno=91{}'.format(username),
+ None, note='Sending OTP')
+ if otp_request_json['code'] == 0:
+ self.to_screen(otp_request_json['message'])
+ else:
+ raise ExtractorError(otp_request_json['message'], expected=True)
+ otp_code = self._get_tfa_info('OTP')
+ otp_verify_json = self._download_json('https://b2bapi.zee5.com/device/verifyotp_v1.php?phoneno=91{}&otp={}&guest_token={}&platform=web'.format(username, otp_code, self._DEVICE_ID),
+ None, note='Verifying OTP', fatal=False)
+ if not otp_verify_json:
+ raise ExtractorError('Unable to verify OTP.', expected=True)
+ self._USER_TOKEN = otp_verify_json.get('token')
+ if not self._USER_TOKEN:
+ raise ExtractorError(otp_request_json['message'], expected=True)
+ elif username.lower() == 'token' and len(password) > 1198:
+ self._USER_TOKEN = password
+
+ def _real_initialize(self):
+ self._login()
+
+ def _real_extract(self, url):
+ video_id, display_id = self._match_valid_url(url).group('id', 'display_id')
+ access_token_request = self._download_json(
+ 'https://useraction.zee5.com/token/platform_tokens.php?platform_name=web_app',
+ video_id, note='Downloading access token')
+ data = {
+ 'x-access-token': access_token_request['token']
+ }
+ if self._USER_TOKEN:
+ data['Authorization'] = 'bearer %s' % self._USER_TOKEN
+ else:
+ data['X-Z5-Guest-Token'] = self._DEVICE_ID
+
+ json_data = self._download_json(
+ self._DETAIL_API_URL.format(video_id, self._DEVICE_ID),
+ video_id, headers={'content-type': 'application/json'}, data=json.dumps(data).encode('utf-8'))
+ asset_data = json_data['assetDetails']
+ show_data = json_data.get('showDetails', {})
+ if 'premium' in asset_data['business_type']:
+ raise ExtractorError('Premium content is DRM protected.', expected=True)
+ if not asset_data.get('hls_url'):
+ self.raise_login_required(self._LOGIN_HINT, metadata_available=True, method=None)
+ formats, m3u8_subs = self._extract_m3u8_formats_and_subtitles(asset_data['hls_url'], video_id, 'mp4', fatal=False)
+ self._sort_formats(formats)
+
+ subtitles = {}
+ for sub in asset_data.get('subtitle_url', []):
+ sub_url = sub.get('url')
+ if not sub_url:
+ continue
+ subtitles.setdefault(sub.get('language', 'en'), []).append({
+ 'url': self._proto_relative_url(sub_url),
+ })
+ subtitles = self._merge_subtitles(subtitles, m3u8_subs)
+ return {
+ 'id': video_id,
+ 'display_id': display_id,
+ 'title': asset_data['title'],
+ 'formats': formats,
+ 'subtitles': subtitles,
+ 'duration': int_or_none(asset_data.get('duration')),
+ 'average_rating': int_or_none(asset_data.get('rating')),
+ 'description': str_or_none(asset_data.get('description')),
+ 'alt_title': str_or_none(asset_data.get('original_title')),
+ 'uploader': str_or_none(asset_data.get('content_owner')),
+ 'age_limit': parse_age_limit(asset_data.get('age_rating')),
+ 'release_date': unified_strdate(asset_data.get('release_date')),
+ 'timestamp': unified_timestamp(asset_data.get('release_date')),
+ 'thumbnail': url_or_none(asset_data.get('image_url')),
+ 'series': str_or_none(asset_data.get('tvshow_name')),
+ 'season': try_get(show_data, lambda x: x['seasons']['title'], str),
+ 'season_number': int_or_none(try_get(show_data, lambda x: x['seasons'][0]['orderid'])),
+ 'episode_number': int_or_none(try_get(asset_data, lambda x: x['orderid'])),
+ 'tags': try_get(asset_data, lambda x: x['tags'], list)
+ }
+
+
+class Zee5SeriesIE(InfoExtractor):
+ IE_NAME = 'zee5:series'
+ _VALID_URL = r'''(?x)
+ (?:
+ zee5:series:|
+ (?:https?://)(?:www\.)?zee5\.com/(?:[^#?]+/)?
+ (?:tvshows|kids|zee5originals)(?:/[^#/?]+){2}/
+ )
+ (?P<id>[^#/?]+)/?(?:$|[?#])
+ '''
+ _TESTS = [{
+ 'url': 'https://www.zee5.com/kids/kids-shows/krishna-balram/0-6-1871',
+ 'playlist_mincount': 43,
+ 'info_dict': {
+ 'id': '0-6-1871',
+ },
+ }, {
+ 'url': 'https://www.zee5.com/tvshows/details/bhabi-ji-ghar-par-hai/0-6-199',
+ 'playlist_mincount': 1500,
+ 'info_dict': {
+ 'id': '0-6-199',
+ },
+ }, {
+ 'url': 'https://www.zee5.com/tvshows/details/agent-raghav-crime-branch/0-6-965',
+ 'playlist_mincount': 24,
+ 'info_dict': {
+ 'id': '0-6-965',
+ },
+ }, {
+ 'url': 'https://www.zee5.com/ta/tvshows/details/nagabhairavi/0-6-3201',
+ 'playlist_mincount': 3,
+ 'info_dict': {
+ 'id': '0-6-3201',
+ },
+ }, {
+ 'url': 'https://www.zee5.com/global/hi/tvshows/details/khwaabon-ki-zamin-par/0-6-270',
+ 'playlist_mincount': 150,
+ 'info_dict': {
+ 'id': '0-6-270',
+ },
+ }
+ ]
+
+ def _entries(self, show_id):
+ access_token_request = self._download_json(
+ 'https://useraction.zee5.com/token/platform_tokens.php?platform_name=web_app',
+ show_id, note='Downloading access token')
+ headers = {
+ 'X-Access-Token': access_token_request['token'],
+ 'Referer': 'https://www.zee5.com/',
+ }
+ show_url = 'https://gwapi.zee5.com/content/tvshow/{}?translation=en&country=IN'.format(show_id)
+
+ page_num = 0
+ show_json = self._download_json(show_url, video_id=show_id, headers=headers)
+ for season in show_json.get('seasons') or []:
+ season_id = try_get(season, lambda x: x['id'], compat_str)
+ next_url = 'https://gwapi.zee5.com/content/tvshow/?season_id={}&type=episode&translation=en&country=IN&on_air=false&asset_subtype=tvshow&page=1&limit=100'.format(season_id)
+ while next_url:
+ page_num += 1
+ episodes_json = self._download_json(
+ next_url, video_id=show_id, headers=headers,
+ note='Downloading JSON metadata page %d' % page_num)
+ for episode in try_get(episodes_json, lambda x: x['episode'], list) or []:
+ video_id = episode.get('id')
+ yield self.url_result(
+ 'zee5:%s' % video_id,
+ ie=Zee5IE.ie_key(), video_id=video_id)
+ next_url = url_or_none(episodes_json.get('next_episode_api'))
+
+ def _real_extract(self, url):
+ show_id = self._match_id(url)
+ return self.playlist_result(self._entries(show_id), playlist_id=show_id)
diff --git a/hypervideo_dl/extractor/zingmp3.py b/hypervideo_dl/extractor/zingmp3.py
index 207c04f..a3edc15 100644
--- a/hypervideo_dl/extractor/zingmp3.py
+++ b/hypervideo_dl/extractor/zingmp3.py
@@ -3,7 +3,6 @@ from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
- ExtractorError,
int_or_none,
)
@@ -48,8 +47,8 @@ class ZingMp3BaseIE(InfoExtractor):
return
msg = item['msg']
if msg == 'Sorry, this content is not available in your country.':
- self.raise_geo_restricted(countries=self._GEO_COUNTRIES)
- raise ExtractorError(msg, expected=True)
+ self.raise_geo_restricted(countries=self._GEO_COUNTRIES, metadata_available=True)
+ self.raise_no_formats(msg, expected=True)
self._sort_formats(formats)
subtitles = None
diff --git a/hypervideo_dl/extractor/zoom.py b/hypervideo_dl/extractor/zoom.py
index db073d9..25a0902 100644
--- a/hypervideo_dl/extractor/zoom.py
+++ b/hypervideo_dl/extractor/zoom.py
@@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
from ..utils import (
@@ -10,6 +9,7 @@ from ..utils import (
js_to_json,
parse_filesize,
urlencode_postdata,
+ urljoin,
)
@@ -27,7 +27,7 @@ class ZoomIE(InfoExtractor):
}
def _real_extract(self, url):
- base_url, play_id = re.match(self._VALID_URL, url).groups()
+ base_url, play_id = self._match_valid_url(url).groups()
webpage = self._download_webpage(url, play_id)
try:
@@ -35,7 +35,7 @@ class ZoomIE(InfoExtractor):
except ExtractorError:
form = None
if form:
- password = self._downloader.params.get('videopassword')
+ password = self.get_param('videopassword')
if not password:
raise ExtractorError(
'This video is protected by a passcode, use the --video-password option', expected=True)
@@ -55,10 +55,19 @@ class ZoomIE(InfoExtractor):
r'(?s)window\.__data__\s*=\s*({.+?});',
webpage, 'data'), play_id, js_to_json)
+ subtitles = {}
+ for _type in ('transcript', 'cc'):
+ if data.get('%sUrl' % _type):
+ subtitles[_type] = [{
+ 'url': urljoin(base_url, data['%sUrl' % _type]),
+ 'ext': 'vtt',
+ }]
+
return {
'id': play_id,
'title': data['topic'],
'url': data['viewMp4Url'],
+ 'subtitles': subtitles,
'width': int_or_none(data.get('viewResolvtionsWidth')),
'height': int_or_none(data.get('viewResolvtionsHeight')),
'http_headers': {
diff --git a/hypervideo_dl/extractor/zype.py b/hypervideo_dl/extractor/zype.py
index f20f953..7663cb3 100644
--- a/hypervideo_dl/extractor/zype.py
+++ b/hypervideo_dl/extractor/zype.py
@@ -56,6 +56,8 @@ class ZypeIE(InfoExtractor):
video = response['video']
title = video['title']
+ subtitles = {}
+
if isinstance(body, dict):
formats = []
for output in body.get('outputs', []):
@@ -64,7 +66,7 @@ class ZypeIE(InfoExtractor):
continue
name = output.get('name')
if name == 'm3u8':
- formats = self._extract_m3u8_formats(
+ formats, subtitles = self._extract_m3u8_formats_and_subtitles(
output_url, video_id, 'mp4',
'm3u8_native', m3u8_id='hls', fatal=False)
else:
@@ -97,7 +99,7 @@ class ZypeIE(InfoExtractor):
if get_attr('integration') == 'verizon-media':
m3u8_url = 'https://content.uplynk.com/%s.m3u8' % get_attr('id')
- formats = self._extract_m3u8_formats(
+ formats, subtitles = self._extract_m3u8_formats_and_subtitles(
m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls')
text_tracks = self._search_regex(
r'textTracks\s*:\s*(\[[^]]+\])',
@@ -107,7 +109,6 @@ class ZypeIE(InfoExtractor):
text_tracks, video_id, js_to_json, False)
self._sort_formats(formats)
- subtitles = {}
if text_tracks:
for text_track in text_tracks:
tt_url = dict_get(text_track, ('file', 'src'))