From eaeeef9c1d1bedb76fea953c332ef84d53bffe2c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jes=C3=BAs?= Date: Fri, 2 Dec 2022 05:21:10 +0800 Subject: update from upstream --- hypervideo_dl/extractor/__init__.py | 52 +- hypervideo_dl/extractor/_extractors.py | 2354 +++++++++++++++ hypervideo_dl/extractor/abc.py | 5 - hypervideo_dl/extractor/abcnews.py | 4 - hypervideo_dl/extractor/abcotvs.py | 6 - hypervideo_dl/extractor/abematv.py | 254 +- hypervideo_dl/extractor/academicearth.py | 2 - hypervideo_dl/extractor/acast.py | 4 - hypervideo_dl/extractor/acfun.py | 199 ++ hypervideo_dl/extractor/adn.py | 50 +- hypervideo_dl/extractor/adobeconnect.py | 3 - hypervideo_dl/extractor/adobepass.py | 98 +- hypervideo_dl/extractor/adobetv.py | 5 +- hypervideo_dl/extractor/adultswim.py | 4 - hypervideo_dl/extractor/aenetworks.py | 17 +- hypervideo_dl/extractor/aeonco.py | 40 + hypervideo_dl/extractor/afreecatv.py | 71 +- hypervideo_dl/extractor/agora.py | 251 ++ hypervideo_dl/extractor/airmozilla.py | 3 - hypervideo_dl/extractor/aliexpress.py | 3 - hypervideo_dl/extractor/aljazeera.py | 3 - hypervideo_dl/extractor/allocine.py | 5 - hypervideo_dl/extractor/alphaporno.py | 2 - hypervideo_dl/extractor/alsace20tv.py | 4 - hypervideo_dl/extractor/alura.py | 7 +- hypervideo_dl/extractor/amara.py | 3 - hypervideo_dl/extractor/amazon.py | 31 +- hypervideo_dl/extractor/amazonminitv.py | 290 ++ hypervideo_dl/extractor/amcnetworks.py | 6 +- hypervideo_dl/extractor/americastestkitchen.py | 57 +- hypervideo_dl/extractor/amp.py | 7 +- hypervideo_dl/extractor/angel.py | 56 + hypervideo_dl/extractor/animelab.py | 278 -- hypervideo_dl/extractor/animeondemand.py | 284 -- hypervideo_dl/extractor/ant1newsgr.py | 19 +- hypervideo_dl/extractor/anvato.py | 233 +- .../extractor/anvato_token_generator/__init__.py | 7 - .../extractor/anvato_token_generator/common.py | 6 - .../extractor/anvato_token_generator/nfl.py | 30 - hypervideo_dl/extractor/aol.py | 6 +- hypervideo_dl/extractor/apa.py | 15 +- hypervideo_dl/extractor/aparat.py | 5 +- hypervideo_dl/extractor/appleconnect.py | 3 - hypervideo_dl/extractor/applepodcasts.py | 3 - hypervideo_dl/extractor/appletrailers.py | 5 - hypervideo_dl/extractor/archiveorg.py | 437 ++- hypervideo_dl/extractor/arcpublishing.py | 8 +- hypervideo_dl/extractor/ard.py | 7 - hypervideo_dl/extractor/arkena.py | 17 +- hypervideo_dl/extractor/arnes.py | 6 +- hypervideo_dl/extractor/arte.py | 373 +-- hypervideo_dl/extractor/asiancrush.py | 3 - hypervideo_dl/extractor/atresplayer.py | 5 - hypervideo_dl/extractor/atscaleconf.py | 34 + hypervideo_dl/extractor/atttechchannel.py | 2 - hypervideo_dl/extractor/atvat.py | 4 - hypervideo_dl/extractor/audimedia.py | 4 - hypervideo_dl/extractor/audioboom.py | 76 +- hypervideo_dl/extractor/audiodraft.py | 93 + hypervideo_dl/extractor/audiomack.py | 3 - hypervideo_dl/extractor/audius.py | 11 +- hypervideo_dl/extractor/awaan.py | 5 +- hypervideo_dl/extractor/aws.py | 5 +- hypervideo_dl/extractor/azmedien.py | 3 - hypervideo_dl/extractor/baidu.py | 4 - hypervideo_dl/extractor/banbye.py | 5 - hypervideo_dl/extractor/bandaichannel.py | 7 +- hypervideo_dl/extractor/bandcamp.py | 23 +- hypervideo_dl/extractor/bannedvideo.py | 3 - hypervideo_dl/extractor/bbc.py | 56 +- hypervideo_dl/extractor/beatport.py | 4 - hypervideo_dl/extractor/beeg.py | 4 - hypervideo_dl/extractor/behindkink.py | 4 - hypervideo_dl/extractor/bellmedia.py | 14 +- hypervideo_dl/extractor/berufetv.py | 70 + hypervideo_dl/extractor/bet.py | 2 - hypervideo_dl/extractor/bfi.py | 3 - hypervideo_dl/extractor/bfmtv.py | 5 +- hypervideo_dl/extractor/bibeltv.py | 3 - hypervideo_dl/extractor/bigflix.py | 5 - hypervideo_dl/extractor/bigo.py | 15 +- hypervideo_dl/extractor/bild.py | 3 - hypervideo_dl/extractor/bilibili.py | 1007 ++++--- hypervideo_dl/extractor/biobiochiletv.py | 3 - hypervideo_dl/extractor/biqle.py | 4 - hypervideo_dl/extractor/bitchute.py | 279 +- hypervideo_dl/extractor/bitwave.py | 3 - hypervideo_dl/extractor/blackboardcollaborate.py | 4 - hypervideo_dl/extractor/bleacherreport.py | 3 - hypervideo_dl/extractor/blinkx.py | 86 - hypervideo_dl/extractor/blogger.py | 11 +- hypervideo_dl/extractor/bloomberg.py | 14 +- hypervideo_dl/extractor/bokecc.py | 6 - hypervideo_dl/extractor/bongacams.py | 21 +- hypervideo_dl/extractor/booyah.py | 86 + hypervideo_dl/extractor/bostonglobe.py | 3 - hypervideo_dl/extractor/box.py | 5 - hypervideo_dl/extractor/bpb.py | 6 - hypervideo_dl/extractor/br.py | 5 - hypervideo_dl/extractor/bravotv.py | 3 - hypervideo_dl/extractor/breakcom.py | 4 - hypervideo_dl/extractor/breitbart.py | 6 +- hypervideo_dl/extractor/brightcove.py | 530 +++- hypervideo_dl/extractor/bundesliga.py | 34 + hypervideo_dl/extractor/businessinsider.py | 3 - hypervideo_dl/extractor/buzzfeed.py | 5 +- hypervideo_dl/extractor/byutv.py | 4 - hypervideo_dl/extractor/c56.py | 5 - hypervideo_dl/extractor/cableav.py | 2 - hypervideo_dl/extractor/callin.py | 6 +- hypervideo_dl/extractor/caltrans.py | 4 - hypervideo_dl/extractor/cam4.py | 4 - hypervideo_dl/extractor/camdemy.py | 3 - hypervideo_dl/extractor/cammodels.py | 4 - hypervideo_dl/extractor/camsoda.py | 57 + hypervideo_dl/extractor/camtasia.py | 71 + hypervideo_dl/extractor/camtube.py | 71 - hypervideo_dl/extractor/camwithher.py | 2 - hypervideo_dl/extractor/canalalpha.py | 4 - hypervideo_dl/extractor/canalc2.py | 5 - hypervideo_dl/extractor/canalplus.py | 5 - hypervideo_dl/extractor/canvas.py | 2 - hypervideo_dl/extractor/carambatv.py | 4 - hypervideo_dl/extractor/cartoonnetwork.py | 3 - hypervideo_dl/extractor/cbc.py | 9 +- hypervideo_dl/extractor/cbs.py | 5 +- hypervideo_dl/extractor/cbsinteractive.py | 6 +- hypervideo_dl/extractor/cbslocal.py | 7 +- hypervideo_dl/extractor/cbsnews.py | 8 +- hypervideo_dl/extractor/cbssports.py | 4 - hypervideo_dl/extractor/ccc.py | 5 +- hypervideo_dl/extractor/ccma.py | 4 - hypervideo_dl/extractor/cctv.py | 5 - hypervideo_dl/extractor/cda.py | 97 +- hypervideo_dl/extractor/cellebrite.py | 63 + hypervideo_dl/extractor/ceskatelevize.py | 77 +- hypervideo_dl/extractor/cgtn.py | 3 - hypervideo_dl/extractor/channel9.py | 10 +- hypervideo_dl/extractor/charlierose.py | 4 - hypervideo_dl/extractor/chaturbate.py | 3 - hypervideo_dl/extractor/chilloutzone.py | 2 - hypervideo_dl/extractor/chingari.py | 18 +- hypervideo_dl/extractor/chirbit.py | 3 - hypervideo_dl/extractor/cinchcast.py | 6 +- hypervideo_dl/extractor/cinemax.py | 4 - hypervideo_dl/extractor/cinetecamilano.py | 61 + hypervideo_dl/extractor/ciscolive.py | 3 - hypervideo_dl/extractor/ciscowebex.py | 4 - hypervideo_dl/extractor/cjsw.py | 4 - hypervideo_dl/extractor/cliphunter.py | 3 - hypervideo_dl/extractor/clippit.py | 4 - hypervideo_dl/extractor/cliprs.py | 3 - hypervideo_dl/extractor/clipsyndicate.py | 2 - hypervideo_dl/extractor/closertotruth.py | 3 - hypervideo_dl/extractor/cloudflarestream.py | 16 +- hypervideo_dl/extractor/cloudy.py | 3 - hypervideo_dl/extractor/clubic.py | 4 - hypervideo_dl/extractor/clyp.py | 3 - hypervideo_dl/extractor/cmt.py | 4 +- hypervideo_dl/extractor/cnbc.py | 4 - hypervideo_dl/extractor/cnn.py | 60 +- hypervideo_dl/extractor/comedycentral.py | 2 - hypervideo_dl/extractor/common.py | 1459 +++++----- hypervideo_dl/extractor/commonmistakes.py | 12 +- hypervideo_dl/extractor/commonprotocols.py | 8 +- hypervideo_dl/extractor/condenast.py | 9 +- hypervideo_dl/extractor/contv.py | 5 - hypervideo_dl/extractor/corus.py | 7 +- hypervideo_dl/extractor/coub.py | 5 - hypervideo_dl/extractor/cozytv.py | 3 - hypervideo_dl/extractor/cpac.py | 12 - hypervideo_dl/extractor/cracked.py | 2 - hypervideo_dl/extractor/crackle.py | 4 - hypervideo_dl/extractor/craftsy.py | 3 - hypervideo_dl/extractor/crooksandliars.py | 5 +- hypervideo_dl/extractor/crowdbunker.py | 4 - hypervideo_dl/extractor/crunchyroll.py | 921 +----- hypervideo_dl/extractor/cspan.py | 8 +- hypervideo_dl/extractor/ctsnews.py | 3 - hypervideo_dl/extractor/ctv.py | 3 - hypervideo_dl/extractor/ctvnews.py | 3 - hypervideo_dl/extractor/cultureunplugged.py | 2 - hypervideo_dl/extractor/curiositystream.py | 19 +- hypervideo_dl/extractor/cwtv.py | 4 +- hypervideo_dl/extractor/cybrary.py | 4 +- hypervideo_dl/extractor/daftsex.py | 5 - hypervideo_dl/extractor/dailymail.py | 13 +- hypervideo_dl/extractor/dailymotion.py | 36 +- hypervideo_dl/extractor/dailywire.py | 113 + hypervideo_dl/extractor/damtomo.py | 4 - hypervideo_dl/extractor/daum.py | 6 +- hypervideo_dl/extractor/daystar.py | 1 - hypervideo_dl/extractor/dbtv.py | 12 +- hypervideo_dl/extractor/dctp.py | 3 - hypervideo_dl/extractor/deezer.py | 4 - hypervideo_dl/extractor/defense.py | 2 - hypervideo_dl/extractor/democracynow.py | 5 - hypervideo_dl/extractor/detik.py | 159 ++ hypervideo_dl/extractor/deuxm.py | 76 + hypervideo_dl/extractor/dfb.py | 4 - hypervideo_dl/extractor/dhm.py | 2 - hypervideo_dl/extractor/digg.py | 2 - hypervideo_dl/extractor/digitalconcerthall.py | 6 +- hypervideo_dl/extractor/digiteka.py | 16 +- hypervideo_dl/extractor/discovery.py | 2 - hypervideo_dl/extractor/discoverygo.py | 3 - hypervideo_dl/extractor/discoverynetworks.py | 42 - hypervideo_dl/extractor/discoveryplusindia.py | 98 - hypervideo_dl/extractor/discoveryvr.py | 59 - hypervideo_dl/extractor/disney.py | 4 - hypervideo_dl/extractor/dispeak.py | 3 - hypervideo_dl/extractor/dlive.py | 4 - hypervideo_dl/extractor/doodstream.py | 76 - hypervideo_dl/extractor/dotsub.py | 2 - hypervideo_dl/extractor/douyutv.py | 3 - hypervideo_dl/extractor/dplay.py | 83 +- hypervideo_dl/extractor/drbonanza.py | 4 - hypervideo_dl/extractor/dreisat.py | 4 +- hypervideo_dl/extractor/drooble.py | 3 - hypervideo_dl/extractor/dropbox.py | 8 +- hypervideo_dl/extractor/dropout.py | 34 +- hypervideo_dl/extractor/drtuber.py | 10 +- hypervideo_dl/extractor/drtv.py | 51 +- hypervideo_dl/extractor/dtube.py | 3 - hypervideo_dl/extractor/duboku.py | 53 +- hypervideo_dl/extractor/dumpert.py | 4 - hypervideo_dl/extractor/dvtv.py | 4 - hypervideo_dl/extractor/dw.py | 4 - hypervideo_dl/extractor/eagleplatform.py | 39 +- hypervideo_dl/extractor/ebaumsworld.py | 2 - hypervideo_dl/extractor/echomsk.py | 3 - hypervideo_dl/extractor/egghead.py | 4 - hypervideo_dl/extractor/ehow.py | 2 - hypervideo_dl/extractor/eighttracks.py | 3 - hypervideo_dl/extractor/einthusan.py | 5 - hypervideo_dl/extractor/eitb.py | 5 - hypervideo_dl/extractor/ellentube.py | 4 - hypervideo_dl/extractor/elonet.py | 4 - hypervideo_dl/extractor/elpais.py | 3 - hypervideo_dl/extractor/embedly.py | 14 +- hypervideo_dl/extractor/engadget.py | 2 - hypervideo_dl/extractor/epicon.py | 4 - hypervideo_dl/extractor/epoch.py | 55 + hypervideo_dl/extractor/eporner.py | 5 - hypervideo_dl/extractor/eroprofile.py | 2 - hypervideo_dl/extractor/ertgr.py | 26 +- hypervideo_dl/extractor/escapist.py | 3 - hypervideo_dl/extractor/espn.py | 165 +- hypervideo_dl/extractor/esri.py | 4 - hypervideo_dl/extractor/europa.py | 4 - hypervideo_dl/extractor/europeantour.py | 3 - hypervideo_dl/extractor/eurosport.py | 97 + hypervideo_dl/extractor/euscreen.py | 4 - hypervideo_dl/extractor/everyonesmixtape.py | 76 - hypervideo_dl/extractor/expotv.py | 3 - hypervideo_dl/extractor/expressen.py | 21 +- hypervideo_dl/extractor/extractors.py | 2162 +------------- hypervideo_dl/extractor/extremetube.py | 4 +- hypervideo_dl/extractor/eyedotv.py | 3 - hypervideo_dl/extractor/facebook.py | 79 +- hypervideo_dl/extractor/fancode.py | 5 +- hypervideo_dl/extractor/faz.py | 4 - hypervideo_dl/extractor/fc2.py | 26 +- hypervideo_dl/extractor/fczenit.py | 5 - hypervideo_dl/extractor/fifa.py | 94 + hypervideo_dl/extractor/filmmodu.py | 5 - hypervideo_dl/extractor/filmon.py | 5 - hypervideo_dl/extractor/filmweb.py | 3 - hypervideo_dl/extractor/firsttv.py | 4 - hypervideo_dl/extractor/fivemin.py | 54 - hypervideo_dl/extractor/fivetv.py | 6 +- hypervideo_dl/extractor/flickr.py | 5 +- hypervideo_dl/extractor/folketinget.py | 4 - hypervideo_dl/extractor/footyroom.py | 3 - hypervideo_dl/extractor/formula1.py | 3 - hypervideo_dl/extractor/fourtube.py | 3 - hypervideo_dl/extractor/fourzerostudio.py | 106 + hypervideo_dl/extractor/fox.py | 10 +- hypervideo_dl/extractor/fox9.py | 3 - hypervideo_dl/extractor/foxgay.py | 6 +- hypervideo_dl/extractor/foxnews.py | 43 +- hypervideo_dl/extractor/foxsports.py | 2 - hypervideo_dl/extractor/fptplay.py | 41 +- hypervideo_dl/extractor/franceculture.py | 128 - hypervideo_dl/extractor/franceinter.py | 3 - hypervideo_dl/extractor/francetv.py | 10 +- hypervideo_dl/extractor/freesound.py | 3 - hypervideo_dl/extractor/freespeech.py | 2 - hypervideo_dl/extractor/freetv.py | 139 + hypervideo_dl/extractor/freshlive.py | 83 - hypervideo_dl/extractor/frontendmasters.py | 4 - hypervideo_dl/extractor/fujitv.py | 12 +- hypervideo_dl/extractor/funimation.py | 14 +- hypervideo_dl/extractor/funk.py | 4 - hypervideo_dl/extractor/fusion.py | 3 - hypervideo_dl/extractor/fuyintv.py | 30 + hypervideo_dl/extractor/fxnetworks.py | 77 - hypervideo_dl/extractor/gab.py | 6 - hypervideo_dl/extractor/gaia.py | 5 - hypervideo_dl/extractor/gameinformer.py | 3 - hypervideo_dl/extractor/gamejolt.py | 1 - hypervideo_dl/extractor/gamespot.py | 4 - hypervideo_dl/extractor/gamestar.py | 4 - hypervideo_dl/extractor/gaskrank.py | 4 - hypervideo_dl/extractor/gazeta.py | 4 - hypervideo_dl/extractor/gdcvault.py | 2 - hypervideo_dl/extractor/gedidigital.py | 36 +- hypervideo_dl/extractor/generic.py | 1922 ++----------- hypervideo_dl/extractor/genericembeds.py | 114 + hypervideo_dl/extractor/genius.py | 127 + hypervideo_dl/extractor/gettr.py | 7 - hypervideo_dl/extractor/gfycat.py | 17 +- hypervideo_dl/extractor/giantbomb.py | 4 - hypervideo_dl/extractor/giga.py | 13 +- hypervideo_dl/extractor/gigya.py | 2 - hypervideo_dl/extractor/glide.py | 5 +- hypervideo_dl/extractor/globo.py | 24 +- hypervideo_dl/extractor/glomex.py | 16 +- hypervideo_dl/extractor/go.py | 59 +- hypervideo_dl/extractor/godtube.py | 3 - hypervideo_dl/extractor/gofile.py | 53 +- hypervideo_dl/extractor/golem.py | 4 - hypervideo_dl/extractor/goodgame.py | 57 + hypervideo_dl/extractor/googledrive.py | 68 +- hypervideo_dl/extractor/googlepodcasts.py | 3 - hypervideo_dl/extractor/googlesearch.py | 2 - hypervideo_dl/extractor/goplay.py | 394 +++ hypervideo_dl/extractor/gopro.py | 5 - hypervideo_dl/extractor/goshgay.py | 3 - hypervideo_dl/extractor/gotostage.py | 3 - hypervideo_dl/extractor/gputechconf.py | 3 - hypervideo_dl/extractor/gronkh.py | 76 +- hypervideo_dl/extractor/groupon.py | 2 - hypervideo_dl/extractor/harpodeon.py | 70 + hypervideo_dl/extractor/hbo.py | 4 - hypervideo_dl/extractor/hearthisat.py | 5 - hypervideo_dl/extractor/heise.py | 73 +- hypervideo_dl/extractor/hellporno.py | 3 - hypervideo_dl/extractor/helsinki.py | 5 - hypervideo_dl/extractor/hentaistigma.py | 2 - hypervideo_dl/extractor/hgtv.py | 3 - hypervideo_dl/extractor/hidive.py | 6 +- hypervideo_dl/extractor/historicfilms.py | 2 - hypervideo_dl/extractor/hitbox.py | 13 +- hypervideo_dl/extractor/hitrecord.py | 2 - hypervideo_dl/extractor/hketv.py | 4 - hypervideo_dl/extractor/holodex.py | 100 + hypervideo_dl/extractor/hornbunny.py | 49 - hypervideo_dl/extractor/hotnewhiphop.py | 2 - hypervideo_dl/extractor/hotstar.py | 291 +- hypervideo_dl/extractor/howcast.py | 2 - hypervideo_dl/extractor/howstuffworks.py | 4 - hypervideo_dl/extractor/hrfensehen.py | 58 +- hypervideo_dl/extractor/hrti.py | 4 - hypervideo_dl/extractor/hse.py | 2 - hypervideo_dl/extractor/huajiao.py | 3 - hypervideo_dl/extractor/huffpost.py | 5 +- hypervideo_dl/extractor/hungama.py | 48 +- hypervideo_dl/extractor/huya.py | 14 +- hypervideo_dl/extractor/hypem.py | 2 - hypervideo_dl/extractor/hytale.py | 58 + hypervideo_dl/extractor/icareus.py | 179 ++ hypervideo_dl/extractor/ichinanalive.py | 7 - hypervideo_dl/extractor/ign.py | 4 - hypervideo_dl/extractor/iheart.py | 3 - hypervideo_dl/extractor/iltalehti.py | 51 + hypervideo_dl/extractor/imdb.py | 3 - hypervideo_dl/extractor/imggaming.py | 4 - hypervideo_dl/extractor/imgur.py | 6 +- hypervideo_dl/extractor/ina.py | 110 +- hypervideo_dl/extractor/inc.py | 2 - hypervideo_dl/extractor/indavideo.py | 28 +- hypervideo_dl/extractor/infoq.py | 15 +- hypervideo_dl/extractor/instagram.py | 359 ++- hypervideo_dl/extractor/internazionale.py | 4 - hypervideo_dl/extractor/internetvideoarchive.py | 3 - hypervideo_dl/extractor/iprima.py | 8 +- hypervideo_dl/extractor/iqiyi.py | 18 +- hypervideo_dl/extractor/ir90tv.py | 42 - hypervideo_dl/extractor/islamchannel.py | 81 + hypervideo_dl/extractor/israelnationalnews.py | 50 + hypervideo_dl/extractor/itprotv.py | 2 - hypervideo_dl/extractor/itv.py | 4 - hypervideo_dl/extractor/ivi.py | 5 +- hypervideo_dl/extractor/ivideon.py | 5 - hypervideo_dl/extractor/iwara.py | 137 +- hypervideo_dl/extractor/ixigua.py | 83 + hypervideo_dl/extractor/izlesene.py | 4 - hypervideo_dl/extractor/jable.py | 103 + hypervideo_dl/extractor/jamendo.py | 41 +- hypervideo_dl/extractor/japandiet.py | 274 ++ hypervideo_dl/extractor/jeuxvideo.py | 5 - hypervideo_dl/extractor/jixie.py | 47 + hypervideo_dl/extractor/joj.py | 17 +- hypervideo_dl/extractor/jove.py | 3 - hypervideo_dl/extractor/jwplatform.py | 46 +- hypervideo_dl/extractor/kakao.py | 6 +- hypervideo_dl/extractor/kaltura.py | 265 +- hypervideo_dl/extractor/kanal2.py | 66 + hypervideo_dl/extractor/kanalplay.py | 96 - hypervideo_dl/extractor/kankan.py | 48 - hypervideo_dl/extractor/karaoketv.py | 3 - hypervideo_dl/extractor/karrierevideos.py | 3 - hypervideo_dl/extractor/keezmovies.py | 11 +- hypervideo_dl/extractor/kelbyone.py | 4 - hypervideo_dl/extractor/ketnet.py | 2 - hypervideo_dl/extractor/khanacademy.py | 19 +- hypervideo_dl/extractor/kicker.py | 55 + hypervideo_dl/extractor/kickstarter.py | 3 - hypervideo_dl/extractor/kinja.py | 17 +- hypervideo_dl/extractor/kinopoisk.py | 4 - hypervideo_dl/extractor/kompas.py | 26 + hypervideo_dl/extractor/konserthusetplay.py | 5 - hypervideo_dl/extractor/koo.py | 3 - hypervideo_dl/extractor/krasview.py | 3 - hypervideo_dl/extractor/kth.py | 28 + hypervideo_dl/extractor/ku6.py | 2 - hypervideo_dl/extractor/kusi.py | 10 +- hypervideo_dl/extractor/kuwo.py | 6 - hypervideo_dl/extractor/la7.py | 8 +- hypervideo_dl/extractor/laola1tv.py | 6 +- hypervideo_dl/extractor/lastfm.py | 5 +- hypervideo_dl/extractor/lbry.py | 86 +- hypervideo_dl/extractor/lci.py | 32 +- hypervideo_dl/extractor/lcp.py | 5 +- hypervideo_dl/extractor/lecture2go.py | 5 - hypervideo_dl/extractor/lecturio.py | 4 - hypervideo_dl/extractor/leeco.py | 6 +- hypervideo_dl/extractor/lego.py | 4 - hypervideo_dl/extractor/lemonde.py | 2 - hypervideo_dl/extractor/lenta.py | 3 - hypervideo_dl/extractor/libraryofcongress.py | 5 - hypervideo_dl/extractor/libsyn.py | 5 +- hypervideo_dl/extractor/lifenews.py | 5 - hypervideo_dl/extractor/likee.py | 192 ++ hypervideo_dl/extractor/limelight.py | 9 +- hypervideo_dl/extractor/line.py | 7 +- hypervideo_dl/extractor/linkedin.py | 13 +- hypervideo_dl/extractor/linuxacademy.py | 3 - hypervideo_dl/extractor/liputan6.py | 64 + hypervideo_dl/extractor/listennotes.py | 86 + hypervideo_dl/extractor/litv.py | 3 - hypervideo_dl/extractor/livejournal.py | 3 - hypervideo_dl/extractor/liveleak.py | 191 -- hypervideo_dl/extractor/livestream.py | 7 +- hypervideo_dl/extractor/livestreamfails.py | 37 + hypervideo_dl/extractor/lnkgo.py | 8 +- hypervideo_dl/extractor/localnews8.py | 4 - hypervideo_dl/extractor/lovehomeporn.py | 3 - hypervideo_dl/extractor/lrt.py | 58 +- hypervideo_dl/extractor/lynda.py | 4 - hypervideo_dl/extractor/m6.py | 3 - hypervideo_dl/extractor/magentamusik360.py | 3 - hypervideo_dl/extractor/mailru.py | 4 - hypervideo_dl/extractor/mainstreaming.py | 11 +- hypervideo_dl/extractor/malltv.py | 37 +- hypervideo_dl/extractor/mangomolo.py | 31 +- hypervideo_dl/extractor/manoto.py | 5 - hypervideo_dl/extractor/manyvids.py | 122 +- hypervideo_dl/extractor/maoritv.py | 3 - hypervideo_dl/extractor/markiza.py | 3 - hypervideo_dl/extractor/massengeschmacktv.py | 4 - hypervideo_dl/extractor/masters.py | 38 + hypervideo_dl/extractor/matchtv.py | 4 - hypervideo_dl/extractor/mdr.py | 5 - hypervideo_dl/extractor/medaltv.py | 77 +- hypervideo_dl/extractor/mediaite.py | 3 - hypervideo_dl/extractor/mediaklikk.py | 4 - hypervideo_dl/extractor/medialaan.py | 7 +- hypervideo_dl/extractor/mediaset.py | 43 +- hypervideo_dl/extractor/mediasite.py | 19 +- hypervideo_dl/extractor/mediaworksnz.py | 103 + hypervideo_dl/extractor/medici.py | 3 - hypervideo_dl/extractor/megaphone.py | 11 +- hypervideo_dl/extractor/megatvcom.py | 11 +- hypervideo_dl/extractor/meipai.py | 7 +- hypervideo_dl/extractor/melonvod.py | 4 - hypervideo_dl/extractor/meta.py | 3 - hypervideo_dl/extractor/metacafe.py | 16 +- hypervideo_dl/extractor/metacritic.py | 3 - hypervideo_dl/extractor/mgoon.py | 5 - hypervideo_dl/extractor/mgtv.py | 11 +- hypervideo_dl/extractor/miaopai.py | 3 - hypervideo_dl/extractor/microsoftembed.py | 65 + hypervideo_dl/extractor/microsoftstream.py | 4 - hypervideo_dl/extractor/microsoftvirtualacademy.py | 12 +- hypervideo_dl/extractor/mildom.py | 11 +- hypervideo_dl/extractor/minds.py | 8 +- hypervideo_dl/extractor/ministrygrid.py | 2 - hypervideo_dl/extractor/minoto.py | 5 - hypervideo_dl/extractor/miomio.py | 3 - hypervideo_dl/extractor/mirrativ.py | 3 - hypervideo_dl/extractor/mirrorcouk.py | 98 + hypervideo_dl/extractor/mit.py | 2 - hypervideo_dl/extractor/mitele.py | 5 +- hypervideo_dl/extractor/mixch.py | 2 - hypervideo_dl/extractor/mixcloud.py | 11 +- hypervideo_dl/extractor/mlb.py | 120 +- hypervideo_dl/extractor/mlssoccer.py | 3 - hypervideo_dl/extractor/mnet.py | 4 - hypervideo_dl/extractor/mocha.py | 64 + hypervideo_dl/extractor/moevideo.py | 4 - hypervideo_dl/extractor/mofosex.py | 13 +- hypervideo_dl/extractor/mojvideo.py | 4 - hypervideo_dl/extractor/morningstar.py | 4 - hypervideo_dl/extractor/motherless.py | 31 +- hypervideo_dl/extractor/motorsport.py | 12 +- hypervideo_dl/extractor/movieclips.py | 3 - hypervideo_dl/extractor/moviepilot.py | 112 + hypervideo_dl/extractor/moview.py | 43 + hypervideo_dl/extractor/moviezine.py | 6 - hypervideo_dl/extractor/movingimage.py | 2 - hypervideo_dl/extractor/msn.py | 4 - hypervideo_dl/extractor/mtv.py | 26 +- hypervideo_dl/extractor/muenchentv.py | 4 - hypervideo_dl/extractor/murrtube.py | 5 +- hypervideo_dl/extractor/musescore.py | 3 - hypervideo_dl/extractor/musicdex.py | 5 +- hypervideo_dl/extractor/mwave.py | 3 - hypervideo_dl/extractor/mxplayer.py | 151 +- hypervideo_dl/extractor/mychannels.py | 4 - hypervideo_dl/extractor/myspace.py | 5 - hypervideo_dl/extractor/myspass.py | 3 - hypervideo_dl/extractor/myvi.py | 13 +- hypervideo_dl/extractor/myvideoge.py | 3 - hypervideo_dl/extractor/myvidster.py | 2 - hypervideo_dl/extractor/n1.py | 5 - hypervideo_dl/extractor/nate.py | 4 - hypervideo_dl/extractor/nationalgeographic.py | 4 +- hypervideo_dl/extractor/naver.py | 172 +- hypervideo_dl/extractor/nba.py | 4 - hypervideo_dl/extractor/nbc.py | 191 +- hypervideo_dl/extractor/ndr.py | 252 +- hypervideo_dl/extractor/ndtv.py | 15 +- hypervideo_dl/extractor/nebula.py | 111 +- hypervideo_dl/extractor/nerdcubed.py | 3 - hypervideo_dl/extractor/neteasemusic.py | 176 +- hypervideo_dl/extractor/netverse.py | 176 ++ hypervideo_dl/extractor/netzkino.py | 5 - hypervideo_dl/extractor/newgrounds.py | 4 - hypervideo_dl/extractor/newspicks.py | 53 + hypervideo_dl/extractor/newstube.py | 4 - hypervideo_dl/extractor/newsy.py | 4 - hypervideo_dl/extractor/nextmedia.py | 7 +- hypervideo_dl/extractor/nexx.py | 25 +- hypervideo_dl/extractor/nfb.py | 4 - hypervideo_dl/extractor/nfhsnetwork.py | 7 +- hypervideo_dl/extractor/nfl.py | 15 +- hypervideo_dl/extractor/nhk.py | 27 +- hypervideo_dl/extractor/nhl.py | 4 - hypervideo_dl/extractor/nick.py | 6 +- hypervideo_dl/extractor/niconico.py | 56 +- hypervideo_dl/extractor/ninecninemedia.py | 4 - hypervideo_dl/extractor/ninegag.py | 48 +- hypervideo_dl/extractor/ninenow.py | 3 - hypervideo_dl/extractor/nintendo.py | 3 - hypervideo_dl/extractor/nitter.py | 3 - hypervideo_dl/extractor/njpwworld.py | 5 - hypervideo_dl/extractor/nobelprize.py | 4 - hypervideo_dl/extractor/noco.py | 228 -- hypervideo_dl/extractor/nonktube.py | 2 - hypervideo_dl/extractor/noodlemagazine.py | 5 - hypervideo_dl/extractor/noovo.py | 3 - hypervideo_dl/extractor/normalboots.py | 3 - hypervideo_dl/extractor/nosnl.py | 95 + hypervideo_dl/extractor/nosvideo.py | 3 - hypervideo_dl/extractor/nova.py | 5 - hypervideo_dl/extractor/novaplay.py | 54 +- hypervideo_dl/extractor/nowness.py | 3 - hypervideo_dl/extractor/noz.py | 11 +- hypervideo_dl/extractor/npo.py | 10 +- hypervideo_dl/extractor/npr.py | 23 +- hypervideo_dl/extractor/nrk.py | 20 +- hypervideo_dl/extractor/nrl.py | 3 - hypervideo_dl/extractor/ntvcojp.py | 3 - hypervideo_dl/extractor/ntvde.py | 4 - hypervideo_dl/extractor/ntvru.py | 4 - hypervideo_dl/extractor/nuevo.py | 3 - hypervideo_dl/extractor/nuvid.py | 3 - hypervideo_dl/extractor/nytimes.py | 5 +- hypervideo_dl/extractor/nzherald.py | 49 +- hypervideo_dl/extractor/nzz.py | 3 - hypervideo_dl/extractor/odatv.py | 3 - hypervideo_dl/extractor/odnoklassniki.py | 107 +- hypervideo_dl/extractor/oftv.py | 54 + hypervideo_dl/extractor/oktoberfesttv.py | 3 - hypervideo_dl/extractor/olympics.py | 6 +- hypervideo_dl/extractor/on24.py | 4 - hypervideo_dl/extractor/once.py | 5 +- hypervideo_dl/extractor/ondemandkorea.py | 25 +- hypervideo_dl/extractor/onefootball.py | 4 - hypervideo_dl/extractor/onenewsnz.py | 111 + hypervideo_dl/extractor/onet.py | 4 - hypervideo_dl/extractor/onionstudios.py | 13 +- hypervideo_dl/extractor/ooyala.py | 27 +- hypervideo_dl/extractor/opencast.py | 5 - hypervideo_dl/extractor/openload.py | 110 +- hypervideo_dl/extractor/openrec.py | 10 +- hypervideo_dl/extractor/ora.py | 4 - hypervideo_dl/extractor/orf.py | 287 +- hypervideo_dl/extractor/outsidetv.py | 3 - hypervideo_dl/extractor/packtpub.py | 2 - hypervideo_dl/extractor/palcomp3.py | 4 - hypervideo_dl/extractor/pandoratv.py | 5 - hypervideo_dl/extractor/panopto.py | 9 +- hypervideo_dl/extractor/paramountplus.py | 69 +- hypervideo_dl/extractor/parler.py | 111 + hypervideo_dl/extractor/parliamentliveuk.py | 80 - hypervideo_dl/extractor/parlview.py | 4 - hypervideo_dl/extractor/patreon.py | 368 ++- hypervideo_dl/extractor/pbs.py | 4 - hypervideo_dl/extractor/pearvideo.py | 13 +- hypervideo_dl/extractor/peekvids.py | 6 +- hypervideo_dl/extractor/peertube.py | 24 +- hypervideo_dl/extractor/peertv.py | 5 - hypervideo_dl/extractor/peloton.py | 16 +- hypervideo_dl/extractor/people.py | 3 - hypervideo_dl/extractor/performgroup.py | 5 - hypervideo_dl/extractor/periscope.py | 14 +- hypervideo_dl/extractor/philharmoniedeparis.py | 43 +- hypervideo_dl/extractor/phoenix.py | 3 - hypervideo_dl/extractor/photobucket.py | 2 - hypervideo_dl/extractor/piapro.py | 17 +- hypervideo_dl/extractor/picarto.py | 5 - hypervideo_dl/extractor/piksel.py | 15 +- hypervideo_dl/extractor/pinkbike.py | 4 - hypervideo_dl/extractor/pinterest.py | 4 - hypervideo_dl/extractor/pixivsketch.py | 4 - hypervideo_dl/extractor/pladform.py | 15 +- hypervideo_dl/extractor/planetmarathi.py | 4 - hypervideo_dl/extractor/platzi.py | 4 - hypervideo_dl/extractor/playfm.py | 4 - hypervideo_dl/extractor/playplustv.py | 4 - hypervideo_dl/extractor/plays.py | 4 - hypervideo_dl/extractor/playstuff.py | 2 - hypervideo_dl/extractor/playsuisse.py | 147 + hypervideo_dl/extractor/playtvak.py | 4 - hypervideo_dl/extractor/playvid.py | 16 +- hypervideo_dl/extractor/playwire.py | 6 +- hypervideo_dl/extractor/pluralsight.py | 4 - hypervideo_dl/extractor/plutotv.py | 4 - hypervideo_dl/extractor/podbayfm.py | 75 + hypervideo_dl/extractor/podchaser.py | 97 + hypervideo_dl/extractor/podomatic.py | 2 - hypervideo_dl/extractor/pokemon.py | 44 - hypervideo_dl/extractor/pokergo.py | 3 - hypervideo_dl/extractor/polsatgo.py | 4 - hypervideo_dl/extractor/polskieradio.py | 5 - hypervideo_dl/extractor/popcorntimes.py | 11 +- hypervideo_dl/extractor/popcorntv.py | 3 - hypervideo_dl/extractor/porn91.py | 3 - hypervideo_dl/extractor/porncom.py | 4 - hypervideo_dl/extractor/pornez.py | 2 - hypervideo_dl/extractor/pornflip.py | 4 - hypervideo_dl/extractor/pornhd.py | 4 - hypervideo_dl/extractor/pornhub.py | 47 +- hypervideo_dl/extractor/pornotube.py | 2 - hypervideo_dl/extractor/pornovoisines.py | 5 - hypervideo_dl/extractor/pornoxo.py | 3 - hypervideo_dl/extractor/prankcast.py | 66 + hypervideo_dl/extractor/premiershiprugby.py | 39 + hypervideo_dl/extractor/presstv.py | 4 - hypervideo_dl/extractor/projectveritas.py | 4 - hypervideo_dl/extractor/prosiebensat1.py | 4 - hypervideo_dl/extractor/prx.py | 3 - hypervideo_dl/extractor/puhutv.py | 4 - hypervideo_dl/extractor/puls4.py | 10 +- hypervideo_dl/extractor/pyvideo.py | 2 - hypervideo_dl/extractor/qingting.py | 47 + hypervideo_dl/extractor/qqmusic.py | 4 - hypervideo_dl/extractor/r7.py | 4 - hypervideo_dl/extractor/radiko.py | 76 +- hypervideo_dl/extractor/radiobremen.py | 4 - hypervideo_dl/extractor/radiocanada.py | 5 - hypervideo_dl/extractor/radiode.py | 3 - hypervideo_dl/extractor/radiofrance.py | 53 +- hypervideo_dl/extractor/radiojavan.py | 3 - hypervideo_dl/extractor/radiokapital.py | 2 - hypervideo_dl/extractor/radiozet.py | 1 - hypervideo_dl/extractor/radlive.py | 7 +- hypervideo_dl/extractor/rai.py | 247 +- hypervideo_dl/extractor/raywenderlich.py | 2 - hypervideo_dl/extractor/rbmaradio.py | 3 - hypervideo_dl/extractor/rcs.py | 49 +- hypervideo_dl/extractor/rcti.py | 5 - hypervideo_dl/extractor/rds.py | 3 - hypervideo_dl/extractor/redbee.py | 379 +++ hypervideo_dl/extractor/redbulltv.py | 7 +- hypervideo_dl/extractor/reddit.py | 89 +- hypervideo_dl/extractor/redgifs.py | 40 +- hypervideo_dl/extractor/redtube.py | 12 +- hypervideo_dl/extractor/regiotv.py | 3 - hypervideo_dl/extractor/rentv.py | 4 - hypervideo_dl/extractor/restudy.py | 4 - hypervideo_dl/extractor/reuters.py | 4 - hypervideo_dl/extractor/reverbnation.py | 2 - hypervideo_dl/extractor/rice.py | 4 - hypervideo_dl/extractor/rmcdecouverte.py | 4 - hypervideo_dl/extractor/ro220.py | 43 - hypervideo_dl/extractor/rockstargames.py | 5 - hypervideo_dl/extractor/rokfin.py | 173 +- hypervideo_dl/extractor/roosterteeth.py | 2 - hypervideo_dl/extractor/rottentomatoes.py | 2 - hypervideo_dl/extractor/roxwel.py | 52 - hypervideo_dl/extractor/rozhlas.py | 3 - hypervideo_dl/extractor/rtbf.py | 159 -- hypervideo_dl/extractor/rte.py | 5 - hypervideo_dl/extractor/rtl2.py | 6 - hypervideo_dl/extractor/rtlnl.py | 156 +- hypervideo_dl/extractor/rtnews.py | 3 - hypervideo_dl/extractor/rtp.py | 3 - hypervideo_dl/extractor/rtrfm.py | 2 - hypervideo_dl/extractor/rts.py | 6 +- hypervideo_dl/extractor/rtve.py | 28 +- hypervideo_dl/extractor/rtvnh.py | 4 - hypervideo_dl/extractor/rtvs.py | 4 - hypervideo_dl/extractor/rtvslo.py | 150 + hypervideo_dl/extractor/ruhd.py | 3 - hypervideo_dl/extractor/rule34video.py | 4 - hypervideo_dl/extractor/rumble.py | 213 +- hypervideo_dl/extractor/rutube.py | 13 +- hypervideo_dl/extractor/rutv.py | 33 +- hypervideo_dl/extractor/ruutu.py | 50 +- hypervideo_dl/extractor/ruv.py | 3 - hypervideo_dl/extractor/safari.py | 3 - hypervideo_dl/extractor/saitosan.py | 4 - hypervideo_dl/extractor/samplefocus.py | 3 - hypervideo_dl/extractor/sapo.py | 5 - hypervideo_dl/extractor/savefrom.py | 3 - hypervideo_dl/extractor/sbs.py | 16 +- hypervideo_dl/extractor/screen9.py | 62 + hypervideo_dl/extractor/screencast.py | 14 +- hypervideo_dl/extractor/screencastify.py | 52 + hypervideo_dl/extractor/screencastomatic.py | 27 +- hypervideo_dl/extractor/scrippsnetworks.py | 3 - hypervideo_dl/extractor/scrolller.py | 102 + hypervideo_dl/extractor/scte.py | 2 - hypervideo_dl/extractor/seeker.py | 3 - hypervideo_dl/extractor/senategov.py | 15 +- hypervideo_dl/extractor/senateisvp.py | 153 - hypervideo_dl/extractor/sendtonews.py | 13 +- hypervideo_dl/extractor/servus.py | 4 - hypervideo_dl/extractor/sevenplus.py | 7 +- hypervideo_dl/extractor/sexu.py | 3 - hypervideo_dl/extractor/seznamzpravy.py | 16 +- hypervideo_dl/extractor/shahid.py | 4 - hypervideo_dl/extractor/shared.py | 13 +- hypervideo_dl/extractor/sharevideos.py | 6 + hypervideo_dl/extractor/shemaroome.py | 4 - hypervideo_dl/extractor/showroomlive.py | 4 - hypervideo_dl/extractor/simplecast.py | 19 +- hypervideo_dl/extractor/sina.py | 5 - hypervideo_dl/extractor/sixplay.py | 5 - hypervideo_dl/extractor/skeb.py | 3 - hypervideo_dl/extractor/sky.py | 3 - hypervideo_dl/extractor/skyit.py | 99 +- hypervideo_dl/extractor/skylinewebcams.py | 3 - hypervideo_dl/extractor/skynewsarabia.py | 3 - hypervideo_dl/extractor/skynewsau.py | 3 - hypervideo_dl/extractor/slideshare.py | 2 - hypervideo_dl/extractor/slideslive.py | 5 +- hypervideo_dl/extractor/slutload.py | 2 - hypervideo_dl/extractor/smotrim.py | 65 + hypervideo_dl/extractor/snotr.py | 4 - hypervideo_dl/extractor/sohu.py | 4 - hypervideo_dl/extractor/sonyliv.py | 34 +- hypervideo_dl/extractor/soundcloud.py | 370 ++- hypervideo_dl/extractor/soundgasm.py | 3 - hypervideo_dl/extractor/southpark.py | 54 +- hypervideo_dl/extractor/sovietscloset.py | 18 +- hypervideo_dl/extractor/spankbang.py | 4 - hypervideo_dl/extractor/spankwire.py | 10 +- hypervideo_dl/extractor/spiegel.py | 3 - hypervideo_dl/extractor/spiegeltv.py | 17 - hypervideo_dl/extractor/spike.py | 2 - hypervideo_dl/extractor/sport5.py | 5 - hypervideo_dl/extractor/sportbox.py | 13 +- hypervideo_dl/extractor/sportdeutschland.py | 3 - hypervideo_dl/extractor/spotify.py | 55 +- hypervideo_dl/extractor/spreaker.py | 3 - hypervideo_dl/extractor/springboardplatform.py | 14 +- hypervideo_dl/extractor/sprout.py | 3 - hypervideo_dl/extractor/srgssr.py | 5 - hypervideo_dl/extractor/srmediathek.py | 3 - hypervideo_dl/extractor/stanfordoc.py | 2 - hypervideo_dl/extractor/startrek.py | 75 + hypervideo_dl/extractor/startv.py | 3 - hypervideo_dl/extractor/steam.py | 49 +- hypervideo_dl/extractor/stitcher.py | 2 - hypervideo_dl/extractor/storyfire.py | 5 +- hypervideo_dl/extractor/streamable.py | 15 +- hypervideo_dl/extractor/streamanity.py | 4 - hypervideo_dl/extractor/streamcloud.py | 3 - hypervideo_dl/extractor/streamcz.py | 6 +- hypervideo_dl/extractor/streamff.py | 1 - hypervideo_dl/extractor/streetvoice.py | 3 - hypervideo_dl/extractor/stretchinternet.py | 2 - hypervideo_dl/extractor/stripchat.py | 52 +- hypervideo_dl/extractor/stv.py | 6 +- hypervideo_dl/extractor/substack.py | 100 + hypervideo_dl/extractor/sunporno.py | 3 - hypervideo_dl/extractor/sverigesradio.py | 4 - hypervideo_dl/extractor/svt.py | 12 +- hypervideo_dl/extractor/swearnet.py | 73 + hypervideo_dl/extractor/swrmediathek.py | 4 - hypervideo_dl/extractor/syfy.py | 2 - hypervideo_dl/extractor/syvdk.py | 33 + hypervideo_dl/extractor/sztvhu.py | 3 - hypervideo_dl/extractor/tagesschau.py | 5 - hypervideo_dl/extractor/tass.py | 4 - hypervideo_dl/extractor/tastytrade.py | 43 - hypervideo_dl/extractor/tbs.py | 3 - hypervideo_dl/extractor/tdslifeway.py | 2 - hypervideo_dl/extractor/teachable.py | 16 +- hypervideo_dl/extractor/teachertube.py | 5 - hypervideo_dl/extractor/teachingchannel.py | 2 - hypervideo_dl/extractor/teamcoco.py | 4 - hypervideo_dl/extractor/teamtreehouse.py | 3 - hypervideo_dl/extractor/techtalks.py | 2 - hypervideo_dl/extractor/ted.py | 8 +- hypervideo_dl/extractor/tele13.py | 4 - hypervideo_dl/extractor/tele5.py | 5 +- hypervideo_dl/extractor/telebruxelles.py | 4 - hypervideo_dl/extractor/telecinco.py | 4 - hypervideo_dl/extractor/telegraaf.py | 9 +- hypervideo_dl/extractor/telegram.py | 141 +- hypervideo_dl/extractor/telemb.py | 4 - hypervideo_dl/extractor/telemundo.py | 4 - hypervideo_dl/extractor/telequebec.py | 3 - hypervideo_dl/extractor/teletask.py | 2 - hypervideo_dl/extractor/telewebion.py | 3 - hypervideo_dl/extractor/tempo.py | 53 + hypervideo_dl/extractor/tencent.py | 452 +++ hypervideo_dl/extractor/tennistv.py | 186 +- hypervideo_dl/extractor/tenplay.py | 4 - hypervideo_dl/extractor/testurl.py | 47 +- hypervideo_dl/extractor/tf1.py | 3 - hypervideo_dl/extractor/tfo.py | 3 - hypervideo_dl/extractor/theholetv.py | 35 + hypervideo_dl/extractor/theintercept.py | 3 - hypervideo_dl/extractor/theplatform.py | 30 +- hypervideo_dl/extractor/thescene.py | 44 - hypervideo_dl/extractor/thestar.py | 3 - hypervideo_dl/extractor/thesun.py | 2 - hypervideo_dl/extractor/theta.py | 5 - hypervideo_dl/extractor/theweatherchannel.py | 6 +- hypervideo_dl/extractor/thisamericanlife.py | 2 - hypervideo_dl/extractor/thisav.py | 4 - hypervideo_dl/extractor/thisoldhouse.py | 3 - hypervideo_dl/extractor/threeqsdn.py | 26 +- hypervideo_dl/extractor/threespeak.py | 4 - hypervideo_dl/extractor/tiktok.py | 283 +- hypervideo_dl/extractor/tinypic.py | 2 - hypervideo_dl/extractor/tmz.py | 62 +- hypervideo_dl/extractor/tnaflix.py | 205 +- hypervideo_dl/extractor/toggle.py | 4 - hypervideo_dl/extractor/toggo.py | 11 +- hypervideo_dl/extractor/tokentube.py | 5 - hypervideo_dl/extractor/tonline.py | 3 - hypervideo_dl/extractor/toongoggles.py | 4 - hypervideo_dl/extractor/toutv.py | 5 +- hypervideo_dl/extractor/toypics.py | 3 - hypervideo_dl/extractor/traileraddict.py | 2 - hypervideo_dl/extractor/triller.py | 294 ++ hypervideo_dl/extractor/trilulilu.py | 3 - hypervideo_dl/extractor/trovo.py | 308 +- hypervideo_dl/extractor/trueid.py | 3 - hypervideo_dl/extractor/trunews.py | 2 - hypervideo_dl/extractor/truth.py | 69 + hypervideo_dl/extractor/trutv.py | 4 - hypervideo_dl/extractor/tube8.py | 11 +- hypervideo_dl/extractor/tubetugraz.py | 233 ++ hypervideo_dl/extractor/tubitv.py | 57 +- hypervideo_dl/extractor/tudou.py | 49 - hypervideo_dl/extractor/tumblr.py | 5 - hypervideo_dl/extractor/tunein.py | 11 +- hypervideo_dl/extractor/tunepk.py | 3 - hypervideo_dl/extractor/turbo.py | 4 - hypervideo_dl/extractor/turner.py | 7 +- hypervideo_dl/extractor/tv2.py | 37 +- hypervideo_dl/extractor/tv24ua.py | 78 + hypervideo_dl/extractor/tv2dk.py | 4 - hypervideo_dl/extractor/tv2hu.py | 3 - hypervideo_dl/extractor/tv4.py | 5 - hypervideo_dl/extractor/tv5mondeplus.py | 4 - hypervideo_dl/extractor/tv5unis.py | 4 - hypervideo_dl/extractor/tva.py | 3 - hypervideo_dl/extractor/tvanouvelles.py | 3 - hypervideo_dl/extractor/tvc.py | 14 +- hypervideo_dl/extractor/tver.py | 130 +- hypervideo_dl/extractor/tvigle.py | 6 +- hypervideo_dl/extractor/tviplayer.py | 78 + hypervideo_dl/extractor/tvland.py | 3 - hypervideo_dl/extractor/tvn24.py | 4 - hypervideo_dl/extractor/tvnet.py | 4 - hypervideo_dl/extractor/tvnoe.py | 3 - hypervideo_dl/extractor/tvnow.py | 7 +- hypervideo_dl/extractor/tvopengr.py | 14 +- hypervideo_dl/extractor/tvp.py | 236 +- hypervideo_dl/extractor/tvplay.py | 7 - hypervideo_dl/extractor/tvplayer.py | 4 - hypervideo_dl/extractor/tweakers.py | 3 - hypervideo_dl/extractor/twentyfourvideo.py | 4 - hypervideo_dl/extractor/twentymin.py | 13 +- hypervideo_dl/extractor/twentythreevideo.py | 3 - hypervideo_dl/extractor/twitcasting.py | 33 +- hypervideo_dl/extractor/twitch.py | 155 +- hypervideo_dl/extractor/twitter.py | 788 ++++- hypervideo_dl/extractor/udemy.py | 23 +- hypervideo_dl/extractor/udn.py | 6 +- hypervideo_dl/extractor/ufctv.py | 3 - hypervideo_dl/extractor/ukcolumn.py | 2 - hypervideo_dl/extractor/uktvplay.py | 8 +- hypervideo_dl/extractor/umg.py | 4 - hypervideo_dl/extractor/unistra.py | 3 - hypervideo_dl/extractor/unity.py | 2 - hypervideo_dl/extractor/unscripted.py | 53 + hypervideo_dl/extractor/unsupported.py | 143 + hypervideo_dl/extractor/uol.py | 4 - hypervideo_dl/extractor/uplynk.py | 7 +- hypervideo_dl/extractor/urort.py | 13 +- hypervideo_dl/extractor/urplay.py | 4 - hypervideo_dl/extractor/usanetwork.py | 5 +- hypervideo_dl/extractor/usatoday.py | 3 - hypervideo_dl/extractor/ustream.py | 12 +- hypervideo_dl/extractor/ustudio.py | 5 - hypervideo_dl/extractor/utreon.py | 4 - hypervideo_dl/extractor/varzesh3.py | 3 - hypervideo_dl/extractor/vbox7.py | 14 +- hypervideo_dl/extractor/veehd.py | 2 - hypervideo_dl/extractor/veo.py | 5 - hypervideo_dl/extractor/veoh.py | 67 +- hypervideo_dl/extractor/vesti.py | 3 - hypervideo_dl/extractor/vevo.py | 123 +- hypervideo_dl/extractor/vgtv.py | 12 +- hypervideo_dl/extractor/vh1.py | 3 - hypervideo_dl/extractor/vice.py | 17 +- hypervideo_dl/extractor/vidbit.py | 2 - hypervideo_dl/extractor/viddler.py | 6 +- hypervideo_dl/extractor/videa.py | 21 +- hypervideo_dl/extractor/videocampus_sachsen.py | 237 +- hypervideo_dl/extractor/videodetective.py | 2 - hypervideo_dl/extractor/videofyme.py | 2 - hypervideo_dl/extractor/videomore.py | 25 +- hypervideo_dl/extractor/videopress.py | 13 +- hypervideo_dl/extractor/vidio.py | 42 +- hypervideo_dl/extractor/vidlii.py | 6 +- hypervideo_dl/extractor/vidme.py | 295 -- hypervideo_dl/extractor/vidzi.py | 68 - hypervideo_dl/extractor/vier.py | 264 -- hypervideo_dl/extractor/viewlift.py | 13 +- hypervideo_dl/extractor/viidea.py | 3 - hypervideo_dl/extractor/viki.py | 3 - hypervideo_dl/extractor/vimeo.py | 252 +- hypervideo_dl/extractor/vimm.py | 3 - hypervideo_dl/extractor/vimple.py | 3 - hypervideo_dl/extractor/vine.py | 8 +- hypervideo_dl/extractor/viqeo.py | 15 +- hypervideo_dl/extractor/viu.py | 47 +- hypervideo_dl/extractor/vk.py | 140 +- hypervideo_dl/extractor/vlive.py | 21 +- hypervideo_dl/extractor/vodlocker.py | 3 - hypervideo_dl/extractor/vodpl.py | 3 - hypervideo_dl/extractor/vodplatform.py | 5 +- hypervideo_dl/extractor/voicerepublic.py | 3 - hypervideo_dl/extractor/voicy.py | 8 +- hypervideo_dl/extractor/voot.py | 9 +- hypervideo_dl/extractor/voxmedia.py | 6 +- hypervideo_dl/extractor/vrak.py | 3 - hypervideo_dl/extractor/vrt.py | 4 - hypervideo_dl/extractor/vrv.py | 19 +- hypervideo_dl/extractor/vshare.py | 22 +- hypervideo_dl/extractor/vtm.py | 3 - hypervideo_dl/extractor/vube.py | 170 -- hypervideo_dl/extractor/vuclip.py | 2 - hypervideo_dl/extractor/vupload.py | 3 - hypervideo_dl/extractor/vvvvid.py | 21 +- hypervideo_dl/extractor/vyborymos.py | 3 - hypervideo_dl/extractor/vzaar.py | 14 +- hypervideo_dl/extractor/wakanim.py | 3 - hypervideo_dl/extractor/walla.py | 4 - hypervideo_dl/extractor/wasdtv.py | 10 +- hypervideo_dl/extractor/washingtonpost.py | 10 +- hypervideo_dl/extractor/wat.py | 7 +- hypervideo_dl/extractor/watchbox.py | 5 - hypervideo_dl/extractor/watchindianporn.py | 3 - hypervideo_dl/extractor/wdr.py | 7 +- hypervideo_dl/extractor/webcaster.py | 20 +- hypervideo_dl/extractor/webofstories.py | 5 - hypervideo_dl/extractor/weibo.py | 5 - hypervideo_dl/extractor/weiqitv.py | 3 - hypervideo_dl/extractor/whowatch.py | 4 - hypervideo_dl/extractor/wikimedia.py | 55 + hypervideo_dl/extractor/willow.py | 2 - hypervideo_dl/extractor/wimtv.py | 19 +- hypervideo_dl/extractor/wistia.py | 250 +- hypervideo_dl/extractor/wordpress.py | 154 + hypervideo_dl/extractor/worldstarhiphop.py | 2 - hypervideo_dl/extractor/wppilot.py | 10 +- hypervideo_dl/extractor/wsj.py | 7 +- hypervideo_dl/extractor/wwe.py | 2 - hypervideo_dl/extractor/xbef.py | 2 - hypervideo_dl/extractor/xboxclips.py | 3 - hypervideo_dl/extractor/xfileshare.py | 31 +- hypervideo_dl/extractor/xhamster.py | 27 +- hypervideo_dl/extractor/xiami.py | 3 - hypervideo_dl/extractor/ximalaya.py | 161 +- hypervideo_dl/extractor/xinpianchang.py | 5 - hypervideo_dl/extractor/xminus.py | 3 - hypervideo_dl/extractor/xnxx.py | 4 - hypervideo_dl/extractor/xstream.py | 4 - hypervideo_dl/extractor/xtube.py | 3 - hypervideo_dl/extractor/xuite.py | 4 - hypervideo_dl/extractor/xvideos.py | 4 - hypervideo_dl/extractor/xxxymovies.py | 3 - hypervideo_dl/extractor/yahoo.py | 143 +- hypervideo_dl/extractor/yandexdisk.py | 4 - hypervideo_dl/extractor/yandexmusic.py | 6 +- hypervideo_dl/extractor/yandexvideo.py | 182 +- hypervideo_dl/extractor/yapfiles.py | 14 +- hypervideo_dl/extractor/yesjapan.py | 3 - hypervideo_dl/extractor/yinyuetai.py | 4 - hypervideo_dl/extractor/yle_areena.py | 71 + hypervideo_dl/extractor/ynet.py | 10 +- hypervideo_dl/extractor/youjizz.py | 3 - hypervideo_dl/extractor/youku.py | 4 - hypervideo_dl/extractor/younow.py | 5 +- hypervideo_dl/extractor/youporn.py | 39 +- hypervideo_dl/extractor/yourporn.py | 2 - hypervideo_dl/extractor/yourupload.py | 3 - hypervideo_dl/extractor/youtube.py | 2997 +++++++++++++------- hypervideo_dl/extractor/zapiks.py | 5 +- hypervideo_dl/extractor/zaq1.py | 101 - hypervideo_dl/extractor/zattoo.py | 708 ++++- hypervideo_dl/extractor/zdf.py | 150 +- hypervideo_dl/extractor/zee5.py | 49 +- hypervideo_dl/extractor/zeenews.py | 57 + hypervideo_dl/extractor/zhihu.py | 6 +- hypervideo_dl/extractor/zingmp3.py | 392 ++- hypervideo_dl/extractor/zoom.py | 6 - hypervideo_dl/extractor/zype.py | 13 +- 1041 files changed, 26768 insertions(+), 19198 deletions(-) create mode 100644 hypervideo_dl/extractor/_extractors.py create mode 100644 hypervideo_dl/extractor/acfun.py create mode 100644 hypervideo_dl/extractor/aeonco.py create mode 100644 hypervideo_dl/extractor/agora.py create mode 100644 hypervideo_dl/extractor/amazonminitv.py create mode 100644 hypervideo_dl/extractor/angel.py delete mode 100644 hypervideo_dl/extractor/animelab.py delete mode 100644 hypervideo_dl/extractor/animeondemand.py delete mode 100644 hypervideo_dl/extractor/anvato_token_generator/__init__.py delete mode 100644 hypervideo_dl/extractor/anvato_token_generator/common.py delete mode 100644 hypervideo_dl/extractor/anvato_token_generator/nfl.py create mode 100644 hypervideo_dl/extractor/atscaleconf.py create mode 100644 hypervideo_dl/extractor/audiodraft.py create mode 100644 hypervideo_dl/extractor/berufetv.py delete mode 100644 hypervideo_dl/extractor/blinkx.py create mode 100644 hypervideo_dl/extractor/booyah.py create mode 100644 hypervideo_dl/extractor/bundesliga.py create mode 100644 hypervideo_dl/extractor/camsoda.py create mode 100644 hypervideo_dl/extractor/camtasia.py delete mode 100644 hypervideo_dl/extractor/camtube.py create mode 100644 hypervideo_dl/extractor/cellebrite.py create mode 100644 hypervideo_dl/extractor/cinetecamilano.py create mode 100644 hypervideo_dl/extractor/dailywire.py create mode 100644 hypervideo_dl/extractor/detik.py create mode 100644 hypervideo_dl/extractor/deuxm.py delete mode 100644 hypervideo_dl/extractor/discoverynetworks.py delete mode 100644 hypervideo_dl/extractor/discoveryplusindia.py delete mode 100644 hypervideo_dl/extractor/discoveryvr.py delete mode 100644 hypervideo_dl/extractor/doodstream.py create mode 100644 hypervideo_dl/extractor/epoch.py create mode 100644 hypervideo_dl/extractor/eurosport.py delete mode 100644 hypervideo_dl/extractor/everyonesmixtape.py create mode 100644 hypervideo_dl/extractor/fifa.py delete mode 100644 hypervideo_dl/extractor/fivemin.py create mode 100644 hypervideo_dl/extractor/fourzerostudio.py delete mode 100644 hypervideo_dl/extractor/franceculture.py create mode 100644 hypervideo_dl/extractor/freetv.py delete mode 100644 hypervideo_dl/extractor/freshlive.py create mode 100644 hypervideo_dl/extractor/fuyintv.py delete mode 100644 hypervideo_dl/extractor/fxnetworks.py create mode 100644 hypervideo_dl/extractor/genericembeds.py create mode 100644 hypervideo_dl/extractor/genius.py create mode 100644 hypervideo_dl/extractor/goodgame.py create mode 100644 hypervideo_dl/extractor/goplay.py create mode 100644 hypervideo_dl/extractor/harpodeon.py create mode 100644 hypervideo_dl/extractor/holodex.py delete mode 100644 hypervideo_dl/extractor/hornbunny.py create mode 100644 hypervideo_dl/extractor/hytale.py create mode 100644 hypervideo_dl/extractor/icareus.py create mode 100644 hypervideo_dl/extractor/iltalehti.py delete mode 100644 hypervideo_dl/extractor/ir90tv.py create mode 100644 hypervideo_dl/extractor/islamchannel.py create mode 100644 hypervideo_dl/extractor/israelnationalnews.py create mode 100644 hypervideo_dl/extractor/ixigua.py create mode 100644 hypervideo_dl/extractor/jable.py create mode 100644 hypervideo_dl/extractor/japandiet.py create mode 100644 hypervideo_dl/extractor/jixie.py create mode 100644 hypervideo_dl/extractor/kanal2.py delete mode 100644 hypervideo_dl/extractor/kanalplay.py delete mode 100644 hypervideo_dl/extractor/kankan.py create mode 100644 hypervideo_dl/extractor/kicker.py create mode 100644 hypervideo_dl/extractor/kompas.py create mode 100644 hypervideo_dl/extractor/kth.py create mode 100644 hypervideo_dl/extractor/likee.py create mode 100644 hypervideo_dl/extractor/liputan6.py create mode 100644 hypervideo_dl/extractor/listennotes.py delete mode 100644 hypervideo_dl/extractor/liveleak.py create mode 100644 hypervideo_dl/extractor/livestreamfails.py create mode 100644 hypervideo_dl/extractor/masters.py create mode 100644 hypervideo_dl/extractor/mediaworksnz.py create mode 100644 hypervideo_dl/extractor/microsoftembed.py create mode 100644 hypervideo_dl/extractor/mirrorcouk.py create mode 100644 hypervideo_dl/extractor/mocha.py create mode 100644 hypervideo_dl/extractor/moviepilot.py create mode 100644 hypervideo_dl/extractor/moview.py create mode 100644 hypervideo_dl/extractor/netverse.py create mode 100644 hypervideo_dl/extractor/newspicks.py delete mode 100644 hypervideo_dl/extractor/noco.py create mode 100644 hypervideo_dl/extractor/nosnl.py create mode 100644 hypervideo_dl/extractor/oftv.py create mode 100644 hypervideo_dl/extractor/onenewsnz.py create mode 100644 hypervideo_dl/extractor/parler.py delete mode 100644 hypervideo_dl/extractor/parliamentliveuk.py create mode 100644 hypervideo_dl/extractor/playsuisse.py create mode 100644 hypervideo_dl/extractor/podbayfm.py create mode 100644 hypervideo_dl/extractor/podchaser.py create mode 100644 hypervideo_dl/extractor/prankcast.py create mode 100644 hypervideo_dl/extractor/premiershiprugby.py create mode 100644 hypervideo_dl/extractor/qingting.py create mode 100644 hypervideo_dl/extractor/redbee.py delete mode 100644 hypervideo_dl/extractor/ro220.py delete mode 100644 hypervideo_dl/extractor/roxwel.py delete mode 100644 hypervideo_dl/extractor/rtbf.py create mode 100644 hypervideo_dl/extractor/rtvslo.py create mode 100644 hypervideo_dl/extractor/screen9.py create mode 100644 hypervideo_dl/extractor/screencastify.py create mode 100644 hypervideo_dl/extractor/scrolller.py delete mode 100644 hypervideo_dl/extractor/senateisvp.py create mode 100644 hypervideo_dl/extractor/sharevideos.py create mode 100644 hypervideo_dl/extractor/smotrim.py delete mode 100644 hypervideo_dl/extractor/spiegeltv.py create mode 100644 hypervideo_dl/extractor/startrek.py create mode 100644 hypervideo_dl/extractor/substack.py create mode 100644 hypervideo_dl/extractor/swearnet.py create mode 100644 hypervideo_dl/extractor/syvdk.py delete mode 100644 hypervideo_dl/extractor/tastytrade.py create mode 100644 hypervideo_dl/extractor/tempo.py create mode 100644 hypervideo_dl/extractor/tencent.py create mode 100644 hypervideo_dl/extractor/theholetv.py delete mode 100644 hypervideo_dl/extractor/thescene.py create mode 100644 hypervideo_dl/extractor/triller.py create mode 100644 hypervideo_dl/extractor/truth.py create mode 100644 hypervideo_dl/extractor/tubetugraz.py delete mode 100644 hypervideo_dl/extractor/tudou.py create mode 100644 hypervideo_dl/extractor/tv24ua.py create mode 100644 hypervideo_dl/extractor/tviplayer.py create mode 100644 hypervideo_dl/extractor/unscripted.py create mode 100644 hypervideo_dl/extractor/unsupported.py delete mode 100644 hypervideo_dl/extractor/vidme.py delete mode 100644 hypervideo_dl/extractor/vidzi.py delete mode 100644 hypervideo_dl/extractor/vier.py delete mode 100644 hypervideo_dl/extractor/vube.py create mode 100644 hypervideo_dl/extractor/wikimedia.py create mode 100644 hypervideo_dl/extractor/wordpress.py create mode 100644 hypervideo_dl/extractor/yle_areena.py delete mode 100644 hypervideo_dl/extractor/zaq1.py create mode 100644 hypervideo_dl/extractor/zeenews.py (limited to 'hypervideo_dl/extractor') diff --git a/hypervideo_dl/extractor/__init__.py b/hypervideo_dl/extractor/__init__.py index b354842..6bfa4bd 100644 --- a/hypervideo_dl/extractor/__init__.py +++ b/hypervideo_dl/extractor/__init__.py @@ -1,33 +1,15 @@ -import os +from ..compat.compat_utils import passthrough_module -from ..utils import load_plugins - -_LAZY_LOADER = False -if not os.environ.get('YTDLP_NO_LAZY_EXTRACTORS'): - try: - from .lazy_extractors import * - from .lazy_extractors import _ALL_CLASSES - _LAZY_LOADER = True - except ImportError: - pass - -if not _LAZY_LOADER: - from .extractors import * - _ALL_CLASSES = [ - klass - for name, klass in globals().items() - if name.endswith('IE') and name != 'GenericIE' - ] - _ALL_CLASSES.append(GenericIE) - -_PLUGIN_CLASSES = load_plugins('extractor', 'IE', globals()) -_ALL_CLASSES = list(_PLUGIN_CLASSES.values()) + _ALL_CLASSES +passthrough_module(__name__, '.extractors') +del passthrough_module def gen_extractor_classes(): """ Return a list of supported extractors. The order does matter; the first extractor matched is the one handling the URL. """ + from .extractors import _ALL_CLASSES + return _ALL_CLASSES @@ -38,17 +20,23 @@ def gen_extractors(): return [klass() for klass in gen_extractor_classes()] -def list_extractors(age_limit): - """ - Return a list of extractors that are suitable for the given age, - sorted by extractor ID. - """ +def list_extractor_classes(age_limit=None): + """Return a list of extractors that are suitable for the given age, sorted by extractor name""" + from .generic import GenericIE + + yield from sorted(filter( + lambda ie: ie.is_suitable(age_limit) and ie != GenericIE, + gen_extractor_classes()), key=lambda ie: ie.IE_NAME.lower()) + yield GenericIE - return sorted( - filter(lambda ie: ie.is_suitable(age_limit), gen_extractors()), - key=lambda ie: ie.IE_NAME.lower()) + +def list_extractors(age_limit=None): + """Return a list of extractor instances that are suitable for the given age, sorted by extractor name""" + return [ie() for ie in list_extractor_classes(age_limit)] def get_info_extractor(ie_name): """Returns the info extractor class with the given ie_name""" - return globals()[ie_name + 'IE'] + from . import extractors + + return getattr(extractors, f'{ie_name}IE') diff --git a/hypervideo_dl/extractor/_extractors.py b/hypervideo_dl/extractor/_extractors.py new file mode 100644 index 0000000..2fe15f6 --- /dev/null +++ b/hypervideo_dl/extractor/_extractors.py @@ -0,0 +1,2354 @@ +# flake8: noqa: F401 + +from .youtube import ( # Youtube is moved to the top to improve performance + YoutubeIE, + YoutubeClipIE, + YoutubeFavouritesIE, + YoutubeNotificationsIE, + YoutubeHistoryIE, + YoutubeTabIE, + YoutubeLivestreamEmbedIE, + YoutubePlaylistIE, + YoutubeRecommendedIE, + YoutubeSearchDateIE, + YoutubeSearchIE, + YoutubeSearchURLIE, + YoutubeMusicSearchURLIE, + YoutubeSubscriptionsIE, + YoutubeStoriesIE, + YoutubeTruncatedIDIE, + YoutubeTruncatedURLIE, + YoutubeYtBeIE, + YoutubeYtUserIE, + YoutubeWatchLaterIE, + YoutubeShortsAudioPivotIE +) + +from .abc import ( + ABCIE, + ABCIViewIE, + ABCIViewShowSeriesIE, +) +from .abcnews import ( + AbcNewsIE, + AbcNewsVideoIE, +) +from .abcotvs import ( + ABCOTVSIE, + ABCOTVSClipsIE, +) +from .abematv import ( + AbemaTVIE, + AbemaTVTitleIE, +) +from .academicearth import AcademicEarthCourseIE +from .acast import ( + ACastIE, + ACastChannelIE, +) +from .acfun import AcFunVideoIE, AcFunBangumiIE +from .adn import ADNIE +from .adobeconnect import AdobeConnectIE +from .adobetv import ( + AdobeTVEmbedIE, + AdobeTVIE, + AdobeTVShowIE, + AdobeTVChannelIE, + AdobeTVVideoIE, +) +from .adultswim import AdultSwimIE +from .aenetworks import ( + AENetworksIE, + AENetworksCollectionIE, + AENetworksShowIE, + HistoryTopicIE, + HistoryPlayerIE, + BiographyIE, +) +from .aeonco import AeonCoIE +from .afreecatv import ( + AfreecaTVIE, + AfreecaTVLiveIE, + AfreecaTVUserIE, +) +from .agora import ( + TokFMAuditionIE, + TokFMPodcastIE, + WyborczaPodcastIE, + WyborczaVideoIE, +) +from .airmozilla import AirMozillaIE +from .aljazeera import AlJazeeraIE +from .alphaporno import AlphaPornoIE +from .amara import AmaraIE +from .alura import ( + AluraIE, + AluraCourseIE +) +from .amcnetworks import AMCNetworksIE +from .amazon import AmazonStoreIE +from .amazonminitv import ( + AmazonMiniTVIE, + AmazonMiniTVSeasonIE, + AmazonMiniTVSeriesIE, +) +from .americastestkitchen import ( + AmericasTestKitchenIE, + AmericasTestKitchenSeasonIE, +) +from .angel import AngelIE +from .anvato import AnvatoIE +from .aol import AolIE +from .allocine import AllocineIE +from .aliexpress import AliExpressLiveIE +from .alsace20tv import ( + Alsace20TVIE, + Alsace20TVEmbedIE, +) +from .apa import APAIE +from .aparat import AparatIE +from .appleconnect import AppleConnectIE +from .appletrailers import ( + AppleTrailersIE, + AppleTrailersSectionIE, +) +from .applepodcasts import ApplePodcastsIE +from .archiveorg import ( + ArchiveOrgIE, + YoutubeWebArchiveIE, +) +from .arcpublishing import ArcPublishingIE +from .arkena import ArkenaIE +from .ard import ( + ARDBetaMediathekIE, + ARDIE, + ARDMediathekIE, +) +from .arte import ( + ArteTVIE, + ArteTVEmbedIE, + ArteTVPlaylistIE, + ArteTVCategoryIE, +) +from .arnes import ArnesIE +from .asiancrush import ( + AsianCrushIE, + AsianCrushPlaylistIE, +) +from .atresplayer import AtresPlayerIE +from .atscaleconf import AtScaleConfEventIE +from .atttechchannel import ATTTechChannelIE +from .atvat import ATVAtIE +from .audimedia import AudiMediaIE +from .audioboom import AudioBoomIE +from .audiodraft import ( + AudiodraftCustomIE, + AudiodraftGenericIE, +) +from .audiomack import AudiomackIE, AudiomackAlbumIE +from .audius import ( + AudiusIE, + AudiusTrackIE, + AudiusPlaylistIE, + AudiusProfileIE, +) +from .awaan import ( + AWAANIE, + AWAANVideoIE, + AWAANLiveIE, + AWAANSeasonIE, +) +from .azmedien import AZMedienIE +from .baidu import BaiduVideoIE +from .banbye import ( + BanByeIE, + BanByeChannelIE, +) +from .bandaichannel import BandaiChannelIE +from .bandcamp import ( + BandcampIE, + BandcampAlbumIE, + BandcampWeeklyIE, + BandcampUserIE, +) +from .bannedvideo import BannedVideoIE +from .bbc import ( + BBCCoUkIE, + BBCCoUkArticleIE, + BBCCoUkIPlayerEpisodesIE, + BBCCoUkIPlayerGroupIE, + BBCCoUkPlaylistIE, + BBCIE, +) +from .beeg import BeegIE +from .behindkink import BehindKinkIE +from .bellmedia import BellMediaIE +from .beatport import BeatportIE +from .berufetv import BerufeTVIE +from .bet import BetIE +from .bfi import BFIPlayerIE +from .bfmtv import ( + BFMTVIE, + BFMTVLiveIE, + BFMTVArticleIE, +) +from .bibeltv import BibelTVIE +from .bigflix import BigflixIE +from .bigo import BigoIE +from .bild import BildIE +from .bilibili import ( + BiliBiliIE, + BiliBiliBangumiIE, + BiliBiliBangumiMediaIE, + BiliBiliSearchIE, + BilibiliCategoryIE, + BilibiliAudioIE, + BilibiliAudioAlbumIE, + BiliBiliPlayerIE, + BilibiliSpaceVideoIE, + BilibiliSpaceAudioIE, + BilibiliSpacePlaylistIE, + BiliIntlIE, + BiliIntlSeriesIE, + BiliLiveIE, +) +from .biobiochiletv import BioBioChileTVIE +from .bitchute import ( + BitChuteIE, + BitChuteChannelIE, +) +from .bitwave import ( + BitwaveReplayIE, + BitwaveStreamIE, +) +from .biqle import BIQLEIE +from .blackboardcollaborate import BlackboardCollaborateIE +from .bleacherreport import ( + BleacherReportIE, + BleacherReportCMSIE, +) +from .blogger import BloggerIE +from .bloomberg import BloombergIE +from .bokecc import BokeCCIE +from .bongacams import BongaCamsIE +from .bostonglobe import BostonGlobeIE +from .box import BoxIE +from .booyah import BooyahClipsIE +from .bpb import BpbIE +from .br import ( + BRIE, + BRMediathekIE, +) +from .bravotv import BravoTVIE +from .breakcom import BreakIE +from .breitbart import BreitBartIE +from .brightcove import ( + BrightcoveLegacyIE, + BrightcoveNewIE, +) +from .businessinsider import BusinessInsiderIE +from .bundesliga import BundesligaIE +from .buzzfeed import BuzzFeedIE +from .byutv import BYUtvIE +from .c56 import C56IE +from .cableav import CableAVIE +from .callin import CallinIE +from .caltrans import CaltransIE +from .cam4 import CAM4IE +from .camdemy import ( + CamdemyIE, + CamdemyFolderIE +) +from .cammodels import CamModelsIE +from .camsoda import CamsodaIE +from .camtasia import CamtasiaEmbedIE +from .camwithher import CamWithHerIE +from .canalalpha import CanalAlphaIE +from .canalplus import CanalplusIE +from .canalc2 import Canalc2IE +from .canvas import ( + CanvasIE, + CanvasEenIE, + VrtNUIE, + DagelijkseKostIE, +) +from .carambatv import ( + CarambaTVIE, + CarambaTVPageIE, +) +from .cartoonnetwork import CartoonNetworkIE +from .cbc import ( + CBCIE, + CBCPlayerIE, + CBCGemIE, + CBCGemPlaylistIE, + CBCGemLiveIE, +) +from .cbs import CBSIE +from .cbslocal import ( + CBSLocalIE, + CBSLocalArticleIE, +) +from .cbsinteractive import CBSInteractiveIE +from .cbsnews import ( + CBSNewsEmbedIE, + CBSNewsIE, + CBSNewsLiveVideoIE, +) +from .cbssports import ( + CBSSportsEmbedIE, + CBSSportsIE, + TwentyFourSevenSportsIE, +) +from .ccc import ( + CCCIE, + CCCPlaylistIE, +) +from .ccma import CCMAIE +from .cctv import CCTVIE +from .cda import CDAIE +from .cellebrite import CellebriteIE +from .ceskatelevize import CeskaTelevizeIE +from .cgtn import CGTNIE +from .channel9 import Channel9IE +from .charlierose import CharlieRoseIE +from .chaturbate import ChaturbateIE +from .chilloutzone import ChilloutzoneIE +from .chingari import ( + ChingariIE, + ChingariUserIE, +) +from .chirbit import ( + ChirbitIE, + ChirbitProfileIE, +) +from .cinchcast import CinchcastIE +from .cinemax import CinemaxIE +from .cinetecamilano import CinetecaMilanoIE +from .ciscolive import ( + CiscoLiveSessionIE, + CiscoLiveSearchIE, +) +from .ciscowebex import CiscoWebexIE +from .cjsw import CJSWIE +from .cliphunter import CliphunterIE +from .clippit import ClippitIE +from .cliprs import ClipRsIE +from .clipsyndicate import ClipsyndicateIE +from .closertotruth import CloserToTruthIE +from .cloudflarestream import CloudflareStreamIE +from .cloudy import CloudyIE +from .clubic import ClubicIE +from .clyp import ClypIE +from .cmt import CMTIE +from .cnbc import ( + CNBCIE, + CNBCVideoIE, +) +from .cnn import ( + CNNIE, + CNNBlogsIE, + CNNArticleIE, + CNNIndonesiaIE, +) +from .coub import CoubIE +from .comedycentral import ( + ComedyCentralIE, + ComedyCentralTVIE, +) +from .commonmistakes import CommonMistakesIE, UnicodeBOMIE +from .commonprotocols import ( + MmsIE, + RtmpIE, + ViewSourceIE, +) +from .condenast import CondeNastIE +from .contv import CONtvIE +from .corus import CorusIE +from .cpac import ( + CPACIE, + CPACPlaylistIE, +) +from .cozytv import CozyTVIE +from .cracked import CrackedIE +from .crackle import CrackleIE +from .craftsy import CraftsyIE +from .crooksandliars import CrooksAndLiarsIE +from .crowdbunker import ( + CrowdBunkerIE, + CrowdBunkerChannelIE, +) +from .crunchyroll import ( + CrunchyrollBetaIE, + CrunchyrollBetaShowIE, +) +from .cspan import CSpanIE, CSpanCongressIE +from .ctsnews import CtsNewsIE +from .ctv import CTVIE +from .ctvnews import CTVNewsIE +from .cultureunplugged import CultureUnpluggedIE +from .curiositystream import ( + CuriosityStreamIE, + CuriosityStreamCollectionsIE, + CuriosityStreamSeriesIE, +) +from .cwtv import CWTVIE +from .cybrary import ( + CybraryIE, + CybraryCourseIE +) +from .daftsex import DaftsexIE +from .dailymail import DailyMailIE +from .dailymotion import ( + DailymotionIE, + DailymotionPlaylistIE, + DailymotionUserIE, +) +from .dailywire import ( + DailyWireIE, + DailyWirePodcastIE, +) +from .damtomo import ( + DamtomoRecordIE, + DamtomoVideoIE, +) +from .daum import ( + DaumIE, + DaumClipIE, + DaumPlaylistIE, + DaumUserIE, +) +from .daystar import DaystarClipIE +from .dbtv import DBTVIE +from .dctp import DctpTvIE +from .deezer import ( + DeezerPlaylistIE, + DeezerAlbumIE, +) +from .democracynow import DemocracynowIE +from .detik import DetikEmbedIE +from .dfb import DFBIE +from .dhm import DHMIE +from .digg import DiggIE +from .dotsub import DotsubIE +from .douyutv import ( + DouyuShowIE, + DouyuTVIE, +) +from .dplay import ( + DPlayIE, + DiscoveryPlusIE, + HGTVDeIE, + GoDiscoveryIE, + TravelChannelIE, + CookingChannelIE, + HGTVUsaIE, + FoodNetworkIE, + InvestigationDiscoveryIE, + DestinationAmericaIE, + AmHistoryChannelIE, + ScienceChannelIE, + DIYNetworkIE, + DiscoveryLifeIE, + AnimalPlanetIE, + TLCIE, + MotorTrendIE, + MotorTrendOnDemandIE, + DiscoveryPlusIndiaIE, + DiscoveryNetworksDeIE, + DiscoveryPlusItalyIE, + DiscoveryPlusItalyShowIE, + DiscoveryPlusIndiaShowIE, +) +from .dreisat import DreiSatIE +from .drbonanza import DRBonanzaIE +from .drtuber import DrTuberIE +from .drtv import ( + DRTVIE, + DRTVLiveIE, +) +from .dtube import DTubeIE +from .dvtv import DVTVIE +from .duboku import ( + DubokuIE, + DubokuPlaylistIE +) +from .dumpert import DumpertIE +from .defense import DefenseGouvFrIE +from .deuxm import ( + DeuxMIE, + DeuxMNewsIE +) +from .digitalconcerthall import DigitalConcertHallIE +from .discovery import DiscoveryIE +from .disney import DisneyIE +from .dispeak import DigitallySpeakingIE +from .dropbox import DropboxIE +from .dropout import ( + DropoutSeasonIE, + DropoutIE +) +from .dw import ( + DWIE, + DWArticleIE, +) +from .eagleplatform import EaglePlatformIE, ClipYouEmbedIE +from .ebaumsworld import EbaumsWorldIE +from .echomsk import EchoMskIE +from .egghead import ( + EggheadCourseIE, + EggheadLessonIE, +) +from .ehow import EHowIE +from .eighttracks import EightTracksIE +from .einthusan import EinthusanIE +from .eitb import EitbIE +from .ellentube import ( + EllenTubeIE, + EllenTubeVideoIE, + EllenTubePlaylistIE, +) +from .elonet import ElonetIE +from .elpais import ElPaisIE +from .embedly import EmbedlyIE +from .engadget import EngadgetIE +from .epicon import ( + EpiconIE, + EpiconSeriesIE, +) +from .epoch import EpochIE +from .eporner import EpornerIE +from .eroprofile import ( + EroProfileIE, + EroProfileAlbumIE, +) +from .ertgr import ( + ERTFlixCodenameIE, + ERTFlixIE, + ERTWebtvEmbedIE, +) +from .escapist import EscapistIE +from .espn import ( + ESPNIE, + WatchESPNIE, + ESPNArticleIE, + FiveThirtyEightIE, + ESPNCricInfoIE, +) +from .esri import EsriVideoIE +from .europa import EuropaIE +from .europeantour import EuropeanTourIE +from .eurosport import EurosportIE +from .euscreen import EUScreenIE +from .expotv import ExpoTVIE +from .expressen import ExpressenIE +from .extremetube import ExtremeTubeIE +from .eyedotv import EyedoTVIE +from .facebook import ( + FacebookIE, + FacebookPluginsVideoIE, + FacebookRedirectURLIE, + FacebookReelIE, +) +from .fancode import ( + FancodeVodIE, + FancodeLiveIE +) + +from .faz import FazIE +from .fc2 import ( + FC2IE, + FC2EmbedIE, + FC2LiveIE, +) +from .fczenit import FczenitIE +from .fifa import FifaIE +from .filmmodu import FilmmoduIE +from .filmon import ( + FilmOnIE, + FilmOnChannelIE, +) +from .filmweb import FilmwebIE +from .firsttv import FirstTVIE +from .fivetv import FiveTVIE +from .flickr import FlickrIE +from .folketinget import FolketingetIE +from .footyroom import FootyRoomIE +from .formula1 import Formula1IE +from .fourtube import ( + FourTubeIE, + PornTubeIE, + PornerBrosIE, + FuxIE, +) +from .fourzerostudio import ( + FourZeroStudioArchiveIE, + FourZeroStudioClipIE, +) +from .fox import FOXIE +from .fox9 import ( + FOX9IE, + FOX9NewsIE, +) +from .foxgay import FoxgayIE +from .foxnews import ( + FoxNewsIE, + FoxNewsArticleIE, + FoxNewsVideoIE, +) +from .foxsports import FoxSportsIE +from .fptplay import FptplayIE +from .franceinter import FranceInterIE +from .francetv import ( + FranceTVIE, + FranceTVSiteIE, + FranceTVInfoIE, +) +from .freesound import FreesoundIE +from .freespeech import FreespeechIE +from .frontendmasters import ( + FrontendMastersIE, + FrontendMastersLessonIE, + FrontendMastersCourseIE +) +from .freetv import ( + FreeTvIE, + FreeTvMoviesIE, +) +from .fujitv import FujiTVFODPlus7IE +from .funimation import ( + FunimationIE, + FunimationPageIE, + FunimationShowIE, +) +from .funk import FunkIE +from .fusion import FusionIE +from .fuyintv import FuyinTVIE +from .gab import ( + GabTVIE, + GabIE, +) +from .gaia import GaiaIE +from .gameinformer import GameInformerIE +from .gamejolt import ( + GameJoltIE, + GameJoltUserIE, + GameJoltGameIE, + GameJoltGameSoundtrackIE, + GameJoltCommunityIE, + GameJoltSearchIE, +) +from .gamespot import GameSpotIE +from .gamestar import GameStarIE +from .gaskrank import GaskrankIE +from .gazeta import GazetaIE +from .gdcvault import GDCVaultIE +from .gedidigital import GediDigitalIE +from .generic import GenericIE +from .genius import ( + GeniusIE, + GeniusLyricsIE, +) +from .gettr import ( + GettrIE, + GettrStreamingIE, +) +from .gfycat import GfycatIE +from .giantbomb import GiantBombIE +from .giga import GigaIE +from .glide import GlideIE +from .globo import ( + GloboIE, + GloboArticleIE, +) +from .go import GoIE +from .godtube import GodTubeIE +from .gofile import GofileIE +from .golem import GolemIE +from .goodgame import GoodGameIE +from .googledrive import ( + GoogleDriveIE, + GoogleDriveFolderIE, +) +from .googlepodcasts import ( + GooglePodcastsIE, + GooglePodcastsFeedIE, +) +from .googlesearch import GoogleSearchIE +from .gopro import GoProIE +from .goplay import GoPlayIE +from .goshgay import GoshgayIE +from .gotostage import GoToStageIE +from .gputechconf import GPUTechConfIE +from .gronkh import ( + GronkhIE, + GronkhFeedIE, + GronkhVodsIE +) +from .groupon import GrouponIE +from .harpodeon import HarpodeonIE +from .hbo import HBOIE +from .hearthisat import HearThisAtIE +from .heise import HeiseIE +from .hellporno import HellPornoIE +from .helsinki import HelsinkiIE +from .hentaistigma import HentaiStigmaIE +from .hgtv import HGTVComShowIE +from .hketv import HKETVIE +from .hidive import HiDiveIE +from .historicfilms import HistoricFilmsIE +from .hitbox import HitboxIE, HitboxLiveIE +from .hitrecord import HitRecordIE +from .holodex import HolodexIE +from .hotnewhiphop import HotNewHipHopIE +from .hotstar import ( + HotStarIE, + HotStarPrefixIE, + HotStarPlaylistIE, + HotStarSeasonIE, + HotStarSeriesIE, +) +from .howcast import HowcastIE +from .howstuffworks import HowStuffWorksIE +from .hrfensehen import HRFernsehenIE +from .hrti import ( + HRTiIE, + HRTiPlaylistIE, +) +from .hse import ( + HSEShowIE, + HSEProductIE, +) +from .genericembeds import ( + HTML5MediaEmbedIE, + QuotedHTMLIE, +) +from .huajiao import HuajiaoIE +from .huya import HuyaLiveIE +from .huffpost import HuffPostIE +from .hungama import ( + HungamaIE, + HungamaSongIE, + HungamaAlbumPlaylistIE, +) +from .hypem import HypemIE +from .hytale import HytaleIE +from .icareus import IcareusIE +from .ichinanalive import ( + IchinanaLiveIE, + IchinanaLiveClipIE, +) +from .ign import ( + IGNIE, + IGNVideoIE, + IGNArticleIE, +) +from .iheart import ( + IHeartRadioIE, + IHeartRadioPodcastIE, +) +from .iltalehti import IltalehtiIE +from .imdb import ( + ImdbIE, + ImdbListIE +) +from .imgur import ( + ImgurIE, + ImgurAlbumIE, + ImgurGalleryIE, +) +from .ina import InaIE +from .inc import IncIE +from .indavideo import IndavideoEmbedIE +from .infoq import InfoQIE +from .instagram import ( + InstagramIE, + InstagramIOSIE, + InstagramUserIE, + InstagramTagIE, + InstagramStoryIE, +) +from .internazionale import InternazionaleIE +from .internetvideoarchive import InternetVideoArchiveIE +from .iprima import ( + IPrimaIE, + IPrimaCNNIE +) +from .iqiyi import ( + IqiyiIE, + IqIE, + IqAlbumIE +) +from .islamchannel import ( + IslamChannelIE, + IslamChannelSeriesIE, +) +from .israelnationalnews import IsraelNationalNewsIE +from .itprotv import ( + ITProTVIE, + ITProTVCourseIE +) +from .itv import ( + ITVIE, + ITVBTCCIE, +) +from .ivi import ( + IviIE, + IviCompilationIE +) +from .ivideon import IvideonIE +from .iwara import ( + IwaraIE, + IwaraPlaylistIE, + IwaraUserIE, +) +from .ixigua import IxiguaIE +from .izlesene import IzleseneIE +from .jable import ( + JableIE, + JablePlaylistIE, +) +from .jamendo import ( + JamendoIE, + JamendoAlbumIE, +) +from .japandiet import ( + ShugiinItvLiveIE, + ShugiinItvLiveRoomIE, + ShugiinItvVodIE, + SangiinInstructionIE, + SangiinIE, +) +from .jeuxvideo import JeuxVideoIE +from .jove import JoveIE +from .joj import JojIE +from .jwplatform import JWPlatformIE +from .kakao import KakaoIE +from .kaltura import KalturaIE +from .kanal2 import Kanal2IE +from .karaoketv import KaraoketvIE +from .karrierevideos import KarriereVideosIE +from .keezmovies import KeezMoviesIE +from .kelbyone import KelbyOneIE +from .ketnet import KetnetIE +from .khanacademy import ( + KhanAcademyIE, + KhanAcademyUnitIE, +) +from .kicker import KickerIE +from .kickstarter import KickStarterIE +from .kinja import KinjaEmbedIE +from .kinopoisk import KinoPoiskIE +from .kompas import KompasVideoIE +from .konserthusetplay import KonserthusetPlayIE +from .koo import KooIE +from .kth import KTHIE +from .krasview import KrasViewIE +from .ku6 import Ku6IE +from .kusi import KUSIIE +from .kuwo import ( + KuwoIE, + KuwoAlbumIE, + KuwoChartIE, + KuwoSingerIE, + KuwoCategoryIE, + KuwoMvIE, +) +from .la7 import ( + LA7IE, + LA7PodcastEpisodeIE, + LA7PodcastIE, +) +from .laola1tv import ( + Laola1TvEmbedIE, + Laola1TvIE, + EHFTVIE, + ITTFIE, +) +from .lastfm import ( + LastFMIE, + LastFMPlaylistIE, + LastFMUserIE, +) +from .lbry import ( + LBRYIE, + LBRYChannelIE, +) +from .lci import LCIIE +from .lcp import ( + LcpPlayIE, + LcpIE, +) +from .lecture2go import Lecture2GoIE +from .lecturio import ( + LecturioIE, + LecturioCourseIE, + LecturioDeCourseIE, +) +from .leeco import ( + LeIE, + LePlaylistIE, + LetvCloudIE, +) +from .lego import LEGOIE +from .lemonde import LemondeIE +from .lenta import LentaIE +from .libraryofcongress import LibraryOfCongressIE +from .libsyn import LibsynIE +from .lifenews import ( + LifeNewsIE, + LifeEmbedIE, +) +from .likee import ( + LikeeIE, + LikeeUserIE +) +from .limelight import ( + LimelightMediaIE, + LimelightChannelIE, + LimelightChannelListIE, +) +from .line import ( + LineLiveIE, + LineLiveChannelIE, +) +from .linkedin import ( + LinkedInIE, + LinkedInLearningIE, + LinkedInLearningCourseIE, +) +from .linuxacademy import LinuxAcademyIE +from .liputan6 import Liputan6IE +from .listennotes import ListenNotesIE +from .litv import LiTVIE +from .livejournal import LiveJournalIE +from .livestream import ( + LivestreamIE, + LivestreamOriginalIE, + LivestreamShortenerIE, +) +from .livestreamfails import LivestreamfailsIE +from .lnkgo import ( + LnkGoIE, + LnkIE, +) +from .localnews8 import LocalNews8IE +from .lovehomeporn import LoveHomePornIE +from .lrt import ( + LRTVODIE, + LRTStreamIE +) +from .lynda import ( + LyndaIE, + LyndaCourseIE +) +from .m6 import M6IE +from .magentamusik360 import MagentaMusik360IE +from .mailru import ( + MailRuIE, + MailRuMusicIE, + MailRuMusicSearchIE, +) +from .mainstreaming import MainStreamingIE +from .malltv import MallTVIE +from .mangomolo import ( + MangomoloVideoIE, + MangomoloLiveIE, +) +from .manoto import ( + ManotoTVIE, + ManotoTVShowIE, + ManotoTVLiveIE, +) +from .manyvids import ManyVidsIE +from .maoritv import MaoriTVIE +from .markiza import ( + MarkizaIE, + MarkizaPageIE, +) +from .massengeschmacktv import MassengeschmackTVIE +from .masters import MastersIE +from .matchtv import MatchTVIE +from .mdr import MDRIE +from .medaltv import MedalTVIE +from .mediaite import MediaiteIE +from .mediaklikk import MediaKlikkIE +from .mediaset import ( + MediasetIE, + MediasetShowIE, +) +from .mediasite import ( + MediasiteIE, + MediasiteCatalogIE, + MediasiteNamedCatalogIE, +) +from .mediaworksnz import MediaWorksNZVODIE +from .medici import MediciIE +from .megaphone import MegaphoneIE +from .meipai import MeipaiIE +from .melonvod import MelonVODIE +from .meta import METAIE +from .metacafe import MetacafeIE +from .metacritic import MetacriticIE +from .mgoon import MgoonIE +from .mgtv import MGTVIE +from .miaopai import MiaoPaiIE +from .microsoftstream import MicrosoftStreamIE +from .microsoftvirtualacademy import ( + MicrosoftVirtualAcademyIE, + MicrosoftVirtualAcademyCourseIE, +) +from .microsoftembed import MicrosoftEmbedIE +from .mildom import ( + MildomIE, + MildomVodIE, + MildomClipIE, + MildomUserVodIE, +) +from .minds import ( + MindsIE, + MindsChannelIE, + MindsGroupIE, +) +from .ministrygrid import MinistryGridIE +from .minoto import MinotoIE +from .miomio import MioMioIE +from .mirrativ import ( + MirrativIE, + MirrativUserIE, +) +from .mirrorcouk import MirrorCoUKIE +from .mit import TechTVMITIE, OCWMITIE +from .mitele import MiTeleIE +from .mixch import ( + MixchIE, + MixchArchiveIE, +) +from .mixcloud import ( + MixcloudIE, + MixcloudUserIE, + MixcloudPlaylistIE, +) +from .mlb import ( + MLBIE, + MLBVideoIE, + MLBTVIE, + MLBArticleIE, +) +from .mlssoccer import MLSSoccerIE +from .mnet import MnetIE +from .mocha import MochaVideoIE +from .moevideo import MoeVideoIE +from .mofosex import ( + MofosexIE, + MofosexEmbedIE, +) +from .mojvideo import MojvideoIE +from .morningstar import MorningstarIE +from .motherless import ( + MotherlessIE, + MotherlessGroupIE +) +from .motorsport import MotorsportIE +from .movieclips import MovieClipsIE +from .moviepilot import MoviepilotIE +from .moview import MoviewPlayIE +from .moviezine import MoviezineIE +from .movingimage import MovingImageIE +from .msn import MSNIE +from .mtv import ( + MTVIE, + MTVVideoIE, + MTVServicesEmbeddedIE, + MTVDEIE, + MTVJapanIE, + MTVItaliaIE, + MTVItaliaProgrammaIE, +) +from .muenchentv import MuenchenTVIE +from .murrtube import MurrtubeIE, MurrtubeUserIE +from .musescore import MuseScoreIE +from .musicdex import ( + MusicdexSongIE, + MusicdexAlbumIE, + MusicdexArtistIE, + MusicdexPlaylistIE, +) +from .mwave import MwaveIE, MwaveMeetGreetIE +from .mxplayer import ( + MxplayerIE, + MxplayerShowIE, +) +from .mychannels import MyChannelsIE +from .myspace import MySpaceIE, MySpaceAlbumIE +from .myspass import MySpassIE +from .myvi import ( + MyviIE, + MyviEmbedIE, +) +from .myvideoge import MyVideoGeIE +from .myvidster import MyVidsterIE +from .n1 import ( + N1InfoAssetIE, + N1InfoIIE, +) +from .nate import ( + NateIE, + NateProgramIE, +) +from .nationalgeographic import ( + NationalGeographicVideoIE, + NationalGeographicTVIE, +) +from .naver import ( + NaverIE, + NaverLiveIE, + NaverNowIE, +) +from .nba import ( + NBAWatchEmbedIE, + NBAWatchIE, + NBAWatchCollectionIE, + NBAEmbedIE, + NBAIE, + NBAChannelIE, +) +from .nbc import ( + NBCIE, + NBCNewsIE, + NBCOlympicsIE, + NBCOlympicsStreamIE, + NBCSportsIE, + NBCSportsStreamIE, + NBCSportsVPlayerIE, + NBCStationsIE, +) +from .ndr import ( + NDRIE, + NJoyIE, + NDREmbedBaseIE, + NDREmbedIE, + NJoyEmbedIE, +) +from .ndtv import NDTVIE +from .nebula import ( + NebulaIE, + NebulaSubscriptionsIE, + NebulaChannelIE, +) +from .nerdcubed import NerdCubedFeedIE +from .netzkino import NetzkinoIE +from .neteasemusic import ( + NetEaseMusicIE, + NetEaseMusicAlbumIE, + NetEaseMusicSingerIE, + NetEaseMusicListIE, + NetEaseMusicMvIE, + NetEaseMusicProgramIE, + NetEaseMusicDjRadioIE, +) +from .netverse import ( + NetverseIE, + NetversePlaylistIE, +) +from .newgrounds import ( + NewgroundsIE, + NewgroundsPlaylistIE, + NewgroundsUserIE, +) +from .newspicks import NewsPicksIE +from .newstube import NewstubeIE +from .newsy import NewsyIE +from .nextmedia import ( + NextMediaIE, + NextMediaActionNewsIE, + AppleDailyIE, + NextTVIE, +) +from .nexx import ( + NexxIE, + NexxEmbedIE, +) +from .nfb import NFBIE +from .nfhsnetwork import NFHSNetworkIE +from .nfl import ( + NFLIE, + NFLArticleIE, +) +from .nhk import ( + NhkVodIE, + NhkVodProgramIE, + NhkForSchoolBangumiIE, + NhkForSchoolSubjectIE, + NhkForSchoolProgramListIE, +) +from .nhl import NHLIE +from .nick import ( + NickIE, + NickBrIE, + NickDeIE, + NickNightIE, + NickRuIE, +) +from .niconico import ( + NiconicoIE, + NiconicoPlaylistIE, + NiconicoUserIE, + NiconicoSeriesIE, + NiconicoHistoryIE, + NicovideoSearchDateIE, + NicovideoSearchIE, + NicovideoSearchURLIE, + NicovideoTagURLIE, +) +from .ninecninemedia import ( + NineCNineMediaIE, + CPTwentyFourIE, +) +from .ninegag import NineGagIE +from .ninenow import NineNowIE +from .nintendo import NintendoIE +from .nitter import NitterIE +from .njpwworld import NJPWWorldIE +from .nobelprize import NobelPrizeIE +from .nonktube import NonkTubeIE +from .noodlemagazine import NoodleMagazineIE +from .noovo import NoovoIE +from .normalboots import NormalbootsIE +from .nosvideo import NosVideoIE +from .nosnl import NOSNLArticleIE +from .nova import ( + NovaEmbedIE, + NovaIE, +) +from .novaplay import NovaPlayIE +from .nowness import ( + NownessIE, + NownessPlaylistIE, + NownessSeriesIE, +) +from .noz import NozIE +from .npo import ( + AndereTijdenIE, + NPOIE, + NPOLiveIE, + NPORadioIE, + NPORadioFragmentIE, + SchoolTVIE, + HetKlokhuisIE, + VPROIE, + WNLIE, +) +from .npr import NprIE +from .nrk import ( + NRKIE, + NRKPlaylistIE, + NRKSkoleIE, + NRKTVIE, + NRKTVDirekteIE, + NRKRadioPodkastIE, + NRKTVEpisodeIE, + NRKTVEpisodesIE, + NRKTVSeasonIE, + NRKTVSeriesIE, +) +from .nrl import NRLTVIE +from .ntvcojp import NTVCoJpCUIE +from .ntvde import NTVDeIE +from .ntvru import NTVRuIE +from .nytimes import ( + NYTimesIE, + NYTimesArticleIE, + NYTimesCookingIE, +) +from .nuvid import NuvidIE +from .nzherald import NZHeraldIE +from .nzz import NZZIE +from .odatv import OdaTVIE +from .odnoklassniki import OdnoklassnikiIE +from .oftv import ( + OfTVIE, + OfTVPlaylistIE +) +from .oktoberfesttv import OktoberfestTVIE +from .olympics import OlympicsReplayIE +from .on24 import On24IE +from .ondemandkorea import OnDemandKoreaIE +from .onefootball import OneFootballIE +from .onenewsnz import OneNewsNZIE +from .onet import ( + OnetIE, + OnetChannelIE, + OnetMVPIE, + OnetPlIE, +) +from .onionstudios import OnionStudiosIE +from .ooyala import ( + OoyalaIE, + OoyalaExternalIE, +) +from .opencast import ( + OpencastIE, + OpencastPlaylistIE, +) +from .openrec import ( + OpenRecIE, + OpenRecCaptureIE, + OpenRecMovieIE, +) +from .ora import OraTVIE +from .orf import ( + ORFTVthekIE, + ORFFM4StoryIE, + ORFRadioIE, + ORFIPTVIE, +) +from .outsidetv import OutsideTVIE +from .packtpub import ( + PacktPubIE, + PacktPubCourseIE, +) +from .palcomp3 import ( + PalcoMP3IE, + PalcoMP3ArtistIE, + PalcoMP3VideoIE, +) +from .pandoratv import PandoraTVIE +from .panopto import ( + PanoptoIE, + PanoptoListIE, + PanoptoPlaylistIE +) +from .paramountplus import ( + ParamountPlusIE, + ParamountPlusSeriesIE, +) +from .parler import ParlerIE +from .parlview import ParlviewIE +from .patreon import ( + PatreonIE, + PatreonCampaignIE +) +from .pbs import PBSIE +from .pearvideo import PearVideoIE +from .peekvids import PeekVidsIE, PlayVidsIE +from .peertube import ( + PeerTubeIE, + PeerTubePlaylistIE, +) +from .peertv import PeerTVIE +from .peloton import ( + PelotonIE, + PelotonLiveIE +) +from .people import PeopleIE +from .performgroup import PerformGroupIE +from .periscope import ( + PeriscopeIE, + PeriscopeUserIE, +) +from .philharmoniedeparis import PhilharmonieDeParisIE +from .phoenix import PhoenixIE +from .photobucket import PhotobucketIE +from .piapro import PiaproIE +from .picarto import ( + PicartoIE, + PicartoVodIE, +) +from .piksel import PikselIE +from .pinkbike import PinkbikeIE +from .pinterest import ( + PinterestIE, + PinterestCollectionIE, +) +from .pixivsketch import ( + PixivSketchIE, + PixivSketchUserIE, +) +from .pladform import PladformIE +from .planetmarathi import PlanetMarathiIE +from .platzi import ( + PlatziIE, + PlatziCourseIE, +) +from .playfm import PlayFMIE +from .playplustv import PlayPlusTVIE +from .plays import PlaysTVIE +from .playstuff import PlayStuffIE +from .playsuisse import PlaySuisseIE +from .playtvak import PlaytvakIE +from .playvid import PlayvidIE +from .playwire import PlaywireIE +from .plutotv import PlutoTVIE +from .pluralsight import ( + PluralsightIE, + PluralsightCourseIE, +) +from .podbayfm import PodbayFMIE, PodbayFMChannelIE +from .podchaser import PodchaserIE +from .podomatic import PodomaticIE +from .pokemon import ( + PokemonIE, + PokemonWatchIE, +) +from .pokergo import ( + PokerGoIE, + PokerGoCollectionIE, +) +from .polsatgo import PolsatGoIE +from .polskieradio import ( + PolskieRadioIE, + PolskieRadioCategoryIE, + PolskieRadioPlayerIE, + PolskieRadioPodcastIE, + PolskieRadioPodcastListIE, + PolskieRadioRadioKierowcowIE, +) +from .popcorntimes import PopcorntimesIE +from .popcorntv import PopcornTVIE +from .porn91 import Porn91IE +from .porncom import PornComIE +from .pornflip import PornFlipIE +from .pornhd import PornHdIE +from .pornhub import ( + PornHubIE, + PornHubUserIE, + PornHubPlaylistIE, + PornHubPagedVideoListIE, + PornHubUserVideosUploadIE, +) +from .pornotube import PornotubeIE +from .pornovoisines import PornoVoisinesIE +from .pornoxo import PornoXOIE +from .pornez import PornezIE +from .puhutv import ( + PuhuTVIE, + PuhuTVSerieIE, +) +from .prankcast import PrankCastIE +from .premiershiprugby import PremiershipRugbyIE +from .presstv import PressTVIE +from .projectveritas import ProjectVeritasIE +from .prosiebensat1 import ProSiebenSat1IE +from .prx import ( + PRXStoryIE, + PRXSeriesIE, + PRXAccountIE, + PRXStoriesSearchIE, + PRXSeriesSearchIE +) +from .puls4 import Puls4IE +from .pyvideo import PyvideoIE +from .qingting import QingTingIE +from .qqmusic import ( + QQMusicIE, + QQMusicSingerIE, + QQMusicAlbumIE, + QQMusicToplistIE, + QQMusicPlaylistIE, +) +from .r7 import ( + R7IE, + R7ArticleIE, +) +from .radiko import RadikoIE, RadikoRadioIE +from .radiocanada import ( + RadioCanadaIE, + RadioCanadaAudioVideoIE, +) +from .radiode import RadioDeIE +from .radiojavan import RadioJavanIE +from .radiobremen import RadioBremenIE +from .radiofrance import FranceCultureIE, RadioFranceIE +from .radiozet import RadioZetPodcastIE +from .radiokapital import ( + RadioKapitalIE, + RadioKapitalShowIE, +) +from .radlive import ( + RadLiveIE, + RadLiveChannelIE, + RadLiveSeasonIE, +) +from .rai import ( + RaiPlayIE, + RaiPlayLiveIE, + RaiPlayPlaylistIE, + RaiPlaySoundIE, + RaiPlaySoundLiveIE, + RaiPlaySoundPlaylistIE, + RaiNewsIE, + RaiSudtirolIE, + RaiIE, +) +from .raywenderlich import ( + RayWenderlichIE, + RayWenderlichCourseIE, +) +from .rbmaradio import RBMARadioIE +from .rcs import ( + RCSIE, + RCSEmbedsIE, + RCSVariousIE, +) +from .rcti import ( + RCTIPlusIE, + RCTIPlusSeriesIE, + RCTIPlusTVIE, +) +from .rds import RDSIE +from .redbee import ParliamentLiveUKIE, RTBFIE +from .redbulltv import ( + RedBullTVIE, + RedBullEmbedIE, + RedBullTVRrnContentIE, + RedBullIE, +) +from .reddit import RedditIE +from .redgifs import ( + RedGifsIE, + RedGifsSearchIE, + RedGifsUserIE, +) +from .redtube import RedTubeIE +from .regiotv import RegioTVIE +from .rentv import ( + RENTVIE, + RENTVArticleIE, +) +from .restudy import RestudyIE +from .reuters import ReutersIE +from .reverbnation import ReverbNationIE +from .rice import RICEIE +from .rmcdecouverte import RMCDecouverteIE +from .rockstargames import RockstarGamesIE +from .rokfin import ( + RokfinIE, + RokfinStackIE, + RokfinChannelIE, + RokfinSearchIE, +) +from .roosterteeth import RoosterTeethIE, RoosterTeethSeriesIE +from .rottentomatoes import RottenTomatoesIE +from .rozhlas import RozhlasIE +from .rte import RteIE, RteRadioIE +from .rtlnl import ( + RtlNlIE, + RTLLuTeleVODIE, + RTLLuArticleIE, + RTLLuLiveIE, + RTLLuRadioIE, +) +from .rtl2 import ( + RTL2IE, + RTL2YouIE, + RTL2YouSeriesIE, +) +from .rtnews import ( + RTNewsIE, + RTDocumentryIE, + RTDocumentryPlaylistIE, + RuptlyIE, +) +from .rtp import RTPIE +from .rtrfm import RTRFMIE +from .rts import RTSIE +from .rtve import ( + RTVEALaCartaIE, + RTVEAudioIE, + RTVELiveIE, + RTVEInfantilIE, + RTVETelevisionIE, +) +from .rtvnh import RTVNHIE +from .rtvs import RTVSIE +from .rtvslo import RTVSLOIE +from .ruhd import RUHDIE +from .rule34video import Rule34VideoIE +from .rumble import ( + RumbleEmbedIE, + RumbleChannelIE, +) +from .rutube import ( + RutubeIE, + RutubeChannelIE, + RutubeEmbedIE, + RutubeMovieIE, + RutubePersonIE, + RutubePlaylistIE, + RutubeTagsIE, +) +from .glomex import ( + GlomexIE, + GlomexEmbedIE, +) +from .megatvcom import ( + MegaTVComIE, + MegaTVComEmbedIE, +) +from .ant1newsgr import ( + Ant1NewsGrWatchIE, + Ant1NewsGrArticleIE, + Ant1NewsGrEmbedIE, +) +from .rutv import RUTVIE +from .ruutu import RuutuIE +from .ruv import ( + RuvIE, + RuvSpilaIE +) +from .safari import ( + SafariIE, + SafariApiIE, + SafariCourseIE, +) +from .saitosan import SaitosanIE +from .samplefocus import SampleFocusIE +from .sapo import SapoIE +from .savefrom import SaveFromIE +from .sbs import SBSIE +from .screen9 import Screen9IE +from .screencast import ScreencastIE +from .screencastify import ScreencastifyIE +from .screencastomatic import ScreencastOMaticIE +from .scrippsnetworks import ( + ScrippsNetworksWatchIE, + ScrippsNetworksIE, +) +from .scte import ( + SCTEIE, + SCTECourseIE, +) +from .scrolller import ScrolllerIE +from .seeker import SeekerIE +from .senategov import SenateISVPIE, SenateGovIE +from .sendtonews import SendtoNewsIE +from .servus import ServusIE +from .sevenplus import SevenPlusIE +from .sexu import SexuIE +from .seznamzpravy import ( + SeznamZpravyIE, + SeznamZpravyArticleIE, +) +from .shahid import ( + ShahidIE, + ShahidShowIE, +) +from .shared import ( + SharedIE, + VivoIE, +) +from .sharevideos import ShareVideosEmbedIE +from .shemaroome import ShemarooMeIE +from .showroomlive import ShowRoomLiveIE +from .simplecast import ( + SimplecastIE, + SimplecastEpisodeIE, + SimplecastPodcastIE, +) +from .sina import SinaIE +from .sixplay import SixPlayIE +from .skeb import SkebIE +from .skyit import ( + SkyItPlayerIE, + SkyItVideoIE, + SkyItVideoLiveIE, + SkyItIE, + SkyItArteIE, + CieloTVItIE, + TV8ItIE, +) +from .skylinewebcams import SkylineWebcamsIE +from .skynewsarabia import ( + SkyNewsArabiaIE, + SkyNewsArabiaArticleIE, +) +from .skynewsau import SkyNewsAUIE +from .sky import ( + SkyNewsIE, + SkyNewsStoryIE, + SkySportsIE, + SkySportsNewsIE, +) +from .slideshare import SlideshareIE +from .slideslive import SlidesLiveIE +from .slutload import SlutloadIE +from .smotrim import SmotrimIE +from .snotr import SnotrIE +from .sohu import SohuIE +from .sonyliv import ( + SonyLIVIE, + SonyLIVSeriesIE, +) +from .soundcloud import ( + SoundcloudEmbedIE, + SoundcloudIE, + SoundcloudSetIE, + SoundcloudRelatedIE, + SoundcloudUserIE, + SoundcloudTrackStationIE, + SoundcloudPlaylistIE, + SoundcloudSearchIE, +) +from .soundgasm import ( + SoundgasmIE, + SoundgasmProfileIE +) +from .southpark import ( + SouthParkIE, + SouthParkDeIE, + SouthParkDkIE, + SouthParkEsIE, + SouthParkLatIE, + SouthParkNlIE +) +from .sovietscloset import ( + SovietsClosetIE, + SovietsClosetPlaylistIE +) +from .spankbang import ( + SpankBangIE, + SpankBangPlaylistIE, +) +from .spankwire import SpankwireIE +from .spiegel import SpiegelIE +from .spike import ( + BellatorIE, + ParamountNetworkIE, +) +from .startrek import StarTrekIE +from .stitcher import ( + StitcherIE, + StitcherShowIE, +) +from .sport5 import Sport5IE +from .sportbox import SportBoxIE +from .sportdeutschland import SportDeutschlandIE +from .spotify import ( + SpotifyIE, + SpotifyShowIE, +) +from .spreaker import ( + SpreakerIE, + SpreakerPageIE, + SpreakerShowIE, + SpreakerShowPageIE, +) +from .springboardplatform import SpringboardPlatformIE +from .sprout import SproutIE +from .srgssr import ( + SRGSSRIE, + SRGSSRPlayIE, +) +from .srmediathek import SRMediathekIE +from .stanfordoc import StanfordOpenClassroomIE +from .startv import StarTVIE +from .steam import ( + SteamIE, + SteamCommunityBroadcastIE, +) +from .storyfire import ( + StoryFireIE, + StoryFireUserIE, + StoryFireSeriesIE, +) +from .streamable import StreamableIE +from .streamanity import StreamanityIE +from .streamcloud import StreamcloudIE +from .streamcz import StreamCZIE +from .streamff import StreamFFIE +from .streetvoice import StreetVoiceIE +from .stretchinternet import StretchInternetIE +from .stripchat import StripchatIE +from .stv import STVPlayerIE +from .substack import SubstackIE +from .sunporno import SunPornoIE +from .sverigesradio import ( + SverigesRadioEpisodeIE, + SverigesRadioPublicationIE, +) +from .svt import ( + SVTIE, + SVTPageIE, + SVTPlayIE, + SVTSeriesIE, +) +from .swearnet import SwearnetEpisodeIE +from .swrmediathek import SWRMediathekIE +from .syvdk import SYVDKIE +from .syfy import SyfyIE +from .sztvhu import SztvHuIE +from .tagesschau import TagesschauIE +from .tass import TassIE +from .tbs import TBSIE +from .tdslifeway import TDSLifewayIE +from .teachable import ( + TeachableIE, + TeachableCourseIE, +) +from .teachertube import ( + TeacherTubeIE, + TeacherTubeUserIE, +) +from .teachingchannel import TeachingChannelIE +from .teamcoco import TeamcocoIE +from .teamtreehouse import TeamTreeHouseIE +from .techtalks import TechTalksIE +from .ted import ( + TedEmbedIE, + TedPlaylistIE, + TedSeriesIE, + TedTalkIE, +) +from .tele5 import Tele5IE +from .tele13 import Tele13IE +from .telebruxelles import TeleBruxellesIE +from .telecinco import TelecincoIE +from .telegraaf import TelegraafIE +from .telegram import TelegramEmbedIE +from .telemb import TeleMBIE +from .telemundo import TelemundoIE +from .telequebec import ( + TeleQuebecIE, + TeleQuebecSquatIE, + TeleQuebecEmissionIE, + TeleQuebecLiveIE, + TeleQuebecVideoIE, +) +from .teletask import TeleTaskIE +from .telewebion import TelewebionIE +from .tempo import TempoIE +from .tencent import ( + IflixEpisodeIE, + IflixSeriesIE, + VQQSeriesIE, + VQQVideoIE, + WeTvEpisodeIE, + WeTvSeriesIE, +) +from .tennistv import TennisTVIE +from .tenplay import TenPlayIE +from .testurl import TestURLIE +from .tf1 import TF1IE +from .tfo import TFOIE +from .theholetv import TheHoleTvIE +from .theintercept import TheInterceptIE +from .theplatform import ( + ThePlatformIE, + ThePlatformFeedIE, +) +from .thestar import TheStarIE +from .thesun import TheSunIE +from .theta import ( + ThetaVideoIE, + ThetaStreamIE, +) +from .theweatherchannel import TheWeatherChannelIE +from .thisamericanlife import ThisAmericanLifeIE +from .thisav import ThisAVIE +from .thisoldhouse import ThisOldHouseIE +from .threespeak import ( + ThreeSpeakIE, + ThreeSpeakUserIE, +) +from .threeqsdn import ThreeQSDNIE +from .tiktok import ( + TikTokIE, + TikTokUserIE, + TikTokSoundIE, + TikTokEffectIE, + TikTokTagIE, + TikTokVMIE, + DouyinIE, +) +from .tinypic import TinyPicIE +from .tmz import TMZIE +from .tnaflix import ( + TNAFlixNetworkEmbedIE, + TNAFlixIE, + EMPFlixIE, + MovieFapIE, +) +from .toggle import ( + ToggleIE, + MeWatchIE, +) +from .toggo import ( + ToggoIE, +) +from .tokentube import ( + TokentubeIE, + TokentubeChannelIE +) +from .tonline import TOnlineIE +from .toongoggles import ToonGogglesIE +from .toutv import TouTvIE +from .toypics import ToypicsUserIE, ToypicsIE +from .traileraddict import TrailerAddictIE +from .triller import ( + TrillerIE, + TrillerUserIE, +) +from .trilulilu import TriluliluIE +from .trovo import ( + TrovoIE, + TrovoVodIE, + TrovoChannelVodIE, + TrovoChannelClipIE, +) +from .trueid import TrueIDIE +from .trunews import TruNewsIE +from .truth import TruthIE +from .trutv import TruTVIE +from .tube8 import Tube8IE +from .tubetugraz import TubeTuGrazIE, TubeTuGrazSeriesIE +from .tubitv import ( + TubiTvIE, + TubiTvShowIE, +) +from .tumblr import TumblrIE +from .tunein import ( + TuneInClipIE, + TuneInStationIE, + TuneInProgramIE, + TuneInTopicIE, + TuneInShortenerIE, +) +from .tunepk import TunePkIE +from .turbo import TurboIE +from .tv2 import ( + TV2IE, + TV2ArticleIE, + KatsomoIE, + MTVUutisetArticleIE, +) +from .tv24ua import ( + TV24UAVideoIE, +) +from .tv2dk import ( + TV2DKIE, + TV2DKBornholmPlayIE, +) +from .tv2hu import ( + TV2HuIE, + TV2HuSeriesIE, +) +from .tv4 import TV4IE +from .tv5mondeplus import TV5MondePlusIE +from .tv5unis import ( + TV5UnisVideoIE, + TV5UnisIE, +) +from .tva import ( + TVAIE, + QubIE, +) +from .tvanouvelles import ( + TVANouvellesIE, + TVANouvellesArticleIE, +) +from .tvc import ( + TVCIE, + TVCArticleIE, +) +from .tver import TVerIE +from .tvigle import TvigleIE +from .tviplayer import TVIPlayerIE +from .tvland import TVLandIE +from .tvn24 import TVN24IE +from .tvnet import TVNetIE +from .tvnoe import TVNoeIE +from .tvnow import ( + TVNowIE, + TVNowFilmIE, + TVNowNewIE, + TVNowSeasonIE, + TVNowAnnualIE, + TVNowShowIE, +) +from .tvopengr import ( + TVOpenGrWatchIE, + TVOpenGrEmbedIE, +) +from .tvp import ( + TVPEmbedIE, + TVPIE, + TVPStreamIE, + TVPVODSeriesIE, + TVPVODVideoIE, +) +from .tvplay import ( + TVPlayIE, + ViafreeIE, + TVPlayHomeIE, +) +from .tvplayer import TVPlayerIE +from .tweakers import TweakersIE +from .twentyfourvideo import TwentyFourVideoIE +from .twentymin import TwentyMinutenIE +from .twentythreevideo import TwentyThreeVideoIE +from .twitcasting import ( + TwitCastingIE, + TwitCastingLiveIE, + TwitCastingUserIE, +) +from .twitch import ( + TwitchVodIE, + TwitchCollectionIE, + TwitchVideosIE, + TwitchVideosClipsIE, + TwitchVideosCollectionsIE, + TwitchStreamIE, + TwitchClipsIE, +) +from .twitter import ( + TwitterCardIE, + TwitterIE, + TwitterAmplifyIE, + TwitterBroadcastIE, + TwitterSpacesIE, + TwitterShortenerIE, +) +from .udemy import ( + UdemyIE, + UdemyCourseIE +) +from .udn import UDNEmbedIE +from .ufctv import ( + UFCTVIE, + UFCArabiaIE, +) +from .ukcolumn import UkColumnIE +from .uktvplay import UKTVPlayIE +from .digiteka import DigitekaIE +from .dlive import ( + DLiveVODIE, + DLiveStreamIE, +) +from .drooble import DroobleIE +from .umg import UMGDeIE +from .unistra import UnistraIE +from .unity import UnityIE +from .unscripted import UnscriptedNewsVideoIE +from .unsupported import KnownDRMIE, KnownPiracyIE +from .uol import UOLIE +from .uplynk import ( + UplynkIE, + UplynkPreplayIE, +) +from .urort import UrortIE +from .urplay import URPlayIE +from .usanetwork import USANetworkIE +from .usatoday import USATodayIE +from .ustream import UstreamIE, UstreamChannelIE +from .ustudio import ( + UstudioIE, + UstudioEmbedIE, +) +from .utreon import UtreonIE +from .varzesh3 import Varzesh3IE +from .vbox7 import Vbox7IE +from .veehd import VeeHDIE +from .veo import VeoIE +from .veoh import ( + VeohIE, + VeohUserIE +) +from .vesti import VestiIE +from .vevo import ( + VevoIE, + VevoPlaylistIE, +) +from .vgtv import ( + BTArticleIE, + BTVestlendingenIE, + VGTVIE, +) +from .vh1 import VH1IE +from .vice import ( + ViceIE, + ViceArticleIE, + ViceShowIE, +) +from .vidbit import VidbitIE +from .viddler import ViddlerIE +from .videa import VideaIE +from .videocampus_sachsen import ( + VideocampusSachsenIE, + ViMPPlaylistIE, +) +from .videodetective import VideoDetectiveIE +from .videofyme import VideofyMeIE +from .videomore import ( + VideomoreIE, + VideomoreVideoIE, + VideomoreSeasonIE, +) +from .videopress import VideoPressIE +from .vidio import ( + VidioIE, + VidioPremierIE, + VidioLiveIE +) +from .vidlii import VidLiiIE +from .viewlift import ( + ViewLiftIE, + ViewLiftEmbedIE, +) +from .viidea import ViideaIE +from .vimeo import ( + VimeoIE, + VimeoAlbumIE, + VimeoChannelIE, + VimeoGroupsIE, + VimeoLikesIE, + VimeoOndemandIE, + VimeoProIE, + VimeoReviewIE, + VimeoUserIE, + VimeoWatchLaterIE, + VHXEmbedIE, +) +from .vimm import ( + VimmIE, + VimmRecordingIE, +) +from .vimple import VimpleIE +from .vine import ( + VineIE, + VineUserIE, +) +from .viki import ( + VikiIE, + VikiChannelIE, +) +from .viqeo import ViqeoIE +from .viu import ( + ViuIE, + ViuPlaylistIE, + ViuOTTIE, +) +from .vk import ( + VKIE, + VKUserVideosIE, + VKWallPostIE, +) +from .vlive import ( + VLiveIE, + VLivePostIE, + VLiveChannelIE, +) +from .vodlocker import VodlockerIE +from .vodpl import VODPlIE +from .vodplatform import VODPlatformIE +from .voicerepublic import VoiceRepublicIE +from .voicy import ( + VoicyIE, + VoicyChannelIE, +) +from .voot import ( + VootIE, + VootSeriesIE, +) +from .voxmedia import ( + VoxMediaVolumeIE, + VoxMediaIE, +) +from .vrt import VRTIE +from .vrak import VrakIE +from .vrv import ( + VRVIE, + VRVSeriesIE, +) +from .vshare import VShareIE +from .vtm import VTMIE +from .medialaan import MedialaanIE +from .vuclip import VuClipIE +from .vupload import VuploadIE +from .vvvvid import ( + VVVVIDIE, + VVVVIDShowIE, +) +from .vyborymos import VyboryMosIE +from .vzaar import VzaarIE +from .wakanim import WakanimIE +from .walla import WallaIE +from .washingtonpost import ( + WashingtonPostIE, + WashingtonPostArticleIE, +) +from .wasdtv import ( + WASDTVStreamIE, + WASDTVRecordIE, + WASDTVClipIE, +) +from .wat import WatIE +from .watchbox import WatchBoxIE +from .watchindianporn import WatchIndianPornIE +from .wdr import ( + WDRIE, + WDRPageIE, + WDRElefantIE, + WDRMobileIE, +) +from .webcaster import ( + WebcasterIE, + WebcasterFeedIE, +) +from .webofstories import ( + WebOfStoriesIE, + WebOfStoriesPlaylistIE, +) +from .weibo import ( + WeiboIE, + WeiboMobileIE +) +from .weiqitv import WeiqiTVIE +from .wikimedia import WikimediaIE +from .willow import WillowIE +from .wimtv import WimTVIE +from .whowatch import WhoWatchIE +from .wistia import ( + WistiaIE, + WistiaPlaylistIE, + WistiaChannelIE, +) +from .wordpress import ( + WordpressPlaylistEmbedIE, + WordpressMiniAudioPlayerEmbedIE, +) +from .worldstarhiphop import WorldStarHipHopIE +from .wppilot import ( + WPPilotIE, + WPPilotChannelsIE, +) +from .wsj import ( + WSJIE, + WSJArticleIE, +) +from .wwe import WWEIE +from .xbef import XBefIE +from .xboxclips import XboxClipsIE +from .xfileshare import XFileShareIE +from .xhamster import ( + XHamsterIE, + XHamsterEmbedIE, + XHamsterUserIE, +) +from .xiami import ( + XiamiSongIE, + XiamiAlbumIE, + XiamiArtistIE, + XiamiCollectionIE +) +from .ximalaya import ( + XimalayaIE, + XimalayaAlbumIE +) +from .xinpianchang import XinpianchangIE +from .xminus import XMinusIE +from .xnxx import XNXXIE +from .xstream import XstreamIE +from .xtube import XTubeUserIE, XTubeIE +from .xuite import XuiteIE +from .xvideos import XVideosIE +from .xxxymovies import XXXYMoviesIE +from .yahoo import ( + YahooIE, + YahooSearchIE, + YahooGyaOPlayerIE, + YahooGyaOIE, + YahooJapanNewsIE, +) +from .yandexdisk import YandexDiskIE +from .yandexmusic import ( + YandexMusicTrackIE, + YandexMusicAlbumIE, + YandexMusicPlaylistIE, + YandexMusicArtistTracksIE, + YandexMusicArtistAlbumsIE, +) +from .yandexvideo import ( + YandexVideoIE, + YandexVideoPreviewIE, + ZenYandexIE, + ZenYandexChannelIE, +) +from .yapfiles import YapFilesIE +from .yesjapan import YesJapanIE +from .yinyuetai import YinYueTaiIE +from .yle_areena import YleAreenaIE +from .ynet import YnetIE +from .youjizz import YouJizzIE +from .youku import ( + YoukuIE, + YoukuShowIE, +) +from .younow import ( + YouNowLiveIE, + YouNowChannelIE, + YouNowMomentIE, +) +from .youporn import YouPornIE +from .yourporn import YourPornIE +from .yourupload import YourUploadIE +from .zapiks import ZapiksIE +from .zattoo import ( + BBVTVIE, + BBVTVLiveIE, + BBVTVRecordingsIE, + EinsUndEinsTVIE, + EinsUndEinsTVLiveIE, + EinsUndEinsTVRecordingsIE, + EWETVIE, + EWETVLiveIE, + EWETVRecordingsIE, + GlattvisionTVIE, + GlattvisionTVLiveIE, + GlattvisionTVRecordingsIE, + MNetTVIE, + MNetTVLiveIE, + MNetTVRecordingsIE, + NetPlusTVIE, + NetPlusTVLiveIE, + NetPlusTVRecordingsIE, + OsnatelTVIE, + OsnatelTVLiveIE, + OsnatelTVRecordingsIE, + QuantumTVIE, + QuantumTVLiveIE, + QuantumTVRecordingsIE, + SaltTVIE, + SaltTVLiveIE, + SaltTVRecordingsIE, + SAKTVIE, + SAKTVLiveIE, + SAKTVRecordingsIE, + VTXTVIE, + VTXTVLiveIE, + VTXTVRecordingsIE, + WalyTVIE, + WalyTVLiveIE, + WalyTVRecordingsIE, + ZattooIE, + ZattooLiveIE, + ZattooMoviesIE, + ZattooRecordingsIE, +) +from .zdf import ZDFIE, ZDFChannelIE +from .zee5 import ( + Zee5IE, + Zee5SeriesIE, +) +from .zeenews import ZeeNewsIE +from .zhihu import ZhihuIE +from .zingmp3 import ( + ZingMp3IE, + ZingMp3AlbumIE, + ZingMp3ChartHomeIE, + ZingMp3WeekChartIE, + ZingMp3ChartMusicVideoIE, + ZingMp3UserIE, +) +from .zoom import ZoomIE +from .zype import ZypeIE diff --git a/hypervideo_dl/extractor/abc.py b/hypervideo_dl/extractor/abc.py index 6fe195e..0ca76b8 100644 --- a/hypervideo_dl/extractor/abc.py +++ b/hypervideo_dl/extractor/abc.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import hashlib import hmac import re @@ -157,8 +155,6 @@ class ABCIE(InfoExtractor): 'format_id': format_id }) - self._sort_formats(formats) - return { 'id': video_id, 'title': self._og_search_title(webpage), @@ -223,7 +219,6 @@ class ABCIViewIE(InfoExtractor): entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) if formats: break - self._sort_formats(formats) subtitles = {} src_vtt = stream.get('captions', {}).get('src-vtt') diff --git a/hypervideo_dl/extractor/abcnews.py b/hypervideo_dl/extractor/abcnews.py index 296b8ce..a57295b 100644 --- a/hypervideo_dl/extractor/abcnews.py +++ b/hypervideo_dl/extractor/abcnews.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .amp import AMPIE from .common import InfoExtractor from ..utils import ( diff --git a/hypervideo_dl/extractor/abcotvs.py b/hypervideo_dl/extractor/abcotvs.py index 5bff466..6dca19d 100644 --- a/hypervideo_dl/extractor/abcotvs.py +++ b/hypervideo_dl/extractor/abcotvs.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor from ..compat import compat_str from ..utils import ( @@ -82,7 +78,6 @@ class ABCOTVSIE(InfoExtractor): 'url': mp4_url, 'width': 640, }) - self._sort_formats(formats) image = video.get('image') or {} @@ -123,7 +118,6 @@ class ABCOTVSClipsIE(InfoExtractor): title = video_data['title'] formats = self._extract_m3u8_formats( video_data['videoURL'].split('?')[0], video_id, 'mp4') - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/abematv.py b/hypervideo_dl/extractor/abematv.py index 27b7d86..80046af 100644 --- a/hypervideo_dl/extractor/abematv.py +++ b/hypervideo_dl/extractor/abematv.py @@ -1,42 +1,41 @@ -import io -import json -import time +import base64 +import binascii +import functools import hashlib import hmac +import io +import json import re import struct -from base64 import urlsafe_b64encode -from binascii import unhexlify +import time +import urllib.parse +import urllib.request +import urllib.response +import uuid from .common import InfoExtractor from ..aes import aes_ecb_decrypt -from ..compat import ( - compat_urllib_response, - compat_urllib_parse_urlparse, - compat_urllib_request, -) from ..utils import ( ExtractorError, - decode_base, + bytes_to_intlist, + decode_base_n, int_or_none, - random_uuidv4, + intlist_to_bytes, + OnDemandPagedList, request_to_url, time_seconds, - update_url_query, traverse_obj, - intlist_to_bytes, - bytes_to_intlist, - urljoin, + update_url_query, ) - # NOTE: network handler related code is temporary thing until network stack overhaul PRs are merged (#2861/#2862) + def add_opener(ydl, handler): ''' Add a handler for opening URLs, like _download_webpage ''' # https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L426 # https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L605 - assert isinstance(ydl._opener, compat_urllib_request.OpenerDirector) + assert isinstance(ydl._opener, urllib.request.OpenerDirector) ydl._opener.add_handler(handler) @@ -49,7 +48,7 @@ def remove_opener(ydl, handler): # https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L426 # https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L605 opener = ydl._opener - assert isinstance(ydl._opener, compat_urllib_request.OpenerDirector) + assert isinstance(ydl._opener, urllib.request.OpenerDirector) if isinstance(handler, (type, tuple)): find_cp = lambda x: isinstance(x, handler) else: @@ -99,20 +98,20 @@ def remove_opener(ydl, handler): opener.handlers[:] = [x for x in opener.handlers if not find_cp(x)] -class AbemaLicenseHandler(compat_urllib_request.BaseHandler): +class AbemaLicenseHandler(urllib.request.BaseHandler): handler_order = 499 STRTABLE = '123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz' HKEY = b'3AF0298C219469522A313570E8583005A642E73EDD58E3EA2FB7339D3DF1597E' def __init__(self, ie: 'AbemaTVIE'): - # the protcol that this should really handle is 'abematv-license://' + # the protocol that this should really handle is 'abematv-license://' # abematv_license_open is just a placeholder for development purposes # ref. https://github.com/python/cpython/blob/f4c03484da59049eb62a9bf7777b963e2267d187/Lib/urllib/request.py#L510 setattr(self, 'abematv-license_open', getattr(self, 'abematv_license_open')) self.ie = ie def _get_videokey_from_ticket(self, ticket): - to_show = self.ie._downloader.params.get('verbose', False) + to_show = self.ie.get_param('verbose', False) media_token = self.ie._get_media_token(to_show=to_show) license_response = self.ie._download_json( @@ -126,11 +125,11 @@ class AbemaLicenseHandler(compat_urllib_request.BaseHandler): 'Content-Type': 'application/json', }) - res = decode_base(license_response['k'], self.STRTABLE) + res = decode_base_n(license_response['k'], table=self.STRTABLE) encvideokey = bytes_to_intlist(struct.pack('>QQ', res >> 64, res & 0xffffffffffffffff)) h = hmac.new( - unhexlify(self.HKEY), + binascii.unhexlify(self.HKEY), (license_response['cid'] + self.ie._DEVICE_ID).encode('utf-8'), digestmod=hashlib.sha256) enckey = bytes_to_intlist(h.digest()) @@ -139,84 +138,22 @@ class AbemaLicenseHandler(compat_urllib_request.BaseHandler): def abematv_license_open(self, url): url = request_to_url(url) - ticket = compat_urllib_parse_urlparse(url).netloc + ticket = urllib.parse.urlparse(url).netloc response_data = self._get_videokey_from_ticket(ticket) - return compat_urllib_response.addinfourl(io.BytesIO(response_data), headers={ + return urllib.response.addinfourl(io.BytesIO(response_data), headers={ 'Content-Length': len(response_data), }, url=url, code=200) class AbemaTVBaseIE(InfoExtractor): - def _extract_breadcrumb_list(self, webpage, video_id): - for jld in re.finditer( - r'(?is)]+type=(["\']?)application/ld\+json\1[^>]*>(?P.+?)', - webpage): - jsonld = self._parse_json(jld.group('json_ld'), video_id, fatal=False) - if jsonld: - if jsonld.get('@type') != 'BreadcrumbList': - continue - trav = traverse_obj(jsonld, ('itemListElement', ..., 'name')) - if trav: - return trav - return [] - - -class AbemaTVIE(AbemaTVBaseIE): - _VALID_URL = r'https?://abema\.tv/(?Pnow-on-air|video/episode|channels/.+?/slots)/(?P[^?/]+)' - _NETRC_MACHINE = 'abematv' - _TESTS = [{ - 'url': 'https://abema.tv/video/episode/194-25_s2_p1', - 'info_dict': { - 'id': '194-25_s2_p1', - 'title': '第1話 「チーズケーキ」 「モーニング再び」', - 'series': '異世界食堂2', - 'series_number': 2, - 'episode': '第1話 「チーズケーキ」 「モーニング再び」', - 'episode_number': 1, - }, - 'skip': 'expired', - }, { - 'url': 'https://abema.tv/channels/anime-live2/slots/E8tvAnMJ7a9a5d', - 'info_dict': { - 'id': 'E8tvAnMJ7a9a5d', - 'title': 'ゆるキャン△ SEASON2 全話一挙【無料ビデオ72時間】', - 'series': 'ゆるキャン△ SEASON2', - 'episode': 'ゆるキャン△ SEASON2 全話一挙【無料ビデオ72時間】', - 'series_number': 2, - 'episode_number': 1, - 'description': 'md5:9c5a3172ae763278f9303922f0ea5b17', - }, - 'skip': 'expired', - }, { - 'url': 'https://abema.tv/video/episode/87-877_s1282_p31047', - 'info_dict': { - 'id': 'E8tvAnMJ7a9a5d', - 'title': '第5話『光射す』', - 'description': 'md5:56d4fc1b4f7769ded5f923c55bb4695d', - 'thumbnail': r're:https://hayabusa\.io/.+', - 'series': '相棒', - 'episode': '第5話『光射す』', - }, - 'skip': 'expired', - }, { - 'url': 'https://abema.tv/now-on-air/abema-anime', - 'info_dict': { - 'id': 'abema-anime', - # this varies - # 'title': '女子高生の無駄づかい 全話一挙【無料ビデオ72時間】', - 'description': 'md5:55f2e61f46a17e9230802d7bcc913d5f', - 'is_live': True, - }, - 'skip': 'Not supported until hypervideo implements native live downloader OR AbemaTV can start a local HTTP server', - }] _USERTOKEN = None _DEVICE_ID = None - _TIMETABLE = None _MEDIATOKEN = None _SECRETKEY = b'v+Gjs=25Aw5erR!J8ZuvRrCx*rGswhB&qdHd_SYerEWdU&a?3DzN9BRbp5KwY4hEmcj5#fykMjJ=AuWz5GSMY-d@H7DMEh3M@9n2G552Us$$k9cD=3TxwWe86!x#Zyhe' - def _generate_aks(self, deviceid): + @classmethod + def _generate_aks(cls, deviceid): deviceid = deviceid.encode('utf-8') # add 1 hour and then drop minute and secs ts_1hour = int((time_seconds(hours=9) // 3600 + 1) * 3600) @@ -227,7 +164,7 @@ class AbemaTVIE(AbemaTVBaseIE): def mix_once(nonce): nonlocal tmp - h = hmac.new(self._SECRETKEY, digestmod=hashlib.sha256) + h = hmac.new(cls._SECRETKEY, digestmod=hashlib.sha256) h.update(nonce) tmp = h.digest() @@ -238,22 +175,22 @@ class AbemaTVIE(AbemaTVBaseIE): def mix_twist(nonce): nonlocal tmp - mix_once(urlsafe_b64encode(tmp).rstrip(b'=') + nonce) + mix_once(base64.urlsafe_b64encode(tmp).rstrip(b'=') + nonce) - mix_once(self._SECRETKEY) + mix_once(cls._SECRETKEY) mix_tmp(time_struct.tm_mon) mix_twist(deviceid) mix_tmp(time_struct.tm_mday % 5) mix_twist(ts_1hour_str) mix_tmp(time_struct.tm_hour % 5) - return urlsafe_b64encode(tmp).rstrip(b'=').decode('utf-8') + return base64.urlsafe_b64encode(tmp).rstrip(b'=').decode('utf-8') def _get_device_token(self): if self._USERTOKEN: return self._USERTOKEN - self._DEVICE_ID = random_uuidv4() + AbemaTVBaseIE._DEVICE_ID = str(uuid.uuid4()) aks = self._generate_aks(self._DEVICE_ID) user_data = self._download_json( 'https://api.abema.io/v1/users', None, note='Authorizing', @@ -264,7 +201,7 @@ class AbemaTVIE(AbemaTVBaseIE): headers={ 'Content-Type': 'application/json', }) - self._USERTOKEN = user_data['token'] + AbemaTVBaseIE._USERTOKEN = user_data['token'] # don't allow adding it 2 times or more, though it's guarded remove_opener(self._downloader, AbemaLicenseHandler) @@ -276,7 +213,7 @@ class AbemaTVIE(AbemaTVBaseIE): if not invalidate and self._MEDIATOKEN: return self._MEDIATOKEN - self._MEDIATOKEN = self._download_json( + AbemaTVBaseIE._MEDIATOKEN = self._download_json( 'https://api.abema.io/v1/media/token', None, note='Fetching media token' if to_show else False, query={ 'osName': 'android', @@ -286,11 +223,82 @@ class AbemaTVIE(AbemaTVBaseIE): 'appId': 'tv.abema', 'appVersion': '3.27.1' }, headers={ - 'Authorization': 'bearer ' + self._get_device_token() + 'Authorization': f'bearer {self._get_device_token()}', })['token'] return self._MEDIATOKEN + def _call_api(self, endpoint, video_id, query=None, note='Downloading JSON metadata'): + return self._download_json( + f'https://api.abema.io/{endpoint}', video_id, query=query or {}, + note=note, + headers={ + 'Authorization': f'bearer {self._get_device_token()}', + }) + + def _extract_breadcrumb_list(self, webpage, video_id): + for jld in re.finditer( + r'(?is)]+type=(["\']?)application/ld\+json\1[^>]*>(?P.+?)', + webpage): + jsonld = self._parse_json(jld.group('json_ld'), video_id, fatal=False) + if traverse_obj(jsonld, '@type') != 'BreadcrumbList': + continue + items = traverse_obj(jsonld, ('itemListElement', ..., 'name')) + if items: + return items + return [] + + +class AbemaTVIE(AbemaTVBaseIE): + _VALID_URL = r'https?://abema\.tv/(?Pnow-on-air|video/episode|channels/.+?/slots)/(?P[^?/]+)' + _NETRC_MACHINE = 'abematv' + _TESTS = [{ + 'url': 'https://abema.tv/video/episode/194-25_s2_p1', + 'info_dict': { + 'id': '194-25_s2_p1', + 'title': '第1話 「チーズケーキ」 「モーニング再び」', + 'series': '異世界食堂2', + 'series_number': 2, + 'episode': '第1話 「チーズケーキ」 「モーニング再び」', + 'episode_number': 1, + }, + 'skip': 'expired', + }, { + 'url': 'https://abema.tv/channels/anime-live2/slots/E8tvAnMJ7a9a5d', + 'info_dict': { + 'id': 'E8tvAnMJ7a9a5d', + 'title': 'ゆるキャン△ SEASON2 全話一挙【無料ビデオ72時間】', + 'series': 'ゆるキャン△ SEASON2', + 'episode': 'ゆるキャン△ SEASON2 全話一挙【無料ビデオ72時間】', + 'series_number': 2, + 'episode_number': 1, + 'description': 'md5:9c5a3172ae763278f9303922f0ea5b17', + }, + 'skip': 'expired', + }, { + 'url': 'https://abema.tv/video/episode/87-877_s1282_p31047', + 'info_dict': { + 'id': 'E8tvAnMJ7a9a5d', + 'title': '第5話『光射す』', + 'description': 'md5:56d4fc1b4f7769ded5f923c55bb4695d', + 'thumbnail': r're:https://hayabusa\.io/.+', + 'series': '相棒', + 'episode': '第5話『光射す』', + }, + 'skip': 'expired', + }, { + 'url': 'https://abema.tv/now-on-air/abema-anime', + 'info_dict': { + 'id': 'abema-anime', + # this varies + # 'title': '女子高生の無駄づかい 全話一挙【無料ビデオ72時間】', + 'description': 'md5:55f2e61f46a17e9230802d7bcc913d5f', + 'is_live': True, + }, + 'skip': 'Not supported until hypervideo implements native live downloader OR AbemaTV can start a local HTTP server', + }] + _TIMETABLE = None + def _perform_login(self, username, password): if '@' in username: # don't strictly check if it's email address or not ep, method = 'user/email', 'email' @@ -303,18 +311,18 @@ class AbemaTVIE(AbemaTVBaseIE): method: username, 'password': password }).encode('utf-8'), headers={ - 'Authorization': 'bearer ' + self._get_device_token(), + 'Authorization': f'bearer {self._get_device_token()}', 'Origin': 'https://abema.tv', 'Referer': 'https://abema.tv/', 'Content-Type': 'application/json', }) - self._USERTOKEN = login_response['token'] + AbemaTVBaseIE._USERTOKEN = login_response['token'] self._get_media_token(True) def _real_extract(self, url): # starting download using infojson from this extractor is undefined behavior, - # and never be fixed in the future; you must trigger downloads by directly specifing URL. + # and never be fixed in the future; you must trigger downloads by directly specifying URL. # (unless there's a way to hook before downloading by extractor) video_id, video_type = self._match_valid_url(url).group('id', 'type') headers = { @@ -357,7 +365,7 @@ class AbemaTVIE(AbemaTVBaseIE): # read breadcrumb on top of page breadcrumb = self._extract_breadcrumb_list(webpage, video_id) if breadcrumb: - # breadcrumb list translates to: (example is 1st test for this IE) + # breadcrumb list translates to: (e.g. 1st test for this IE) # Home > Anime (genre) > Isekai Shokudo 2 (series name) > Episode 1 "Cheese cakes" "Morning again" (episode title) # hence this works info['series'] = breadcrumb[-2] @@ -444,6 +452,7 @@ class AbemaTVIE(AbemaTVBaseIE): class AbemaTVTitleIE(AbemaTVBaseIE): _VALID_URL = r'https?://abema\.tv/video/title/(?P[^?/]+)' + _PAGE_SIZE = 25 _TESTS = [{ 'url': 'https://abema.tv/video/title/90-1597', @@ -459,18 +468,39 @@ class AbemaTVTitleIE(AbemaTVBaseIE): 'title': '真心が届く~僕とスターのオフィス・ラブ!?~', }, 'playlist_mincount': 16, + }, { + 'url': 'https://abema.tv/video/title/25-102', + 'info_dict': { + 'id': '25-102', + 'title': 'ソードアート・オンライン アリシゼーション', + }, + 'playlist_mincount': 24, }] - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + def _fetch_page(self, playlist_id, series_version, page): + programs = self._call_api( + f'v1/video/series/{playlist_id}/programs', playlist_id, + note=f'Downloading page {page + 1}', + query={ + 'seriesVersion': series_version, + 'offset': str(page * self._PAGE_SIZE), + 'order': 'seq', + 'limit': str(self._PAGE_SIZE), + }) + yield from ( + self.url_result(f'https://abema.tv/video/episode/{x}') + for x in traverse_obj(programs, ('programs', ..., 'id'), default=[])) - playlist_title, breadcrumb = None, self._extract_breadcrumb_list(webpage, video_id) - if breadcrumb: - playlist_title = breadcrumb[-1] + def _entries(self, playlist_id, series_version): + return OnDemandPagedList( + functools.partial(self._fetch_page, playlist_id, series_version), + self._PAGE_SIZE) - playlist = [ - self.url_result(urljoin('https://abema.tv/', mobj.group(1))) - for mobj in re.finditer(r'[_\d]+)' + + _TESTS = [{ + 'url': 'https://www.acfun.cn/v/ac35457073', + 'info_dict': { + 'id': '35457073', + 'ext': 'mp4', + 'duration': 174.208, + 'timestamp': 1656403967, + 'title': '1 8 岁 现 状', + 'description': '“赶紧回去!班主任查班了!”', + 'uploader': '锤子game', + 'uploader_id': '51246077', + 'thumbnail': r're:^https?://.*\.(jpg|jpeg)', + 'upload_date': '20220628', + 'like_count': int, + 'view_count': int, + 'comment_count': int, + 'tags': list, + }, + }, { + # example for len(video_list) > 1 + 'url': 'https://www.acfun.cn/v/ac35468952_2', + 'info_dict': { + 'id': '35468952_2', + 'ext': 'mp4', + 'title': '【动画剧集】Rocket & Groot Season 1(2022)/火箭浣熊与格鲁特第1季 P02 S01E02 十拿九穩', + 'duration': 90.459, + 'uploader': '比令', + 'uploader_id': '37259967', + 'upload_date': '20220629', + 'timestamp': 1656479962, + 'tags': list, + 'like_count': int, + 'view_count': int, + 'comment_count': int, + 'thumbnail': r're:^https?://.*\.(jpg|jpeg)', + 'description': 'md5:67583aaf3a0f933bd606bc8a2d3ebb17', + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + json_all = self._search_json(r'window.videoInfo\s*=', webpage, 'videoInfo', video_id) + + title = json_all.get('title') + video_list = json_all.get('videoList') or [] + video_internal_id = traverse_obj(json_all, ('currentVideoInfo', 'id')) + if video_internal_id and len(video_list) > 1: + part_idx, part_video_info = next( + (idx + 1, v) for (idx, v) in enumerate(video_list) + if v['id'] == video_internal_id) + title = f'{title} P{part_idx:02d} {part_video_info["title"]}' + + return { + **self._extract_metadata(video_id, json_all['currentVideoInfo']), + 'title': title, + 'thumbnail': json_all.get('coverUrl'), + 'description': json_all.get('description'), + 'uploader': traverse_obj(json_all, ('user', 'name')), + 'uploader_id': traverse_obj(json_all, ('user', 'href')), + 'tags': traverse_obj(json_all, ('tagList', ..., 'name')), + 'view_count': int_or_none(json_all.get('viewCount')), + 'like_count': int_or_none(json_all.get('likeCountShow')), + 'comment_count': int_or_none(json_all.get('commentCountShow')), + } + + +class AcFunBangumiIE(AcFunVideoBaseIE): + _VALID_URL = r'https?://www\.acfun\.cn/bangumi/(?Paa[_\d]+)' + + _TESTS = [{ + 'url': 'https://www.acfun.cn/bangumi/aa6002917_36188_1745457?ac=2', + 'info_dict': { + 'id': 'aa6002917_36188_1745457__2', + 'ext': 'mp4', + 'title': '【7月】租借女友 水原千鹤角色曲『DATE』特别PV', + 'upload_date': '20200916', + 'timestamp': 1600243813, + 'duration': 92.091, + }, + }, { + 'url': 'https://www.acfun.cn/bangumi/aa5023171_36188_1750645', + 'info_dict': { + 'id': 'aa5023171_36188_1750645', + 'ext': 'mp4', + 'title': '红孩儿之趴趴蛙寻石记 第5话 ', + 'duration': 760.0, + 'season': '红孩儿之趴趴蛙寻石记', + 'season_id': 5023171, + 'season_number': 1, # series has only 1 season + 'episode': 'Episode 5', + 'episode_number': 5, + 'upload_date': '20181223', + 'timestamp': 1545552185, + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)', + 'comment_count': int, + }, + }, { + 'url': 'https://www.acfun.cn/bangumi/aa6065485_36188_1885061', + 'info_dict': { + 'id': 'aa6065485_36188_1885061', + 'ext': 'mp4', + 'title': '叽歪老表(第二季) 第5话 坚不可摧', + 'season': '叽歪老表(第二季)', + 'season_number': 2, + 'season_id': 6065485, + 'episode': '坚不可摧', + 'episode_number': 5, + 'upload_date': '20220324', + 'timestamp': 1648082786, + 'duration': 105.002, + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)', + 'comment_count': int, + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + ac_idx = parse_qs(url).get('ac', [None])[-1] + video_id = f'{video_id}{format_field(ac_idx, None, "__%s")}' + + webpage = self._download_webpage(url, video_id) + json_bangumi_data = self._search_json(r'window.bangumiData\s*=', webpage, 'bangumiData', video_id) + + if ac_idx: + video_info = json_bangumi_data['hlVideoInfo'] + return { + **self._extract_metadata(video_id, video_info), + 'title': video_info.get('title'), + } + + video_info = json_bangumi_data['currentVideoInfo'] + + season_id = json_bangumi_data.get('bangumiId') + season_number = season_id and next(( + idx for idx, v in enumerate(json_bangumi_data.get('relatedBangumis') or [], 1) + if v.get('id') == season_id), 1) + + json_bangumi_list = self._search_json( + r'window\.bangumiList\s*=', webpage, 'bangumiList', video_id, fatal=False) + video_internal_id = int_or_none(traverse_obj(json_bangumi_data, ('currentVideoInfo', 'id'))) + episode_number = video_internal_id and next(( + idx for idx, v in enumerate(json_bangumi_list.get('items') or [], 1) + if v.get('videoId') == video_internal_id), None) + + return { + **self._extract_metadata(video_id, video_info), + 'title': json_bangumi_data.get('showTitle'), + 'thumbnail': json_bangumi_data.get('image'), + 'season': json_bangumi_data.get('bangumiTitle'), + 'season_id': season_id, + 'season_number': season_number, + 'episode': json_bangumi_data.get('title'), + 'episode_number': episode_number, + 'comment_count': int_or_none(json_bangumi_data.get('commentCount')), + } diff --git a/hypervideo_dl/extractor/adn.py b/hypervideo_dl/extractor/adn.py index fca6e60..e0c18c8 100644 --- a/hypervideo_dl/extractor/adn.py +++ b/hypervideo_dl/extractor/adn.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import base64 import binascii import json @@ -31,30 +28,34 @@ from ..utils import ( class ADNIE(InfoExtractor): - IE_DESC = 'Anime Digital Network' - _VALID_URL = r'https?://(?:www\.)?animedigitalnetwork\.fr/video/[^/]+/(?P\d+)' - _TEST = { - 'url': 'http://animedigitalnetwork.fr/video/blue-exorcist-kyoto-saga/7778-episode-1-debut-des-hostilites', - 'md5': '0319c99885ff5547565cacb4f3f9348d', + IE_DESC = 'Animation Digital Network' + _VALID_URL = r'https?://(?:www\.)?(?:animation|anime)digitalnetwork\.fr/video/[^/]+/(?P\d+)' + _TESTS = [{ + 'url': 'https://animationdigitalnetwork.fr/video/fruits-basket/9841-episode-1-a-ce-soir', + 'md5': '1c9ef066ceb302c86f80c2b371615261', 'info_dict': { - 'id': '7778', + 'id': '9841', 'ext': 'mp4', - 'title': 'Blue Exorcist - Kyôto Saga - Episode 1', - 'description': 'md5:2f7b5aa76edbc1a7a92cedcda8a528d5', - 'series': 'Blue Exorcist - Kyôto Saga', - 'duration': 1467, - 'release_date': '20170106', + 'title': 'Fruits Basket - Episode 1', + 'description': 'md5:14be2f72c3c96809b0ca424b0097d336', + 'series': 'Fruits Basket', + 'duration': 1437, + 'release_date': '20190405', 'comment_count': int, 'average_rating': float, - 'season_number': 2, - 'episode': 'Début des hostilités', + 'season_number': 1, + 'episode': 'À ce soir !', 'episode_number': 1, - } - } + }, + 'skip': 'Only available in region (FR, ...)', + }, { + 'url': 'http://animedigitalnetwork.fr/video/blue-exorcist-kyoto-saga/7778-episode-1-debut-des-hostilites', + 'only_matching': True, + }] - _NETRC_MACHINE = 'animedigitalnetwork' - _BASE_URL = 'http://animedigitalnetwork.fr' - _API_BASE_URL = 'https://gw.api.animedigitalnetwork.fr/' + _NETRC_MACHINE = 'animationdigitalnetwork' + _BASE = 'animationdigitalnetwork.fr' + _API_BASE_URL = 'https://gw.api.' + _BASE + '/' _PLAYER_BASE_URL = _API_BASE_URL + 'player/' _HEADERS = {} _LOGIN_ERR_MESSAGE = 'Unable to log in' @@ -78,14 +79,14 @@ class ADNIE(InfoExtractor): if subtitle_location: enc_subtitles = self._download_webpage( subtitle_location, video_id, 'Downloading subtitles data', - fatal=False, headers={'Origin': 'https://animedigitalnetwork.fr'}) + fatal=False, headers={'Origin': 'https://' + self._BASE}) if not enc_subtitles: return None - # http://animedigitalnetwork.fr/components/com_vodvideo/videojs/adn-vjs.min.js + # http://animationdigitalnetwork.fr/components/com_vodvideo/videojs/adn-vjs.min.js dec_subtitles = unpad_pkcs7(aes_cbc_decrypt_bytes( compat_b64decode(enc_subtitles[24:]), - binascii.unhexlify(self._K + 'ab9f52f5baae7c72'), + binascii.unhexlify(self._K + '7fac1178830cfe0c'), compat_b64decode(enc_subtitles[:24]))) subtitles_json = self._parse_json(dec_subtitles.decode(), None, fatal=False) if not subtitles_json: @@ -234,7 +235,6 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text''' for f in m3u8_formats: f['language'] = 'fr' formats.extend(m3u8_formats) - self._sort_formats(formats) video = (self._download_json( self._API_BASE_URL + 'video/%s' % video_id, video_id, diff --git a/hypervideo_dl/extractor/adobeconnect.py b/hypervideo_dl/extractor/adobeconnect.py index e2e6f93..8963b12 100644 --- a/hypervideo_dl/extractor/adobeconnect.py +++ b/hypervideo_dl/extractor/adobeconnect.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import ( compat_parse_qs, diff --git a/hypervideo_dl/extractor/adobepass.py b/hypervideo_dl/extractor/adobepass.py index 5d98301..e5944f7 100644 --- a/hypervideo_dl/extractor/adobepass.py +++ b/hypervideo_dl/extractor/adobepass.py @@ -1,26 +1,20 @@ -# coding: utf-8 -from __future__ import unicode_literals - +import getpass import json import re import time +import urllib.error import xml.etree.ElementTree as etree from .common import InfoExtractor -from ..compat import ( - compat_kwargs, - compat_urlparse, - compat_getpass -) +from ..compat import compat_urlparse from ..utils import ( + NO_DEFAULT, + ExtractorError, unescapeHTML, - urlencode_postdata, unified_timestamp, - ExtractorError, - NO_DEFAULT, + urlencode_postdata, ) - MSO_INFO = { 'DTV': { 'name': 'DIRECTV', @@ -1350,10 +1344,15 @@ MSO_INFO = { 'username_field': 'username', 'password_field': 'password', }, + 'AlticeOne': { + 'name': 'Optimum TV', + 'username_field': 'j_username', + 'password_field': 'j_password', + }, } -class AdobePassIE(InfoExtractor): +class AdobePassIE(InfoExtractor): # XXX: Conventionally, base classes should end with BaseIE/InfoExtractor _SERVICE_PROVIDER_TEMPLATE = 'https://sp.auth.adobe.com/adobe-services/%s' _USER_AGENT = 'Mozilla/5.0 (X11; Linux i686; rv:47.0) Gecko/20100101 Firefox/47.0' _MVPD_CACHE = 'ap-mvpd' @@ -1365,7 +1364,7 @@ class AdobePassIE(InfoExtractor): headers.update(kwargs.get('headers', {})) kwargs['headers'] = headers return super(AdobePassIE, self)._download_webpage_handle( - *args, **compat_kwargs(kwargs)) + *args, **kwargs) @staticmethod def _get_mvpd_resource(provider_id, title, guid, rating): @@ -1434,32 +1433,34 @@ class AdobePassIE(InfoExtractor): guid = xml_text(resource, 'guid') if '<' in resource else resource count = 0 while count < 2: - requestor_info = self._downloader.cache.load(self._MVPD_CACHE, requestor_id) or {} + requestor_info = self.cache.load(self._MVPD_CACHE, requestor_id) or {} authn_token = requestor_info.get('authn_token') if authn_token and is_expired(authn_token, 'simpleTokenExpires'): authn_token = None if not authn_token: - # TODO add support for other TV Providers mso_id = self.get_param('ap_mso') - if not mso_id: - raise_mvpd_required() - username, password = self._get_login_info('ap_username', 'ap_password', mso_id) - if not username or not password: - raise_mvpd_required() - mso_info = MSO_INFO[mso_id] + if mso_id: + username, password = self._get_login_info('ap_username', 'ap_password', mso_id) + if not username or not password: + raise_mvpd_required() + mso_info = MSO_INFO[mso_id] - provider_redirect_page_res = self._download_webpage_handle( - self._SERVICE_PROVIDER_TEMPLATE % 'authenticate/saml', video_id, - 'Downloading Provider Redirect Page', query={ - 'noflash': 'true', - 'mso_id': mso_id, - 'requestor_id': requestor_id, - 'no_iframe': 'false', - 'domain_name': 'adobe.com', - 'redirect_url': url, - }) + provider_redirect_page_res = self._download_webpage_handle( + self._SERVICE_PROVIDER_TEMPLATE % 'authenticate/saml', video_id, + 'Downloading Provider Redirect Page', query={ + 'noflash': 'true', + 'mso_id': mso_id, + 'requestor_id': requestor_id, + 'no_iframe': 'false', + 'domain_name': 'adobe.com', + 'redirect_url': url, + }) + elif not self._cookies_passed: + raise_mvpd_required() - if mso_id == 'Comcast_SSO': + if not mso_id: + pass + elif mso_id == 'Comcast_SSO': # Comcast page flow varies by video site and whether you # are on Comcast's network. provider_redirect_page, urlh = provider_redirect_page_res @@ -1507,7 +1508,7 @@ class AdobePassIE(InfoExtractor): 'send_confirm_link': False, 'send_token': True })) - philo_code = compat_getpass('Type auth code you have received [Return]: ') + philo_code = getpass.getpass('Type auth code you have received [Return]: ') self._download_webpage( 'https://idp.philo.com/auth/update/login_code', video_id, 'Submitting token', data=urlencode_postdata({ 'token': philo_code @@ -1709,25 +1710,30 @@ class AdobePassIE(InfoExtractor): mso_info.get('username_field', 'username'): username, mso_info.get('password_field', 'password'): password } - if mso_id == 'Cablevision': + if mso_id in ('Cablevision', 'AlticeOne'): form_data['_eventId_proceed'] = '' mvpd_confirm_page_res = post_form(provider_login_page_res, 'Logging in', form_data) if mso_id != 'Rogers': post_form(mvpd_confirm_page_res, 'Confirming Login') - session = self._download_webpage( - self._SERVICE_PROVIDER_TEMPLATE % 'session', video_id, - 'Retrieving Session', data=urlencode_postdata({ - '_method': 'GET', - 'requestor_id': requestor_id, - }), headers=mvpd_headers) + try: + session = self._download_webpage( + self._SERVICE_PROVIDER_TEMPLATE % 'session', video_id, + 'Retrieving Session', data=urlencode_postdata({ + '_method': 'GET', + 'requestor_id': requestor_id, + }), headers=mvpd_headers) + except ExtractorError as e: + if not mso_id and isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 401: + raise_mvpd_required() + raise if '\d+)' + _EMBED_REGEX = [r']+src=[\'"](?P(?:https?:)?//video\.tv\.adobe\.com/v/\d+[^"]+)[\'"]'] _TEST = { # From https://helpx.adobe.com/acrobat/how-to/new-experience-acrobat-dc.html?set=acrobat--get-started--essential-beginners @@ -270,7 +268,6 @@ class AdobeTVVideoIE(AdobeTVBaseIE): 'width': int_or_none(source.get('width') or None), 'url': source_src, }) - self._sort_formats(formats) # For both metadata and downloaded files the duration varies among # formats. I just pick the max one diff --git a/hypervideo_dl/extractor/adultswim.py b/hypervideo_dl/extractor/adultswim.py index c97cfc1..bd29eb4 100644 --- a/hypervideo_dl/extractor/adultswim.py +++ b/hypervideo_dl/extractor/adultswim.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json from .turner import TurnerBaseIE @@ -183,7 +180,6 @@ class AdultSwimIE(TurnerBaseIE): info['subtitles'].setdefault('en', []).append({ 'url': asset_url, }) - self._sort_formats(info['formats']) return info else: diff --git a/hypervideo_dl/extractor/aenetworks.py b/hypervideo_dl/extractor/aenetworks.py index 8025de5..d7c4010 100644 --- a/hypervideo_dl/extractor/aenetworks.py +++ b/hypervideo_dl/extractor/aenetworks.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .theplatform import ThePlatformIE from ..utils import ( ExtractorError, @@ -12,7 +8,7 @@ from ..utils import ( ) -class AENetworksBaseIE(ThePlatformIE): +class AENetworksBaseIE(ThePlatformIE): # XXX: Do not subclass from concrete IE _BASE_URL_REGEX = r'''(?x)https?:// (?:(?:www|play|watch)\.)? (?P @@ -32,14 +28,17 @@ class AENetworksBaseIE(ThePlatformIE): } def _extract_aen_smil(self, smil_url, video_id, auth=None): - query = {'mbr': 'true'} + query = { + 'mbr': 'true', + 'formats': 'M3U+none,MPEG-DASH+none,MPEG4,MP3', + } if auth: query['auth'] = auth TP_SMIL_QUERY = [{ 'assetTypes': 'high_video_ak', - 'switch': 'hls_high_ak' + 'switch': 'hls_high_ak', }, { - 'assetTypes': 'high_video_s3' + 'assetTypes': 'high_video_s3', }, { 'assetTypes': 'high_video_s3', 'switch': 'hls_high_fastly', @@ -63,7 +62,6 @@ class AENetworksBaseIE(ThePlatformIE): subtitles = self._merge_subtitles(subtitles, tp_subtitles) if last_e and not formats: raise last_e - self._sort_formats(formats) return { 'id': video_id, 'formats': formats, @@ -305,7 +303,6 @@ class HistoryTopicIE(AENetworksBaseIE): class HistoryPlayerIE(AENetworksBaseIE): IE_NAME = 'history:player' _VALID_URL = r'https?://(?:www\.)?(?P(?:history|biography)\.com)/player/(?P\d+)' - _TESTS = [] def _real_extract(self, url): domain, video_id = self._match_valid_url(url).groups() diff --git a/hypervideo_dl/extractor/aeonco.py b/hypervideo_dl/extractor/aeonco.py new file mode 100644 index 0000000..4655862 --- /dev/null +++ b/hypervideo_dl/extractor/aeonco.py @@ -0,0 +1,40 @@ +from .common import InfoExtractor +from .vimeo import VimeoIE + + +class AeonCoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?aeon\.co/videos/(?P[^/?]+)' + _TESTS = [{ + 'url': 'https://aeon.co/videos/raw-solar-storm-footage-is-the-punk-rock-antidote-to-sleek-james-webb-imagery', + 'md5': 'e5884d80552c9b6ea8d268a258753362', + 'info_dict': { + 'id': '1284717', + 'ext': 'mp4', + 'title': 'Brilliant Noise', + 'thumbnail': 'https://i.vimeocdn.com/video/21006315-1a1e49da8b07fd908384a982b4ba9ff0268c509a474576ebdf7b1392f4acae3b-d_960', + 'uploader': 'Semiconductor', + 'uploader_id': 'semiconductor', + 'uploader_url': 'https://vimeo.com/semiconductor', + 'duration': 348 + } + }, { + 'url': 'https://aeon.co/videos/dazzling-timelapse-shows-how-microbes-spoil-our-food-and-sometimes-enrich-it', + 'md5': '4e5f3dad9dbda0dbfa2da41a851e631e', + 'info_dict': { + 'id': '728595228', + 'ext': 'mp4', + 'title': 'Wrought', + 'thumbnail': 'https://i.vimeocdn.com/video/1484618528-c91452611f9a4e4497735a533da60d45b2fe472deb0c880f0afaab0cd2efb22a-d_1280', + 'uploader': 'Biofilm Productions', + 'uploader_id': 'user140352216', + 'uploader_url': 'https://vimeo.com/user140352216', + 'duration': 1344 + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + vimeo_id = self._search_regex(r'hosterId":\s*"(?P[0-9]+)', webpage, 'vimeo id') + vimeo_url = VimeoIE._smuggle_referrer(f'https://player.vimeo.com/video/{vimeo_id}', 'https://aeon.co') + return self.url_result(vimeo_url, VimeoIE) diff --git a/hypervideo_dl/extractor/afreecatv.py b/hypervideo_dl/extractor/afreecatv.py index 77f0e3c..9276fe7 100644 --- a/hypervideo_dl/extractor/afreecatv.py +++ b/hypervideo_dl/extractor/afreecatv.py @@ -1,14 +1,12 @@ -# coding: utf-8 -from __future__ import unicode_literals - +import functools import re from .common import InfoExtractor -from ..compat import compat_xpath from ..utils import ( + ExtractorError, + OnDemandPagedList, date_from_str, determine_ext, - ExtractorError, int_or_none, qualities, traverse_obj, @@ -280,7 +278,7 @@ class AfreecaTVIE(InfoExtractor): else: raise ExtractorError('Unable to download video info') - video_element = video_xml.findall(compat_xpath('./track/video'))[-1] + video_element = video_xml.findall('./track/video')[-1] if video_element is None or video_element.text is None: raise ExtractorError( 'Video %s does not exist' % video_id, expected=True) @@ -310,7 +308,7 @@ class AfreecaTVIE(InfoExtractor): if not video_url: entries = [] - file_elements = video_element.findall(compat_xpath('./file')) + file_elements = video_element.findall('./file') one = len(file_elements) == 1 for file_num, file_element in enumerate(file_elements, start=1): file_url = url_or_none(file_element.text) @@ -340,7 +338,6 @@ class AfreecaTVIE(InfoExtractor): }] if not formats and not self.get_param('ignore_no_formats'): continue - self._sort_formats(formats) file_info = common_entry.copy() file_info.update({ 'id': format_id, @@ -382,7 +379,7 @@ class AfreecaTVIE(InfoExtractor): return info -class AfreecaTVLiveIE(AfreecaTVIE): +class AfreecaTVLiveIE(AfreecaTVIE): # XXX: Do not subclass from concrete IE IE_NAME = 'afreecatv:live' _VALID_URL = r'https?://play\.afreeca(?:tv)?\.com/(?P[^/]+)(?:/(?P\d+))?' @@ -466,8 +463,6 @@ class AfreecaTVLiveIE(AfreecaTVIE): 'quality': quality_key(quality_str), }) - self._sort_formats(formats) - station_info = self._download_json( 'https://st.afreecatv.com/api/get_station_status.php', broadcast_no, query={'szBjId': broadcaster_id}, fatal=False, @@ -482,3 +477,57 @@ class AfreecaTVLiveIE(AfreecaTVIE): 'formats': formats, 'is_live': True, } + + +class AfreecaTVUserIE(InfoExtractor): + IE_NAME = 'afreecatv:user' + _VALID_URL = r'https?://bj\.afreeca(?:tv)?\.com/(?P[^/]+)/vods/?(?P[^/]+)?' + _TESTS = [{ + 'url': 'https://bj.afreecatv.com/ryuryu24/vods/review', + 'info_dict': { + '_type': 'playlist', + 'id': 'ryuryu24', + 'title': 'ryuryu24 - review', + }, + 'playlist_count': 218, + }, { + 'url': 'https://bj.afreecatv.com/parang1995/vods/highlight', + 'info_dict': { + '_type': 'playlist', + 'id': 'parang1995', + 'title': 'parang1995 - highlight', + }, + 'playlist_count': 997, + }, { + 'url': 'https://bj.afreecatv.com/ryuryu24/vods', + 'info_dict': { + '_type': 'playlist', + 'id': 'ryuryu24', + 'title': 'ryuryu24 - all', + }, + 'playlist_count': 221, + }, { + 'url': 'https://bj.afreecatv.com/ryuryu24/vods/balloonclip', + 'info_dict': { + '_type': 'playlist', + 'id': 'ryuryu24', + 'title': 'ryuryu24 - balloonclip', + }, + 'playlist_count': 0, + }] + _PER_PAGE = 60 + + def _fetch_page(self, user_id, user_type, page): + page += 1 + info = self._download_json(f'https://bjapi.afreecatv.com/api/{user_id}/vods/{user_type}', user_id, + query={'page': page, 'per_page': self._PER_PAGE, 'orderby': 'reg_date'}, + note=f'Downloading {user_type} video page {page}') + for item in info['data']: + yield self.url_result( + f'https://vod.afreecatv.com/player/{item["title_no"]}/', AfreecaTVIE, item['title_no']) + + def _real_extract(self, url): + user_id, user_type = self._match_valid_url(url).group('id', 'slug_type') + user_type = user_type or 'all' + entries = OnDemandPagedList(functools.partial(self._fetch_page, user_id, user_type), self._PER_PAGE) + return self.playlist_result(entries, user_id, f'{user_id} - {user_type}') diff --git a/hypervideo_dl/extractor/agora.py b/hypervideo_dl/extractor/agora.py new file mode 100644 index 0000000..abb2d3f --- /dev/null +++ b/hypervideo_dl/extractor/agora.py @@ -0,0 +1,251 @@ +import functools +import uuid + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + OnDemandPagedList, + int_or_none, + month_by_name, + parse_duration, + try_call, +) + + +class WyborczaVideoIE(InfoExtractor): + # this id is not an article id, it has to be extracted from the article + _VALID_URL = r'(?:wyborcza:video:|https?://wyborcza\.pl/(?:api-)?video/)(?P\d+)' + IE_NAME = 'wyborcza:video' + _TESTS = [{ + 'url': 'wyborcza:video:26207634', + 'info_dict': { + 'id': '26207634', + 'ext': 'mp4', + 'title': '- Polska w 2020 r. jest innym państwem niż w 2015 r. Nie zmieniła się konstytucja, ale jest to już inny ustrój - mówi Adam Bodnar', + 'description': ' ', + 'uploader': 'Dorota Roman', + 'duration': 2474, + 'thumbnail': r're:https://.+\.jpg', + }, + }, { + 'url': 'https://wyborcza.pl/video/26207634', + 'only_matching': True, + }, { + 'url': 'https://wyborcza.pl/api-video/26207634', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + meta = self._download_json(f'https://wyborcza.pl/api-video/{video_id}', video_id) + + formats = [] + base_url = meta['redirector'].replace('http://', 'https://') + meta['basePath'] + for quality in ('standard', 'high'): + if not meta['files'].get(quality): + continue + formats.append({ + 'url': base_url + meta['files'][quality], + 'height': int_or_none( + self._search_regex( + r'p(\d+)[a-z]+\.mp4$', meta['files'][quality], + 'mp4 video height', default=None)), + 'format_id': quality, + }) + if meta['files'].get('dash'): + formats.extend(self._extract_mpd_formats(base_url + meta['files']['dash'], video_id)) + + return { + 'id': video_id, + 'formats': formats, + 'title': meta.get('title'), + 'description': meta.get('lead'), + 'uploader': meta.get('signature'), + 'thumbnail': meta.get('imageUrl'), + 'duration': meta.get('duration'), + } + + +class WyborczaPodcastIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?://(?:www\.)?(?: + wyborcza\.pl/podcast(?:/0,172673\.html)?| + wysokieobcasy\.pl/wysokie-obcasy/0,176631\.html + )(?:\?(?:[^&#]+?&)*podcast=(?P\d+))? + ''' + _TESTS = [{ + 'url': 'https://wyborcza.pl/podcast/0,172673.html?podcast=100720#S.main_topic-K.C-B.6-L.1.podcast', + 'info_dict': { + 'id': '100720', + 'ext': 'mp3', + 'title': 'Cyfrodziewczyny. Kim były pionierki polskiej informatyki ', + 'uploader': 'Michał Nogaś ', + 'upload_date': '20210117', + 'description': 'md5:49f0a06ffc4c1931210d3ab1416a651d', + 'duration': 3684.0, + 'thumbnail': r're:https://.+\.jpg', + }, + }, { + 'url': 'https://www.wysokieobcasy.pl/wysokie-obcasy/0,176631.html?podcast=100673', + 'info_dict': { + 'id': '100673', + 'ext': 'mp3', + 'title': 'Czym jest ubóstwo menstruacyjne i dlaczego dotyczy każdej i każdego z nas?', + 'uploader': 'Agnieszka Urazińska ', + 'upload_date': '20210115', + 'description': 'md5:c161dc035f8dbb60077011fc41274899', + 'duration': 1803.0, + 'thumbnail': r're:https://.+\.jpg', + }, + }, { + 'url': 'https://wyborcza.pl/podcast', + 'info_dict': { + 'id': '334', + 'title': 'Gościnnie: Wyborcza, 8:10', + 'series': 'Gościnnie: Wyborcza, 8:10', + }, + 'playlist_mincount': 370, + }, { + 'url': 'https://www.wysokieobcasy.pl/wysokie-obcasy/0,176631.html', + 'info_dict': { + 'id': '395', + 'title': 'Gościnnie: Wysokie Obcasy', + 'series': 'Gościnnie: Wysokie Obcasy', + }, + 'playlist_mincount': 12, + }] + + def _real_extract(self, url): + podcast_id = self._match_id(url) + + if not podcast_id: # playlist + podcast_id = '395' if 'wysokieobcasy.pl/' in url else '334' + return self.url_result(TokFMAuditionIE._create_url(podcast_id), TokFMAuditionIE, podcast_id) + + meta = self._download_json('https://wyborcza.pl/api/podcast', podcast_id, + query={'guid': podcast_id, 'type': 'wo' if 'wysokieobcasy.pl/' in url else None}) + + day, month, year = self._search_regex(r'^(\d\d?) (\w+) (\d{4})$', meta.get('publishedDate'), + 'upload date', group=(1, 2, 3), default=(None, None, None)) + return { + 'id': podcast_id, + 'url': meta['url'], + 'title': meta.get('title'), + 'description': meta.get('description'), + 'thumbnail': meta.get('imageUrl'), + 'duration': parse_duration(meta.get('duration')), + 'uploader': meta.get('author'), + 'upload_date': try_call(lambda: f'{year}{month_by_name(month, lang="pl"):0>2}{day:0>2}'), + } + + +class TokFMPodcastIE(InfoExtractor): + _VALID_URL = r'(?:https?://audycje\.tokfm\.pl/podcast/|tokfm:podcast:)(?P\d+),?' + IE_NAME = 'tokfm:podcast' + _TESTS = [{ + 'url': 'https://audycje.tokfm.pl/podcast/91275,-Systemowy-rasizm-Czy-zamieszki-w-USA-po-morderstwie-w-Minneapolis-doprowadza-do-zmian-w-sluzbach-panstwowych', + 'info_dict': { + 'id': '91275', + 'ext': 'aac', + 'title': 'md5:a9b15488009065556900169fb8061cce', + 'episode': 'md5:a9b15488009065556900169fb8061cce', + 'series': 'Analizy', + }, + }] + + def _real_extract(self, url): + media_id = self._match_id(url) + + # in case it breaks see this but it returns a lot of useless data + # https://api.podcast.radioagora.pl/api4/getPodcasts?podcast_id=100091&with_guests=true&with_leaders_for_mobile=true + metadata = self._download_json( + f'https://audycje.tokfm.pl/getp/3{media_id}', media_id, 'Downloading podcast metadata') + if not metadata: + raise ExtractorError('No such podcast', expected=True) + metadata = metadata[0] + + formats = [] + for ext in ('aac', 'mp3'): + url_data = self._download_json( + f'https://api.podcast.radioagora.pl/api4/getSongUrl?podcast_id={media_id}&device_id={uuid.uuid4()}&ppre=false&audio={ext}', + media_id, 'Downloading podcast %s URL' % ext) + # prevents inserting the mp3 (default) multiple times + if 'link_ssl' in url_data and f'.{ext}' in url_data['link_ssl']: + formats.append({ + 'url': url_data['link_ssl'], + 'ext': ext, + 'vcodec': 'none', + 'acodec': ext, + }) + + return { + 'id': media_id, + 'formats': formats, + 'title': metadata.get('podcast_name'), + 'series': metadata.get('series_name'), + 'episode': metadata.get('podcast_name'), + } + + +class TokFMAuditionIE(InfoExtractor): + _VALID_URL = r'(?:https?://audycje\.tokfm\.pl/audycja/|tokfm:audition:)(?P\d+),?' + IE_NAME = 'tokfm:audition' + _TESTS = [{ + 'url': 'https://audycje.tokfm.pl/audycja/218,Analizy', + 'info_dict': { + 'id': '218', + 'title': 'Analizy', + 'series': 'Analizy', + }, + 'playlist_count': 1635, + }] + + _PAGE_SIZE = 30 + _HEADERS = { + 'User-Agent': 'Mozilla/5.0 (Linux; Android 9; Redmi 3S Build/PQ3A.190801.002; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/87.0.4280.101 Mobile Safari/537.36', + } + + @staticmethod + def _create_url(id): + return f'https://audycje.tokfm.pl/audycja/{id}' + + def _real_extract(self, url): + audition_id = self._match_id(url) + + data = self._download_json( + f'https://api.podcast.radioagora.pl/api4/getSeries?series_id={audition_id}', + audition_id, 'Downloading audition metadata', headers=self._HEADERS) + if not data: + raise ExtractorError('No such audition', expected=True) + data = data[0] + + entries = OnDemandPagedList(functools.partial( + self._fetch_page, audition_id, data), self._PAGE_SIZE) + + return { + '_type': 'playlist', + 'id': audition_id, + 'title': data.get('series_name'), + 'series': data.get('series_name'), + 'entries': entries, + } + + def _fetch_page(self, audition_id, data, page): + for retry in self.RetryManager(): + podcast_page = self._download_json( + f'https://api.podcast.radioagora.pl/api4/getPodcasts?series_id={audition_id}&limit=30&offset={page}&with_guests=true&with_leaders_for_mobile=true', + audition_id, f'Downloading podcast list page {page + 1}', headers=self._HEADERS) + if not podcast_page: + retry.error = ExtractorError('Agora returned empty page', expected=True) + + for podcast in podcast_page: + yield { + '_type': 'url_transparent', + 'url': podcast['podcast_sharing_url'], + 'ie_key': TokFMPodcastIE.ie_key(), + 'title': podcast.get('podcast_name'), + 'episode': podcast.get('podcast_name'), + 'description': podcast.get('podcast_description'), + 'timestamp': int_or_none(podcast.get('podcast_timestamp')), + 'series': data.get('series_name'), + } diff --git a/hypervideo_dl/extractor/airmozilla.py b/hypervideo_dl/extractor/airmozilla.py index 9e38136..669556b 100644 --- a/hypervideo_dl/extractor/airmozilla.py +++ b/hypervideo_dl/extractor/airmozilla.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/aliexpress.py b/hypervideo_dl/extractor/aliexpress.py index 9722fe9..2e83f2e 100644 --- a/hypervideo_dl/extractor/aliexpress.py +++ b/hypervideo_dl/extractor/aliexpress.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import compat_str from ..utils import ( diff --git a/hypervideo_dl/extractor/aljazeera.py b/hypervideo_dl/extractor/aljazeera.py index 7bcdb7a..124bab0 100644 --- a/hypervideo_dl/extractor/aljazeera.py +++ b/hypervideo_dl/extractor/aljazeera.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/allocine.py b/hypervideo_dl/extractor/allocine.py index 403a277..2d342cf 100644 --- a/hypervideo_dl/extractor/allocine.py +++ b/hypervideo_dl/extractor/allocine.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import compat_str from ..utils import ( @@ -115,8 +112,6 @@ class AllocineIE(InfoExtractor): }) duration, view_count, timestamp = [None] * 3 - self._sort_formats(formats) - return { 'id': video_id, 'display_id': display_id, diff --git a/hypervideo_dl/extractor/alphaporno.py b/hypervideo_dl/extractor/alphaporno.py index 3a6d99f..8d5b472 100644 --- a/hypervideo_dl/extractor/alphaporno.py +++ b/hypervideo_dl/extractor/alphaporno.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( parse_iso8601, diff --git a/hypervideo_dl/extractor/alsace20tv.py b/hypervideo_dl/extractor/alsace20tv.py index 4aae6fe..ea3332e 100644 --- a/hypervideo_dl/extractor/alsace20tv.py +++ b/hypervideo_dl/extractor/alsace20tv.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( clean_html, @@ -25,7 +22,6 @@ class Alsace20TVBaseIE(InfoExtractor): self._extract_smil_formats(fmt_url, video_id, fatal=False) if '/smil:_' in fmt_url else self._extract_mpd_formats(fmt_url, video_id, mpd_id=res, fatal=False)) - self._sort_formats(formats) webpage = (url and self._download_webpage(url, video_id, fatal=False)) or '' thumbnail = url_or_none(dict_get(info, ('image', 'preview', )) or self._og_search_thumbnail(webpage)) diff --git a/hypervideo_dl/extractor/alura.py b/hypervideo_dl/extractor/alura.py index d2e2df2..bfe066b 100644 --- a/hypervideo_dl/extractor/alura.py +++ b/hypervideo_dl/extractor/alura.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -66,8 +63,6 @@ class AluraIE(InfoExtractor): f['height'] = int('720' if m.group('res') == 'hd' else '480') formats.extend(video_format) - self._sort_formats(formats) - return { 'id': video_id, 'title': video_title, @@ -116,7 +111,7 @@ class AluraIE(InfoExtractor): raise ExtractorError('Unable to log in') -class AluraCourseIE(AluraIE): +class AluraCourseIE(AluraIE): # XXX: Do not subclass from concrete IE _VALID_URL = r'https?://(?:cursos\.)?alura\.com\.br/course/(?P[^/]+)' _LOGIN_URL = 'https://cursos.alura.com.br/loginForm?urlAfterLogin=/loginForm' diff --git a/hypervideo_dl/extractor/amara.py b/hypervideo_dl/extractor/amara.py index 61d4695..5018710 100644 --- a/hypervideo_dl/extractor/amara.py +++ b/hypervideo_dl/extractor/amara.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from .youtube import YoutubeIE from .vimeo import VimeoIE diff --git a/hypervideo_dl/extractor/amazon.py b/hypervideo_dl/extractor/amazon.py index 07b1b18..4d31706 100644 --- a/hypervideo_dl/extractor/amazon.py +++ b/hypervideo_dl/extractor/amazon.py @@ -1,6 +1,5 @@ -# coding: utf-8 from .common import InfoExtractor -from ..utils import int_or_none +from ..utils import ExtractorError, int_or_none class AmazonStoreIE(InfoExtractor): @@ -10,7 +9,7 @@ class AmazonStoreIE(InfoExtractor): 'url': 'https://www.amazon.co.uk/dp/B098XNCHLD/', 'info_dict': { 'id': 'B098XNCHLD', - 'title': 'md5:5f3194dbf75a8dcfc83079bd63a2abed', + 'title': 'md5:dae240564cbb2642170c02f7f0d7e472', }, 'playlist_mincount': 1, 'playlist': [{ @@ -19,28 +18,44 @@ class AmazonStoreIE(InfoExtractor): 'ext': 'mp4', 'title': 'mcdodo usb c cable 100W 5a', 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 34, }, }] }, { 'url': 'https://www.amazon.in/Sony-WH-1000XM4-Cancelling-Headphones-Bluetooth/dp/B0863TXGM3', 'info_dict': { 'id': 'B0863TXGM3', - 'title': 'md5:b0bde4881d3cfd40d63af19f7898b8ff', + 'title': 'md5:d1d3352428f8f015706c84b31e132169', }, 'playlist_mincount': 4, }, { 'url': 'https://www.amazon.com/dp/B0845NXCXF/', 'info_dict': { 'id': 'B0845NXCXF', - 'title': 'md5:2145cd4e3c7782f1ee73649a3cff1171', + 'title': 'md5:f3fa12779bf62ddb6a6ec86a360a858e', }, 'playlist-mincount': 1, + }, { + 'url': 'https://www.amazon.es/Samsung-Smartphone-s-AMOLED-Quad-c%C3%A1mara-espa%C3%B1ola/dp/B08WX337PQ', + 'info_dict': { + 'id': 'B08WX337PQ', + 'title': 'md5:f3fa12779bf62ddb6a6ec86a360a858e', + }, + 'playlist_mincount': 1, }] def _real_extract(self, url): id = self._match_id(url) - webpage = self._download_webpage(url, id) - data_json = self._parse_json(self._html_search_regex(r'var\s?obj\s?=\s?jQuery\.parseJSON\(\'(.*)\'\)', webpage, 'data'), id) + + for retry in self.RetryManager(): + webpage = self._download_webpage(url, id) + try: + data_json = self._search_json( + r'var\s?obj\s?=\s?jQuery\.parseJSON\(\'', webpage, 'data', id, + transform_source=lambda x: x.replace(R'\\u', R'\u')) + except ExtractorError as e: + retry.error = e + entries = [{ 'id': video['marketPlaceID'], 'url': video['url'], @@ -50,4 +65,4 @@ class AmazonStoreIE(InfoExtractor): 'height': int_or_none(video.get('videoHeight')), 'width': int_or_none(video.get('videoWidth')), } for video in (data_json.get('videos') or []) if video.get('isVideo') and video.get('url')] - return self.playlist_result(entries, playlist_id=id, playlist_title=data_json['title']) + return self.playlist_result(entries, playlist_id=id, playlist_title=data_json.get('title')) diff --git a/hypervideo_dl/extractor/amazonminitv.py b/hypervideo_dl/extractor/amazonminitv.py new file mode 100644 index 0000000..7309968 --- /dev/null +++ b/hypervideo_dl/extractor/amazonminitv.py @@ -0,0 +1,290 @@ +import json + +from .common import InfoExtractor +from ..utils import ExtractorError, int_or_none, traverse_obj, try_get + + +class AmazonMiniTVBaseIE(InfoExtractor): + def _real_initialize(self): + self._download_webpage( + 'https://www.amazon.in/minitv', None, + note='Fetching guest session cookies') + AmazonMiniTVBaseIE.session_id = self._get_cookies('https://www.amazon.in')['session-id'].value + + def _call_api(self, asin, data=None, note=None): + device = {'clientId': 'ATVIN', 'deviceLocale': 'en_GB'} + if data: + data['variables'].update({ + 'contentType': 'VOD', + 'sessionIdToken': self.session_id, + **device, + }) + + resp = self._download_json( + f'https://www.amazon.in/minitv/api/web/{"graphql" if data else "prs"}', + asin, note=note, headers={'Content-Type': 'application/json'}, + data=json.dumps(data).encode() if data else None, + query=None if data else { + 'deviceType': 'A1WMMUXPCUJL4N', + 'contentId': asin, + **device, + }) + + if resp.get('errors'): + raise ExtractorError(f'MiniTV said: {resp["errors"][0]["message"]}') + elif not data: + return resp + return resp['data'][data['operationName']] + + +class AmazonMiniTVIE(AmazonMiniTVBaseIE): + _VALID_URL = r'(?:https?://(?:www\.)?amazon\.in/minitv/tp/|amazonminitv:(?:amzn1\.dv\.gti\.)?)(?P[a-f0-9-]+)' + _TESTS = [{ + 'url': 'https://www.amazon.in/minitv/tp/75fe3a75-b8fe-4499-8100-5c9424344840?referrer=https%3A%2F%2Fwww.amazon.in%2Fminitv', + 'info_dict': { + 'id': 'amzn1.dv.gti.75fe3a75-b8fe-4499-8100-5c9424344840', + 'ext': 'mp4', + 'title': 'May I Kiss You?', + 'language': 'Hindi', + 'thumbnail': r're:^https?://.*\.jpg$', + 'description': 'md5:a549bfc747973e04feb707833474e59d', + 'release_timestamp': 1644710400, + 'release_date': '20220213', + 'duration': 846, + 'chapters': 'count:2', + 'series': 'Couple Goals', + 'series_id': 'amzn1.dv.gti.56521d46-b040-4fd5-872e-3e70476a04b0', + 'season': 'Season 3', + 'season_number': 3, + 'season_id': 'amzn1.dv.gti.20331016-d9b9-4968-b991-c89fa4927a36', + 'episode': 'May I Kiss You?', + 'episode_number': 2, + 'episode_id': 'amzn1.dv.gti.75fe3a75-b8fe-4499-8100-5c9424344840', + }, + }, { + 'url': 'https://www.amazon.in/minitv/tp/280d2564-584f-452f-9c98-7baf906e01ab?referrer=https%3A%2F%2Fwww.amazon.in%2Fminitv', + 'info_dict': { + 'id': 'amzn1.dv.gti.280d2564-584f-452f-9c98-7baf906e01ab', + 'ext': 'mp4', + 'title': 'Jahaan', + 'language': 'Hindi', + 'thumbnail': r're:^https?://.*\.jpg', + 'description': 'md5:05eb765a77bf703f322f120ec6867339', + 'release_timestamp': 1647475200, + 'release_date': '20220317', + 'duration': 783, + 'chapters': [], + }, + }, { + 'url': 'https://www.amazon.in/minitv/tp/280d2564-584f-452f-9c98-7baf906e01ab', + 'only_matching': True, + }, { + 'url': 'amazonminitv:amzn1.dv.gti.280d2564-584f-452f-9c98-7baf906e01ab', + 'only_matching': True, + }, { + 'url': 'amazonminitv:280d2564-584f-452f-9c98-7baf906e01ab', + 'only_matching': True, + }] + + _GRAPHQL_QUERY_CONTENT = ''' +query content($sessionIdToken: String!, $deviceLocale: String, $contentId: ID!, $contentType: ContentType!, $clientId: String) { + content( + applicationContextInput: {deviceLocale: $deviceLocale, sessionIdToken: $sessionIdToken, clientId: $clientId} + contentId: $contentId + contentType: $contentType + ) { + contentId + name + ... on Episode { + contentId + vodType + name + images + description { + synopsis + contentLengthInSeconds + } + publicReleaseDateUTC + audioTracks + seasonId + seriesId + seriesName + seasonNumber + episodeNumber + timecode { + endCreditsTime + } + } + ... on MovieContent { + contentId + vodType + name + description { + synopsis + contentLengthInSeconds + } + images + publicReleaseDateUTC + audioTracks + } + } +}''' + + def _real_extract(self, url): + asin = f'amzn1.dv.gti.{self._match_id(url)}' + prs = self._call_api(asin, note='Downloading playback info') + + formats, subtitles = [], {} + for type_, asset in prs['playbackAssets'].items(): + if not traverse_obj(asset, 'manifestUrl'): + continue + if type_ == 'hls': + m3u8_fmts, m3u8_subs = self._extract_m3u8_formats_and_subtitles( + asset['manifestUrl'], asin, ext='mp4', entry_protocol='m3u8_native', + m3u8_id=type_, fatal=False) + formats.extend(m3u8_fmts) + subtitles = self._merge_subtitles(subtitles, m3u8_subs) + elif type_ == 'dash': + mpd_fmts, mpd_subs = self._extract_mpd_formats_and_subtitles( + asset['manifestUrl'], asin, mpd_id=type_, fatal=False) + formats.extend(mpd_fmts) + subtitles = self._merge_subtitles(subtitles, mpd_subs) + else: + self.report_warning(f'Unknown asset type: {type_}') + + title_info = self._call_api( + asin, note='Downloading title info', data={ + 'operationName': 'content', + 'variables': {'contentId': asin}, + 'query': self._GRAPHQL_QUERY_CONTENT, + }) + credits_time = try_get(title_info, lambda x: x['timecode']['endCreditsTime'] / 1000) + is_episode = title_info.get('vodType') == 'EPISODE' + + return { + 'id': asin, + 'title': title_info.get('name'), + 'formats': formats, + 'subtitles': subtitles, + 'language': traverse_obj(title_info, ('audioTracks', 0)), + 'thumbnails': [{ + 'id': type_, + 'url': url, + } for type_, url in (title_info.get('images') or {}).items()], + 'description': traverse_obj(title_info, ('description', 'synopsis')), + 'release_timestamp': int_or_none(try_get(title_info, lambda x: x['publicReleaseDateUTC'] / 1000)), + 'duration': traverse_obj(title_info, ('description', 'contentLengthInSeconds')), + 'chapters': [{ + 'start_time': credits_time, + 'title': 'End Credits', + }] if credits_time else [], + 'series': title_info.get('seriesName'), + 'series_id': title_info.get('seriesId'), + 'season_number': title_info.get('seasonNumber'), + 'season_id': title_info.get('seasonId'), + 'episode': title_info.get('name') if is_episode else None, + 'episode_number': title_info.get('episodeNumber'), + 'episode_id': asin if is_episode else None, + } + + +class AmazonMiniTVSeasonIE(AmazonMiniTVBaseIE): + IE_NAME = 'amazonminitv:season' + _VALID_URL = r'amazonminitv:season:(?:amzn1\.dv\.gti\.)?(?P[a-f0-9-]+)' + IE_DESC = 'Amazon MiniTV Series, "minitv:season:" prefix' + _TESTS = [{ + 'url': 'amazonminitv:season:amzn1.dv.gti.0aa996eb-6a1b-4886-a342-387fbd2f1db0', + 'playlist_mincount': 6, + 'info_dict': { + 'id': 'amzn1.dv.gti.0aa996eb-6a1b-4886-a342-387fbd2f1db0', + }, + }, { + 'url': 'amazonminitv:season:0aa996eb-6a1b-4886-a342-387fbd2f1db0', + 'only_matching': True, + }] + + _GRAPHQL_QUERY = ''' +query getEpisodes($sessionIdToken: String!, $clientId: String, $episodeOrSeasonId: ID!, $deviceLocale: String) { + getEpisodes( + applicationContextInput: {sessionIdToken: $sessionIdToken, deviceLocale: $deviceLocale, clientId: $clientId} + episodeOrSeasonId: $episodeOrSeasonId + ) { + episodes { + ... on Episode { + contentId + name + images + seriesName + seasonId + seriesId + seasonNumber + episodeNumber + description { + synopsis + contentLengthInSeconds + } + publicReleaseDateUTC + } + } + } +} +''' + + def _entries(self, asin): + season_info = self._call_api( + asin, note='Downloading season info', data={ + 'operationName': 'getEpisodes', + 'variables': {'episodeOrSeasonId': asin}, + 'query': self._GRAPHQL_QUERY, + }) + + for episode in season_info['episodes']: + yield self.url_result( + f'amazonminitv:{episode["contentId"]}', AmazonMiniTVIE, episode['contentId']) + + def _real_extract(self, url): + asin = f'amzn1.dv.gti.{self._match_id(url)}' + return self.playlist_result(self._entries(asin), asin) + + +class AmazonMiniTVSeriesIE(AmazonMiniTVBaseIE): + IE_NAME = 'amazonminitv:series' + _VALID_URL = r'amazonminitv:series:(?:amzn1\.dv\.gti\.)?(?P[a-f0-9-]+)' + _TESTS = [{ + 'url': 'amazonminitv:series:amzn1.dv.gti.56521d46-b040-4fd5-872e-3e70476a04b0', + 'playlist_mincount': 3, + 'info_dict': { + 'id': 'amzn1.dv.gti.56521d46-b040-4fd5-872e-3e70476a04b0', + }, + }, { + 'url': 'amazonminitv:series:56521d46-b040-4fd5-872e-3e70476a04b0', + 'only_matching': True, + }] + + _GRAPHQL_QUERY = ''' +query getSeasons($sessionIdToken: String!, $deviceLocale: String, $episodeOrSeasonOrSeriesId: ID!, $clientId: String) { + getSeasons( + applicationContextInput: {deviceLocale: $deviceLocale, sessionIdToken: $sessionIdToken, clientId: $clientId} + episodeOrSeasonOrSeriesId: $episodeOrSeasonOrSeriesId + ) { + seasons { + seasonId + } + } +} +''' + + def _entries(self, asin): + season_info = self._call_api( + asin, note='Downloading series info', data={ + 'operationName': 'getSeasons', + 'variables': {'episodeOrSeasonOrSeriesId': asin}, + 'query': self._GRAPHQL_QUERY, + }) + + for season in season_info['seasons']: + yield self.url_result(f'amazonminitv:season:{season["seasonId"]}', AmazonMiniTVSeasonIE, season['seasonId']) + + def _real_extract(self, url): + asin = f'amzn1.dv.gti.{self._match_id(url)}' + return self.playlist_result(self._entries(asin), asin) diff --git a/hypervideo_dl/extractor/amcnetworks.py b/hypervideo_dl/extractor/amcnetworks.py index e38e215..c58bc7b 100644 --- a/hypervideo_dl/extractor/amcnetworks.py +++ b/hypervideo_dl/extractor/amcnetworks.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .theplatform import ThePlatformIE @@ -12,7 +9,7 @@ from ..utils import ( ) -class AMCNetworksIE(ThePlatformIE): +class AMCNetworksIE(ThePlatformIE): # XXX: Do not subclass from concrete IE _VALID_URL = r'https?://(?:www\.)?(?Pamc|bbcamerica|ifc|(?:we|sundance)tv)\.com/(?P(?:movies|shows(?:/[^/]+)+)/[^/?#&]+)' _TESTS = [{ 'url': 'https://www.bbcamerica.com/shows/the-graham-norton-show/videos/tina-feys-adorable-airline-themed-family-dinner--51631', @@ -109,7 +106,6 @@ class AMCNetworksIE(ThePlatformIE): media_url = update_url_query(media_url, query) formats, subtitles = self._extract_theplatform_smil( media_url, video_id) - self._sort_formats(formats) thumbnails = [] thumbnail_urls = [properties.get('imageDesktop')] diff --git a/hypervideo_dl/extractor/americastestkitchen.py b/hypervideo_dl/extractor/americastestkitchen.py index 6e6099a..abda55d 100644 --- a/hypervideo_dl/extractor/americastestkitchen.py +++ b/hypervideo_dl/extractor/americastestkitchen.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json from .common import InfoExtractor @@ -14,7 +11,7 @@ from ..utils import ( class AmericasTestKitchenIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?:americastestkitchen|cooks(?:country|illustrated))\.com/(?Pepisode|videos)/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?americastestkitchen\.com/(?:cooks(?:country|illustrated)/)?(?Pepisode|videos)/(?P\d+)' _TESTS = [{ 'url': 'https://www.americastestkitchen.com/episode/582-weeknight-japanese-suppers', 'md5': 'b861c3e365ac38ad319cfd509c30577f', @@ -22,15 +19,20 @@ class AmericasTestKitchenIE(InfoExtractor): 'id': '5b400b9ee338f922cb06450c', 'title': 'Japanese Suppers', 'ext': 'mp4', + 'display_id': 'weeknight-japanese-suppers', 'description': 'md5:64e606bfee910627efc4b5f050de92b3', - 'thumbnail': r're:^https?://', - 'timestamp': 1523318400, - 'upload_date': '20180410', - 'release_date': '20180410', - 'series': "America's Test Kitchen", - 'season_number': 18, + 'timestamp': 1523304000, + 'upload_date': '20180409', + 'release_date': '20180409', + 'series': 'America\'s Test Kitchen', + 'season': 'Season 18', 'episode': 'Japanese Suppers', + 'season_number': 18, 'episode_number': 15, + 'duration': 1376, + 'thumbnail': r're:^https?://', + 'average_rating': 0, + 'view_count': int, }, 'params': { 'skip_download': True, @@ -43,15 +45,20 @@ class AmericasTestKitchenIE(InfoExtractor): 'id': '5fbe8c61bda2010001c6763b', 'title': 'Simple Chicken Dinner', 'ext': 'mp4', + 'display_id': 'atktv_2103_simple-chicken-dinner_full-episode_web-mp4', 'description': 'md5:eb68737cc2fd4c26ca7db30139d109e7', - 'thumbnail': r're:^https?://', - 'timestamp': 1610755200, - 'upload_date': '20210116', - 'release_date': '20210116', - 'series': "America's Test Kitchen", - 'season_number': 21, + 'timestamp': 1610737200, + 'upload_date': '20210115', + 'release_date': '20210115', + 'series': 'America\'s Test Kitchen', + 'season': 'Season 21', 'episode': 'Simple Chicken Dinner', + 'season_number': 21, 'episode_number': 3, + 'duration': 1397, + 'thumbnail': r're:^https?://', + 'view_count': int, + 'average_rating': 0, }, 'params': { 'skip_download': True, @@ -60,10 +67,10 @@ class AmericasTestKitchenIE(InfoExtractor): 'url': 'https://www.americastestkitchen.com/videos/3420-pan-seared-salmon', 'only_matching': True, }, { - 'url': 'https://www.cookscountry.com/episode/564-when-only-chocolate-will-do', + 'url': 'https://www.americastestkitchen.com/cookscountry/episode/564-when-only-chocolate-will-do', 'only_matching': True, }, { - 'url': 'https://www.cooksillustrated.com/videos/4478-beef-wellington', + 'url': 'https://www.americastestkitchen.com/cooksillustrated/videos/4478-beef-wellington', 'only_matching': True, }] @@ -93,7 +100,7 @@ class AmericasTestKitchenIE(InfoExtractor): class AmericasTestKitchenSeasonIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?Pamericastestkitchen|cookscountry)\.com/episodes/browse/season_(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?americastestkitchen\.com(?P/cookscountry)?/episodes/browse/season_(?P\d+)' _TESTS = [{ # ATK Season 'url': 'https://www.americastestkitchen.com/episodes/browse/season_1', @@ -104,7 +111,7 @@ class AmericasTestKitchenSeasonIE(InfoExtractor): 'playlist_count': 13, }, { # Cooks Country Season - 'url': 'https://www.cookscountry.com/episodes/browse/season_12', + 'url': 'https://www.americastestkitchen.com/cookscountry/episodes/browse/season_12', 'info_dict': { 'id': 'season_12', 'title': 'Season 12', @@ -113,17 +120,17 @@ class AmericasTestKitchenSeasonIE(InfoExtractor): }] def _real_extract(self, url): - show_name, season_number = self._match_valid_url(url).groups() + show_path, season_number = self._match_valid_url(url).group('show', 'id') season_number = int(season_number) - slug = 'atk' if show_name == 'americastestkitchen' else 'cco' + slug = 'cco' if show_path == '/cookscountry' else 'atk' season = 'Season %d' % season_number season_search = self._download_json( 'https://y1fnzxui30-dsn.algolia.net/1/indexes/everest_search_%s_season_desc_production' % slug, season, headers={ - 'Origin': 'https://www.%s.com' % show_name, + 'Origin': 'https://www.americastestkitchen.com', 'X-Algolia-API-Key': '8d504d0099ed27c1b73708d22871d805', 'X-Algolia-Application-Id': 'Y1FNZXUI30', }, query={ @@ -139,12 +146,12 @@ class AmericasTestKitchenSeasonIE(InfoExtractor): def entries(): for episode in (season_search.get('hits') or []): - search_url = episode.get('search_url') + search_url = episode.get('search_url') # always formatted like '/episode/123-title-of-episode' if not search_url: continue yield { '_type': 'url', - 'url': 'https://www.%s.com%s' % (show_name, search_url), + 'url': f'https://www.americastestkitchen.com{show_path or ""}{search_url}', 'id': try_get(episode, lambda e: e['objectID'].split('_')[-1]), 'title': episode.get('title'), 'description': episode.get('description'), diff --git a/hypervideo_dl/extractor/amp.py b/hypervideo_dl/extractor/amp.py index 24c684c..b0cbd77 100644 --- a/hypervideo_dl/extractor/amp.py +++ b/hypervideo_dl/extractor/amp.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( determine_ext, @@ -13,7 +10,7 @@ from ..utils import ( ) -class AMPIE(InfoExtractor): +class AMPIE(InfoExtractor): # XXX: Conventionally, base classes should end with BaseIE/InfoExtractor # parse Akamai Adaptive Media Player feed def _extract_feed_info(self, url): feed = self._download_json( @@ -87,8 +84,6 @@ class AMPIE(InfoExtractor): 'ext': ext, }) - self._sort_formats(formats) - timestamp = unified_timestamp(item.get('pubDate'), ' ') or parse_iso8601(item.get('dc-date')) return { diff --git a/hypervideo_dl/extractor/angel.py b/hypervideo_dl/extractor/angel.py new file mode 100644 index 0000000..306b365 --- /dev/null +++ b/hypervideo_dl/extractor/angel.py @@ -0,0 +1,56 @@ +import re + +from .common import InfoExtractor +from ..utils import url_or_none, merge_dicts + + +class AngelIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?angel\.com/watch/(?P[^/?#]+)/episode/(?P[\w-]+)/season-(?P\d+)/episode-(?P\d+)/(?P[^/?#]+)' + _TESTS = [{ + 'url': 'https://www.angel.com/watch/tuttle-twins/episode/2f3d0382-ea82-4cdc-958e-84fbadadc710/season-1/episode-1/when-laws-give-you-lemons', + 'md5': '4734e5cfdd64a568e837246aa3eaa524', + 'info_dict': { + 'id': '2f3d0382-ea82-4cdc-958e-84fbadadc710', + 'ext': 'mp4', + 'title': 'Tuttle Twins Season 1, Episode 1: When Laws Give You Lemons', + 'description': 'md5:73b704897c20ab59c433a9c0a8202d5e', + 'thumbnail': r're:^https?://images.angelstudios.com/image/upload/angel-app/.*$', + 'duration': 1359.0 + } + }, { + 'url': 'https://www.angel.com/watch/the-chosen/episode/8dfb714d-bca5-4812-8125-24fb9514cd10/season-1/episode-1/i-have-called-you-by-name', + 'md5': 'e4774bad0a5f0ad2e90d175cafdb797d', + 'info_dict': { + 'id': '8dfb714d-bca5-4812-8125-24fb9514cd10', + 'ext': 'mp4', + 'title': 'The Chosen Season 1, Episode 1: I Have Called You By Name', + 'description': 'md5:aadfb4827a94415de5ff6426e6dee3be', + 'thumbnail': r're:^https?://images.angelstudios.com/image/upload/angel-app/.*$', + 'duration': 3276.0 + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + json_ld = self._search_json_ld(webpage, video_id) + + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + json_ld.pop('url'), video_id, note='Downloading HD m3u8 information') + + info_dict = { + 'id': video_id, + 'title': self._og_search_title(webpage), + 'description': self._og_search_description(webpage), + 'formats': formats, + 'subtitles': subtitles + } + + # Angel uses cloudinary in the background and supports image transformations. + # We remove these transformations and return the source file + base_thumbnail_url = url_or_none(self._og_search_thumbnail(webpage)) or json_ld.pop('thumbnails') + if base_thumbnail_url: + info_dict['thumbnail'] = re.sub(r'(/upload)/.+(/angel-app/.+)$', r'\1\2', base_thumbnail_url) + + return merge_dicts(info_dict, json_ld) diff --git a/hypervideo_dl/extractor/animelab.py b/hypervideo_dl/extractor/animelab.py deleted file mode 100644 index 1c2cc47..0000000 --- a/hypervideo_dl/extractor/animelab.py +++ /dev/null @@ -1,278 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor - -from ..utils import ( - ExtractorError, - urlencode_postdata, - int_or_none, - str_or_none, - determine_ext, -) - -from ..compat import compat_HTTPError - - -class AnimeLabBaseIE(InfoExtractor): - _LOGIN_URL = 'https://www.animelab.com/login' - _NETRC_MACHINE = 'animelab' - _LOGGED_IN = False - - def _is_logged_in(self, login_page=None): - if not self._LOGGED_IN: - if not login_page: - login_page = self._download_webpage(self._LOGIN_URL, None, 'Downloading login page') - AnimeLabBaseIE._LOGGED_IN = 'Sign In' not in login_page - return self._LOGGED_IN - - def _perform_login(self, username, password): - if self._is_logged_in(): - return - - login_form = { - 'email': username, - 'password': password, - } - - try: - response = self._download_webpage( - self._LOGIN_URL, None, 'Logging in', 'Wrong login info', - data=urlencode_postdata(login_form), - headers={'Content-Type': 'application/x-www-form-urlencoded'}) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: - raise ExtractorError('Unable to log in (wrong credentials?)', expected=True) - raise - - if not self._is_logged_in(response): - raise ExtractorError('Unable to login (cannot verify if logged in)') - - def _real_initialize(self): - if not self._is_logged_in(): - self.raise_login_required('Login is required to access any AnimeLab content') - - -class AnimeLabIE(AnimeLabBaseIE): - _VALID_URL = r'https?://(?:www\.)?animelab\.com/player/(?P<id>[^/]+)' - - # the following tests require authentication, but a free account will suffice - # just set 'usenetrc' to true in test/local_parameters.json if you use a .netrc file - # or you can set 'username' and 'password' there - # the tests also select a specific format so that the same video is downloaded - # regardless of whether the user is premium or not (needs testing on a premium account) - _TEST = { - 'url': 'https://www.animelab.com/player/fullmetal-alchemist-brotherhood-episode-42', - 'md5': '05bde4b91a5d1ff46ef5b94df05b0f7f', - 'info_dict': { - 'id': '383', - 'ext': 'mp4', - 'display_id': 'fullmetal-alchemist-brotherhood-episode-42', - 'title': 'Fullmetal Alchemist: Brotherhood - Episode 42 - Signs of a Counteroffensive', - 'description': 'md5:103eb61dd0a56d3dfc5dbf748e5e83f4', - 'series': 'Fullmetal Alchemist: Brotherhood', - 'episode': 'Signs of a Counteroffensive', - 'episode_number': 42, - 'duration': 1469, - 'season': 'Season 1', - 'season_number': 1, - 'season_id': '38', - }, - 'params': { - 'format': '[format_id=21711_yeshardsubbed_ja-JP][height=480]', - }, - 'skip': 'All AnimeLab content requires authentication', - } - - def _real_extract(self, url): - display_id = self._match_id(url) - - # unfortunately we can get different URLs for the same formats - # e.g. if we are using a "free" account so no dubs available - # (so _remove_duplicate_formats is not effective) - # so we use a dictionary as a workaround - formats = {} - for language_option_url in ('https://www.animelab.com/player/%s/subtitles', - 'https://www.animelab.com/player/%s/dubbed'): - actual_url = language_option_url % display_id - webpage = self._download_webpage(actual_url, display_id, 'Downloading URL ' + actual_url) - - video_collection = self._parse_json(self._search_regex(r'new\s+?AnimeLabApp\.VideoCollection\s*?\((.*?)\);', webpage, 'AnimeLab VideoCollection'), display_id) - position = int_or_none(self._search_regex(r'playlistPosition\s*?=\s*?(\d+)', webpage, 'Playlist Position')) - - raw_data = video_collection[position]['videoEntry'] - - video_id = str_or_none(raw_data['id']) - - # create a title from many sources (while grabbing other info) - # TODO use more fallback sources to get some of these - series = raw_data.get('showTitle') - video_type = raw_data.get('videoEntryType', {}).get('name') - episode_number = raw_data.get('episodeNumber') - episode_name = raw_data.get('name') - - title_parts = (series, video_type, episode_number, episode_name) - if None not in title_parts: - title = '%s - %s %s - %s' % title_parts - else: - title = episode_name - - description = raw_data.get('synopsis') or self._og_search_description(webpage, default=None) - - duration = int_or_none(raw_data.get('duration')) - - thumbnail_data = raw_data.get('images', []) - thumbnails = [] - for thumbnail in thumbnail_data: - for instance in thumbnail['imageInstances']: - image_data = instance.get('imageInfo', {}) - thumbnails.append({ - 'id': str_or_none(image_data.get('id')), - 'url': image_data.get('fullPath'), - 'width': image_data.get('width'), - 'height': image_data.get('height'), - }) - - season_data = raw_data.get('season', {}) or {} - season = str_or_none(season_data.get('name')) - season_number = int_or_none(season_data.get('seasonNumber')) - season_id = str_or_none(season_data.get('id')) - - for video_data in raw_data['videoList']: - current_video_list = {} - current_video_list['language'] = video_data.get('language', {}).get('languageCode') - - is_hardsubbed = video_data.get('hardSubbed') - - for video_instance in video_data['videoInstances']: - httpurl = video_instance.get('httpUrl') - url = httpurl if httpurl else video_instance.get('rtmpUrl') - if url is None: - # this video format is unavailable to the user (not premium etc.) - continue - - current_format = current_video_list.copy() - - format_id_parts = [] - - format_id_parts.append(str_or_none(video_instance.get('id'))) - - if is_hardsubbed is not None: - if is_hardsubbed: - format_id_parts.append('yeshardsubbed') - else: - format_id_parts.append('nothardsubbed') - - format_id_parts.append(current_format['language']) - - format_id = '_'.join([x for x in format_id_parts if x is not None]) - - ext = determine_ext(url) - if ext == 'm3u8': - for format_ in self._extract_m3u8_formats( - url, video_id, m3u8_id=format_id, fatal=False): - formats[format_['format_id']] = format_ - continue - elif ext == 'mpd': - for format_ in self._extract_mpd_formats( - url, video_id, mpd_id=format_id, fatal=False): - formats[format_['format_id']] = format_ - continue - - current_format['url'] = url - quality_data = video_instance.get('videoQuality') - if quality_data: - quality = quality_data.get('name') or quality_data.get('description') - else: - quality = None - - height = None - if quality: - height = int_or_none(self._search_regex(r'(\d+)p?$', quality, 'Video format height', default=None)) - - if height is None: - self.report_warning('Could not get height of video') - else: - current_format['height'] = height - current_format['format_id'] = format_id - - formats[current_format['format_id']] = current_format - - formats = list(formats.values()) - self._sort_formats(formats) - - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': description, - 'series': series, - 'episode': episode_name, - 'episode_number': int_or_none(episode_number), - 'thumbnails': thumbnails, - 'duration': duration, - 'formats': formats, - 'season': season, - 'season_number': season_number, - 'season_id': season_id, - } - - -class AnimeLabShowsIE(AnimeLabBaseIE): - _VALID_URL = r'https?://(?:www\.)?animelab\.com/shows/(?P<id>[^/]+)' - - _TEST = { - 'url': 'https://www.animelab.com/shows/attack-on-titan', - 'info_dict': { - 'id': '45', - 'title': 'Attack on Titan', - 'description': 'md5:989d95a2677e9309368d5cf39ba91469', - }, - 'playlist_count': 59, - 'skip': 'All AnimeLab content requires authentication', - } - - def _real_extract(self, url): - _BASE_URL = 'http://www.animelab.com' - _SHOWS_API_URL = '/api/videoentries/show/videos/' - display_id = self._match_id(url) - - webpage = self._download_webpage(url, display_id, 'Downloading requested URL') - - show_data_str = self._search_regex(r'({"id":.*}),\svideoEntry', webpage, 'AnimeLab show data') - show_data = self._parse_json(show_data_str, display_id) - - show_id = str_or_none(show_data.get('id')) - title = show_data.get('name') - description = show_data.get('shortSynopsis') or show_data.get('longSynopsis') - - entries = [] - for season in show_data['seasons']: - season_id = season['id'] - get_data = urlencode_postdata({ - 'seasonId': season_id, - 'limit': 1000, - }) - # despite using urlencode_postdata, we are sending a GET request - target_url = _BASE_URL + _SHOWS_API_URL + show_id + "?" + get_data.decode('utf-8') - response = self._download_webpage( - target_url, - None, 'Season id %s' % season_id) - - season_data = self._parse_json(response, display_id) - - for video_data in season_data['list']: - entries.append(self.url_result( - _BASE_URL + '/player/' + video_data['slug'], 'AnimeLab', - str_or_none(video_data.get('id')), video_data.get('name') - )) - - return { - '_type': 'playlist', - 'id': show_id, - 'title': title, - 'description': description, - 'entries': entries, - } - -# TODO implement myqueue diff --git a/hypervideo_dl/extractor/animeondemand.py b/hypervideo_dl/extractor/animeondemand.py deleted file mode 100644 index 2e674d5..0000000 --- a/hypervideo_dl/extractor/animeondemand.py +++ /dev/null @@ -1,284 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - determine_ext, - extract_attributes, - ExtractorError, - join_nonempty, - url_or_none, - urlencode_postdata, - urljoin, -) - - -class AnimeOnDemandIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?anime-on-demand\.de/anime/(?P<id>\d+)' - _LOGIN_URL = 'https://www.anime-on-demand.de/users/sign_in' - _APPLY_HTML5_URL = 'https://www.anime-on-demand.de/html5apply' - _NETRC_MACHINE = 'animeondemand' - # German-speaking countries of Europe - _GEO_COUNTRIES = ['AT', 'CH', 'DE', 'LI', 'LU'] - _TESTS = [{ - # jap, OmU - 'url': 'https://www.anime-on-demand.de/anime/161', - 'info_dict': { - 'id': '161', - 'title': 'Grimgar, Ashes and Illusions (OmU)', - 'description': 'md5:6681ce3c07c7189d255ac6ab23812d31', - }, - 'playlist_mincount': 4, - }, { - # Film wording is used instead of Episode, ger/jap, Dub/OmU - 'url': 'https://www.anime-on-demand.de/anime/39', - 'only_matching': True, - }, { - # Episodes without titles, jap, OmU - 'url': 'https://www.anime-on-demand.de/anime/162', - 'only_matching': True, - }, { - # ger/jap, Dub/OmU, account required - 'url': 'https://www.anime-on-demand.de/anime/169', - 'only_matching': True, - }, { - # Full length film, non-series, ger/jap, Dub/OmU, account required - 'url': 'https://www.anime-on-demand.de/anime/185', - 'only_matching': True, - }, { - # Flash videos - 'url': 'https://www.anime-on-demand.de/anime/12', - 'only_matching': True, - }] - - def _perform_login(self, username, password): - login_page = self._download_webpage( - self._LOGIN_URL, None, 'Downloading login page') - - if '>Our licensing terms allow the distribution of animes only to German-speaking countries of Europe' in login_page: - self.raise_geo_restricted( - '%s is only available in German-speaking countries of Europe' % self.IE_NAME) - - login_form = self._form_hidden_inputs('new_user', login_page) - - login_form.update({ - 'user[login]': username, - 'user[password]': password, - }) - - post_url = self._search_regex( - r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page, - 'post url', default=self._LOGIN_URL, group='url') - - if not post_url.startswith('http'): - post_url = urljoin(self._LOGIN_URL, post_url) - - response = self._download_webpage( - post_url, None, 'Logging in', - data=urlencode_postdata(login_form), headers={ - 'Referer': self._LOGIN_URL, - }) - - if all(p not in response for p in ('>Logout<', 'href="/users/sign_out"')): - error = self._search_regex( - r'<p[^>]+\bclass=(["\'])(?:(?!\1).)*\balert\b(?:(?!\1).)*\1[^>]*>(?P<error>.+?)</p>', - response, 'error', default=None, group='error') - if error: - raise ExtractorError('Unable to login: %s' % error, expected=True) - raise ExtractorError('Unable to log in') - - def _real_extract(self, url): - anime_id = self._match_id(url) - - webpage = self._download_webpage(url, anime_id) - - if 'data-playlist=' not in webpage: - self._download_webpage( - self._APPLY_HTML5_URL, anime_id, - 'Activating HTML5 beta', 'Unable to apply HTML5 beta') - webpage = self._download_webpage(url, anime_id) - - csrf_token = self._html_search_meta( - 'csrf-token', webpage, 'csrf token', fatal=True) - - anime_title = self._html_search_regex( - r'(?s)<h1[^>]+itemprop="name"[^>]*>(.+?)</h1>', - webpage, 'anime name') - anime_description = self._html_search_regex( - r'(?s)<div[^>]+itemprop="description"[^>]*>(.+?)</div>', - webpage, 'anime description', default=None) - - def extract_info(html, video_id, num=None): - title, description = [None] * 2 - formats = [] - - for input_ in re.findall( - r'<input[^>]+class=["\'].*?streamstarter[^>]+>', html): - attributes = extract_attributes(input_) - title = attributes.get('data-dialog-header') - playlist_urls = [] - for playlist_key in ('data-playlist', 'data-otherplaylist', 'data-stream'): - playlist_url = attributes.get(playlist_key) - if isinstance(playlist_url, compat_str) and re.match( - r'/?[\da-zA-Z]+', playlist_url): - playlist_urls.append(attributes[playlist_key]) - if not playlist_urls: - continue - - lang = attributes.get('data-lang') - lang_note = attributes.get('value') - - for playlist_url in playlist_urls: - kind = self._search_regex( - r'videomaterialurl/\d+/([^/]+)/', - playlist_url, 'media kind', default=None) - format_id = join_nonempty(lang, kind) if lang or kind else str(num) - format_note = join_nonempty(kind, lang_note, delim=', ') - item_id_list = [] - if format_id: - item_id_list.append(format_id) - item_id_list.append('videomaterial') - playlist = self._download_json( - urljoin(url, playlist_url), video_id, - 'Downloading %s JSON' % ' '.join(item_id_list), - headers={ - 'X-Requested-With': 'XMLHttpRequest', - 'X-CSRF-Token': csrf_token, - 'Referer': url, - 'Accept': 'application/json, text/javascript, */*; q=0.01', - }, fatal=False) - if not playlist: - continue - stream_url = url_or_none(playlist.get('streamurl')) - if stream_url: - rtmp = re.search( - r'^(?P<url>rtmpe?://(?P<host>[^/]+)/(?P<app>.+/))(?P<playpath>mp[34]:.+)', - stream_url) - if rtmp: - formats.append({ - 'url': rtmp.group('url'), - 'app': rtmp.group('app'), - 'play_path': rtmp.group('playpath'), - 'page_url': url, - 'player_url': 'https://www.anime-on-demand.de/assets/jwplayer.flash-55abfb34080700304d49125ce9ffb4a6.swf', - 'rtmp_real_time': True, - 'format_id': 'rtmp', - 'ext': 'flv', - }) - continue - start_video = playlist.get('startvideo', 0) - playlist = playlist.get('playlist') - if not playlist or not isinstance(playlist, list): - continue - playlist = playlist[start_video] - title = playlist.get('title') - if not title: - continue - description = playlist.get('description') - for source in playlist.get('sources', []): - file_ = source.get('file') - if not file_: - continue - ext = determine_ext(file_) - format_id = join_nonempty( - lang, kind, - 'hls' if ext == 'm3u8' else None, - 'dash' if source.get('type') == 'video/dash' or ext == 'mpd' else None) - if ext == 'm3u8': - file_formats = self._extract_m3u8_formats( - file_, video_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id=format_id, fatal=False) - elif source.get('type') == 'video/dash' or ext == 'mpd': - continue - file_formats = self._extract_mpd_formats( - file_, video_id, mpd_id=format_id, fatal=False) - else: - continue - for f in file_formats: - f.update({ - 'language': lang, - 'format_note': format_note, - }) - formats.extend(file_formats) - - return { - 'title': title, - 'description': description, - 'formats': formats, - } - - def extract_entries(html, video_id, common_info, num=None): - info = extract_info(html, video_id, num) - - if info['formats']: - self._sort_formats(info['formats']) - f = common_info.copy() - f.update(info) - yield f - - # Extract teaser/trailer only when full episode is not available - if not info['formats']: - m = re.search( - r'data-dialog-header=(["\'])(?P<title>.+?)\1[^>]+href=(["\'])(?P<href>.+?)\3[^>]*>(?P<kind>Teaser|Trailer)<', - html) - if m: - f = common_info.copy() - f.update({ - 'id': '%s-%s' % (f['id'], m.group('kind').lower()), - 'title': m.group('title'), - 'url': urljoin(url, m.group('href')), - }) - yield f - - def extract_episodes(html): - for num, episode_html in enumerate(re.findall( - r'(?s)<h3[^>]+class="episodebox-title".+?>Episodeninhalt<', html), 1): - episodebox_title = self._search_regex( - (r'class="episodebox-title"[^>]+title=(["\'])(?P<title>.+?)\1', - r'class="episodebox-title"[^>]+>(?P<title>.+?)<'), - episode_html, 'episodebox title', default=None, group='title') - if not episodebox_title: - continue - - episode_number = int(self._search_regex( - r'(?:Episode|Film)\s*(\d+)', - episodebox_title, 'episode number', default=num)) - episode_title = self._search_regex( - r'(?:Episode|Film)\s*\d+\s*-\s*(.+)', - episodebox_title, 'episode title', default=None) - - video_id = 'episode-%d' % episode_number - - common_info = { - 'id': video_id, - 'series': anime_title, - 'episode': episode_title, - 'episode_number': episode_number, - } - - for e in extract_entries(episode_html, video_id, common_info): - yield e - - def extract_film(html, video_id): - common_info = { - 'id': anime_id, - 'title': anime_title, - 'description': anime_description, - } - for e in extract_entries(html, video_id, common_info): - yield e - - def entries(): - has_episodes = False - for e in extract_episodes(webpage): - has_episodes = True - yield e - - if not has_episodes: - for e in extract_film(webpage, anime_id): - yield e - - return self.playlist_result( - entries(), anime_id, anime_title, anime_description) diff --git a/hypervideo_dl/extractor/ant1newsgr.py b/hypervideo_dl/extractor/ant1newsgr.py index 1075b46..7b384b2 100644 --- a/hypervideo_dl/extractor/ant1newsgr.py +++ b/hypervideo_dl/extractor/ant1newsgr.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re import urllib.parse from .common import InfoExtractor @@ -10,7 +6,6 @@ from ..utils import ( ExtractorError, determine_ext, scale_thumbnails_to_max_format_width, - unescapeHTML, ) @@ -24,7 +19,6 @@ class Ant1NewsGrBaseIE(InfoExtractor): raise ExtractorError('no source found for %s' % video_id) formats, subs = (self._extract_m3u8_formats_and_subtitles(source, video_id, 'mp4') if determine_ext(source) == 'm3u8' else ([{'url': source}], {})) - self._sort_formats(formats) thumbnails = scale_thumbnails_to_max_format_width( formats, [{'url': info['thumb']}], r'(?<=/imgHandler/)\d+') return { @@ -94,7 +88,7 @@ class Ant1NewsGrArticleIE(Ant1NewsGrBaseIE): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) info = self._search_json_ld(webpage, video_id, expected_type='NewsArticle') - embed_urls = list(Ant1NewsGrEmbedIE._extract_urls(webpage)) + embed_urls = list(Ant1NewsGrEmbedIE._extract_embed_urls(url, webpage)) if not embed_urls: raise ExtractorError('no videos found for %s' % video_id, expected=True) return self.playlist_from_matches( @@ -107,6 +101,7 @@ class Ant1NewsGrEmbedIE(Ant1NewsGrBaseIE): IE_DESC = 'ant1news.gr embedded videos' _BASE_PLAYER_URL_RE = r'(?:https?:)?//(?:[a-zA-Z0-9\-]+\.)?(?:antenna|ant1news)\.gr/templates/pages/player' _VALID_URL = rf'{_BASE_PLAYER_URL_RE}\?([^#]+&)?cid=(?P<id>[^#&]+)' + _EMBED_REGEX = [rf'<iframe[^>]+?src=(?P<_q1>["\'])(?P<url>{_BASE_PLAYER_URL_RE}\?(?:(?!(?P=_q1)).)+)(?P=_q1)'] _API_PATH = '/news/templates/data/jsonPlayer' _TESTS = [{ @@ -120,16 +115,6 @@ class Ant1NewsGrEmbedIE(Ant1NewsGrBaseIE): }, }] - @classmethod - def _extract_urls(cls, webpage): - _EMBED_URL_RE = rf'{cls._BASE_PLAYER_URL_RE}\?(?:(?!(?P=_q1)).)+' - _EMBED_RE = rf'<iframe[^>]+?src=(?P<_q1>["\'])(?P<url>{_EMBED_URL_RE})(?P=_q1)' - for mobj in re.finditer(_EMBED_RE, webpage): - url = unescapeHTML(mobj.group('url')) - if not cls.suitable(url): - continue - yield url - def _real_extract(self, url): video_id = self._match_id(url) diff --git a/hypervideo_dl/extractor/anvato.py b/hypervideo_dl/extractor/anvato.py index 686d453..79bfe41 100644 --- a/hypervideo_dl/extractor/anvato.py +++ b/hypervideo_dl/extractor/anvato.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import base64 import hashlib import json @@ -10,38 +7,68 @@ import time from .common import InfoExtractor from ..aes import aes_encrypt -from ..compat import compat_str from ..utils import ( bytes_to_intlist, determine_ext, - intlist_to_bytes, int_or_none, + intlist_to_bytes, join_nonempty, + smuggle_url, strip_jsonp, + traverse_obj, unescapeHTML, unsmuggle_url, ) -# This import causes a ModuleNotFoundError on some systems for unknown reason. -# See issues: -# https://github.com/hypervideo/hypervideo/issues/35 -# https://github.com/ytdl-org/youtube-dl/issues/27449 -# https://github.com/animelover1984/youtube-dl/issues/17 -try: - from .anvato_token_generator import NFLTokenGenerator -except ImportError: - NFLTokenGenerator = None - def md5_text(s): - if not isinstance(s, compat_str): - s = compat_str(s) - return hashlib.md5(s.encode('utf-8')).hexdigest() + return hashlib.md5(str(s).encode()).hexdigest() class AnvatoIE(InfoExtractor): _VALID_URL = r'anvato:(?P<access_key_or_mcp>[^:]+):(?P<id>\d+)' + _API_BASE_URL = 'https://tkx.mp.lura.live/rest/v2' + _ANVP_RE = r'<script[^>]+\bdata-anvp\s*=\s*(["\'])(?P<anvp>(?:(?!\1).)+)\1' + _AUTH_KEY = b'\x31\xc2\x42\x84\x9e\x73\xa0\xce' # from anvplayer.min.js + + _TESTS = [{ + # from https://www.nfl.com/videos/baker-mayfield-s-game-changing-plays-from-3-td-game-week-14 + 'url': 'anvato:GXvEgwyJeWem8KCYXfeoHWknwP48Mboj:899441', + 'md5': '921919dab3cd0b849ff3d624831ae3e2', + 'info_dict': { + 'id': '899441', + 'ext': 'mp4', + 'title': 'Baker Mayfield\'s game-changing plays from 3-TD game Week 14', + 'description': 'md5:85e05a3cc163f8c344340f220521136d', + 'upload_date': '20201215', + 'timestamp': 1608009755, + 'thumbnail': r're:^https?://.*\.jpg', + 'uploader': 'NFL', + 'tags': ['Baltimore Ravens at Cleveland Browns (2020-REG-14)', 'Baker Mayfield', 'Game Highlights', + 'Player Highlights', 'Cleveland Browns', 'league'], + 'duration': 157, + 'categories': ['Entertainment', 'Game', 'Highlights'], + }, + }, { + # from https://ktla.com/news/99-year-old-woman-learns-to-fly-in-torrance-checks-off-bucket-list-dream/ + 'url': 'anvato:X8POa4zpGZMmeiq0wqiO8IP5rMqQM9VN:8032455', + 'md5': '837718bcfb3a7778d022f857f7a9b19e', + 'info_dict': { + 'id': '8032455', + 'ext': 'mp4', + 'title': '99-year-old woman learns to fly plane in Torrance, checks off bucket list dream', + 'description': 'md5:0a12bab8159445e78f52a297a35c6609', + 'upload_date': '20220928', + 'timestamp': 1664408881, + 'thumbnail': r're:^https?://.*\.jpg', + 'uploader': 'LIN', + 'tags': ['video', 'news', '5live'], + 'duration': 155, + 'categories': ['News'], + }, + }] + # Copied from anvplayer.min.js _ANVACK_TABLE = { 'nbcu_nbcd_desktop_web_prod_93d8ead38ce2024f8f544b78306fbd15895ae5e6': 'NNemUkySjxLyPTKvZRiGntBIjEyK8uqicjMakIaQ', @@ -214,86 +241,74 @@ class AnvatoIE(InfoExtractor): 'telemundo': 'anvato_mcp_telemundo_web_prod_c5278d51ad46fda4b6ca3d0ea44a7846a054f582' } - _TOKEN_GENERATORS = { - 'GXvEgwyJeWem8KCYXfeoHWknwP48Mboj': NFLTokenGenerator, + def _generate_nfl_token(self, anvack, mcp_id): + reroute = self._download_json( + 'https://api.nfl.com/v1/reroute', mcp_id, data=b'grant_type=client_credentials', + headers={'X-Domain-Id': 100}, note='Fetching token info') + token_type = reroute.get('token_type') or 'Bearer' + auth_token = f'{token_type} {reroute["access_token"]}' + response = self._download_json( + 'https://api.nfl.com/v3/shield/', mcp_id, data=json.dumps({ + 'query': '''{ + viewer { + mediaToken(anvack: "%s", id: %s) { + token } + } +}''' % (anvack, mcp_id), + }).encode(), headers={ + 'Authorization': auth_token, + 'Content-Type': 'application/json', + }, note='Fetching NFL API token') + return traverse_obj(response, ('data', 'viewer', 'mediaToken', 'token')) - _API_KEY = '3hwbSuqqT690uxjNYBktSQpa5ZrpYYR0Iofx7NcJHyA' - - _ANVP_RE = r'<script[^>]+\bdata-anvp\s*=\s*(["\'])(?P<anvp>(?:(?!\1).)+)\1' - _AUTH_KEY = b'\x31\xc2\x42\x84\x9e\x73\xa0\xce' - - _TESTS = [{ - # from https://www.boston25news.com/news/watch-humpback-whale-breaches-right-next-to-fishing-boat-near-nh/817484874 - 'url': 'anvato:8v9BEynrwx8EFLYpgfOWcG1qJqyXKlRM:4465496', - 'info_dict': { - 'id': '4465496', - 'ext': 'mp4', - 'title': 'VIDEO: Humpback whale breaches right next to NH boat', - 'description': 'VIDEO: Humpback whale breaches right next to NH boat. Footage courtesy: Zach Fahey.', - 'duration': 22, - 'timestamp': 1534855680, - 'upload_date': '20180821', - 'uploader': 'ANV', - }, - 'params': { - 'skip_download': True, - }, - }, { - # from https://sanfrancisco.cbslocal.com/2016/06/17/source-oakland-cop-on-leave-for-having-girlfriend-help-with-police-reports/ - 'url': 'anvato:DVzl9QRzox3ZZsP9bNu5Li3X7obQOnqP:3417601', - 'only_matching': True, - }] - - def __init__(self, *args, **kwargs): - super(AnvatoIE, self).__init__(*args, **kwargs) - self.__server_time = None + _TOKEN_GENERATORS = { + 'GXvEgwyJeWem8KCYXfeoHWknwP48Mboj': _generate_nfl_token, + } def _server_time(self, access_key, video_id): - if self.__server_time is not None: - return self.__server_time - - self.__server_time = int(self._download_json( - self._api_prefix(access_key) + 'server_time?anvack=' + access_key, video_id, - note='Fetching server time')['server_time']) + return int_or_none(traverse_obj(self._download_json( + f'{self._API_BASE_URL}/server_time', video_id, query={'anvack': access_key}, + note='Fetching server time', fatal=False), 'server_time')) or int(time.time()) - return self.__server_time - - def _api_prefix(self, access_key): - return 'https://tkx2-%s.anvato.net/rest/v2/' % ('prod' if 'prod' in access_key else 'stage') - - def _get_video_json(self, access_key, video_id): + def _get_video_json(self, access_key, video_id, extracted_token): # See et() in anvplayer.min.js, which is an alias of getVideoJSON() - video_data_url = self._api_prefix(access_key) + 'mcp/video/%s?anvack=%s' % (video_id, access_key) + video_data_url = f'{self._API_BASE_URL}/mcp/video/{video_id}?anvack={access_key}' server_time = self._server_time(access_key, video_id) - input_data = '%d~%s~%s' % (server_time, md5_text(video_data_url), md5_text(server_time)) + input_data = f'{server_time}~{md5_text(video_data_url)}~{md5_text(server_time)}' auth_secret = intlist_to_bytes(aes_encrypt( bytes_to_intlist(input_data[:64]), bytes_to_intlist(self._AUTH_KEY))) - - video_data_url += '&X-Anvato-Adst-Auth=' + base64.b64encode(auth_secret).decode('ascii') + query = { + 'X-Anvato-Adst-Auth': base64.b64encode(auth_secret).decode('ascii'), + 'rtyp': 'fp', + } anvrid = md5_text(time.time() * 1000 * random.random())[:30] api = { 'anvrid': anvrid, 'anvts': server_time, } - if self._TOKEN_GENERATORS.get(access_key) is not None: - api['anvstk2'] = self._TOKEN_GENERATORS[access_key].generate(self, access_key, video_id) + if extracted_token is not None: + api['anvstk2'] = extracted_token + elif self._TOKEN_GENERATORS.get(access_key) is not None: + api['anvstk2'] = self._TOKEN_GENERATORS[access_key](self, access_key, video_id) + elif self._ANVACK_TABLE.get(access_key) is not None: + api['anvstk'] = md5_text(f'{access_key}|{anvrid}|{server_time}|{self._ANVACK_TABLE[access_key]}') else: - api['anvstk'] = md5_text('%s|%s|%d|%s' % ( - access_key, anvrid, server_time, - self._ANVACK_TABLE.get(access_key, self._API_KEY))) + api['anvstk2'] = 'default' return self._download_json( - video_data_url, video_id, transform_source=strip_jsonp, - data=json.dumps({'api': api}).encode('utf-8')) + video_data_url, video_id, transform_source=strip_jsonp, query=query, + data=json.dumps({'api': api}, separators=(',', ':')).encode('utf-8')) - def _get_anvato_videos(self, access_key, video_id): - video_data = self._get_video_json(access_key, video_id) + def _get_anvato_videos(self, access_key, video_id, token): + video_data = self._get_video_json(access_key, video_id, token) formats = [] for published_url in video_data['published_urls']: - video_url = published_url['embed_url'] + video_url = published_url.get('embed_url') + if not video_url: + continue media_format = published_url.get('format') ext = determine_ext(video_url) @@ -308,15 +323,27 @@ class AnvatoIE(InfoExtractor): 'tbr': tbr or None, } - if media_format == 'm3u8' and tbr is not None: + vtt_subs, hls_subs = {}, {} + if media_format == 'vtt': + _, vtt_subs = self._extract_m3u8_formats_and_subtitles( + video_url, video_id, m3u8_id='vtt', fatal=False) + continue + elif media_format == 'm3u8' and tbr is not None: a_format.update({ 'format_id': join_nonempty('hls', tbr), 'ext': 'mp4', }) elif media_format == 'm3u8-variant' or ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - video_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) + # For some videos the initial m3u8 URL returns JSON instead + manifest_json = self._download_json( + video_url, video_id, note='Downloading manifest JSON', errnote=False) + if manifest_json: + video_url = manifest_json.get('master_m3u8') + if not video_url: + continue + hls_fmts, hls_subs = self._extract_m3u8_formats_and_subtitles( + video_url, video_id, ext='mp4', m3u8_id='hls', fatal=False) + formats.extend(hls_fmts) continue elif ext == 'mp3' or media_format == 'mp3': a_format['vcodec'] = 'none' @@ -327,8 +354,6 @@ class AnvatoIE(InfoExtractor): }) formats.append(a_format) - self._sort_formats(formats) - subtitles = {} for caption in video_data.get('captions', []): a_caption = { @@ -336,6 +361,7 @@ class AnvatoIE(InfoExtractor): 'ext': 'tt' if caption.get('format') == 'SMPTE-TT' else None } subtitles.setdefault(caption['language'], []).append(a_caption) + subtitles = self._merge_subtitles(subtitles, hls_subs, vtt_subs) return { 'id': video_id, @@ -352,30 +378,19 @@ class AnvatoIE(InfoExtractor): 'subtitles': subtitles, } - @staticmethod - def _extract_urls(ie, webpage, video_id): - entries = [] - for mobj in re.finditer(AnvatoIE._ANVP_RE, webpage): - anvplayer_data = ie._parse_json( - mobj.group('anvp'), video_id, transform_source=unescapeHTML, - fatal=False) - if not anvplayer_data: - continue - video = anvplayer_data.get('video') - if not isinstance(video, compat_str) or not video.isdigit(): - continue - access_key = anvplayer_data.get('accessKey') - if not access_key: - mcp = anvplayer_data.get('mcp') - if mcp: - access_key = AnvatoIE._MCP_TO_ACCESS_KEY_TABLE.get( - mcp.lower()) + @classmethod + def _extract_from_webpage(cls, url, webpage): + for mobj in re.finditer(cls._ANVP_RE, webpage): + anvplayer_data = unescapeHTML(json.loads(mobj.group('anvp'))) or {} + video_id, access_key = anvplayer_data.get('video'), anvplayer_data.get('accessKey') if not access_key: + access_key = cls._MCP_TO_ACCESS_KEY_TABLE.get((anvplayer_data.get('mcp') or '').lower()) + if not (video_id or '').isdigit() or not access_key: continue - entries.append(ie.url_result( - 'anvato:%s:%s' % (access_key, video), ie=AnvatoIE.ie_key(), - video_id=video)) - return entries + url = f'anvato:{access_key}:{video_id}' + if anvplayer_data.get('token'): + url = smuggle_url(url, {'token': anvplayer_data['token']}) + yield cls.url_result(url, AnvatoIE, video_id) def _extract_anvato_videos(self, webpage, video_id): anvplayer_data = self._parse_json( @@ -383,7 +398,7 @@ class AnvatoIE(InfoExtractor): self._ANVP_RE, webpage, 'Anvato player data', group='anvp'), video_id) return self._get_anvato_videos( - anvplayer_data['accessKey'], anvplayer_data['video']) + anvplayer_data['accessKey'], anvplayer_data['video'], 'default') # cbslocal token = 'default' def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) @@ -391,9 +406,7 @@ class AnvatoIE(InfoExtractor): 'countries': smuggled_data.get('geo_countries'), }) - mobj = self._match_valid_url(url) - access_key, video_id = mobj.group('access_key_or_mcp', 'id') + access_key, video_id = self._match_valid_url(url).group('access_key_or_mcp', 'id') if access_key not in self._ANVACK_TABLE: - access_key = self._MCP_TO_ACCESS_KEY_TABLE.get( - access_key) or access_key - return self._get_anvato_videos(access_key, video_id) + access_key = self._MCP_TO_ACCESS_KEY_TABLE.get(access_key) or access_key + return self._get_anvato_videos(access_key, video_id, smuggled_data.get('token')) diff --git a/hypervideo_dl/extractor/anvato_token_generator/__init__.py b/hypervideo_dl/extractor/anvato_token_generator/__init__.py deleted file mode 100644 index 6e223db..0000000 --- a/hypervideo_dl/extractor/anvato_token_generator/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -from __future__ import unicode_literals - -from .nfl import NFLTokenGenerator - -__all__ = [ - 'NFLTokenGenerator', -] diff --git a/hypervideo_dl/extractor/anvato_token_generator/common.py b/hypervideo_dl/extractor/anvato_token_generator/common.py deleted file mode 100644 index b959a90..0000000 --- a/hypervideo_dl/extractor/anvato_token_generator/common.py +++ /dev/null @@ -1,6 +0,0 @@ -from __future__ import unicode_literals - - -class TokenGenerator: - def generate(self, anvack, mcp_id): - raise NotImplementedError('This method must be implemented by subclasses') diff --git a/hypervideo_dl/extractor/anvato_token_generator/nfl.py b/hypervideo_dl/extractor/anvato_token_generator/nfl.py deleted file mode 100644 index 97a2b24..0000000 --- a/hypervideo_dl/extractor/anvato_token_generator/nfl.py +++ /dev/null @@ -1,30 +0,0 @@ -from __future__ import unicode_literals - -import json - -from .common import TokenGenerator - - -class NFLTokenGenerator(TokenGenerator): - _AUTHORIZATION = None - - def generate(ie, anvack, mcp_id): - if not NFLTokenGenerator._AUTHORIZATION: - reroute = ie._download_json( - 'https://api.nfl.com/v1/reroute', mcp_id, - data=b'grant_type=client_credentials', - headers={'X-Domain-Id': 100}) - NFLTokenGenerator._AUTHORIZATION = '%s %s' % (reroute.get('token_type') or 'Bearer', reroute['access_token']) - return ie._download_json( - 'https://api.nfl.com/v3/shield/', mcp_id, data=json.dumps({ - 'query': '''{ - viewer { - mediaToken(anvack: "%s", id: %s) { - token - } - } -}''' % (anvack, mcp_id), - }).encode(), headers={ - 'Authorization': NFLTokenGenerator._AUTHORIZATION, - 'Content-Type': 'application/json', - })['data']['viewer']['mediaToken']['token'] diff --git a/hypervideo_dl/extractor/aol.py b/hypervideo_dl/extractor/aol.py index 4766a2c..6949ca9 100644 --- a/hypervideo_dl/extractor/aol.py +++ b/hypervideo_dl/extractor/aol.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .yahoo import YahooIE @@ -12,7 +9,7 @@ from ..utils import ( ) -class AolIE(YahooIE): +class AolIE(YahooIE): # XXX: Do not subclass from concrete IE IE_NAME = 'aol.com' _VALID_URL = r'(?:aol-video:|https?://(?:www\.)?aol\.(?:com|ca|co\.uk|de|jp)/video/(?:[^/]+/)*)(?P<id>\d{9}|[0-9a-f]{24}|[0-9a-f]{8}-(?:[0-9a-f]{4}-){3}[0-9a-f]{12})' @@ -122,7 +119,6 @@ class AolIE(YahooIE): 'height': int_or_none(qs.get('h', [None])[0]), }) formats.append(f) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/apa.py b/hypervideo_dl/extractor/apa.py index 1736cdf..1ea0b1d 100644 --- a/hypervideo_dl/extractor/apa.py +++ b/hypervideo_dl/extractor/apa.py @@ -1,8 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - from .common import InfoExtractor from ..utils import ( determine_ext, @@ -13,6 +8,7 @@ from ..utils import ( class APAIE(InfoExtractor): _VALID_URL = r'(?P<base_url>https?://[^/]+\.apa\.at)/embed/(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' + _EMBED_REGEX = [r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//[^/]+\.apa\.at/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}.*?)\1'] _TESTS = [{ 'url': 'http://uvp.apa.at/embed/293f6d17-692a-44e3-9fd5-7b178f3a1029', 'md5': '2b12292faeb0a7d930c778c7a5b4759b', @@ -33,14 +29,6 @@ class APAIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): - return [ - mobj.group('url') - for mobj in re.finditer( - r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//[^/]+\.apa\.at/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}.*?)\1', - webpage)] - def _real_extract(self, url): mobj = self._match_valid_url(url) video_id, base_url = mobj.group('id', 'base_url') @@ -84,7 +72,6 @@ class APAIE(InfoExtractor): 'format_id': format_id, 'height': height, }) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/aparat.py b/hypervideo_dl/extractor/aparat.py index 1057233..4a989d8 100644 --- a/hypervideo_dl/extractor/aparat.py +++ b/hypervideo_dl/extractor/aparat.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( get_element_by_id, @@ -13,6 +10,7 @@ from ..utils import ( class AparatIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?aparat\.com/(?:v/|video/video/embed/videohash/)(?P<id>[a-zA-Z0-9]+)' + _EMBED_REGEX = [r'<iframe .*?src="(?P<url>http://www\.aparat\.com/video/[^"]+)"'] _TESTS = [{ 'url': 'http://www.aparat.com/v/wP8On', @@ -75,7 +73,6 @@ class AparatIE(InfoExtractor): r'(\d+)[pP]', label or '', 'height', default=None)), }) - self._sort_formats(formats) info = self._search_json_ld(webpage, video_id, default={}) diff --git a/hypervideo_dl/extractor/appleconnect.py b/hypervideo_dl/extractor/appleconnect.py index 494f833..d00b0f9 100644 --- a/hypervideo_dl/extractor/appleconnect.py +++ b/hypervideo_dl/extractor/appleconnect.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( str_to_int, diff --git a/hypervideo_dl/extractor/applepodcasts.py b/hypervideo_dl/extractor/applepodcasts.py index 9139ff7..49bbeab 100644 --- a/hypervideo_dl/extractor/applepodcasts.py +++ b/hypervideo_dl/extractor/applepodcasts.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( clean_html, diff --git a/hypervideo_dl/extractor/appletrailers.py b/hypervideo_dl/extractor/appletrailers.py index 0abfb43..a5abb55 100644 --- a/hypervideo_dl/extractor/appletrailers.py +++ b/hypervideo_dl/extractor/appletrailers.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re import json @@ -122,7 +120,6 @@ class AppleTrailersIE(InfoExtractor): 'height': int_or_none(size_data.get('height')), 'language': version[:2], }) - self._sort_formats(formats) entries.append({ 'id': movie + '-' + re.sub(r'[^a-zA-Z0-9]', '', clip_title).lower(), @@ -187,8 +184,6 @@ class AppleTrailersIE(InfoExtractor): 'height': int_or_none(format['height']), }) - self._sort_formats(formats) - playlist.append({ '_type': 'video', 'id': video_id, diff --git a/hypervideo_dl/extractor/archiveorg.py b/hypervideo_dl/extractor/archiveorg.py index 2ab3c1b..90dda9f 100644 --- a/hypervideo_dl/extractor/archiveorg.py +++ b/hypervideo_dl/extractor/archiveorg.py @@ -1,39 +1,35 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re import json +import re +import urllib.parse + from .common import InfoExtractor -from .youtube import YoutubeIE, YoutubeBaseInfoExtractor -from ..compat import ( - compat_urllib_parse_unquote, - compat_urllib_parse_unquote_plus, - compat_HTTPError -) +from .youtube import YoutubeBaseInfoExtractor, YoutubeIE +from ..compat import compat_HTTPError, compat_urllib_parse_unquote from ..utils import ( + KNOWN_EXTENSIONS, + ExtractorError, + HEADRequest, bug_reports_message, clean_html, dict_get, extract_attributes, - ExtractorError, get_element_by_id, - HEADRequest, int_or_none, join_nonempty, - KNOWN_EXTENSIONS, + js_to_json, merge_dicts, mimetype2ext, orderedSet, parse_duration, parse_qs, - str_to_int, str_or_none, + str_to_int, traverse_obj, try_get, unified_strdate, unified_timestamp, + url_or_none, urlhandle_detect_ext, - url_or_none ) @@ -54,6 +50,11 @@ class ArchiveOrgIE(InfoExtractor): 'upload_date': '20100315', 'creator': 'SRI International', 'uploader': 'laura@archive.org', + 'thumbnail': r're:https://archive\.org/download/.*\.jpg', + 'release_year': 1968, + 'display_id': 'XD300-23_68HighlightsAResearchCntAugHumanIntellect.cdr', + 'track': 'XD300-23 68HighlightsAResearchCntAugHumanIntellect', + }, }, { 'url': 'https://archive.org/details/Cops1922', @@ -62,33 +63,43 @@ class ArchiveOrgIE(InfoExtractor): 'id': 'Cops1922', 'ext': 'mp4', 'title': 'Buster Keaton\'s "Cops" (1922)', - 'description': 'md5:43a603fd6c5b4b90d12a96b921212b9c', + 'description': 'md5:cd6f9910c35aedd5fc237dbc3957e2ca', 'uploader': 'yorkmba99@hotmail.com', 'timestamp': 1387699629, 'upload_date': '20131222', + 'display_id': 'Cops-v2.mp4', + 'thumbnail': r're:https://archive\.org/download/.*\.jpg', + 'duration': 1091.96, }, }, { 'url': 'http://archive.org/embed/XD300-23_68HighlightsAResearchCntAugHumanIntellect', 'only_matching': True, }, { 'url': 'https://archive.org/details/Election_Ads', - 'md5': '284180e857160cf866358700bab668a3', + 'md5': 'eec5cddebd4793c6a653b69c3b11f2e6', 'info_dict': { 'id': 'Election_Ads/Commercial-JFK1960ElectionAdCampaignJingle.mpg', 'title': 'Commercial-JFK1960ElectionAdCampaignJingle.mpg', - 'ext': 'mp4', + 'ext': 'mpg', + 'thumbnail': r're:https://archive\.org/download/.*\.jpg', + 'duration': 59.77, + 'display_id': 'Commercial-JFK1960ElectionAdCampaignJingle.mpg', }, }, { 'url': 'https://archive.org/details/Election_Ads/Commercial-Nixon1960ElectionAdToughonDefense.mpg', - 'md5': '7915213ef02559b5501fe630e1a53f59', + 'md5': 'ea1eed8234e7d4165f38c8c769edef38', 'info_dict': { 'id': 'Election_Ads/Commercial-Nixon1960ElectionAdToughonDefense.mpg', 'title': 'Commercial-Nixon1960ElectionAdToughonDefense.mpg', - 'ext': 'mp4', + 'ext': 'mpg', 'timestamp': 1205588045, 'uploader': 'mikedavisstripmaster@yahoo.com', 'description': '1960 Presidential Campaign Election Commercials John F Kennedy, Richard M Nixon', 'upload_date': '20080315', + 'display_id': 'Commercial-Nixon1960ElectionAdToughonDefense.mpg', + 'duration': 59.51, + 'license': 'http://creativecommons.org/licenses/publicdomain/', + 'thumbnail': r're:https://archive\.org/download/.*\.jpg', }, }, { 'url': 'https://archive.org/details/gd1977-05-08.shure57.stevenson.29303.flac16', @@ -97,6 +108,12 @@ class ArchiveOrgIE(InfoExtractor): 'id': 'gd1977-05-08.shure57.stevenson.29303.flac16/gd1977-05-08d01t01.flac', 'title': 'Turning', 'ext': 'flac', + 'track': 'Turning', + 'creator': 'Grateful Dead', + 'display_id': 'gd1977-05-08d01t01.flac', + 'track_number': 1, + 'album': '1977-05-08 - Barton Hall - Cornell University', + 'duration': 39.8, }, }, { 'url': 'https://archive.org/details/gd1977-05-08.shure57.stevenson.29303.flac16/gd1977-05-08d01t07.flac', @@ -107,11 +124,20 @@ class ArchiveOrgIE(InfoExtractor): 'ext': 'flac', 'timestamp': 1205895624, 'uploader': 'mvernon54@yahoo.com', - 'description': 'md5:6a31f1996db0aa0fc9da6d6e708a1bb0', + 'description': 'md5:6c921464414814720c6593810a5c7e3d', 'upload_date': '20080319', 'location': 'Barton Hall - Cornell University', + 'duration': 438.68, + 'track': 'Deal', + 'creator': 'Grateful Dead', + 'album': '1977-05-08 - Barton Hall - Cornell University', + 'release_date': '19770508', + 'display_id': 'gd1977-05-08d01t07.flac', + 'release_year': 1977, + 'track_number': 7, }, }, { + # FIXME: give a better error message than just IndexError when all available formats are restricted 'url': 'https://archive.org/details/lp_the-music-of-russia_various-artists-a-askaryan-alexander-melik', 'md5': '7cb019baa9b332e82ea7c10403acd180', 'info_dict': { @@ -119,6 +145,7 @@ class ArchiveOrgIE(InfoExtractor): 'title': 'Bells Of Rostov', 'ext': 'mp3', }, + 'skip': 'restricted' }, { 'url': 'https://archive.org/details/lp_the-music-of-russia_various-artists-a-askaryan-alexander-melik/disc1/02.02.+Song+And+Chorus+In+The+Polovetsian+Camp+From+%22Prince+Igor%22+(Act+2%2C+Scene+1).mp3', 'md5': '1d0aabe03edca83ca58d9ed3b493a3c3', @@ -131,6 +158,52 @@ class ArchiveOrgIE(InfoExtractor): 'description': 'md5:012b2d668ae753be36896f343d12a236', 'upload_date': '20190928', }, + 'skip': 'restricted' + }, { + # Original formats are private + 'url': 'https://archive.org/details/irelandthemakingofarepublic', + 'info_dict': { + 'id': 'irelandthemakingofarepublic', + 'title': 'Ireland: The Making of a Republic', + 'upload_date': '20160610', + 'description': 'md5:f70956a156645a658a0dc9513d9e78b7', + 'uploader': 'dimitrios@archive.org', + 'creator': ['British Broadcasting Corporation', 'Time-Life Films'], + 'timestamp': 1465594947, + }, + 'playlist': [ + { + 'md5': '0b211261b26590d49df968f71b90690d', + 'info_dict': { + 'id': 'irelandthemakingofarepublic/irelandthemakingofarepublicreel1_01.mov', + 'ext': 'mp4', + 'title': 'irelandthemakingofarepublicreel1_01.mov', + 'duration': 130.46, + 'thumbnail': 'https://archive.org/download/irelandthemakingofarepublic/irelandthemakingofarepublic.thumbs/irelandthemakingofarepublicreel1_01_000117.jpg', + 'display_id': 'irelandthemakingofarepublicreel1_01.mov', + }, + }, { + 'md5': '67335ee3b23a0da930841981c1e79b02', + 'info_dict': { + 'id': 'irelandthemakingofarepublic/irelandthemakingofarepublicreel1_02.mov', + 'ext': 'mp4', + 'duration': 1395.13, + 'title': 'irelandthemakingofarepublicreel1_02.mov', + 'display_id': 'irelandthemakingofarepublicreel1_02.mov', + 'thumbnail': 'https://archive.org/download/irelandthemakingofarepublic/irelandthemakingofarepublic.thumbs/irelandthemakingofarepublicreel1_02_001374.jpg', + }, + }, { + 'md5': 'e470e86787893603f4a341a16c281eb5', + 'info_dict': { + 'id': 'irelandthemakingofarepublic/irelandthemakingofarepublicreel2.mov', + 'ext': 'mp4', + 'duration': 1602.67, + 'title': 'irelandthemakingofarepublicreel2.mov', + 'thumbnail': 'https://archive.org/download/irelandthemakingofarepublic/irelandthemakingofarepublic.thumbs/irelandthemakingofarepublicreel2_001554.jpg', + 'display_id': 'irelandthemakingofarepublicreel2.mov', + }, + } + ] }] @staticmethod @@ -146,7 +219,7 @@ class ArchiveOrgIE(InfoExtractor): return json.loads(extract_attributes(element)['value']) def _real_extract(self, url): - video_id = compat_urllib_parse_unquote_plus(self._match_id(url)) + video_id = urllib.parse.unquote_plus(self._match_id(url)) identifier, entry_id = (video_id.split('/', 1) + [None])[:2] # Archive.org metadata API doesn't clearly demarcate playlist entries @@ -221,17 +294,25 @@ class ArchiveOrgIE(InfoExtractor): 'filesize': int_or_none(f.get('size'))}) extension = (f['name'].rsplit('.', 1) + [None])[1] - if extension in KNOWN_EXTENSIONS: + + # We don't want to skip private formats if the user has access to them, + # however without access to an account with such privileges we can't implement/test this. + # For now to be safe, we will only skip them if there is no user logged in. + is_logged_in = bool(self._get_cookies('https://archive.org').get('logged-in-sig')) + if extension in KNOWN_EXTENSIONS and (not f.get('private') or is_logged_in): entry['formats'].append({ 'url': 'https://archive.org/download/' + identifier + '/' + f['name'], 'format': f.get('format'), 'width': int_or_none(f.get('width')), 'height': int_or_none(f.get('height')), 'filesize': int_or_none(f.get('size')), - 'protocol': 'https'}) + 'protocol': 'https', + 'source_preference': 0 if f.get('source') == 'original' else -1, + 'format_note': f.get('source') + }) for entry in entries.values(): - self._sort_formats(entry['formats']) + entry['_format_sort_fields'] = ('source', ) if len(entries) == 1: # If there's only one item, use it as the main info dict @@ -287,7 +368,9 @@ class YoutubeWebArchiveIE(InfoExtractor): 'channel_id': 'UCukCyHaD-bK3in_pKpfH9Eg', 'duration': 32, 'uploader_id': 'Zeurel', - 'uploader_url': 'http://www.youtube.com/user/Zeurel' + 'uploader_url': 'https://www.youtube.com/user/Zeurel', + 'thumbnail': r're:https?://.*\.(jpg|webp)', + 'channel_url': 'https://www.youtube.com/channel/UCukCyHaD-bK3in_pKpfH9Eg', } }, { # Internal link @@ -302,7 +385,9 @@ class YoutubeWebArchiveIE(InfoExtractor): 'channel_id': 'UCHnyfMqiRRG1u-2MsSQLbXA', 'duration': 771, 'uploader_id': '1veritasium', - 'uploader_url': 'http://www.youtube.com/user/1veritasium' + 'uploader_url': 'https://www.youtube.com/user/1veritasium', + 'thumbnail': r're:https?://.*\.(jpg|webp)', + 'channel_url': 'https://www.youtube.com/channel/UCHnyfMqiRRG1u-2MsSQLbXA', } }, { # Video from 2012, webm format itag 45. Newest capture is deleted video, with an invalid description. @@ -316,7 +401,9 @@ class YoutubeWebArchiveIE(InfoExtractor): 'duration': 398, 'description': 'md5:ff4de6a7980cb65d951c2f6966a4f2f3', 'uploader_id': 'machinima', - 'uploader_url': 'http://www.youtube.com/user/machinima' + 'uploader_url': 'https://www.youtube.com/user/machinima', + 'thumbnail': r're:https?://.*\.(jpg|webp)', + 'uploader': 'machinima' } }, { # FLV video. Video file URL does not provide itag information @@ -330,7 +417,10 @@ class YoutubeWebArchiveIE(InfoExtractor): 'duration': 19, 'description': 'md5:10436b12e07ac43ff8df65287a56efb4', 'uploader_id': 'jawed', - 'uploader_url': 'http://www.youtube.com/user/jawed' + 'uploader_url': 'https://www.youtube.com/user/jawed', + 'channel_url': 'https://www.youtube.com/channel/UC4QobU6STFB0P71PMvOGN5A', + 'thumbnail': r're:https?://.*\.(jpg|webp)', + 'uploader': 'jawed', } }, { 'url': 'https://web.archive.org/web/20110712231407/http://www.youtube.com/watch?v=lTx3G6h2xyA', @@ -344,7 +434,9 @@ class YoutubeWebArchiveIE(InfoExtractor): 'duration': 204, 'description': 'md5:f7535343b6eda34a314eff8b85444680', 'uploader_id': 'itsmadeon', - 'uploader_url': 'http://www.youtube.com/user/itsmadeon' + 'uploader_url': 'https://www.youtube.com/user/itsmadeon', + 'channel_url': 'https://www.youtube.com/channel/UCqMDNf3Pn5L7pcNkuSEeO3w', + 'thumbnail': r're:https?://.*\.(jpg|webp)', } }, { # First capture is of dead video, second is the oldest from CDX response. @@ -355,10 +447,13 @@ class YoutubeWebArchiveIE(InfoExtractor): 'title': 'Fake Teen Doctor Strikes AGAIN! - Weekly Weird News', 'upload_date': '20160218', 'channel_id': 'UCdIaNUarhzLSXGoItz7BHVA', - 'duration': 1236, + 'duration': 1235, 'description': 'md5:21032bae736421e89c2edf36d1936947', 'uploader_id': 'MachinimaETC', - 'uploader_url': 'http://www.youtube.com/user/MachinimaETC' + 'uploader_url': 'https://www.youtube.com/user/MachinimaETC', + 'channel_url': 'https://www.youtube.com/channel/UCdIaNUarhzLSXGoItz7BHVA', + 'thumbnail': r're:https?://.*\.(jpg|webp)', + 'uploader': 'ETC News', } }, { # First capture of dead video, capture date in link links to dead capture. @@ -369,10 +464,13 @@ class YoutubeWebArchiveIE(InfoExtractor): 'title': 'WTF: Video Games Still Launch BROKEN?! - T.U.G.S.', 'upload_date': '20160219', 'channel_id': 'UCdIaNUarhzLSXGoItz7BHVA', - 'duration': 798, + 'duration': 797, 'description': 'md5:a1dbf12d9a3bd7cb4c5e33b27d77ffe7', 'uploader_id': 'MachinimaETC', - 'uploader_url': 'http://www.youtube.com/user/MachinimaETC' + 'uploader_url': 'https://www.youtube.com/user/MachinimaETC', + 'channel_url': 'https://www.youtube.com/channel/UCdIaNUarhzLSXGoItz7BHVA', + 'thumbnail': r're:https?://.*\.(jpg|webp)', + 'uploader': 'ETC News', }, 'expected_warnings': [ r'unable to download capture webpage \(it may not be archived\)' @@ -392,12 +490,11 @@ class YoutubeWebArchiveIE(InfoExtractor): 'title': 'It\'s Bootleg AirPods Time.', 'upload_date': '20211021', 'channel_id': 'UC7Jwj9fkrf1adN4fMmTkpug', - 'channel_url': 'http://www.youtube.com/channel/UC7Jwj9fkrf1adN4fMmTkpug', + 'channel_url': 'https://www.youtube.com/channel/UC7Jwj9fkrf1adN4fMmTkpug', 'duration': 810, 'description': 'md5:7b567f898d8237b256f36c1a07d6d7bc', + 'thumbnail': r're:https?://.*\.(jpg|webp)', 'uploader': 'DankPods', - 'uploader_id': 'UC7Jwj9fkrf1adN4fMmTkpug', - 'uploader_url': 'http://www.youtube.com/channel/UC7Jwj9fkrf1adN4fMmTkpug' } }, { # player response contains '};' See: https://github.com/ytdl-org/youtube-dl/issues/27093 @@ -408,12 +505,135 @@ class YoutubeWebArchiveIE(InfoExtractor): 'title': 'bitch lasagna', 'upload_date': '20181005', 'channel_id': 'UC-lHJZR3Gqxm24_Vd_AJ5Yw', - 'channel_url': 'http://www.youtube.com/channel/UC-lHJZR3Gqxm24_Vd_AJ5Yw', + 'channel_url': 'https://www.youtube.com/channel/UC-lHJZR3Gqxm24_Vd_AJ5Yw', 'duration': 135, 'description': 'md5:2dbe4051feeff2dab5f41f82bb6d11d0', 'uploader': 'PewDiePie', 'uploader_id': 'PewDiePie', - 'uploader_url': 'http://www.youtube.com/user/PewDiePie' + 'uploader_url': 'https://www.youtube.com/user/PewDiePie', + 'thumbnail': r're:https?://.*\.(jpg|webp)', + } + }, { + # ~June 2010 Capture. swfconfig + 'url': 'https://web.archive.org/web/0/https://www.youtube.com/watch?v=8XeW5ilk-9Y', + 'info_dict': { + 'id': '8XeW5ilk-9Y', + 'ext': 'flv', + 'title': 'Story of Stuff, The Critique Part 4 of 4', + 'duration': 541, + 'description': 'md5:28157da06f2c5e94c97f7f3072509972', + 'uploader': 'HowTheWorldWorks', + 'uploader_id': 'HowTheWorldWorks', + 'thumbnail': r're:https?://.*\.(jpg|webp)', + 'uploader_url': 'https://www.youtube.com/user/HowTheWorldWorks', + 'upload_date': '20090520', + } + }, { + # Jan 2011: watch-video-date/eow-date surrounded by whitespace + 'url': 'https://web.archive.org/web/20110126141719/http://www.youtube.com/watch?v=Q_yjX80U7Yc', + 'info_dict': { + 'id': 'Q_yjX80U7Yc', + 'ext': 'flv', + 'title': 'Spray Paint Art by Clay Butler: Purple Fantasy Forest', + 'uploader_id': 'claybutlermusic', + 'description': 'md5:4595264559e3d0a0ceb3f011f6334543', + 'upload_date': '20090803', + 'uploader': 'claybutlermusic', + 'thumbnail': r're:https?://.*\.(jpg|webp)', + 'duration': 132, + 'uploader_url': 'https://www.youtube.com/user/claybutlermusic', + } + }, { + # ~May 2009 swfArgs. ytcfg is spread out over various vars + 'url': 'https://web.archive.org/web/0/https://www.youtube.com/watch?v=c5uJgG05xUY', + 'info_dict': { + 'id': 'c5uJgG05xUY', + 'ext': 'webm', + 'title': 'Story of Stuff, The Critique Part 1 of 4', + 'uploader_id': 'HowTheWorldWorks', + 'uploader': 'HowTheWorldWorks', + 'uploader_url': 'https://www.youtube.com/user/HowTheWorldWorks', + 'upload_date': '20090513', + 'description': 'md5:4ca77d79538064e41e4cc464e93f44f0', + 'thumbnail': r're:https?://.*\.(jpg|webp)', + 'duration': 754, + } + }, { + # ~June 2012. Upload date is in another lang so cannot extract. + 'url': 'https://web.archive.org/web/20120607174520/http://www.youtube.com/watch?v=xWTLLl-dQaA', + 'info_dict': { + 'id': 'xWTLLl-dQaA', + 'ext': 'mp4', + 'title': 'Black Nerd eHarmony Video Bio Parody (SPOOF)', + 'uploader_url': 'https://www.youtube.com/user/BlackNerdComedy', + 'description': 'md5:e25f0133aaf9e6793fb81c18021d193e', + 'uploader_id': 'BlackNerdComedy', + 'uploader': 'BlackNerdComedy', + 'duration': 182, + 'thumbnail': r're:https?://.*\.(jpg|webp)', + } + }, { + # ~July 2013 + 'url': 'https://web.archive.org/web/*/https://www.youtube.com/watch?v=9eO1aasHyTM', + 'info_dict': { + 'id': '9eO1aasHyTM', + 'ext': 'mp4', + 'title': 'Polar-oid', + 'description': 'Cameras and bears are dangerous!', + 'uploader_url': 'https://www.youtube.com/user/punkybird', + 'uploader_id': 'punkybird', + 'duration': 202, + 'channel_id': 'UC62R2cBezNBOqxSerfb1nMQ', + 'channel_url': 'https://www.youtube.com/channel/UC62R2cBezNBOqxSerfb1nMQ', + 'upload_date': '20060428', + 'uploader': 'punkybird', + } + }, { + # April 2020: Player response in player config + 'url': 'https://web.archive.org/web/20200416034815/https://www.youtube.com/watch?v=Cf7vS8jc7dY&gl=US&hl=en', + 'info_dict': { + 'id': 'Cf7vS8jc7dY', + 'ext': 'mp4', + 'title': 'A Dramatic Pool Story (by Jamie Spicer-Lewis) - Game Grumps Animated', + 'duration': 64, + 'upload_date': '20200408', + 'uploader_id': 'GameGrumps', + 'uploader': 'GameGrumps', + 'channel_url': 'https://www.youtube.com/channel/UC9CuvdOVfMPvKCiwdGKL3cQ', + 'channel_id': 'UC9CuvdOVfMPvKCiwdGKL3cQ', + 'thumbnail': r're:https?://.*\.(jpg|webp)', + 'description': 'md5:c625bb3c02c4f5fb4205971e468fa341', + 'uploader_url': 'https://www.youtube.com/user/GameGrumps', + } + }, { + # watch7-user-header with yt-user-info + 'url': 'ytarchive:kbh4T_b4Ixw:20160307085057', + 'info_dict': { + 'id': 'kbh4T_b4Ixw', + 'ext': 'mp4', + 'title': 'Shovel Knight OST - Strike the Earth! Plains of Passage 16 bit SNES style remake / remix', + 'channel_url': 'https://www.youtube.com/channel/UCnTaGvsHmMy792DWeT6HbGA', + 'uploader': 'Nelward music', + 'duration': 213, + 'description': 'md5:804b4a9ce37b050a5fefdbb23aeba54d', + 'thumbnail': r're:https?://.*\.(jpg|webp)', + 'upload_date': '20150503', + 'channel_id': 'UCnTaGvsHmMy792DWeT6HbGA', + } + }, { + # April 2012 + 'url': 'https://web.archive.org/web/0/https://www.youtube.com/watch?v=SOm7mPoPskU', + 'info_dict': { + 'id': 'SOm7mPoPskU', + 'ext': 'mp4', + 'title': 'Boyfriend - Justin Bieber Parody', + 'uploader_url': 'https://www.youtube.com/user/thecomputernerd01', + 'uploader': 'thecomputernerd01', + 'thumbnail': r're:https?://.*\.(jpg|webp)', + 'description': 'md5:dd7fa635519c2a5b4d566beaecad7491', + 'duration': 200, + 'upload_date': '20120407', + 'uploader_id': 'thecomputernerd01', } }, { 'url': 'https://web.archive.org/web/http://www.youtube.com/watch?v=kH-G_aIBlFw', @@ -445,9 +665,11 @@ class YoutubeWebArchiveIE(InfoExtractor): 'only_matching': True }, ] - _YT_INITIAL_DATA_RE = r'(?:(?:(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;)|%s)' % YoutubeBaseInfoExtractor._YT_INITIAL_DATA_RE - _YT_INITIAL_PLAYER_RESPONSE_RE = r'(?:(?:(?:window\s*\[\s*["\']ytInitialPlayerResponse["\']\s*\]|ytInitialPlayerResponse)\s*=[(\s]*({.+?})[)\s]*;)|%s)' % YoutubeBaseInfoExtractor._YT_INITIAL_PLAYER_RESPONSE_RE - _YT_INITIAL_BOUNDARY_RE = r'(?:(?:var\s+meta|</script|\n)|%s)' % YoutubeBaseInfoExtractor._YT_INITIAL_BOUNDARY_RE + _YT_INITIAL_DATA_RE = YoutubeBaseInfoExtractor._YT_INITIAL_DATA_RE + _YT_INITIAL_PLAYER_RESPONSE_RE = fr'''(?x: + (?:window\s*\[\s*["\']ytInitialPlayerResponse["\']\s*\]|ytInitialPlayerResponse)\s*=[(\s]*| + {YoutubeBaseInfoExtractor._YT_INITIAL_PLAYER_RESPONSE_RE} + )''' _YT_DEFAULT_THUMB_SERVERS = ['i.ytimg.com'] # thumbnails most likely archived on these servers _YT_ALL_THUMB_SERVERS = orderedSet( @@ -477,11 +699,6 @@ class YoutubeWebArchiveIE(InfoExtractor): elif not isinstance(res, list) or len(res) != 0: self.report_warning('Error while parsing CDX API response' + bug_reports_message()) - def _extract_yt_initial_variable(self, webpage, regex, video_id, name): - return self._parse_json(self._search_regex( - (r'%s\s*%s' % (regex, self._YT_INITIAL_BOUNDARY_RE), - regex), webpage, name, default='{}'), video_id, fatal=False) - def _extract_webpage_title(self, webpage): page_title = self._html_extract_title(webpage, default='') # YouTube video pages appear to always have either 'YouTube -' as prefix or '- YouTube' as suffix. @@ -491,10 +708,32 @@ class YoutubeWebArchiveIE(InfoExtractor): def _extract_metadata(self, video_id, webpage): search_meta = ((lambda x: self._html_search_meta(x, webpage, default=None)) if webpage else (lambda x: None)) - player_response = self._extract_yt_initial_variable( - webpage, self._YT_INITIAL_PLAYER_RESPONSE_RE, video_id, 'initial player response') or {} - initial_data = self._extract_yt_initial_variable( - webpage, self._YT_INITIAL_DATA_RE, video_id, 'initial player response') or {} + player_response = self._search_json( + self._YT_INITIAL_PLAYER_RESPONSE_RE, webpage, 'initial player response', + video_id, default={}) + initial_data = self._search_json( + self._YT_INITIAL_DATA_RE, webpage, 'initial data', video_id, default={}) + + ytcfg = {} + for j in re.findall(r'yt\.setConfig\(\s*(?P<json>{\s*(?s:.+?)\s*})\s*\);', webpage): # ~June 2010 + ytcfg.update(self._parse_json(j, video_id, fatal=False, ignore_extra=True, transform_source=js_to_json, errnote='') or {}) + + # XXX: this also may contain a 'ptchn' key + player_config = ( + self._search_json( + r'(?:yt\.playerConfig|ytplayer\.config|swfConfig)\s*=', + webpage, 'player config', video_id, default=None) + or ytcfg.get('PLAYER_CONFIG') or {}) + + # XXX: this may also contain a 'creator' key. + swf_args = self._search_json(r'swfArgs\s*=', webpage, 'swf config', video_id, default={}) + if swf_args and not traverse_obj(player_config, ('args',)): + player_config['args'] = swf_args + + if not player_response: + # April 2020 + player_response = self._parse_json( + traverse_obj(player_config, ('args', 'player_response')) or '{}', video_id, fatal=False) initial_data_video = traverse_obj( initial_data, ('contents', 'twoColumnWatchNextResults', 'results', 'results', 'contents', ..., 'videoPrimaryInfoRenderer'), @@ -510,21 +749,64 @@ class YoutubeWebArchiveIE(InfoExtractor): video_details.get('title') or YoutubeBaseInfoExtractor._get_text(microformats, 'title') or YoutubeBaseInfoExtractor._get_text(initial_data_video, 'title') + or traverse_obj(player_config, ('args', 'title')) or self._extract_webpage_title(webpage) or search_meta(['og:title', 'twitter:title', 'title'])) + def id_from_url(url, type_): + return self._search_regex( + rf'(?:{type_})/([^/#&?]+)', url or '', f'{type_} id', default=None) + + # XXX: would the get_elements_by_... functions be better suited here? + _CHANNEL_URL_HREF_RE = r'href="[^"]*(?P<url>https?://www\.youtube\.com/(?:user|channel)/[^"]+)"' + uploader_or_channel_url = self._search_regex( + [fr'<(?:link\s*itemprop=\"url\"|a\s*id=\"watch-username\").*?\b{_CHANNEL_URL_HREF_RE}>', # @fd05024 + fr'<div\s*id=\"(?:watch-channel-stats|watch-headline-user-info)\"[^>]*>\s*<a[^>]*\b{_CHANNEL_URL_HREF_RE}'], # ~ May 2009, ~June 2012 + webpage, 'uploader or channel url', default=None) + + owner_profile_url = url_or_none(microformats.get('ownerProfileUrl')) # @a6211d2 + + # Uploader refers to the /user/ id ONLY + uploader_id = ( + id_from_url(owner_profile_url, 'user') + or id_from_url(uploader_or_channel_url, 'user') + or ytcfg.get('VIDEO_USERNAME')) + uploader_url = f'https://www.youtube.com/user/{uploader_id}' if uploader_id else None + + # XXX: do we want to differentiate uploader and channel? + uploader = ( + self._search_regex( + [r'<a\s*id="watch-username"[^>]*>\s*<strong>([^<]+)</strong>', # June 2010 + r'var\s*watchUsername\s*=\s*\'(.+?)\';', # ~May 2009 + r'<div\s*\bid=\"watch-channel-stats"[^>]*>\s*<a[^>]*>\s*(.+?)\s*</a', # ~May 2009 + r'<a\s*id="watch-userbanner"[^>]*title="\s*(.+?)\s*"'], # ~June 2012 + webpage, 'uploader', default=None) + or self._html_search_regex( + [r'(?s)<div\s*class="yt-user-info".*?<a[^>]*[^>]*>\s*(.*?)\s*</a', # March 2016 + r'(?s)<a[^>]*yt-user-name[^>]*>\s*(.*?)\s*</a'], # july 2013 + get_element_by_id('watch7-user-header', webpage), 'uploader', default=None) + or self._html_search_regex( + r'<button\s*href="/user/[^>]*>\s*<span[^>]*>\s*(.+?)\s*<', # April 2012 + get_element_by_id('watch-headline-user-info', webpage), 'uploader', default=None) + or traverse_obj(player_config, ('args', 'creator')) + or video_details.get('author')) + channel_id = str_or_none( video_details.get('channelId') or microformats.get('externalChannelId') or search_meta('channelId') or self._search_regex( r'data-channel-external-id=(["\'])(?P<id>(?:(?!\1).)+)\1', # @b45a9e6 - webpage, 'channel id', default=None, group='id')) - channel_url = f'http://www.youtube.com/channel/{channel_id}' if channel_id else None + webpage, 'channel id', default=None, group='id') + or id_from_url(owner_profile_url, 'channel') + or id_from_url(uploader_or_channel_url, 'channel') + or traverse_obj(player_config, ('args', 'ucid'))) + channel_url = f'https://www.youtube.com/channel/{channel_id}' if channel_id else None duration = int_or_none( video_details.get('lengthSeconds') or microformats.get('lengthSeconds') + or traverse_obj(player_config, ('args', ('length_seconds', 'l')), get_all=False) or parse_duration(search_meta('duration'))) description = ( video_details.get('shortDescription') @@ -532,26 +814,13 @@ class YoutubeWebArchiveIE(InfoExtractor): or clean_html(get_element_by_id('eow-description', webpage)) # @9e6dd23 or search_meta(['description', 'og:description', 'twitter:description'])) - uploader = video_details.get('author') - - # Uploader ID and URL - uploader_mobj = re.search( - r'<link itemprop="url" href="(?P<uploader_url>https?://www\.youtube\.com/(?:user|channel)/(?P<uploader_id>[^"]+))">', # @fd05024 - webpage) - if uploader_mobj is not None: - uploader_id, uploader_url = uploader_mobj.group('uploader_id'), uploader_mobj.group('uploader_url') - else: - # @a6211d2 - uploader_url = url_or_none(microformats.get('ownerProfileUrl')) - uploader_id = self._search_regex( - r'(?:user|channel)/([^/]+)', uploader_url or '', 'uploader id', default=None) - upload_date = unified_strdate( dict_get(microformats, ('uploadDate', 'publishDate')) or search_meta(['uploadDate', 'datePublished']) or self._search_regex( - [r'(?s)id="eow-date.*?>(.*?)</span>', - r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'], # @7998520 + [r'(?s)id="eow-date.*?>\s*(.*?)\s*</span>', + r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']', # @7998520 + r'class\s*=\s*"(?:watch-video-date|watch-video-added post-date)"[^>]*>\s*([^<]+?)\s*<'], # ~June 2010, ~Jan 2009 (respectively) webpage, 'upload date', default=None)) return { @@ -597,7 +866,7 @@ class YoutubeWebArchiveIE(InfoExtractor): response = self._call_cdx_api( video_id, f'https://www.youtube.com/watch?v={video_id}', filters=['mimetype:text/html'], collapse=['timestamp:6', 'digest'], query={'matchType': 'prefix'}) or [] - all_captures = sorted([int_or_none(r['timestamp']) for r in response if int_or_none(r['timestamp']) is not None]) + all_captures = sorted(int_or_none(r['timestamp']) for r in response if int_or_none(r['timestamp']) is not None) # Prefer the new polymer UI captures as we support extracting more metadata from them # WBM captures seem to all switch to this layout ~July 2020 @@ -620,18 +889,22 @@ class YoutubeWebArchiveIE(InfoExtractor): url_date = url_date or url_date_2 urlh = None - try: - urlh = self._request_webpage( - HEADRequest('https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/%s' % video_id), - video_id, note='Fetching archived video file url', expected_status=True) - except ExtractorError as e: - # HTTP Error 404 is expected if the video is not saved. - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404: - self.raise_no_formats( - 'The requested video is not archived, indexed, or there is an issue with web.archive.org', - expected=True) - else: - raise + retry_manager = self.RetryManager(fatal=False) + for retry in retry_manager: + try: + urlh = self._request_webpage( + HEADRequest('https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/%s' % video_id), + video_id, note='Fetching archived video file url', expected_status=True) + except ExtractorError as e: + # HTTP Error 404 is expected if the video is not saved. + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404: + self.raise_no_formats( + 'The requested video is not archived, indexed, or there is an issue with web.archive.org (try again later)', expected=True) + else: + retry.error = e + + if retry_manager.error: + self.raise_no_formats(retry_manager.error, expected=True, video_id=video_id) capture_dates = self._get_capture_dates(video_id, int_or_none(url_date)) self.write_debug('Captures to try: ' + join_nonempty(*capture_dates, delim=', ')) diff --git a/hypervideo_dl/extractor/arcpublishing.py b/hypervideo_dl/extractor/arcpublishing.py index 8880e5c..febd3d2 100644 --- a/hypervideo_dl/extractor/arcpublishing.py +++ b/hypervideo_dl/extractor/arcpublishing.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -73,8 +70,8 @@ class ArcPublishingIE(InfoExtractor): ], 'video-api-cdn.%s.arcpublishing.com/api'), ] - @staticmethod - def _extract_urls(webpage): + @classmethod + def _extract_embed_urls(cls, url, webpage): entries = [] # https://arcpublishing.atlassian.net/wiki/spaces/POWA/overview for powa_el in re.findall(r'(<div[^>]+class="[^"]*\bpowa\b[^"]*"[^>]+data-uuid="%s"[^>]*>)' % ArcPublishingIE._UUID_REGEX, webpage): @@ -147,7 +144,6 @@ class ArcPublishingIE(InfoExtractor): 'url': s_url, 'quality': -10, }) - self._sort_formats(formats) subtitles = {} for subtitle in (try_get(video, lambda x: x['subtitles']['urls'], list) or []): diff --git a/hypervideo_dl/extractor/ard.py b/hypervideo_dl/extractor/ard.py index 7ea339b..0a8a874 100644 --- a/hypervideo_dl/extractor/ard.py +++ b/hypervideo_dl/extractor/ard.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json import re @@ -43,8 +40,6 @@ class ARDMediathekBaseIE(InfoExtractor): 'This video is not available due to geoblocking', countries=self._GEO_COUNTRIES, metadata_available=True) - self._sort_formats(formats) - subtitles = {} subtitle_url = media_info.get('_subtitleUrl') if subtitle_url: @@ -265,7 +260,6 @@ class ARDMediathekIE(ARDMediathekBaseIE): 'format_id': fid, 'url': furl, }) - self._sort_formats(formats) info = { 'formats': formats, } @@ -374,7 +368,6 @@ class ARDIE(InfoExtractor): continue f['url'] = format_url formats.append(f) - self._sort_formats(formats) _SUB_FORMATS = ( ('./dataTimedText', 'ttml'), diff --git a/hypervideo_dl/extractor/arkena.py b/hypervideo_dl/extractor/arkena.py index 4f4f457..de36ec8 100644 --- a/hypervideo_dl/extractor/arkena.py +++ b/hypervideo_dl/extractor/arkena.py @@ -1,8 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - from .common import InfoExtractor from ..utils import ( ExtractorError, @@ -22,6 +17,8 @@ class ArkenaIE(InfoExtractor): play\.arkena\.com/(?:config|embed)/avp/v\d/player/media/(?P<id>[^/]+)/[^/]+/(?P<account_id>\d+) ) ''' + # See https://support.arkena.com/display/PLAY/Ways+to+embed+your+video + _EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//play\.arkena\.com/embed/avp/.+?)\1'] _TESTS = [{ 'url': 'https://video.qbrick.com/play2/embed/player?accountId=1034090&mediaId=d8ab4607-00090107-aab86310', 'md5': '97f117754e5f3c020f5f26da4a44ebaf', @@ -53,15 +50,6 @@ class ArkenaIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_url(webpage): - # See https://support.arkena.com/display/PLAY/Ways+to+embed+your+video - mobj = re.search( - r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//play\.arkena\.com/embed/avp/.+?)\1', - webpage) - if mobj: - return mobj.group('url') - def _real_extract(self, url): mobj = self._match_valid_url(url) video_id = mobj.group('id') @@ -148,7 +136,6 @@ class ArkenaIE(InfoExtractor): elif mime_type == 'application/vnd.ms-sstr+xml': formats.extend(self._extract_ism_formats( href, video_id, ism_id='mss', fatal=False)) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/arnes.py b/hypervideo_dl/extractor/arnes.py index 050c252..a493714 100644 --- a/hypervideo_dl/extractor/arnes.py +++ b/hypervideo_dl/extractor/arnes.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import ( compat_parse_qs, @@ -76,7 +73,6 @@ class ArnesIE(InfoExtractor): 'width': int_or_none(media.get('width')), 'height': int_or_none(media.get('height')), }) - self._sort_formats(formats) channel = video.get('channel') or {} channel_id = channel.get('url') @@ -93,7 +89,7 @@ class ArnesIE(InfoExtractor): 'timestamp': parse_iso8601(video.get('creationTime')), 'channel': channel.get('name'), 'channel_id': channel_id, - 'channel_url': format_field(channel_id, template=f'{self._BASE_URL}/?channel=%s'), + 'channel_url': format_field(channel_id, None, f'{self._BASE_URL}/?channel=%s'), 'duration': float_or_none(video.get('duration'), 1000), 'view_count': int_or_none(video.get('views')), 'tags': video.get('hashtags'), diff --git a/hypervideo_dl/extractor/arte.py b/hypervideo_dl/extractor/arte.py index c2f2c1b..54e4d2d 100644 --- a/hypervideo_dl/extractor/arte.py +++ b/hypervideo_dl/extractor/arte.py @@ -1,193 +1,216 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor -from ..compat import ( - compat_str, -) from ..utils import ( ExtractorError, + GeoRestrictedError, int_or_none, + parse_iso8601, parse_qs, - qualities, strip_or_none, - try_get, - unified_strdate, + traverse_obj, url_or_none, ) class ArteTVBaseIE(InfoExtractor): _ARTE_LANGUAGES = 'fr|de|en|es|it|pl' - _API_BASE = 'https://api.arte.tv/api/player/v1' + _API_BASE = 'https://api.arte.tv/api/player/v2' class ArteTVIE(ArteTVBaseIE): _VALID_URL = r'''(?x) - https?:// + (?:https?:// (?: (?:www\.)?arte\.tv/(?P<lang>%(langs)s)/videos| api\.arte\.tv/api/player/v\d+/config/(?P<lang_2>%(langs)s) ) - /(?P<id>\d{6}-\d{3}-[AF]) + |arte://program) + /(?P<id>\d{6}-\d{3}-[AF]|LIVE) ''' % {'langs': ArteTVBaseIE._ARTE_LANGUAGES} _TESTS = [{ 'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/', + 'only_matching': True, + }, { + 'url': 'https://www.arte.tv/pl/videos/100103-000-A/usa-dyskryminacja-na-porodowce/', 'info_dict': { - 'id': '088501-000-A', + 'id': '100103-000-A', + 'title': 'USA: Dyskryminacja na porodówce', + 'description': 'md5:242017b7cce59ffae340a54baefcafb1', + 'alt_title': 'ARTE Reportage', + 'upload_date': '20201103', + 'duration': 554, + 'thumbnail': r're:https://api-cdn\.arte\.tv/.+940x530', + 'timestamp': 1604417980, 'ext': 'mp4', - 'title': 'Mexico: Stealing Petrol to Survive', - 'upload_date': '20190628', }, + 'params': {'skip_download': 'm3u8'} }, { - 'url': 'https://www.arte.tv/pl/videos/100103-000-A/usa-dyskryminacja-na-porodowce/', - 'only_matching': True, + 'note': 'No alt_title', + 'url': 'https://www.arte.tv/fr/videos/110371-000-A/la-chaleur-supplice-des-arbres-de-rue/', + 'info_dict': { + 'id': '110371-000-A', + 'ext': 'mp4', + 'upload_date': '20220718', + 'duration': 154, + 'timestamp': 1658162460, + 'description': 'md5:5890f36fe7dccfadb8b7c0891de54786', + 'title': 'La chaleur, supplice des arbres de rue', + 'thumbnail': 'https://api-cdn.arte.tv/img/v2/image/CPE2sQDtD8GLQgt8DuYHLf/940x530', + }, + 'params': {'skip_download': 'm3u8'} }, { 'url': 'https://api.arte.tv/api/player/v2/config/de/100605-013-A', 'only_matching': True, + }, { + 'url': 'https://api.arte.tv/api/player/v2/config/de/LIVE', + 'only_matching': True, }] + _GEO_BYPASS = True + + _LANG_MAP = { # ISO639 -> French abbreviations + 'fr': 'F', + 'de': 'A', + 'en': 'E[ANG]', + 'es': 'E[ESP]', + 'it': 'E[ITA]', + 'pl': 'E[POL]', + # XXX: probably means mixed; <https://www.arte.tv/en/videos/107710-029-A/dispatches-from-ukraine-local-journalists-report/> + # uses this code for audio that happens to be in Ukrainian, but the manifest uses the ISO code 'mul' (mixed) + 'mul': 'EU', + } + + _VERSION_CODE_RE = re.compile(r'''(?x) + V + (?P<original_voice>O?) + (?P<vlang>[FA]|E\[[A-Z]+\]|EU)? + (?P<audio_desc>AUD|) + (?: + (?P<has_sub>-ST) + (?P<sdh_sub>M?) + (?P<sub_lang>[FA]|E\[[A-Z]+\]|EU) + )? + ''') + + # all obtained by exhaustive testing + _COUNTRIES_MAP = { + 'DE_FR': ( + 'BL', 'DE', 'FR', 'GF', 'GP', 'MF', 'MQ', 'NC', + 'PF', 'PM', 'RE', 'WF', 'YT', + ), + # with both of the below 'BE' sometimes works, sometimes doesn't + 'EUR_DE_FR': ( + 'AT', 'BL', 'CH', 'DE', 'FR', 'GF', 'GP', 'LI', + 'MC', 'MF', 'MQ', 'NC', 'PF', 'PM', 'RE', 'WF', + 'YT', + ), + 'SAT': ( + 'AD', 'AT', 'AX', 'BG', 'BL', 'CH', 'CY', 'CZ', + 'DE', 'DK', 'EE', 'ES', 'FI', 'FR', 'GB', 'GF', + 'GR', 'HR', 'HU', 'IE', 'IS', 'IT', 'KN', 'LI', + 'LT', 'LU', 'LV', 'MC', 'MF', 'MQ', 'MT', 'NC', + 'NL', 'NO', 'PF', 'PL', 'PM', 'PT', 'RE', 'RO', + 'SE', 'SI', 'SK', 'SM', 'VA', 'WF', 'YT', + ), + } + def _real_extract(self, url): mobj = self._match_valid_url(url) video_id = mobj.group('id') lang = mobj.group('lang') or mobj.group('lang_2') - - info = self._download_json( - '%s/config/%s/%s' % (self._API_BASE, lang, video_id), video_id) - player_info = info['videoJsonPlayer'] - - vsr = try_get(player_info, lambda x: x['VSR'], dict) - if not vsr: - error = None - if try_get(player_info, lambda x: x['custom_msg']['type']) == 'error': - error = try_get( - player_info, lambda x: x['custom_msg']['msg'], compat_str) - if not error: - error = 'Video %s is not available' % player_info.get('VID') or video_id - raise ExtractorError(error, expected=True) - - upload_date_str = player_info.get('shootingDate') - if not upload_date_str: - upload_date_str = (player_info.get('VRA') or player_info.get('VDA') or '').split(' ')[0] - - title = (player_info.get('VTI') or player_info['VID']).strip() - subtitle = player_info.get('VSU', '').strip() - if subtitle: - title += ' - %s' % subtitle - - qfunc = qualities(['MQ', 'HQ', 'EQ', 'SQ']) - - LANGS = { - 'fr': 'F', - 'de': 'A', - 'en': 'E[ANG]', - 'es': 'E[ESP]', - 'it': 'E[ITA]', - 'pl': 'E[POL]', - } - - langcode = LANGS.get(lang, lang) - - formats = [] - for format_id, format_dict in vsr.items(): - f = dict(format_dict) - format_url = url_or_none(f.get('url')) - streamer = f.get('streamer') - if not format_url and not streamer: - continue - versionCode = f.get('versionCode') - l = re.escape(langcode) - - # Language preference from most to least priority - # Reference: section 6.8 of - # https://www.arte.tv/sites/en/corporate/files/complete-technical-guidelines-arte-geie-v1-07-1.pdf - PREFERENCES = ( - # original version in requested language, without subtitles - r'VO{0}$'.format(l), - # original version in requested language, with partial subtitles in requested language - r'VO{0}-ST{0}$'.format(l), - # original version in requested language, with subtitles for the deaf and hard-of-hearing in requested language - r'VO{0}-STM{0}$'.format(l), - # non-original (dubbed) version in requested language, without subtitles - r'V{0}$'.format(l), - # non-original (dubbed) version in requested language, with subtitles partial subtitles in requested language - r'V{0}-ST{0}$'.format(l), - # non-original (dubbed) version in requested language, with subtitles for the deaf and hard-of-hearing in requested language - r'V{0}-STM{0}$'.format(l), - # original version in requested language, with partial subtitles in different language - r'VO{0}-ST(?!{0}).+?$'.format(l), - # original version in requested language, with subtitles for the deaf and hard-of-hearing in different language - r'VO{0}-STM(?!{0}).+?$'.format(l), - # original version in different language, with partial subtitles in requested language - r'VO(?:(?!{0}).+?)?-ST{0}$'.format(l), - # original version in different language, with subtitles for the deaf and hard-of-hearing in requested language - r'VO(?:(?!{0}).+?)?-STM{0}$'.format(l), - # original version in different language, without subtitles - r'VO(?:(?!{0}))?$'.format(l), - # original version in different language, with partial subtitles in different language - r'VO(?:(?!{0}).+?)?-ST(?!{0}).+?$'.format(l), - # original version in different language, with subtitles for the deaf and hard-of-hearing in different language - r'VO(?:(?!{0}).+?)?-STM(?!{0}).+?$'.format(l), - ) - - for pref, p in enumerate(PREFERENCES): - if re.match(p, versionCode): - lang_pref = len(PREFERENCES) - pref - break - else: - lang_pref = -1 - format_note = '%s, %s' % (f.get('versionCode'), f.get('versionLibelle')) - - media_type = f.get('mediaType') - if media_type == 'hls': - m3u8_formats = self._extract_m3u8_formats( - format_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id=format_id, fatal=False) - for m3u8_format in m3u8_formats: - m3u8_format.update({ + langauge_code = self._LANG_MAP.get(lang) + + config = self._download_json(f'{self._API_BASE}/config/{lang}/{video_id}', video_id) + + geoblocking = traverse_obj(config, ('data', 'attributes', 'restriction', 'geoblocking')) or {} + if geoblocking.get('restrictedArea'): + raise GeoRestrictedError(f'Video restricted to {geoblocking["code"]!r}', + countries=self._COUNTRIES_MAP.get(geoblocking['code'], ('DE', 'FR'))) + + if not traverse_obj(config, ('data', 'attributes', 'rights')): + # Eg: https://www.arte.tv/de/videos/097407-215-A/28-minuten + # Eg: https://www.arte.tv/es/videos/104351-002-A/serviteur-du-peuple-1-23 + raise ExtractorError( + 'Video is not available in this language edition of Arte or broadcast rights expired', expected=True) + + formats, subtitles = [], {} + secondary_formats = [] + for stream in config['data']['attributes']['streams']: + # official player contains code like `e.get("versions")[0].eStat.ml5` + stream_version = stream['versions'][0] + stream_version_code = stream_version['eStat']['ml5'] + + lang_pref = -1 + m = self._VERSION_CODE_RE.match(stream_version_code) + if m: + lang_pref = int(''.join('01'[x] for x in ( + m.group('vlang') == langauge_code, # we prefer voice in the requested language + not m.group('audio_desc'), # and not the audio description version + bool(m.group('original_voice')), # but if voice is not in the requested language, at least choose the original voice + m.group('sub_lang') == langauge_code, # if subtitles are present, we prefer them in the requested language + not m.group('has_sub'), # but we prefer no subtitles otherwise + not m.group('sdh_sub'), # and we prefer not the hard-of-hearing subtitles if there are subtitles + ))) + + short_label = traverse_obj(stream_version, 'shortLabel', expected_type=str, default='?') + if stream['protocol'].startswith('HLS'): + fmts, subs = self._extract_m3u8_formats_and_subtitles( + stream['url'], video_id=video_id, ext='mp4', m3u8_id=stream_version_code, fatal=False) + for fmt in fmts: + fmt.update({ + 'format_note': f'{stream_version.get("label", "unknown")} [{short_label}]', 'language_preference': lang_pref, - 'format_note': format_note, }) - formats.extend(m3u8_formats) - continue + if any(map(short_label.startswith, ('cc', 'OGsub'))): + secondary_formats.extend(fmts) + else: + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + + elif stream['protocol'] in ('HTTPS', 'RTMP'): + formats.append({ + 'format_id': f'{stream["protocol"]}-{stream_version_code}', + 'url': stream['url'], + 'format_note': f'{stream_version.get("label", "unknown")} [{short_label}]', + 'language_preference': lang_pref, + # 'ext': 'mp4', # XXX: may or may not be necessary, at least for HTTPS + }) - format = { - 'format_id': format_id, - 'language_preference': lang_pref, - 'format_note': format_note, - 'width': int_or_none(f.get('width')), - 'height': int_or_none(f.get('height')), - 'tbr': int_or_none(f.get('bitrate')), - 'quality': qfunc(f.get('quality')), - } - - if media_type == 'rtmp': - format['url'] = f['streamer'] - format['play_path'] = 'mp4:' + f['url'] - format['ext'] = 'flv' else: - format['url'] = f['url'] + self.report_warning(f'Skipping stream with unknown protocol {stream["protocol"]}') - formats.append(format) + # TODO: chapters from stream['segments']? + # The JS also looks for chapters in config['data']['attributes']['chapters'], + # but I am yet to find a video having those - # For this extractor, quality only represents the relative quality - # with respect to other formats with the same resolution - self._sort_formats(formats, ('res', 'quality')) + formats.extend(secondary_formats) + self._remove_duplicate_formats(formats) + + metadata = config['data']['attributes']['metadata'] return { - 'id': player_info.get('VID') or video_id, - 'title': title, - 'description': player_info.get('VDE') or player_info.get('V7T'), - 'upload_date': unified_strdate(upload_date_str), - 'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'), + 'id': metadata['providerId'], + 'webpage_url': traverse_obj(metadata, ('link', 'url')), + 'title': traverse_obj(metadata, 'subtitle', 'title'), + 'alt_title': metadata.get('subtitle') and metadata.get('title'), + 'description': metadata.get('description'), + 'duration': traverse_obj(metadata, ('duration', 'seconds')), + 'language': metadata.get('language'), + 'timestamp': traverse_obj(config, ('data', 'attributes', 'rights', 'begin'), expected_type=parse_iso8601), + 'is_live': config['data']['attributes'].get('live', False), 'formats': formats, + 'subtitles': subtitles, + 'thumbnails': [ + {'url': image['url'], 'id': image.get('caption')} + for image in metadata.get('images') or [] if url_or_none(image.get('url')) + ], } class ArteTVEmbedIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+' + _EMBED_REGEX = [r'<(?:iframe|script)[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+?)\1'] _TESTS = [{ 'url': 'https://www.arte.tv/player/v5/index.php?json_url=https%3A%2F%2Fapi.arte.tv%2Fapi%2Fplayer%2Fv2%2Fconfig%2Fde%2F100605-013-A&lang=de&autoplay=true&mute=0100605-013-A', 'info_dict': { @@ -197,17 +220,12 @@ class ArteTVEmbedIE(InfoExtractor): 'description': 'md5:be40b667f45189632b78c1425c7c2ce1', 'upload_date': '20201116', }, + 'skip': 'No video available' }, { 'url': 'https://www.arte.tv/player/v3/index.php?json_url=https://api.arte.tv/api/player/v2/config/de/100605-013-A', 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): - return [url for _, url in re.findall( - r'<(?:iframe|script)[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+?)\1', - webpage)] - def _real_extract(self, url): qs = parse_qs(url) json_url = qs['json_url'][0] @@ -220,44 +238,36 @@ class ArteTVPlaylistIE(ArteTVBaseIE): _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>RC-\d{6})' % ArteTVBaseIE._ARTE_LANGUAGES _TESTS = [{ 'url': 'https://www.arte.tv/en/videos/RC-016954/earn-a-living/', - 'info_dict': { - 'id': 'RC-016954', - 'title': 'Earn a Living', - 'description': 'md5:d322c55011514b3a7241f7fb80d494c2', - }, - 'playlist_mincount': 6, + 'only_matching': True, }, { 'url': 'https://www.arte.tv/pl/videos/RC-014123/arte-reportage/', - 'only_matching': True, + 'playlist_mincount': 100, + 'info_dict': { + 'description': 'md5:84e7bf1feda248bc325ebfac818c476e', + 'id': 'RC-014123', + 'title': 'ARTE Reportage - najlepsze reportaże', + }, }] def _real_extract(self, url): - lang, playlist_id = self._match_valid_url(url).groups() - collection = self._download_json( - '%s/collectionData/%s/%s?source=videos' - % (self._API_BASE, lang, playlist_id), playlist_id) - entries = [] - for video in collection['videos']: - if not isinstance(video, dict): - continue - video_url = url_or_none(video.get('url')) or url_or_none(video.get('jsonUrl')) - if not video_url: - continue - video_id = video.get('programId') - entries.append({ - '_type': 'url_transparent', - 'url': video_url, - 'id': video_id, - 'title': video.get('title'), - 'alt_title': video.get('subtitle'), - 'thumbnail': url_or_none(try_get(video, lambda x: x['mainImage']['url'], compat_str)), - 'duration': int_or_none(video.get('durationSeconds')), - 'view_count': int_or_none(video.get('views')), - 'ie_key': ArteTVIE.ie_key(), - }) - title = collection.get('title') - description = collection.get('shortDescription') or collection.get('teaserText') - return self.playlist_result(entries, playlist_id, title, description) + lang, playlist_id = self._match_valid_url(url).group('lang', 'id') + playlist = self._download_json( + f'{self._API_BASE}/playlist/{lang}/{playlist_id}', playlist_id)['data']['attributes'] + + entries = [{ + '_type': 'url_transparent', + 'url': video['config']['url'], + 'ie_key': ArteTVIE.ie_key(), + 'id': video.get('providerId'), + 'title': video.get('title'), + 'alt_title': video.get('subtitle'), + 'thumbnail': url_or_none(traverse_obj(video, ('mainImage', 'url'))), + 'duration': int_or_none(traverse_obj(video, ('duration', 'seconds'))), + } for video in traverse_obj(playlist, ('items', lambda _, v: v['config']['url']))] + + return self.playlist_result(entries, playlist_id, + traverse_obj(playlist, ('metadata', 'title')), + traverse_obj(playlist, ('metadata', 'description'))) class ArteTVCategoryIE(ArteTVBaseIE): @@ -270,14 +280,13 @@ class ArteTVCategoryIE(ArteTVBaseIE): 'description': 'Investigative documentary series, geopolitical analysis, and international commentary', }, 'playlist_mincount': 13, - }, - ] + }] @classmethod def suitable(cls, url): return ( not any(ie.suitable(url) for ie in (ArteTVIE, ArteTVPlaylistIE, )) - and super(ArteTVCategoryIE, cls).suitable(url)) + and super().suitable(url)) def _real_extract(self, url): lang, playlist_id = self._match_valid_url(url).groups() @@ -293,9 +302,7 @@ class ArteTVCategoryIE(ArteTVBaseIE): if any(ie.suitable(video) for ie in (ArteTVIE, ArteTVPlaylistIE, )): items.append(video) - title = (self._og_search_title(webpage, default=None) - or self._html_search_regex(r'<title\b[^>]*>([^<]+)', default=None)) - title = strip_or_none(title.rsplit('|', 1)[0]) or self._generic_title(url) + title = strip_or_none(self._generic_title('', webpage, default='').rsplit('|', 1)[0]) or None return self.playlist_from_matches(items, playlist_id=playlist_id, playlist_title=title, description=self._og_search_description(webpage, default=None)) diff --git a/hypervideo_dl/extractor/asiancrush.py b/hypervideo_dl/extractor/asiancrush.py index 7f1940f..23f310e 100644 --- a/hypervideo_dl/extractor/asiancrush.py +++ b/hypervideo_dl/extractor/asiancrush.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import functools import re diff --git a/hypervideo_dl/extractor/atresplayer.py b/hypervideo_dl/extractor/atresplayer.py index 465af4e..a20e7f9 100644 --- a/hypervideo_dl/extractor/atresplayer.py +++ b/hypervideo_dl/extractor/atresplayer.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor from ..compat import compat_HTTPError from ..utils import ( @@ -88,7 +84,6 @@ class AtresPlayerIE(InfoExtractor): elif src_type == 'application/dash+xml': formats, subtitles = self._extract_mpd_formats( src, video_id, mpd_id='dash', fatal=False) - self._sort_formats(formats) heartbeat = episode.get('heartbeat') or {} omniture = episode.get('omniture') or {} diff --git a/hypervideo_dl/extractor/atscaleconf.py b/hypervideo_dl/extractor/atscaleconf.py new file mode 100644 index 0000000..3f7b1e9 --- /dev/null +++ b/hypervideo_dl/extractor/atscaleconf.py @@ -0,0 +1,34 @@ +import re + +from .common import InfoExtractor + + +class AtScaleConfEventIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?atscaleconference\.com/events/(?P[^/&$?]+)' + + _TESTS = [{ + 'url': 'https://atscaleconference.com/events/data-scale-spring-2022/', + 'playlist_mincount': 13, + 'info_dict': { + 'id': 'data-scale-spring-2022', + 'title': 'Data @Scale Spring 2022', + 'description': 'md5:7d7ca1c42ac9c6d8a785092a1aea4b55' + }, + }, { + 'url': 'https://atscaleconference.com/events/video-scale-2021/', + 'playlist_mincount': 14, + 'info_dict': { + 'id': 'video-scale-2021', + 'title': 'Video @Scale 2021', + 'description': 'md5:7d7ca1c42ac9c6d8a785092a1aea4b55' + }, + }] + + def _real_extract(self, url): + id = self._match_id(url) + webpage = self._download_webpage(url, id) + + return self.playlist_from_matches( + re.findall(r'data-url\s*=\s*"(https?://(?:www\.)?atscaleconference\.com/videos/[^"]+)"', webpage), + ie='Generic', playlist_id=id, + title=self._og_search_title(webpage), description=self._og_search_description(webpage)) diff --git a/hypervideo_dl/extractor/atttechchannel.py b/hypervideo_dl/extractor/atttechchannel.py index 8f93fb3..6ff4ec0 100644 --- a/hypervideo_dl/extractor/atttechchannel.py +++ b/hypervideo_dl/extractor/atttechchannel.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import unified_strdate diff --git a/hypervideo_dl/extractor/atvat.py b/hypervideo_dl/extractor/atvat.py index 481a097..d6ed9e4 100644 --- a/hypervideo_dl/extractor/atvat.py +++ b/hypervideo_dl/extractor/atvat.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import datetime from .common import InfoExtractor @@ -52,7 +49,6 @@ class ATVAtIE(InfoExtractor): 'url': source_url, 'format_id': protocol, }) - self._sort_formats(formats) return { 'id': clip_id, diff --git a/hypervideo_dl/extractor/audimedia.py b/hypervideo_dl/extractor/audimedia.py index 6bd48ef..35114e5 100644 --- a/hypervideo_dl/extractor/audimedia.py +++ b/hypervideo_dl/extractor/audimedia.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( int_or_none, @@ -79,7 +76,6 @@ class AudiMediaIE(InfoExtractor): 'format_id': 'http-%s' % bitrate, }) formats.append(f) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/audioboom.py b/hypervideo_dl/extractor/audioboom.py index c51837b..a23fcd2 100644 --- a/hypervideo_dl/extractor/audioboom.py +++ b/hypervideo_dl/extractor/audioboom.py @@ -1,27 +1,33 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor -from ..utils import ( - clean_html, - float_or_none, -) +from ..utils import clean_html, float_or_none, traverse_obj, unescapeHTML class AudioBoomIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?audioboom\.com/(?:boos|posts)/(?P[0-9]+)' _TESTS = [{ 'url': 'https://audioboom.com/posts/7398103-asim-chaudhry', - 'md5': '7b00192e593ff227e6a315486979a42d', + 'md5': '4d68be11c9f9daf3dab0778ad1e010c3', 'info_dict': { 'id': '7398103', 'ext': 'mp3', 'title': 'Asim Chaudhry', - 'description': 'md5:2f3fef17dacc2595b5362e1d7d3602fc', + 'description': 'md5:0ed714ae0e81e5d9119cac2f618ad679', 'duration': 4000.99, 'uploader': 'Sue Perkins: An hour or so with...', 'uploader_url': r're:https?://(?:www\.)?audioboom\.com/channel/perkins', } + }, { # Direct mp3-file link + 'url': 'https://audioboom.com/posts/8128496.mp3', + 'md5': 'e329edf304d450def95c7f86a9165ee1', + 'info_dict': { + 'id': '8128496', + 'ext': 'mp3', + 'title': 'TCRNo8 / DAILY 03 - In Control', + 'description': 'md5:44665f142db74858dfa21c5b34787948', + 'duration': 1689.7, + 'uploader': 'Lost Dot Podcast: The Trans Pyrenees and Transcontinental Race', + 'uploader_url': r're:https?://(?:www\.)?audioboom\.com/channels/5003904', + } }, { 'url': 'https://audioboom.com/posts/4279833-3-09-2016-czaban-hour-3?t=0', 'only_matching': True, @@ -29,45 +35,23 @@ class AudioBoomIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) + webpage = self._download_webpage(f'https://audioboom.com/posts/{video_id}', video_id) - webpage = self._download_webpage(url, video_id) - - clip = None - - clip_store = self._parse_json( - self._html_search_regex( - r'data-new-clip-store=(["\'])(?P{.+?})\1', - webpage, 'clip store', default='{}', group='json'), - video_id, fatal=False) - if clip_store: - clips = clip_store.get('clips') - if clips and isinstance(clips, list) and isinstance(clips[0], dict): - clip = clips[0] - - def from_clip(field): - if clip: - return clip.get(field) - - audio_url = from_clip('clipURLPriorToLoading') or self._og_search_property( - 'audio', webpage, 'audio url') - title = from_clip('title') or self._html_search_meta( - ['og:title', 'og:audio:title', 'audio_title'], webpage) - description = from_clip('description') or clean_html(from_clip('formattedDescription')) or self._og_search_description(webpage) - - duration = float_or_none(from_clip('duration') or self._html_search_meta( - 'weibo:audio:duration', webpage)) - - uploader = from_clip('author') or self._html_search_meta( - ['og:audio:artist', 'twitter:audio:artist_name', 'audio_artist'], webpage, 'uploader') - uploader_url = from_clip('author_url') or self._html_search_meta( - 'audioboo:channel', webpage, 'uploader url') + clip_store = self._search_json( + r'data-react-class="V5DetailPagePlayer"\s*data-react-props=["\']', + webpage, 'clip store', video_id, fatal=False, transform_source=unescapeHTML) + clip = traverse_obj(clip_store, ('clips', 0), expected_type=dict) or {} return { 'id': video_id, - 'url': audio_url, - 'title': title, - 'description': description, - 'duration': duration, - 'uploader': uploader, - 'uploader_url': uploader_url, + 'url': clip.get('clipURLPriorToLoading') or self._og_search_property('audio', webpage, 'audio url'), + 'title': clip.get('title') or self._html_search_meta(['og:title', 'og:audio:title', 'audio_title'], webpage), + 'description': (clip.get('description') or clean_html(clip.get('formattedDescription')) + or self._og_search_description(webpage)), + 'duration': float_or_none(clip.get('duration') or self._html_search_meta('weibo:audio:duration', webpage)), + 'uploader': clip.get('author') or self._html_search_meta( + ['og:audio:artist', 'twitter:audio:artist_name', 'audio_artist'], webpage, 'uploader'), + 'uploader_url': clip.get('author_url') or self._html_search_regex( + r'
\s*\d+)' + + _TESTS = [{ + 'url': 'http://nokiatune.audiodraft.com/entry/5874', + 'info_dict': { + 'id': '9485', + 'ext': 'mp3', + 'title': 'Hula Hula Calls', + 'uploader': 'unclemaki', + 'uploader_id': '13512', + 'average_rating': 5, + 'like_count': int, + }, + }, { + 'url': 'http://vikinggrace.audiodraft.com/entry/501', + 'info_dict': { + 'id': '22241', + 'ext': 'mp3', + 'title': 'MVG Happy', + 'uploader': 'frog', + 'uploader_id': '19142', + 'average_rating': 5, + 'like_count': int, + }, + }, { + 'url': 'http://timferriss.audiodraft.com/entry/765', + 'info_dict': { + 'id': '19710', + 'ext': 'mp3', + 'title': 'ferris03', + 'uploader': 'malex', + 'uploader_id': '17335', + 'average_rating': 5, + 'like_count': int, + }, + }] + + def _real_extract(self, url): + id = self._match_id(url) + webpage = self._download_webpage(url, id) + player_entry_id = self._search_regex(r'playAudio\(\'(player_entry_\d+)\'\);', webpage, id, 'play entry id') + return self._audiodraft_extract_from_id(player_entry_id) + + +class AudiodraftGenericIE(AudiodraftBaseIE): + IE_NAME = 'Audiodraft:generic' + _VALID_URL = r'https?://www\.audiodraft\.com/contests/[^/#]+#entries&eid=(?P\d+)' + + _TESTS = [{ + 'url': 'https://www.audiodraft.com/contests/570-Score-A-Video-Surprise-Us#entries&eid=30138', + 'info_dict': { + 'id': '30138', + 'ext': 'mp3', + 'title': 'DROP in sound_V2', + 'uploader': 'TiagoSilva', + 'uploader_id': '19452', + 'average_rating': 4, + 'like_count': int, + }, + }] + + def _real_extract(self, url): + id = self._match_id(url) + return self._audiodraft_extract_from_id(f'player_entry_{id}') diff --git a/hypervideo_dl/extractor/audiomack.py b/hypervideo_dl/extractor/audiomack.py index 19775cf..5c4160f 100644 --- a/hypervideo_dl/extractor/audiomack.py +++ b/hypervideo_dl/extractor/audiomack.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import itertools import time diff --git a/hypervideo_dl/extractor/audius.py b/hypervideo_dl/extractor/audius.py index fa64995..6448b44 100644 --- a/hypervideo_dl/extractor/audius.py +++ b/hypervideo_dl/extractor/audius.py @@ -1,11 +1,8 @@ -# coding: utf-8 -from __future__ import unicode_literals - import random from .common import InfoExtractor -from ..utils import ExtractorError, try_get, compat_str, str_or_none -from ..compat import compat_urllib_parse_unquote +from ..compat import compat_str, compat_urllib_parse_unquote +from ..utils import ExtractorError, str_or_none, try_get class AudiusBaseIE(InfoExtractor): @@ -171,7 +168,7 @@ class AudiusIE(AudiusBaseIE): } -class AudiusTrackIE(AudiusIE): +class AudiusTrackIE(AudiusIE): # XXX: Do not subclass from concrete IE _VALID_URL = r'''(?x)(?:audius:)(?:https?://(?:www\.)?.+/v1/tracks/)?(?P\w+)''' IE_NAME = 'audius:track' IE_DESC = 'Audius track ID or API link. Prepend with "audius:"' @@ -246,7 +243,7 @@ class AudiusPlaylistIE(AudiusBaseIE): playlist_data.get('description')) -class AudiusProfileIE(AudiusPlaylistIE): +class AudiusProfileIE(AudiusPlaylistIE): # XXX: Do not subclass from concrete IE IE_NAME = 'audius:artist' IE_DESC = 'Audius.co profile/artist pages' _VALID_URL = r'https?://(?:www)?audius\.co/(?P[^\/]+)/?(?:[?#]|$)' diff --git a/hypervideo_dl/extractor/awaan.py b/hypervideo_dl/extractor/awaan.py index f5e559c..6fc938d 100644 --- a/hypervideo_dl/extractor/awaan.py +++ b/hypervideo_dl/extractor/awaan.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import base64 from .common import InfoExtractor @@ -44,7 +41,7 @@ class AWAANBaseIE(InfoExtractor): 'id': video_id, 'title': title, 'description': video_data.get('description_en') or video_data.get('description_ar'), - 'thumbnail': format_field(img, template='http://admin.mangomolo.com/analytics/%s'), + 'thumbnail': format_field(img, None, 'http://admin.mangomolo.com/analytics/%s'), 'duration': int_or_none(video_data.get('duration')), 'timestamp': parse_iso8601(video_data.get('create_time'), ' '), 'is_live': is_live, diff --git a/hypervideo_dl/extractor/aws.py b/hypervideo_dl/extractor/aws.py index dccfeaf..eb831a1 100644 --- a/hypervideo_dl/extractor/aws.py +++ b/hypervideo_dl/extractor/aws.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import datetime import hashlib import hmac @@ -9,7 +6,7 @@ from .common import InfoExtractor from ..compat import compat_urllib_parse_urlencode -class AWSIE(InfoExtractor): +class AWSIE(InfoExtractor): # XXX: Conventionally, base classes should end with BaseIE/InfoExtractor _AWS_ALGORITHM = 'AWS4-HMAC-SHA256' _AWS_REGION = 'us-east-1' diff --git a/hypervideo_dl/extractor/azmedien.py b/hypervideo_dl/extractor/azmedien.py index 0168340..d1686ee 100644 --- a/hypervideo_dl/extractor/azmedien.py +++ b/hypervideo_dl/extractor/azmedien.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/baidu.py b/hypervideo_dl/extractor/baidu.py index 364fd94..8786d67 100644 --- a/hypervideo_dl/extractor/baidu.py +++ b/hypervideo_dl/extractor/baidu.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor from ..utils import unescapeHTML diff --git a/hypervideo_dl/extractor/banbye.py b/hypervideo_dl/extractor/banbye.py index 3d4d36e..c873425 100644 --- a/hypervideo_dl/extractor/banbye.py +++ b/hypervideo_dl/extractor/banbye.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import math from .common import InfoExtractor @@ -83,8 +80,6 @@ class BanByeIE(BanByeBaseIE): 'url': f'{self._CDN_BASE}/video/{video_id}/{quality}.mp4', } for quality in data['quality']] - self._sort_formats(formats) - return { 'id': video_id, 'title': data.get('title'), diff --git a/hypervideo_dl/extractor/bandaichannel.py b/hypervideo_dl/extractor/bandaichannel.py index f1bcdef..d7fcf44 100644 --- a/hypervideo_dl/extractor/bandaichannel.py +++ b/hypervideo_dl/extractor/bandaichannel.py @@ -1,11 +1,8 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .brightcove import BrightcoveNewIE +from .brightcove import BrightcoveNewBaseIE from ..utils import extract_attributes -class BandaiChannelIE(BrightcoveNewIE): +class BandaiChannelIE(BrightcoveNewBaseIE): IE_NAME = 'bandaichannel' _VALID_URL = r'https?://(?:www\.)?b-ch\.com/titles/(?P\d+/\d+)' _TESTS = [{ diff --git a/hypervideo_dl/extractor/bandcamp.py b/hypervideo_dl/extractor/bandcamp.py index 745055e..de81e0d 100644 --- a/hypervideo_dl/extractor/bandcamp.py +++ b/hypervideo_dl/extractor/bandcamp.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import random import re import time @@ -8,23 +5,24 @@ import time from .common import InfoExtractor from ..compat import compat_str from ..utils import ( + KNOWN_EXTENSIONS, ExtractorError, float_or_none, int_or_none, - KNOWN_EXTENSIONS, parse_filesize, str_or_none, try_get, - update_url_query, unified_strdate, unified_timestamp, + update_url_query, url_or_none, urljoin, ) class BandcampIE(InfoExtractor): - _VALID_URL = r'https?://[^/]+\.bandcamp\.com/track/(?P[^/?#&]+)' + _VALID_URL = r'https?://(?P[^/]+)\.bandcamp\.com/track/(?P[^/?#&]+)' + _EMBED_REGEX = [r']*?content="(?P.*?bandcamp\.com.*?)"'] _TESTS = [{ 'url': 'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song', 'md5': 'c557841d5e50261777a6585648adf439', @@ -87,7 +85,7 @@ class BandcampIE(InfoExtractor): attr + ' data', group=2), video_id, fatal=fatal) def _real_extract(self, url): - title = self._match_id(url) + title, uploader = self._match_valid_url(url).group('id', 'uploader') webpage = self._download_webpage(url, title) tralbum = self._extract_data_attr(webpage, title) thumbnail = self._og_search_thumbnail(webpage) @@ -186,8 +184,6 @@ class BandcampIE(InfoExtractor): 'acodec': format_id.split('-')[0], }) - self._sort_formats(formats) - title = '%s - %s' % (artist, track) if artist else track if not duration: @@ -199,6 +195,8 @@ class BandcampIE(InfoExtractor): 'title': title, 'thumbnail': thumbnail, 'uploader': artist, + 'uploader_id': uploader, + 'uploader_url': f'https://{uploader}.bandcamp.com', 'timestamp': timestamp, 'release_timestamp': unified_timestamp(tralbum.get('album_release_date')), 'duration': duration, @@ -211,7 +209,7 @@ class BandcampIE(InfoExtractor): } -class BandcampAlbumIE(BandcampIE): +class BandcampAlbumIE(BandcampIE): # XXX: Do not subclass from concrete IE IE_NAME = 'Bandcamp:album' _VALID_URL = r'https?://(?:(?P[^.]+)\.)?bandcamp\.com/album/(?P[^/?#&]+)' @@ -314,7 +312,7 @@ class BandcampAlbumIE(BandcampIE): } -class BandcampWeeklyIE(BandcampIE): +class BandcampWeeklyIE(BandcampIE): # XXX: Do not subclass from concrete IE IE_NAME = 'Bandcamp:weekly' _VALID_URL = r'https?://(?:www\.)?bandcamp\.com/?\?(?:.*?&)?show=(?P\d+)' _TESTS = [{ @@ -363,7 +361,6 @@ class BandcampWeeklyIE(BandcampIE): 'ext': ext, 'vcodec': 'none', }) - self._sort_formats(formats) title = show.get('audio_title') or 'Bandcamp Weekly' subtitle = show.get('subtitle') @@ -439,7 +436,7 @@ class BandcampUserIE(InfoExtractor): uploader = self._match_id(url) webpage = self._download_webpage(url, uploader) - discography_data = (re.findall(r'
  • ]+>\s*]+>\s*]+trackTitle["\'][^"\']+["\']([^"\']+)', webpage)) return self.playlist_from_matches( diff --git a/hypervideo_dl/extractor/bannedvideo.py b/hypervideo_dl/extractor/bannedvideo.py index 3db1151..51e7220 100644 --- a/hypervideo_dl/extractor/bannedvideo.py +++ b/hypervideo_dl/extractor/bannedvideo.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import json from .common import InfoExtractor @@ -137,7 +135,6 @@ query GetCommentReplies($id: String!) { formats.extend(self._extract_m3u8_formats( video_info.get('streamUrl'), video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', live=True)) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/bbc.py b/hypervideo_dl/extractor/bbc.py index 29ad7de..9d28e70 100644 --- a/hypervideo_dl/extractor/bbc.py +++ b/hypervideo_dl/extractor/bbc.py @@ -1,19 +1,12 @@ -# coding: utf-8 -from __future__ import unicode_literals - import functools import itertools import json import re +import urllib.error +import xml.etree.ElementTree from .common import InfoExtractor -from ..compat import ( - compat_etree_Element, - compat_HTTPError, - compat_str, - compat_urllib_error, - compat_urlparse, -) +from ..compat import compat_HTTPError, compat_str, compat_urlparse from ..utils import ( ExtractorError, OnDemandPagedList, @@ -53,6 +46,7 @@ class BBCCoUkIE(InfoExtractor): ) (?P%s)(?!/(?:episodes|broadcasts|clips)) ''' % _ID_REGEX + _EMBED_REGEX = [r'setPlaylist\("(?Phttps?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)'] _LOGIN_URL = 'https://account.bbc.com/signin' _NETRC_MACHINE = 'bbc' @@ -318,7 +312,7 @@ class BBCCoUkIE(InfoExtractor): continue captions = self._download_xml( cc_url, programme_id, 'Downloading captions', fatal=False) - if not isinstance(captions, compat_etree_Element): + if not isinstance(captions, xml.etree.ElementTree.Element): continue subtitles['en'] = [ { @@ -394,7 +388,7 @@ class BBCCoUkIE(InfoExtractor): href, programme_id, ext='mp4', entry_protocol='m3u8_native', m3u8_id=format_id, fatal=False) except ExtractorError as e: - if not (isinstance(e.exc_info[1], compat_urllib_error.HTTPError) + if not (isinstance(e.exc_info[1], urllib.error.HTTPError) and e.exc_info[1].code in (403, 404)): raise fmts = [] @@ -581,8 +575,6 @@ class BBCCoUkIE(InfoExtractor): else: programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id) - self._sort_formats(formats) - return { 'id': programme_id, 'title': title, @@ -594,10 +586,15 @@ class BBCCoUkIE(InfoExtractor): } -class BBCIE(BBCCoUkIE): +class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE IE_NAME = 'bbc' IE_DESC = 'BBC' - _VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P[^/#?]+)' + _VALID_URL = r'''(?x) + https?://(?:www\.)?(?: + bbc\.(?:com|co\.uk)| + bbcnewsd73hkzno2ini43t4gblxvycyac5aw4gnv7t2rccijh7745uqd\.onion| + bbcweb3hytmzhn5d532owbu6oqadra5z3ar726vq5kgwwn6aucdccrad\.onion + )/(?:[^/]+/)+(?P[^/#?]+)''' _MEDIA_SETS = [ 'pc', @@ -847,6 +844,12 @@ class BBCIE(BBCCoUkIE): 'upload_date': '20190604', 'categories': ['Psychology'], }, + }, { # onion routes + 'url': 'https://www.bbcnewsd73hkzno2ini43t4gblxvycyac5aw4gnv7t2rccijh7745uqd.onion/news/av/world-europe-63208576', + 'only_matching': True, + }, { + 'url': 'https://www.bbcweb3hytmzhn5d532owbu6oqadra5z3ar726vq5kgwwn6aucdccrad.onion/sport/av/football/63195681', + 'only_matching': True, }] @classmethod @@ -885,7 +888,6 @@ class BBCIE(BBCCoUkIE): def _extract_from_playlist_sxml(self, url, playlist_id, timestamp): programme_id, title, description, duration, formats, subtitles = \ self._process_legacy_playlist_url(url, playlist_id) - self._sort_formats(formats) return { 'id': programme_id, 'title': title, @@ -904,12 +906,8 @@ class BBCIE(BBCCoUkIE): json_ld_info = self._search_json_ld(webpage, playlist_id, default={}) timestamp = json_ld_info.get('timestamp') - playlist_title = json_ld_info.get('title') - if not playlist_title: - playlist_title = (self._og_search_title(webpage, default=None) - or self._html_extract_title(webpage, 'playlist title', default=None)) - if playlist_title: - playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip() + playlist_title = json_ld_info.get('title') or re.sub( + r'(.+)\s*-\s*BBC.*?$', r'\1', self._generic_title('', webpage, default='')).strip() or None playlist_description = json_ld_info.get( 'description') or self._og_search_description(webpage, default=None) @@ -953,7 +951,6 @@ class BBCIE(BBCCoUkIE): duration = int_or_none(items[0].get('duration')) programme_id = items[0].get('vpid') formats, subtitles = self._download_media_selector(programme_id) - self._sort_formats(formats) entries.append({ 'id': programme_id, 'title': title, @@ -990,7 +987,6 @@ class BBCIE(BBCCoUkIE): continue raise if entry: - self._sort_formats(entry['formats']) entries.append(entry) if entries: @@ -1014,7 +1010,6 @@ class BBCIE(BBCCoUkIE): if programme_id: formats, subtitles = self._download_media_selector(programme_id) - self._sort_formats(formats) # digitalData may be missing (e.g. http://www.bbc.com/autos/story/20130513-hyundais-rock-star) digital_data = self._parse_json( self._search_regex( @@ -1046,7 +1041,6 @@ class BBCIE(BBCCoUkIE): if version_id: title = smp_data['title'] formats, subtitles = self._download_media_selector(version_id) - self._sort_formats(formats) image_url = smp_data.get('holdingImageURL') display_date = init_data.get('displayDate') topic_title = init_data.get('topicTitle') @@ -1088,7 +1082,6 @@ class BBCIE(BBCCoUkIE): continue title = lead_media.get('title') or self._og_search_title(webpage) formats, subtitles = self._download_media_selector(programme_id) - self._sort_formats(formats) description = lead_media.get('summary') uploader = lead_media.get('masterBrand') uploader_id = lead_media.get('mid') @@ -1117,7 +1110,6 @@ class BBCIE(BBCCoUkIE): if current_programme and programme_id and current_programme.get('type') == 'playable_item': title = current_programme.get('titles', {}).get('tertiary') or playlist_title formats, subtitles = self._download_media_selector(programme_id) - self._sort_formats(formats) synopses = current_programme.get('synopses') or {} network = current_programme.get('network') or {} duration = int_or_none( @@ -1150,7 +1142,6 @@ class BBCIE(BBCCoUkIE): clip_title = clip.get('title') if clip_vpid and clip_title: formats, subtitles = self._download_media_selector(clip_vpid) - self._sort_formats(formats) return { 'id': clip_vpid, 'title': clip_title, @@ -1172,7 +1163,6 @@ class BBCIE(BBCCoUkIE): if not programme_id: continue formats, subtitles = self._download_media_selector(programme_id) - self._sort_formats(formats) entries.append({ 'id': programme_id, 'title': playlist_title, @@ -1204,7 +1194,6 @@ class BBCIE(BBCCoUkIE): if not (item_id and item_title): continue formats, subtitles = self._download_media_selector(item_id) - self._sort_formats(formats) item_desc = None blocks = try_get(media, lambda x: x['summary']['blocks'], list) if blocks: @@ -1238,7 +1227,7 @@ class BBCIE(BBCCoUkIE): (lambda x: x['data']['blocks'], lambda x: x['data']['content']['model']['blocks'],), list) or []): - if block.get('type') != 'media': + if block.get('type') not in ['media', 'video']: continue parse_media(block.get('model')) return self.playlist_result( @@ -1305,7 +1294,6 @@ class BBCIE(BBCCoUkIE): formats, subtitles = self._extract_from_media_meta(media_meta, playlist_id) if not formats and not self.get_param('ignore_no_formats'): continue - self._sort_formats(formats) video_id = media_meta.get('externalId') if not video_id: diff --git a/hypervideo_dl/extractor/beatport.py b/hypervideo_dl/extractor/beatport.py index e1cf8b4..0aecbd0 100644 --- a/hypervideo_dl/extractor/beatport.py +++ b/hypervideo_dl/extractor/beatport.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -77,7 +74,6 @@ class BeatportIE(InfoExtractor): fmt['abr'] = 96 fmt['asr'] = 44100 formats.append(fmt) - self._sort_formats(formats) images = [] for name, info in track['images'].items(): diff --git a/hypervideo_dl/extractor/beeg.py b/hypervideo_dl/extractor/beeg.py index 717fff3..52ee68e 100644 --- a/hypervideo_dl/extractor/beeg.py +++ b/hypervideo_dl/extractor/beeg.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( @@ -78,8 +76,6 @@ class BeegIE(InfoExtractor): f['height'] = height formats.extend(current_formats) - self._sort_formats(formats) - return { 'id': video_id, 'display_id': first_fact.get('id'), diff --git a/hypervideo_dl/extractor/behindkink.py b/hypervideo_dl/extractor/behindkink.py index 2c97f98..ca44981 100644 --- a/hypervideo_dl/extractor/behindkink.py +++ b/hypervideo_dl/extractor/behindkink.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor from ..utils import url_basename diff --git a/hypervideo_dl/extractor/bellmedia.py b/hypervideo_dl/extractor/bellmedia.py index 904c17e..5ae4b91 100644 --- a/hypervideo_dl/extractor/bellmedia.py +++ b/hypervideo_dl/extractor/bellmedia.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor @@ -28,7 +24,7 @@ class BellMediaIE(InfoExtractor): )/.*?(?:\b(?:vid(?:eoid)?|clipId)=|-vid|~|%7E|/(?:episode)?)(?P[0-9]{6,})''' _TESTS = [{ 'url': 'https://www.bnnbloomberg.ca/video/david-cockfield-s-top-picks~1403070', - 'md5': '36d3ef559cfe8af8efe15922cd3ce950', + 'md5': '3e5b8e38370741d5089da79161646635', 'info_dict': { 'id': '1403070', 'ext': 'flv', @@ -36,6 +32,14 @@ class BellMediaIE(InfoExtractor): 'description': 'md5:810f7f8c6a83ad5b48677c3f8e5bb2c3', 'upload_date': '20180525', 'timestamp': 1527288600, + 'season_id': 73997, + 'season': '2018', + 'thumbnail': 'http://images2.9c9media.com/image_asset/2018_5_25_baf30cbd-b28d-4a18-9903-4bb8713b00f5_PNG_956x536.jpg', + 'tags': [], + 'categories': ['ETFs'], + 'season_number': 8, + 'duration': 272.038, + 'series': 'Market Call Tonight', }, }, { 'url': 'http://www.thecomedynetwork.ca/video/player?vid=923582', diff --git a/hypervideo_dl/extractor/berufetv.py b/hypervideo_dl/extractor/berufetv.py new file mode 100644 index 0000000..8160cbd --- /dev/null +++ b/hypervideo_dl/extractor/berufetv.py @@ -0,0 +1,70 @@ +from .common import InfoExtractor +from ..utils import float_or_none, mimetype2ext, traverse_obj + + +class BerufeTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?web\.arbeitsagentur\.de/berufetv/[^?#]+/film;filmId=(?P[\w-]+)' + _TESTS = [{ + 'url': 'https://web.arbeitsagentur.de/berufetv/studienberufe/wirtschaftswissenschaften/wirtschaftswissenschaften-volkswirtschaft/film;filmId=DvKC3DUpMKvUZ_6fEnfg3u', + 'md5': '041b6432ec8e6838f84a5c30f31cc795', + 'info_dict': { + 'id': 'DvKC3DUpMKvUZ_6fEnfg3u', + 'ext': 'mp4', + 'title': 'Volkswirtschaftslehre', + 'description': 'md5:6bd87d0c63163480a6489a37526ee1c1', + 'categories': ['Studien­beruf'], + 'tags': ['Studienfilm'], + 'duration': 602.440, + 'thumbnail': r're:^https://asset-out-cdn\.video-cdn\.net/private/videos/DvKC3DUpMKvUZ_6fEnfg3u/thumbnails/793063\?quality=thumbnail&__token__=[^\s]+$', + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + movie_metadata = self._download_json( + 'https://rest.arbeitsagentur.de/infosysbub/berufetv/pc/v1/film-metadata', + video_id, 'Downloading JSON metadata', + headers={'X-API-Key': '79089773-4892-4386-86e6-e8503669f426'}, fatal=False) + + meta = traverse_obj( + movie_metadata, ('metadaten', lambda _, i: video_id == i['miId']), + get_all=False, default={}) + + video = self._download_json( + f'https://d.video-cdn.net/play/player/8YRzUk6pTzmBdrsLe9Y88W/video/{video_id}', + video_id, 'Downloading video JSON') + + formats, subtitles = [], {} + for key, source in video['videoSources']['html'].items(): + if key == 'auto': + fmts, subs = self._extract_m3u8_formats_and_subtitles(source[0]['source'], video_id) + formats += fmts + subtitles = subs + else: + formats.append({ + 'url': source[0]['source'], + 'ext': mimetype2ext(source[0]['mimeType']), + 'format_id': key, + }) + + for track in video.get('videoTracks') or []: + if track.get('type') != 'SUBTITLES': + continue + subtitles.setdefault(track['language'], []).append({ + 'url': track['source'], + 'name': track.get('label'), + 'ext': 'vtt' + }) + + return { + 'id': video_id, + 'title': meta.get('titel') or traverse_obj(video, ('videoMetaData', 'title')), + 'description': meta.get('beschreibung'), + 'thumbnail': meta.get('thumbnail') or f'https://asset-out-cdn.video-cdn.net/private/videos/{video_id}/thumbnails/active', + 'duration': float_or_none(video.get('duration'), scale=1000), + 'categories': [meta['kategorie']] if meta.get('kategorie') else None, + 'tags': meta.get('themengebiete'), + 'subtitles': subtitles, + 'formats': formats, + } diff --git a/hypervideo_dl/extractor/bet.py b/hypervideo_dl/extractor/bet.py index 2c71442..6b867d1 100644 --- a/hypervideo_dl/extractor/bet.py +++ b/hypervideo_dl/extractor/bet.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .mtv import MTVServicesInfoExtractor from ..utils import unified_strdate diff --git a/hypervideo_dl/extractor/bfi.py b/hypervideo_dl/extractor/bfi.py index 60c8944..76f0516 100644 --- a/hypervideo_dl/extractor/bfi.py +++ b/hypervideo_dl/extractor/bfi.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/bfmtv.py b/hypervideo_dl/extractor/bfmtv.py index 501f69d..d86d283 100644 --- a/hypervideo_dl/extractor/bfmtv.py +++ b/hypervideo_dl/extractor/bfmtv.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -45,7 +42,7 @@ class BFMTVIE(BFMTVBaseIE): return self._brightcove_url_result(video_block['videoid'], video_block) -class BFMTVLiveIE(BFMTVIE): +class BFMTVLiveIE(BFMTVIE): # XXX: Do not subclass from concrete IE IE_NAME = 'bfmtv:live' _VALID_URL = BFMTVBaseIE._VALID_URL_BASE + '(?P(?:[^/]+/)?en-direct)' _TESTS = [{ diff --git a/hypervideo_dl/extractor/bibeltv.py b/hypervideo_dl/extractor/bibeltv.py index 56c2bfe..fd20aad 100644 --- a/hypervideo_dl/extractor/bibeltv.py +++ b/hypervideo_dl/extractor/bibeltv.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/bigflix.py b/hypervideo_dl/extractor/bigflix.py index 28e3e59..02d1ba0 100644 --- a/hypervideo_dl/extractor/bigflix.py +++ b/hypervideo_dl/extractor/bigflix.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -66,8 +63,6 @@ class BigflixIE(InfoExtractor): 'url': decode_url(file_url), }) - self._sort_formats(formats) - description = self._html_search_meta('description', webpage) return { diff --git a/hypervideo_dl/extractor/bigo.py b/hypervideo_dl/extractor/bigo.py index ddf76ac..1cb6e58 100644 --- a/hypervideo_dl/extractor/bigo.py +++ b/hypervideo_dl/extractor/bigo.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ExtractorError, urlencode_postdata @@ -31,7 +28,7 @@ class BigoIE(InfoExtractor): user_id = self._match_id(url) info_raw = self._download_json( - 'https://bigo.tv/studio/getInternalStudioInfo', + 'https://ta.bigo.tv/official_website/studio/getInternalStudioInfo', user_id, data=urlencode_postdata({'siteId': user_id})) if not isinstance(info_raw, dict): @@ -44,14 +41,14 @@ class BigoIE(InfoExtractor): if not info.get('alive'): raise ExtractorError('This user is offline.', expected=True) + formats, subs = self._extract_m3u8_formats_and_subtitles( + info.get('hls_src'), user_id, 'mp4', 'm3u8') + return { 'id': info.get('roomId') or user_id, 'title': info.get('roomTopic') or info.get('nick_name') or user_id, - 'formats': [{ - 'url': info.get('hls_src'), - 'ext': 'mp4', - 'protocol': 'm3u8', - }], + 'formats': formats, + 'subtitles': subs, 'thumbnail': info.get('snapshot'), 'uploader': info.get('nick_name'), 'uploader_id': user_id, diff --git a/hypervideo_dl/extractor/bild.py b/hypervideo_dl/extractor/bild.py index b8dfbd4..f3dea33 100644 --- a/hypervideo_dl/extractor/bild.py +++ b/hypervideo_dl/extractor/bild.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( int_or_none, diff --git a/hypervideo_dl/extractor/bilibili.py b/hypervideo_dl/extractor/bilibili.py index 909f7f8..bc04241 100644 --- a/hypervideo_dl/extractor/bilibili.py +++ b/hypervideo_dl/extractor/bilibili.py @@ -1,509 +1,561 @@ -# coding: utf-8 - import base64 -import hashlib -import itertools import functools -import re +import itertools import math +import urllib.error +import urllib.parse from .common import InfoExtractor, SearchInfoExtractor -from ..compat import ( - compat_parse_qs, - compat_urlparse, - compat_urllib_parse_urlparse -) from ..utils import ( ExtractorError, + GeoRestrictedError, + InAdvancePagedList, + OnDemandPagedList, filter_dict, - int_or_none, float_or_none, + format_field, + int_or_none, + make_archive_id, mimetype2ext, - parse_iso8601, - traverse_obj, parse_count, - smuggle_url, + parse_qs, + qualities, srt_subtitles_timecode, str_or_none, - strip_jsonp, - unified_timestamp, - unsmuggle_url, - urlencode_postdata, + traverse_obj, url_or_none, - OnDemandPagedList + urlencode_postdata, ) -class BiliBiliIE(InfoExtractor): - _VALID_URL = r'''(?x) - https?:// - (?:(?:www|bangumi)\.)? - bilibili\.(?:tv|com)/ - (?: - (?: - video/[aA][vV]| - anime/(?P\d+)/play\# - )(?P\d+)| - (s/)?video/[bB][vV](?P[^/?#&]+) - ) - (?:/?\?p=(?P\d+))? - ''' +class BilibiliBaseIE(InfoExtractor): + def extract_formats(self, play_info): + format_names = { + r['quality']: traverse_obj(r, 'new_description', 'display_desc') + for r in traverse_obj(play_info, ('support_formats', lambda _, v: v['quality'])) + } + + audios = traverse_obj(play_info, ('dash', 'audio', ...)) + flac_audio = traverse_obj(play_info, ('dash', 'flac', 'audio')) + if flac_audio: + audios.append(flac_audio) + formats = [{ + 'url': traverse_obj(audio, 'baseUrl', 'base_url', 'url'), + 'ext': mimetype2ext(traverse_obj(audio, 'mimeType', 'mime_type')), + 'acodec': audio.get('codecs'), + 'vcodec': 'none', + 'tbr': float_or_none(audio.get('bandwidth'), scale=1000), + 'filesize': int_or_none(audio.get('size')) + } for audio in audios] + + formats.extend({ + 'url': traverse_obj(video, 'baseUrl', 'base_url', 'url'), + 'ext': mimetype2ext(traverse_obj(video, 'mimeType', 'mime_type')), + 'fps': float_or_none(traverse_obj(video, 'frameRate', 'frame_rate')), + 'width': int_or_none(video.get('width')), + 'height': int_or_none(video.get('height')), + 'vcodec': video.get('codecs'), + 'acodec': 'none' if audios else None, + 'tbr': float_or_none(video.get('bandwidth'), scale=1000), + 'filesize': int_or_none(video.get('size')), + 'quality': int_or_none(video.get('id')), + 'format': format_names.get(video.get('id')), + } for video in traverse_obj(play_info, ('dash', 'video', ...))) + + missing_formats = format_names.keys() - set(traverse_obj(formats, (..., 'quality'))) + if missing_formats: + self.to_screen(f'Format(s) {", ".join(format_names[i] for i in missing_formats)} are missing; ' + f'you have to login or become premium member to download them. {self._login_hint()}') + + return formats + + def json2srt(self, json_data): + srt_data = '' + for idx, line in enumerate(json_data.get('body') or []): + srt_data += (f'{idx + 1}\n' + f'{srt_subtitles_timecode(line["from"])} --> {srt_subtitles_timecode(line["to"])}\n' + f'{line["content"]}\n\n') + return srt_data + + def _get_subtitles(self, video_id, initial_state, cid): + subtitles = { + 'danmaku': [{ + 'ext': 'xml', + 'url': f'https://comment.bilibili.com/{cid}.xml', + }] + } + + for s in traverse_obj(initial_state, ('videoData', 'subtitle', 'list')) or []: + subtitles.setdefault(s['lan'], []).append({ + 'ext': 'srt', + 'data': self.json2srt(self._download_json(s['subtitle_url'], video_id)) + }) + return subtitles + + def _get_chapters(self, aid, cid): + chapters = aid and cid and self._download_json( + 'https://api.bilibili.com/x/player/v2', aid, query={'aid': aid, 'cid': cid}, + note='Extracting chapters', fatal=False) + return traverse_obj(chapters, ('data', 'view_points', ..., { + 'title': 'content', + 'start_time': 'from', + 'end_time': 'to', + })) or None + + def _get_comments(self, aid): + for idx in itertools.count(1): + replies = traverse_obj( + self._download_json( + f'https://api.bilibili.com/x/v2/reply?pn={idx}&oid={aid}&type=1&jsonp=jsonp&sort=2&_=1567227301685', + aid, note=f'Extracting comments from page {idx}', fatal=False), + ('data', 'replies')) + if not replies: + return + for children in map(self._get_all_children, replies): + yield from children + + def _get_all_children(self, reply): + yield { + 'author': traverse_obj(reply, ('member', 'uname')), + 'author_id': traverse_obj(reply, ('member', 'mid')), + 'id': reply.get('rpid'), + 'text': traverse_obj(reply, ('content', 'message')), + 'timestamp': reply.get('ctime'), + 'parent': reply.get('parent') or 'root', + } + for children in map(self._get_all_children, traverse_obj(reply, ('replies', ...))): + yield from children + + +class BiliBiliIE(BilibiliBaseIE): + _VALID_URL = r'https?://www\.bilibili\.com/video/[aAbB][vV](?P[^/?#&]+)' _TESTS = [{ + 'url': 'https://www.bilibili.com/video/BV13x41117TL', + 'info_dict': { + 'id': 'BV13x41117TL', + 'title': '阿滴英文|英文歌分享#6 "Closer', + 'ext': 'mp4', + 'description': '滴妹今天唱Closer給你聽! 有史以来,被推最多次也是最久的歌曲,其实歌词跟我原本想像差蛮多的,不过还是好听! 微博@阿滴英文', + 'uploader_id': '65880958', + 'uploader': '阿滴英文', + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', + 'duration': 554.117, + 'tags': list, + 'comment_count': int, + 'upload_date': '20170301', + 'timestamp': 1488353834, + 'like_count': int, + 'view_count': int, + }, + }, { + # old av URL version 'url': 'http://www.bilibili.com/video/av1074402/', - 'md5': '5f7d29e1a2872f3df0cf76b1f87d3788', 'info_dict': { - 'id': '1074402_part1', + 'thumbnail': r're:^https?://.*\.(jpg|jpeg)$', 'ext': 'mp4', - 'title': '【金坷垃】金泡沫', - 'uploader_id': '156160', 'uploader': '菊子桑', + 'uploader_id': '156160', + 'id': 'BV11x411K7CN', + 'title': '【金坷垃】金泡沫', + 'duration': 308.36, 'upload_date': '20140420', + 'timestamp': 1397983878, 'description': 'md5:ce18c2a2d2193f0df2917d270f2e5923', - 'timestamp': 1398012678, + 'like_count': int, + 'comment_count': int, + 'view_count': int, + 'tags': list, }, + 'params': {'skip_download': True}, }, { - # Tested in BiliBiliBangumiIE - 'url': 'http://bangumi.bilibili.com/anime/1869/play#40062', - 'only_matching': True, + 'note': 'Anthology', + 'url': 'https://www.bilibili.com/video/BV1bK411W797', + 'info_dict': { + 'id': 'BV1bK411W797', + 'title': '物语中的人物是如何吐槽自己的OP的' + }, + 'playlist_count': 18, + 'playlist': [{ + 'info_dict': { + 'id': 'BV1bK411W797_p1', + 'ext': 'mp4', + 'title': '物语中的人物是如何吐槽自己的OP的 p01 Staple Stable/战场原+羽川', + 'tags': 'count:11', + 'timestamp': 1589601697, + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', + 'uploader': '打牌还是打桩', + 'uploader_id': '150259984', + 'like_count': int, + 'comment_count': int, + 'upload_date': '20200516', + 'view_count': int, + 'description': 'md5:e3c401cf7bc363118d1783dd74068a68', + 'duration': 90.314, + } + }] }, { - # bilibili.tv - 'url': 'http://www.bilibili.tv/video/av1074402/', - 'only_matching': True, + 'note': 'Specific page of Anthology', + 'url': 'https://www.bilibili.com/video/BV1bK411W797?p=1', + 'info_dict': { + 'id': 'BV1bK411W797_p1', + 'ext': 'mp4', + 'title': '物语中的人物是如何吐槽自己的OP的 p01 Staple Stable/战场原+羽川', + 'tags': 'count:11', + 'timestamp': 1589601697, + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', + 'uploader': '打牌还是打桩', + 'uploader_id': '150259984', + 'like_count': int, + 'comment_count': int, + 'upload_date': '20200516', + 'view_count': int, + 'description': 'md5:e3c401cf7bc363118d1783dd74068a68', + 'duration': 90.314, + } }, { - 'url': 'http://bangumi.bilibili.com/anime/5802/play#100643', - 'md5': '3f721ad1e75030cc06faf73587cfec57', + 'note': 'video has subtitles', + 'url': 'https://www.bilibili.com/video/BV12N4y1M7rh', 'info_dict': { - 'id': '100643_part1', + 'id': 'BV12N4y1M7rh', 'ext': 'mp4', - 'title': 'CHAOS;CHILD', - 'description': '如果你是神明,并且能够让妄想成为现实。那你会进行怎么样的妄想?是淫靡的世界?独裁社会?毁灭性的制裁?还是……2015年,涩谷。从6年前发生的大灾害“涩谷地震”之后复兴了的这个街区里新设立的私立高中...', + 'title': 'md5:96e8bb42c2b432c0d4ce3434a61479c1', + 'tags': list, + 'description': 'md5:afde2b7ba9025c01d9e3dde10de221e4', + 'duration': 313.557, + 'upload_date': '20220709', + 'uploader': '小夫Tech', + 'timestamp': 1657347907, + 'uploader_id': '1326814124', + 'comment_count': int, + 'view_count': int, + 'like_count': int, + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', + 'subtitles': 'count:2' }, - 'skip': 'Geo-restricted to China', + 'params': {'listsubtitles': True}, }, { - 'url': 'http://www.bilibili.com/video/av8903802/', + 'url': 'https://www.bilibili.com/video/av8903802/', 'info_dict': { - 'id': '8903802_part1', + 'id': 'BV13x41117TL', 'ext': 'mp4', 'title': '阿滴英文|英文歌分享#6 "Closer', 'upload_date': '20170301', - 'description': '滴妹今天唱Closer給你聽! 有史以来,被推最多次也是最久的歌曲,其实歌词跟我原本想像差蛮多的,不过还是好听! 微博@阿滴英文', - 'timestamp': 1488382634, + 'description': 'md5:3b1b9e25b78da4ef87e9b548b88ee76a', + 'timestamp': 1488353834, 'uploader_id': '65880958', 'uploader': '阿滴英文', + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', + 'duration': 554.117, + 'tags': list, + 'comment_count': int, + 'view_count': int, + 'like_count': int, }, 'params': { 'skip_download': True, }, }, { - # new BV video id format - 'url': 'https://www.bilibili.com/video/BV1JE411F741', - 'only_matching': True, - }, { - # Anthology - 'url': 'https://www.bilibili.com/video/BV1bK411W797', + 'note': 'video has chapter', + 'url': 'https://www.bilibili.com/video/BV1vL411G7N7/', 'info_dict': { - 'id': 'BV1bK411W797', - 'title': '物语中的人物是如何吐槽自己的OP的' + 'id': 'BV1vL411G7N7', + 'ext': 'mp4', + 'title': '如何为你的B站视频添加进度条分段', + 'timestamp': 1634554558, + 'upload_date': '20211018', + 'description': 'md5:a9a3d6702b3a94518d419b2e9c320a6d', + 'tags': list, + 'uploader': '爱喝咖啡的当麻', + 'duration': 669.482, + 'uploader_id': '1680903', + 'chapters': 'count:6', + 'comment_count': int, + 'view_count': int, + 'like_count': int, + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', }, - 'playlist_count': 17, + 'params': {'skip_download': True}, }] - _APP_KEY = 'iVGUTjsxvpLeuDCf' - _BILIBILI_KEY = 'aHRmhWMLkdeMuILqORnYZocwMBpMEOdt' + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + initial_state = self._search_json(r'window\.__INITIAL_STATE__\s*=', webpage, 'initial state', video_id) + play_info = self._search_json(r'window\.__playinfo__\s*=', webpage, 'play info', video_id)['data'] - def _report_error(self, result): - if 'message' in result: - raise ExtractorError('%s said: %s' % (self.IE_NAME, result['message']), expected=True) - elif 'code' in result: - raise ExtractorError('%s returns error %d' % (self.IE_NAME, result['code']), expected=True) - else: - raise ExtractorError('Can\'t extract Bangumi episode ID') + video_data = initial_state['videoData'] + video_id, title = video_data['bvid'], video_data.get('title') - def _real_extract(self, url): - url, smuggled_data = unsmuggle_url(url, {}) + # Bilibili anthologies are similar to playlists but all videos share the same video ID as the anthology itself. + page_list_json = traverse_obj( + self._download_json( + 'https://api.bilibili.com/x/player/pagelist', video_id, + fatal=False, query={'bvid': video_id, 'jsonp': 'jsonp'}, + note='Extracting videos in anthology'), + 'data', expected_type=list) or [] + is_anthology = len(page_list_json) > 1 + + part_id = int_or_none(parse_qs(url).get('p', [None])[-1]) + if is_anthology and not part_id and self._yes_playlist(video_id, video_id): + return self.playlist_from_matches( + page_list_json, video_id, title, ie=BiliBiliIE, + getter=lambda entry: f'https://www.bilibili.com/video/{video_id}?p={entry["page"]}') - mobj = self._match_valid_url(url) - video_id = mobj.group('id_bv') or mobj.group('id') + if is_anthology: + title += f' p{part_id:02d} {traverse_obj(page_list_json, ((part_id or 1) - 1, "part")) or ""}' - av_id, bv_id = self._get_video_id_set(video_id, mobj.group('id_bv') is not None) - video_id = av_id + aid = video_data.get('aid') + old_video_id = format_field(aid, None, f'%s_part{part_id or 1}') - info = {} - anime_id = mobj.group('anime_id') - page_id = mobj.group('page') - webpage = self._download_webpage(url, video_id) + cid = traverse_obj(video_data, ('pages', part_id - 1, 'cid')) if part_id else video_data.get('cid') - # Bilibili anthologies are similar to playlists but all videos share the same video ID as the anthology itself. - # If the video has no page argument, check to see if it's an anthology - if page_id is None: - if not self.get_param('noplaylist'): - r = self._extract_anthology_entries(bv_id, video_id, webpage) - if r is not None: - self.to_screen('Downloading anthology %s - add --no-playlist to just download video' % video_id) - return r - else: - self.to_screen('Downloading just video %s because of --no-playlist' % video_id) - - if 'anime/' not in url: - cid = self._search_regex( - r'\bcid(?:["\']:|=)(\d+),["\']page(?:["\']:|=)' + str(page_id), webpage, 'cid', - default=None - ) or self._search_regex( - r'\bcid(?:["\']:|=)(\d+)', webpage, 'cid', - default=None - ) or compat_parse_qs(self._search_regex( - [r'EmbedPlayer\([^)]+,\s*"([^"]+)"\)', - r'EmbedPlayer\([^)]+,\s*\\"([^"]+)\\"\)', - r']+src="https://secure\.bilibili\.com/secure,([^"]+)"'], - webpage, 'player parameters'))['cid'][0] - else: - if 'no_bangumi_tip' not in smuggled_data: - self.to_screen('Downloading episode %s. To download all videos in anime %s, re-run hypervideo with %s' % ( - video_id, anime_id, compat_urlparse.urljoin(url, '//bangumi.bilibili.com/anime/%s' % anime_id))) - headers = { - 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', - 'Referer': url - } - headers.update(self.geo_verification_headers()) - - js = self._download_json( - 'http://bangumi.bilibili.com/web_api/get_source', video_id, - data=urlencode_postdata({'episode_id': video_id}), - headers=headers) - if 'result' not in js: - self._report_error(js) - cid = js['result']['cid'] - - headers = { - 'Accept': 'application/json', - 'Referer': url + return { + 'id': f'{video_id}{format_field(part_id, None, "_p%d")}', + 'formats': self.extract_formats(play_info), + '_old_archive_ids': [make_archive_id(self, old_video_id)] if old_video_id else None, + 'title': title, + 'description': traverse_obj(initial_state, ('videoData', 'desc')), + 'view_count': traverse_obj(initial_state, ('videoData', 'stat', 'view')), + 'uploader': traverse_obj(initial_state, ('upData', 'name')), + 'uploader_id': traverse_obj(initial_state, ('upData', 'mid')), + 'like_count': traverse_obj(initial_state, ('videoData', 'stat', 'like')), + 'comment_count': traverse_obj(initial_state, ('videoData', 'stat', 'reply')), + 'tags': traverse_obj(initial_state, ('tags', ..., 'tag_name')), + 'thumbnail': traverse_obj(initial_state, ('videoData', 'pic')), + 'timestamp': traverse_obj(initial_state, ('videoData', 'pubdate')), + 'duration': float_or_none(play_info.get('timelength'), scale=1000), + 'chapters': self._get_chapters(aid, cid), + 'subtitles': self.extract_subtitles(video_id, initial_state, cid), + '__post_extractor': self.extract_comments(aid), + 'http_headers': {'Referer': url}, } - headers.update(self.geo_verification_headers()) - video_info = self._parse_json( - self._search_regex(r'window.__playinfo__\s*=\s*({.+?})', webpage, 'video info', default=None) or '{}', - video_id, fatal=False) - video_info = video_info.get('data') or {} - durl = traverse_obj(video_info, ('dash', 'video')) - audios = traverse_obj(video_info, ('dash', 'audio')) or [] - entries = [] +class BiliBiliBangumiIE(BilibiliBaseIE): + _VALID_URL = r'(?x)https?://www\.bilibili\.com/bangumi/play/(?P(?:ss|ep)\d+)' - RENDITIONS = ('qn=80&quality=80&type=', 'quality=2&type=mp4') - for num, rendition in enumerate(RENDITIONS, start=1): - payload = 'appkey=%s&cid=%s&otype=json&%s' % (self._APP_KEY, cid, rendition) - sign = hashlib.md5((payload + self._BILIBILI_KEY).encode('utf-8')).hexdigest() - if not video_info: - video_info = self._download_json( - 'http://interface.bilibili.com/v2/playurl?%s&sign=%s' % (payload, sign), - video_id, note='Downloading video info page', - headers=headers, fatal=num == len(RENDITIONS)) - if not video_info: - continue - - if not durl and 'durl' not in video_info: - if num < len(RENDITIONS): - continue - self._report_error(video_info) - - formats = [] - for idx, durl in enumerate(durl or video_info['durl']): - formats.append({ - 'url': durl.get('baseUrl') or durl.get('base_url') or durl.get('url'), - 'ext': mimetype2ext(durl.get('mimeType') or durl.get('mime_type')), - 'fps': int_or_none(durl.get('frameRate') or durl.get('frame_rate')), - 'width': int_or_none(durl.get('width')), - 'height': int_or_none(durl.get('height')), - 'vcodec': durl.get('codecs'), - 'acodec': 'none' if audios else None, - 'tbr': float_or_none(durl.get('bandwidth'), scale=1000), - 'filesize': int_or_none(durl.get('size')), - }) - for backup_url in traverse_obj(durl, 'backup_url', expected_type=list) or []: - formats.append({ - 'url': backup_url, - 'quality': -2 if 'hd.mp4' in backup_url else -3, - }) - - for audio in audios: - formats.append({ - 'url': audio.get('baseUrl') or audio.get('base_url') or audio.get('url'), - 'ext': mimetype2ext(audio.get('mimeType') or audio.get('mime_type')), - 'fps': int_or_none(audio.get('frameRate') or audio.get('frame_rate')), - 'width': int_or_none(audio.get('width')), - 'height': int_or_none(audio.get('height')), - 'acodec': audio.get('codecs'), - 'vcodec': 'none', - 'tbr': float_or_none(audio.get('bandwidth'), scale=1000), - 'filesize': int_or_none(audio.get('size')) - }) - for backup_url in traverse_obj(audio, 'backup_url', expected_type=list) or []: - formats.append({ - 'url': backup_url, - # backup URLs have lower priorities - 'quality': -3, - }) - - info.update({ - 'id': video_id, - 'duration': float_or_none(durl.get('length'), 1000), - 'formats': formats, - 'http_headers': { - 'Referer': url, - }, - }) - break - - self._sort_formats(formats) - - title = self._html_search_regex(( - r']+title=(["\'])(?P[^"\']+)', - r'(?s)]*>(?P.+?)', - self._meta_regex('title') - ), webpage, 'title', group='content', fatal=False) - - # Get part title for anthologies - if page_id is not None: - # TODO: The json is already downloaded by _extract_anthology_entries. Don't redownload for each video. - part_info = traverse_obj(self._download_json( - f'https://api.bilibili.com/x/player/pagelist?bvid={bv_id}&jsonp=jsonp', - video_id, note='Extracting videos in anthology'), 'data', expected_type=list) - title = title if len(part_info) == 1 else traverse_obj(part_info, (int(page_id) - 1, 'part')) or title - - description = self._html_search_meta('description', webpage) - timestamp = unified_timestamp(self._html_search_regex( - r']+datetime="([^"]+)"', webpage, 'upload time', - default=None) or self._html_search_meta( - 'uploadDate', webpage, 'timestamp', default=None)) - thumbnail = self._html_search_meta(['og:image', 'thumbnailUrl'], webpage) - - # TODO 'view_count' requires deobfuscating Javascript - info.update({ - 'id': f'{video_id}_part{page_id or 1}', - 'cid': cid, - 'title': title, - 'description': description, - 'timestamp': timestamp, - 'thumbnail': thumbnail, - 'duration': float_or_none(video_info.get('timelength'), scale=1000), - }) - - uploader_mobj = re.search( - r']+href="(?:https?:)?//space\.bilibili\.com/(?P\d+)"[^>]*>\s*(?P[^<]+?)\s*<', - webpage) - if uploader_mobj: - info.update({ - 'uploader': uploader_mobj.group('name').strip(), - 'uploader_id': uploader_mobj.group('id'), - }) + _TESTS = [{ + 'url': 'https://www.bilibili.com/bangumi/play/ss897', + 'info_dict': { + 'id': 'ss897', + 'ext': 'mp4', + 'series': '神的记事本', + 'season': '神的记事本', + 'season_id': 897, + 'season_number': 1, + 'episode': '你与旅行包', + 'episode_number': 2, + 'title': '神的记事本:第2话 你与旅行包', + 'duration': 1428.487, + 'timestamp': 1310809380, + 'upload_date': '20110716', + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', + }, + }, { + 'url': 'https://www.bilibili.com/bangumi/play/ep508406', + 'only_matching': True, + }] - if not info.get('uploader'): - info['uploader'] = self._html_search_meta( - 'author', webpage, 'uploader', default=None) + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) - top_level_info = { - 'tags': traverse_obj(self._download_json( - f'https://api.bilibili.com/x/tag/archive/tags?aid={video_id}', - video_id, fatal=False, note='Downloading tags'), ('data', ..., 'tag_name')), - } + if '您所在的地区无法观看本片' in webpage: + raise GeoRestrictedError('This video is restricted') + elif ('开通大会员观看' in webpage and '__playinfo__' not in webpage + or '正在观看预览,大会员免费看全片' in webpage): + self.raise_login_required('This video is for premium members only') - info['subtitles'] = { - 'danmaku': [{ - 'ext': 'xml', - 'url': f'https://comment.bilibili.com/{cid}.xml', - }] - } + play_info = self._search_json(r'window\.__playinfo__\s*=', webpage, 'play info', video_id)['data'] + formats = self.extract_formats(play_info) + if (not formats and '成为大会员抢先看' in webpage + and play_info.get('durl') and not play_info.get('dash')): + self.raise_login_required('This video is for premium members only') - r''' - # Requires https://github.com/m13253/danmaku2ass which is licenced under GPL3 - # See https://github.com/animelover1984/youtube-dl + initial_state = self._search_json(r'window\.__INITIAL_STATE__\s*=', webpage, 'initial state', video_id) - raw_danmaku = self._download_webpage( - f'https://comment.bilibili.com/{cid}.xml', video_id, fatal=False, note='Downloading danmaku comments') - danmaku = NiconicoIE.CreateDanmaku(raw_danmaku, commentType='Bilibili', x=1024, y=576) - entries[0]['subtitles'] = { - 'danmaku': [{ - 'ext': 'ass', - 'data': danmaku - }] + season_id = traverse_obj(initial_state, ('mediaInfo', 'season_id')) + season_number = season_id and next(( + idx + 1 for idx, e in enumerate( + traverse_obj(initial_state, ('mediaInfo', 'seasons', ...))) + if e.get('season_id') == season_id + ), None) + + return { + 'id': video_id, + 'formats': formats, + 'title': traverse_obj(initial_state, 'h1Title'), + 'episode': traverse_obj(initial_state, ('epInfo', 'long_title')), + 'episode_number': int_or_none(traverse_obj(initial_state, ('epInfo', 'title'))), + 'series': traverse_obj(initial_state, ('mediaInfo', 'series')), + 'season': traverse_obj(initial_state, ('mediaInfo', 'season_title')), + 'season_id': season_id, + 'season_number': season_number, + 'thumbnail': traverse_obj(initial_state, ('epInfo', 'cover')), + 'timestamp': traverse_obj(initial_state, ('epInfo', 'pub_time')), + 'duration': float_or_none(play_info.get('timelength'), scale=1000), + 'subtitles': self.extract_subtitles( + video_id, initial_state, traverse_obj(initial_state, ('epInfo', 'cid'))), + '__post_extractor': self.extract_comments(traverse_obj(initial_state, ('epInfo', 'aid'))), + 'http_headers': {'Referer': url, **self.geo_verification_headers()}, } - ''' - top_level_info['__post_extractor'] = self.extract_comments(video_id) - for entry in entries: - entry.update(info) +class BiliBiliBangumiMediaIE(InfoExtractor): + _VALID_URL = r'https?://www\.bilibili\.com/bangumi/media/md(?P\d+)' + _TESTS = [{ + 'url': 'https://www.bilibili.com/bangumi/media/md24097891', + 'info_dict': { + 'id': '24097891', + }, + 'playlist_mincount': 25, + }] - if len(entries) == 1: - entries[0].update(top_level_info) - return entries[0] + def _real_extract(self, url): + media_id = self._match_id(url) + webpage = self._download_webpage(url, media_id) - for idx, entry in enumerate(entries): - entry['id'] = '%s_part%d' % (video_id, (idx + 1)) + initial_state = self._search_json(r'window\.__INITIAL_STATE__\s*=', webpage, 'initial_state', media_id) + episode_list = self._download_json( + 'https://api.bilibili.com/pgc/web/season/section', media_id, + query={'season_id': initial_state['mediaInfo']['season_id']}, + note='Downloading season info')['result']['main_section']['episodes'] - return { - 'id': str(video_id), - 'bv_id': bv_id, - 'title': title, - 'description': description, - **info, **top_level_info - } + return self.playlist_result(( + self.url_result(entry['share_url'], BiliBiliBangumiIE, entry['aid']) + for entry in episode_list), media_id) - def _extract_anthology_entries(self, bv_id, video_id, webpage): - title = self._html_search_regex( - (r']+\btitle=(["\'])(?P(?:(?!\1).)+)\1', - r'(?s)<h1[^>]*>(?P<title>.+?)</h1>', - r'<title>(?P<title>.+?)'), webpage, 'title', - group='title') - json_data = self._download_json( - f'https://api.bilibili.com/x/player/pagelist?bvid={bv_id}&jsonp=jsonp', - video_id, note='Extracting videos in anthology') - - if json_data['data']: - return self.playlist_from_matches( - json_data['data'], bv_id, title, ie=BiliBiliIE.ie_key(), - getter=lambda entry: 'https://www.bilibili.com/video/%s?p=%d' % (bv_id, entry['page'])) - - def _get_video_id_set(self, id, is_bv): - query = {'bvid': id} if is_bv else {'aid': id} - response = self._download_json( - "http://api.bilibili.cn/x/web-interface/view", - id, query=query, - note='Grabbing original ID via API') - - if response['code'] == -400: - raise ExtractorError('Video ID does not exist', expected=True, video_id=id) - elif response['code'] != 0: - raise ExtractorError(f'Unknown error occurred during API check (code {response["code"]})', - expected=True, video_id=id) - return response['data']['aid'], response['data']['bvid'] - - def _get_comments(self, video_id, commentPageNumber=0): - for idx in itertools.count(1): - replies = traverse_obj( - self._download_json( - f'https://api.bilibili.com/x/v2/reply?pn={idx}&oid={video_id}&type=1&jsonp=jsonp&sort=2&_=1567227301685', - video_id, note=f'Extracting comments from page {idx}', fatal=False), - ('data', 'replies')) - if not replies: - return - for children in map(self._get_all_children, replies): - yield from children - def _get_all_children(self, reply): - yield { - 'author': traverse_obj(reply, ('member', 'uname')), - 'author_id': traverse_obj(reply, ('member', 'mid')), - 'id': reply.get('rpid'), - 'text': traverse_obj(reply, ('content', 'message')), - 'timestamp': reply.get('ctime'), - 'parent': reply.get('parent') or 'root', - } - for children in map(self._get_all_children, reply.get('replies') or []): - yield from children +class BilibiliSpaceBaseIE(InfoExtractor): + def _extract_playlist(self, fetch_page, get_metadata, get_entries): + first_page = fetch_page(0) + metadata = get_metadata(first_page) + paged_list = InAdvancePagedList( + lambda idx: get_entries(fetch_page(idx) if idx else first_page), + metadata['page_count'], metadata['page_size']) -class BiliBiliBangumiIE(InfoExtractor): - _VALID_URL = r'https?://bangumi\.bilibili\.com/anime/(?P\d+)' + return metadata, paged_list - IE_NAME = 'bangumi.bilibili.com' - IE_DESC = 'BiliBili番剧' +class BilibiliSpaceVideoIE(BilibiliSpaceBaseIE): + _VALID_URL = r'https?://space\.bilibili\.com/(?P\d+)(?P', + ] + if any(re.search(marker, html) for marker in AGE_LIMIT_MARKERS): + return 18 return 0 def _media_rating_search(self, html): @@ -1401,27 +1435,25 @@ class InfoExtractor(object): return self._html_search_meta('twitter:player', html, 'twitter card player') - def _search_json_ld(self, html, video_id, expected_type=None, **kwargs): - json_ld_list = list(re.finditer(JSON_LD_RE, html)) - default = kwargs.get('default', NO_DEFAULT) - # JSON-LD may be malformed and thus `fatal` should be respected. - # At the same time `default` may be passed that assumes `fatal=False` - # for _search_regex. Let's simulate the same behavior here as well. - fatal = kwargs.get('fatal', True) if default is NO_DEFAULT else False - json_ld = [] - for mobj in json_ld_list: - json_ld_item = self._parse_json( - mobj.group('json_ld'), video_id, fatal=fatal) - if not json_ld_item: - continue - if isinstance(json_ld_item, dict): - json_ld.append(json_ld_item) - elif isinstance(json_ld_item, (list, tuple)): - json_ld.extend(json_ld_item) - if json_ld: - json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type) - if json_ld: - return json_ld + def _yield_json_ld(self, html, video_id, *, fatal=True, default=NO_DEFAULT): + """Yield all json ld objects in the html""" + if default is not NO_DEFAULT: + fatal = False + for mobj in re.finditer(JSON_LD_RE, html): + json_ld_item = self._parse_json(mobj.group('json_ld'), video_id, fatal=fatal) + for json_ld in variadic(json_ld_item): + if isinstance(json_ld, dict): + yield json_ld + + def _search_json_ld(self, html, video_id, expected_type=None, *, fatal=True, default=NO_DEFAULT): + """Search for a video in any json ld in the html""" + if default is not NO_DEFAULT: + fatal = False + info = self._json_ld( + list(self._yield_json_ld(html, video_id, fatal=fatal, default=default)), + video_id, fatal=fatal, expected_type=expected_type) + if info: + return info if default is not NO_DEFAULT: return default elif fatal: @@ -1431,15 +1463,11 @@ class InfoExtractor(object): return {} def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None): - if isinstance(json_ld, compat_str): + if isinstance(json_ld, str): json_ld = self._parse_json(json_ld, video_id, fatal=fatal) if not json_ld: return {} info = {} - if not isinstance(json_ld, (list, tuple, dict)): - return info - if isinstance(json_ld, dict): - json_ld = [json_ld] INTERACTION_TYPE_MAP = { 'CommentAction': 'comment', @@ -1452,6 +1480,10 @@ class InfoExtractor(object): 'ViewAction': 'view', } + def is_type(e, *expected_types): + type = variadic(traverse_obj(e, '@type')) + return any(x in type for x in expected_types) + def extract_interaction_type(e): interaction_type = e.get('interactionType') if isinstance(interaction_type, dict): @@ -1465,9 +1497,7 @@ class InfoExtractor(object): if not isinstance(interaction_statistic, list): return for is_e in interaction_statistic: - if not isinstance(is_e, dict): - continue - if is_e.get('@type') != 'InteractionCounter': + if not is_type(is_e, 'InteractionCounter'): continue interaction_type = extract_interaction_type(is_e) if not interaction_type: @@ -1504,44 +1534,53 @@ class InfoExtractor(object): info['chapters'] = chapters def extract_video_object(e): - assert e['@type'] == 'VideoObject' author = e.get('author') info.update({ 'url': url_or_none(e.get('contentUrl')), + 'ext': mimetype2ext(e.get('encodingFormat')), 'title': unescapeHTML(e.get('name')), 'description': unescapeHTML(e.get('description')), - 'thumbnails': [{'url': url_or_none(url)} - for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))], + 'thumbnails': [{'url': unescapeHTML(url)} + for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL')) + if url_or_none(url)], 'duration': parse_duration(e.get('duration')), 'timestamp': unified_timestamp(e.get('uploadDate')), # author can be an instance of 'Organization' or 'Person' types. # both types can have 'name' property(inherited from 'Thing' type). [1] # however some websites are using 'Text' type instead. # 1. https://schema.org/VideoObject - 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None, - 'filesize': float_or_none(e.get('contentSize')), + 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, str) else None, + 'artist': traverse_obj(e, ('byArtist', 'name'), expected_type=str), + 'filesize': int_or_none(float_or_none(e.get('contentSize'))), 'tbr': int_or_none(e.get('bitrate')), 'width': int_or_none(e.get('width')), 'height': int_or_none(e.get('height')), 'view_count': int_or_none(e.get('interactionCount')), + 'tags': try_call(lambda: e.get('keywords').split(',')), }) + if is_type(e, 'AudioObject'): + info.update({ + 'vcodec': 'none', + 'abr': int_or_none(e.get('bitrate')), + }) extract_interaction_statistic(e) extract_chapter_information(e) def traverse_json_ld(json_ld, at_top_level=True): - for e in json_ld: + for e in variadic(json_ld): + if not isinstance(e, dict): + continue if at_top_level and '@context' not in e: continue if at_top_level and set(e.keys()) == {'@context', '@graph'}: - traverse_json_ld(variadic(e['@graph'], allowed_types=(dict,)), at_top_level=False) - break - item_type = e.get('@type') - if expected_type is not None and expected_type != item_type: + traverse_json_ld(e['@graph'], at_top_level=False) + continue + if expected_type is not None and not is_type(e, expected_type): continue rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none) if rating is not None: info['average_rating'] = rating - if item_type in ('TVEpisode', 'Episode'): + if is_type(e, 'TVEpisode', 'Episode'): episode_name = unescapeHTML(e.get('name')) info.update({ 'episode': episode_name, @@ -1551,44 +1590,46 @@ class InfoExtractor(object): if not info.get('title') and episode_name: info['title'] = episode_name part_of_season = e.get('partOfSeason') - if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'): + if is_type(part_of_season, 'TVSeason', 'Season', 'CreativeWorkSeason'): info.update({ 'season': unescapeHTML(part_of_season.get('name')), 'season_number': int_or_none(part_of_season.get('seasonNumber')), }) part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries') - if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'): + if is_type(part_of_series, 'TVSeries', 'Series', 'CreativeWorkSeries'): info['series'] = unescapeHTML(part_of_series.get('name')) - elif item_type == 'Movie': + elif is_type(e, 'Movie'): info.update({ 'title': unescapeHTML(e.get('name')), 'description': unescapeHTML(e.get('description')), 'duration': parse_duration(e.get('duration')), 'timestamp': unified_timestamp(e.get('dateCreated')), }) - elif item_type in ('Article', 'NewsArticle'): + elif is_type(e, 'Article', 'NewsArticle'): info.update({ 'timestamp': parse_iso8601(e.get('datePublished')), 'title': unescapeHTML(e.get('headline')), 'description': unescapeHTML(e.get('articleBody') or e.get('description')), }) - if traverse_obj(e, ('video', 0, '@type')) == 'VideoObject': + if is_type(traverse_obj(e, ('video', 0)), 'VideoObject'): extract_video_object(e['video'][0]) - elif item_type == 'VideoObject': + elif is_type(traverse_obj(e, ('subjectOf', 0)), 'VideoObject'): + extract_video_object(e['subjectOf'][0]) + elif is_type(e, 'VideoObject', 'AudioObject'): extract_video_object(e) if expected_type is None: continue else: break video = e.get('video') - if isinstance(video, dict) and video.get('@type') == 'VideoObject': + if is_type(video, 'VideoObject'): extract_video_object(video) if expected_type is None: continue else: break - traverse_json_ld(json_ld) + traverse_json_ld(json_ld) return filter_dict(info) def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw): @@ -1598,15 +1639,16 @@ class InfoExtractor(object): webpage, 'next.js data', fatal=fatal, **kw), video_id, transform_source=transform_source, fatal=fatal) - def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__'): - ''' Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function. ''' - # not all website do this, but it can be changed - # https://stackoverflow.com/questions/67463109/how-to-change-or-hide-nuxt-and-nuxt-keyword-in-page-source + def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)): + """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function""" rectx = re.escape(context_name) + FUNCTION_RE = r'\(function\((?P.*?)\){return\s+(?P{.*?})\s*;?\s*}\((?P.*?)\)' js, arg_keys, arg_vals = self._search_regex( - (r'' % rectx, - r'%s\(.*?\(function\((?P.*?)\)\{return\s(?P\{.*?\})\}\((?P.*?)\)' % rectx), - webpage, context_name, group=['js', 'arg_keys', 'arg_vals']) + (rf'', rf'{rectx}\(.*?{FUNCTION_RE}'), + webpage, context_name, group=('js', 'arg_keys', 'arg_vals'), + default=NO_DEFAULT if fatal else (None, None, None)) + if js is None: + return {} args = dict(zip(arg_keys.split(','), arg_vals.split(','))) @@ -1614,7 +1656,8 @@ class InfoExtractor(object): if val in ('undefined', 'void 0'): args[key] = 'null' - return self._parse_json(js_to_json(js, args), video_id)['data'][0] + ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal) + return traverse_obj(ret, traverse) or {} @staticmethod def _hidden_inputs(html): @@ -1638,296 +1681,27 @@ class InfoExtractor(object): html, '%s form' % form_id, group='form') return self._hidden_inputs(form) - class FormatSort: - regex = r' *((?P\+)?(?P[a-zA-Z0-9_]+)((?P[~:])(?P.*?))?)? *$' - - default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality', - 'res', 'fps', 'hdr:12', 'codec:vp9.2', 'size', 'br', 'asr', - 'proto', 'ext', 'hasaud', 'source', 'id') # These must not be aliases - ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr', - 'height', 'width', 'proto', 'vext', 'abr', 'aext', - 'fps', 'fs_approx', 'source', 'id') - - settings = { - 'vcodec': {'type': 'ordered', 'regex': True, - 'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']}, - 'acodec': {'type': 'ordered', 'regex': True, - 'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']}, - 'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range', - 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]}, - 'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol', - 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']}, - 'vext': {'type': 'ordered', 'field': 'video_ext', - 'order': ('mp4', 'webm', 'flv', '', 'none'), - 'order_free': ('webm', 'mp4', 'flv', '', 'none')}, - 'aext': {'type': 'ordered', 'field': 'audio_ext', - 'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'), - 'order_free': ('opus', 'ogg', 'webm', 'm4a', 'mp3', 'aac', '', 'none')}, - 'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000}, - 'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple', - 'field': ('vcodec', 'acodec'), - 'function': lambda it: int(any(v != 'none' for v in it))}, - 'ie_pref': {'priority': True, 'type': 'extractor'}, - 'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)}, - 'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)}, - 'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1}, - 'quality': {'convert': 'float', 'default': -1}, - 'filesize': {'convert': 'bytes'}, - 'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'}, - 'id': {'convert': 'string', 'field': 'format_id'}, - 'height': {'convert': 'float_none'}, - 'width': {'convert': 'float_none'}, - 'fps': {'convert': 'float_none'}, - 'tbr': {'convert': 'float_none'}, - 'vbr': {'convert': 'float_none'}, - 'abr': {'convert': 'float_none'}, - 'asr': {'convert': 'float_none'}, - 'source': {'convert': 'float', 'field': 'source_preference', 'default': -1}, - - 'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')}, - 'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True}, - 'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')}, - 'ext': {'type': 'combined', 'field': ('vext', 'aext')}, - 'res': {'type': 'multiple', 'field': ('height', 'width'), - 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))}, - - # For compatibility with youtube-dl - 'format_id': {'type': 'alias', 'field': 'id'}, - 'preference': {'type': 'alias', 'field': 'ie_pref'}, - 'language_preference': {'type': 'alias', 'field': 'lang'}, - 'source_preference': {'type': 'alias', 'field': 'source'}, - 'protocol': {'type': 'alias', 'field': 'proto'}, - 'filesize_approx': {'type': 'alias', 'field': 'fs_approx'}, - - # Deprecated - 'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True}, - 'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True}, - 'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True}, - 'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True}, - 'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True}, - 'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True}, - 'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True}, - 'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True}, - 'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True}, - 'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True}, - 'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True}, - 'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True}, - 'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True}, - 'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True}, - 'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True}, - 'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True}, - 'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True}, - 'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True}, - 'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True}, - 'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True}, - } + @classproperty(cache=True) + def FormatSort(cls): + class FormatSort(FormatSorter): + def __init__(ie, *args, **kwargs): + super().__init__(ie._downloader, *args, **kwargs) - def __init__(self, ie, field_preference): - self._order = [] - self.ydl = ie._downloader - self.evaluate_params(self.ydl.params, field_preference) - if ie.get_param('verbose'): - self.print_verbose_info(self.ydl.write_debug) - - def _get_field_setting(self, field, key): - if field not in self.settings: - if key in ('forced', 'priority'): - return False - self.ydl.deprecation_warning( - f'Using arbitrary fields ({field}) for format sorting is deprecated ' - 'and may be removed in a future version') - self.settings[field] = {} - propObj = self.settings[field] - if key not in propObj: - type = propObj.get('type') - if key == 'field': - default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field - elif key == 'convert': - default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore' - else: - default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None) - propObj[key] = default - return propObj[key] - - def _resolve_field_value(self, field, value, convertNone=False): - if value is None: - if not convertNone: - return None - else: - value = value.lower() - conversion = self._get_field_setting(field, 'convert') - if conversion == 'ignore': - return None - if conversion == 'string': - return value - elif conversion == 'float_none': - return float_or_none(value) - elif conversion == 'bytes': - return FileDownloader.parse_bytes(value) - elif conversion == 'order': - order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order') - use_regex = self._get_field_setting(field, 'regex') - list_length = len(order_list) - empty_pos = order_list.index('') if '' in order_list else list_length + 1 - if use_regex and value is not None: - for i, regex in enumerate(order_list): - if regex and re.match(regex, value): - return list_length - i - return list_length - empty_pos # not in list - else: # not regex or value = None - return list_length - (order_list.index(value) if value in order_list else empty_pos) - else: - if value.isnumeric(): - return float(value) - else: - self.settings[field]['convert'] = 'string' - return value - - def evaluate_params(self, params, sort_extractor): - self._use_free_order = params.get('prefer_free_formats', False) - self._sort_user = params.get('format_sort', []) - self._sort_extractor = sort_extractor - - def add_item(field, reverse, closest, limit_text): - field = field.lower() - if field in self._order: - return - self._order.append(field) - limit = self._resolve_field_value(field, limit_text) - data = { - 'reverse': reverse, - 'closest': False if limit is None else closest, - 'limit_text': limit_text, - 'limit': limit} - if field in self.settings: - self.settings[field].update(data) - else: - self.settings[field] = data - - sort_list = ( - tuple(field for field in self.default if self._get_field_setting(field, 'forced')) - + (tuple() if params.get('format_sort_force', False) - else tuple(field for field in self.default if self._get_field_setting(field, 'priority'))) - + tuple(self._sort_user) + tuple(sort_extractor) + self.default) - - for item in sort_list: - match = re.match(self.regex, item) - if match is None: - raise ExtractorError('Invalid format sort string "%s" given by extractor' % item) - field = match.group('field') - if field is None: - continue - if self._get_field_setting(field, 'type') == 'alias': - alias, field = field, self._get_field_setting(field, 'field') - if self._get_field_setting(alias, 'deprecated'): - self.ydl.deprecation_warning( - f'Format sorting alias {alias} is deprecated ' - f'and may be removed in a future version. Please use {field} instead') - reverse = match.group('reverse') is not None - closest = match.group('separator') == '~' - limit_text = match.group('limit') - - has_limit = limit_text is not None - has_multiple_fields = self._get_field_setting(field, 'type') == 'combined' - has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit') - - fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,) - limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple() - limit_count = len(limits) - for (i, f) in enumerate(fields): - add_item(f, reverse, closest, - limits[i] if i < limit_count - else limits[0] if has_limit and not has_multiple_limits - else None) - - def print_verbose_info(self, write_debug): - if self._sort_user: - write_debug('Sort order given by user: %s' % ', '.join(self._sort_user)) - if self._sort_extractor: - write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor)) - write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % ( - '+' if self._get_field_setting(field, 'reverse') else '', field, - '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':', - self._get_field_setting(field, 'limit_text'), - self._get_field_setting(field, 'limit')) - if self._get_field_setting(field, 'limit_text') is not None else '') - for field in self._order if self._get_field_setting(field, 'visible')])) - - def _calculate_field_preference_from_value(self, format, field, type, value): - reverse = self._get_field_setting(field, 'reverse') - closest = self._get_field_setting(field, 'closest') - limit = self._get_field_setting(field, 'limit') - - if type == 'extractor': - maximum = self._get_field_setting(field, 'max') - if value is None or (maximum is not None and value >= maximum): - value = -1 - elif type == 'boolean': - in_list = self._get_field_setting(field, 'in_list') - not_in_list = self._get_field_setting(field, 'not_in_list') - value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1 - elif type == 'ordered': - value = self._resolve_field_value(field, value, True) - - # try to convert to number - val_num = float_or_none(value, default=self._get_field_setting(field, 'default')) - is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None - if is_num: - value = val_num - - return ((-10, 0) if value is None - else (1, value, 0) if not is_num # if a field has mixed strings and numbers, strings are sorted higher - else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest - else (0, value, 0) if not reverse and (limit is None or value <= limit) - else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit - else (-1, value, 0)) - - def _calculate_field_preference(self, format, field): - type = self._get_field_setting(field, 'type') # extractor, boolean, ordered, field, multiple - get_value = lambda f: format.get(self._get_field_setting(f, 'field')) - if type == 'multiple': - type = 'field' # Only 'field' is allowed in multiple for now - actual_fields = self._get_field_setting(field, 'field') - - value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields) - else: - value = get_value(field) - return self._calculate_field_preference_from_value(format, field, type, value) - - def calculate_preference(self, format): - # Determine missing protocol - if not format.get('protocol'): - format['protocol'] = determine_protocol(format) - - # Determine missing ext - if not format.get('ext') and 'url' in format: - format['ext'] = determine_ext(format['url']) - if format.get('vcodec') == 'none': - format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none' - format['video_ext'] = 'none' - else: - format['video_ext'] = format['ext'] - format['audio_ext'] = 'none' - # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'): # Not supported? - # format['preference'] = -1000 - - # Determine missing bitrates - if format.get('tbr') is None: - if format.get('vbr') is not None and format.get('abr') is not None: - format['tbr'] = format.get('vbr', 0) + format.get('abr', 0) - else: - if format.get('vcodec') != 'none' and format.get('vbr') is None: - format['vbr'] = format.get('tbr') - format.get('abr', 0) - if format.get('acodec') != 'none' and format.get('abr') is None: - format['abr'] = format.get('tbr') - format.get('vbr', 0) - - return tuple(self._calculate_field_preference(format, field) for field in self._order) + deprecation_warning( + 'hypervideo_dl.InfoExtractor.FormatSort is deprecated and may be removed in the future. ' + 'Use hypervideo_dl.utils.FormatSorter instead') + return FormatSort def _sort_formats(self, formats, field_preference=[]): - if not formats: + if not field_preference: + self._downloader.deprecation_warning( + 'hypervideo_dl.InfoExtractor._sort_formats is deprecated and is no longer required') return - format_sort = self.FormatSort(self, field_preference) - formats.sort(key=lambda f: format_sort.calculate_preference(f)) + self._downloader.deprecation_warning( + 'hypervideo_dl.InfoExtractor._sort_formats is deprecated and no longer works as expected. ' + 'Return _format_sort_fields in the info_dict instead') + if formats: + formats[0]['__sort_fields'] = field_preference def _check_formats(self, formats, video_id): if formats: @@ -1969,14 +1743,9 @@ class InfoExtractor(object): else 'https:') def _proto_relative_url(self, url, scheme=None): - if url is None: - return url - if url.startswith('//'): - if scheme is None: - scheme = self.http_scheme() - return scheme + url - else: - return url + scheme = scheme or self.http_scheme() + assert scheme.endswith(':') + return sanitize_url(url, scheme=scheme[:-1]) def _sleep(self, timeout, video_id, msg_template=None): if msg_template is None: @@ -1988,17 +1757,19 @@ class InfoExtractor(object): def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None, transform_source=lambda s: fix_xml_ampersands(s).strip(), fatal=True, m3u8_id=None, data=None, headers={}, query={}): - manifest = self._download_xml( + res = self._download_xml_handle( manifest_url, video_id, 'Downloading f4m manifest', 'Unable to download f4m manifest', # Some manifests may be malformed, e.g. prosiebensat1 generated manifests # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244) transform_source=transform_source, fatal=fatal, data=data, headers=headers, query=query) - - if manifest is False: + if res is False: return [] + manifest, urlh = res + manifest_url = urlh.geturl() + return self._parse_f4m_formats( manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id, transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id) @@ -2006,7 +1777,7 @@ class InfoExtractor(object): def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None, transform_source=lambda s: fix_xml_ampersands(s).strip(), fatal=True, m3u8_id=None): - if not isinstance(manifest, compat_etree_Element) and not fatal: + if not isinstance(manifest, xml.etree.ElementTree.Element) and not fatal: return [] # currently hypervideo cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy @@ -2166,7 +1937,7 @@ class InfoExtractor(object): ]), m3u8_doc) def format_url(url): - return url if re.match(r'^https?://', url) else compat_urlparse.urljoin(m3u8_url, url) + return url if re.match(r'^https?://', url) else urllib.parse.urljoin(m3u8_url, url) if self.get_param('hls_split_discontinuity', False): def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None): @@ -2342,7 +2113,7 @@ class InfoExtractor(object): audio_group_id = last_stream_inf.get('AUDIO') # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which # references a rendition group MUST have a CODECS attribute. - # However, this is not always respected, for example, [2] + # However, this is not always respected. E.g. [2] # contains EXT-X-STREAM-INF tag which references AUDIO # rendition group but does not have CODECS and despite # referencing an audio group it represents a complete @@ -2406,12 +2177,14 @@ class InfoExtractor(object): return '/'.join(out) def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None): - smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source) - - if smil is False: + res = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source) + if res is False: assert not fatal return [], {} + smil, urlh = res + smil_url = urlh.geturl() + namespace = self._parse_smil_namespace(smil) fmts = self._parse_smil_formats( @@ -2428,13 +2201,17 @@ class InfoExtractor(object): return fmts def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None): - smil = self._download_smil(smil_url, video_id, fatal=fatal) - if smil is False: + res = self._download_smil(smil_url, video_id, fatal=fatal) + if res is False: return {} + + smil, urlh = res + smil_url = urlh.geturl() + return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params) def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None): - return self._download_xml( + return self._download_xml_handle( smil_url, video_id, 'Downloading SMIL file', 'Unable to download SMIL file', fatal=fatal, transform_source=transform_source) @@ -2533,7 +2310,7 @@ class InfoExtractor(object): }) continue - src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src) + src_url = src if src.startswith('http') else urllib.parse.urljoin(base, src) src_url = src_url.strip() if proto == 'm3u8' or src_ext == 'm3u8': @@ -2556,7 +2333,7 @@ class InfoExtractor(object): 'plugin': 'flowplayer-3.2.0.1', } f4m_url += '&' if '?' in f4m_url else '?' - f4m_url += compat_urllib_parse_urlencode(f4m_params) + f4m_url += urllib.parse.urlencode(f4m_params) formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False)) elif src_ext == 'mpd': formats.extend(self._extract_mpd_formats( @@ -2613,11 +2390,15 @@ class InfoExtractor(object): return subtitles def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True): - xspf = self._download_xml( + res = self._download_xml_handle( xspf_url, playlist_id, 'Downloading xpsf playlist', 'Unable to download xspf manifest', fatal=fatal) - if xspf is False: + if res is False: return [] + + xspf, urlh = res + xspf_url = urlh.geturl() + return self._parse_xspf( xspf, playlist_id, xspf_url=xspf_url, xspf_base_url=base_url(xspf_url)) @@ -2651,7 +2432,6 @@ class InfoExtractor(object): 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))), 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))), }) - self._sort_formats(formats) entries.append({ 'id': playlist_id, @@ -2682,7 +2462,10 @@ class InfoExtractor(object): mpd_doc, urlh = res if mpd_doc is None: return [], {} - mpd_base_url = base_url(urlh.geturl()) + + # We could have been redirected to a new url when we retrieved our mpd file. + mpd_url = urlh.geturl() + mpd_base_url = base_url(mpd_url) return self._parse_mpd_formats_and_subtitles( mpd_doc, mpd_id, mpd_base_url, mpd_url) @@ -2790,15 +2573,20 @@ class InfoExtractor(object): mime_type = representation_attrib['mimeType'] content_type = representation_attrib.get('contentType', mime_type.split('/')[0]) - codecs = parse_codecs(representation_attrib.get('codecs', '')) + codec_str = representation_attrib.get('codecs', '') + # Some kind of binary subtitle found in some youtube livestreams + if mime_type == 'application/x-rawcc': + codecs = {'scodec': codec_str} + else: + codecs = parse_codecs(codec_str) if content_type not in ('video', 'audio', 'text'): if mime_type == 'image/jpeg': content_type = mime_type - elif codecs['vcodec'] != 'none': + elif codecs.get('vcodec', 'none') != 'none': content_type = 'video' - elif codecs['acodec'] != 'none': + elif codecs.get('acodec', 'none') != 'none': content_type = 'audio' - elif codecs.get('tcodec', 'none') != 'none': + elif codecs.get('scodec', 'none') != 'none': content_type = 'text' elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'): content_type = 'text' @@ -2809,12 +2597,12 @@ class InfoExtractor(object): base_url = '' for element in (representation, adaptation_set, period, mpd_doc): base_url_e = element.find(_add_ns('BaseURL')) - if base_url_e is not None: + if try_call(lambda: base_url_e.text) is not None: base_url = base_url_e.text + base_url if re.match(r'^https?://', base_url): break if mpd_base_url and base_url.startswith('/'): - base_url = compat_urlparse.urljoin(mpd_base_url, base_url) + base_url = urllib.parse.urljoin(mpd_base_url, base_url) elif mpd_base_url and not re.match(r'^https?://', base_url): if not mpd_base_url.endswith('/'): mpd_base_url += '/' @@ -2869,6 +2657,8 @@ class InfoExtractor(object): def prepare_template(template_name, identifiers): tmpl = representation_ms_info[template_name] + if representation_id is not None: + tmpl = tmpl.replace('$RepresentationID$', representation_id) # First of, % characters outside $...$ templates # must be escaped by doubling for proper processing # by % operator string formatting used further (see @@ -2883,8 +2673,6 @@ class InfoExtractor(object): t += c # Next, $...$ templates are translated to their # %(...) counterparts to be used with % operator - if representation_id is not None: - t = t.replace('$RepresentationID$', representation_id) t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t) t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t) t.replace('$$', '$') @@ -2960,8 +2748,8 @@ class InfoExtractor(object): segment_number += 1 segment_time += segment_d elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info: - # No media template - # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI + # No media template, + # e.g. https://www.youtube.com/watch?v=iXZV5uAYMJI # or any YouTube dashsegments video fragments = [] segment_index = 0 @@ -2978,7 +2766,7 @@ class InfoExtractor(object): representation_ms_info['fragments'] = fragments elif 'segment_urls' in representation_ms_info: # Segment URLs with no SegmentTimeline - # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091 + # E.g. https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091 # https://github.com/ytdl-org/youtube-dl/pull/14844 fragments = [] segment_duration = float_or_none( @@ -3070,9 +2858,10 @@ class InfoExtractor(object): stream_name = stream.get('Name') stream_language = stream.get('Language', 'und') for track in stream.findall('QualityLevel'): - fourcc = track.get('FourCC') or ('AACL' if track.get('AudioTag') == '255' else None) + KNOWN_TAGS = {'255': 'AACL', '65534': 'EC-3'} + fourcc = track.get('FourCC') or KNOWN_TAGS.get(track.get('AudioTag')) # TODO: add support for WVC1 and WMAP - if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML'): + if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML', 'EC-3'): self.report_warning('%s is not a supported codec' % fourcc) continue tbr = int(track.attrib['Bitrate']) // 1000 @@ -3084,7 +2873,7 @@ class InfoExtractor(object): sampling_rate = int_or_none(track.get('SamplingRate')) track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern) - track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern) + track_url_pattern = urllib.parse.urljoin(ism_url, track_url_pattern) fragments = [] fragment_ctx = { @@ -3103,7 +2892,7 @@ class InfoExtractor(object): fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat for _ in range(fragment_repeat): fragments.append({ - 'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern), + 'url': re.sub(r'{start[ _]time}', str(fragment_ctx['time']), track_url_pattern), 'duration': fragment_ctx['duration'] / stream_timescale, }) fragment_ctx['time'] += fragment_ctx['duration'] @@ -3171,7 +2960,8 @@ class InfoExtractor(object): return f return {} - def _media_formats(src, cur_media_type, type_info={}): + def _media_formats(src, cur_media_type, type_info=None): + type_info = type_info or {} full_url = absolute_url(src) ext = type_info.get('ext') or determine_ext(full_url) if ext == 'm3u8': @@ -3189,12 +2979,13 @@ class InfoExtractor(object): formats = [{ 'url': full_url, 'vcodec': 'none' if cur_media_type == 'audio' else None, + 'ext': ext, }] return is_plain_url, formats entries = [] # amp-video and amp-audio are very similar to their HTML5 counterparts - # so we wll include them right here (see + # so we will include them right here (see # https://www.ampproject.org/docs/reference/components/amp-video) # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/ _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)' @@ -3204,8 +2995,8 @@ class InfoExtractor(object): media_tags.extend(re.findall( # We only allow video|audio followed by a whitespace or '>'. # Allowing more characters may end up in significant slow down (see - # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL: - # http://www.porntrex.com/maps/videositemap.xml). + # https://github.com/ytdl-org/youtube-dl/issues/11979, + # e.g. http://www.porntrex.com/maps/videositemap.xml). r'(?s)(<(?P%s)(?:\s+[^>]*)?>)(.*?)' % _MEDIA_TAG_NAME_RE, webpage)) for media_tag, _, media_type, media_content in media_tags: media_info = { @@ -3213,9 +3004,10 @@ class InfoExtractor(object): 'subtitles': {}, } media_attributes = extract_attributes(media_tag) - src = strip_or_none(media_attributes.get('src')) + src = strip_or_none(dict_get(media_attributes, ('src', 'data-video-src', 'data-src', 'data-source'))) if src: - _, formats = _media_formats(src, media_type) + f = parse_content_type(media_attributes.get('type')) + _, formats = _media_formats(src, media_type, f) media_info['formats'].extend(formats) media_info['thumbnail'] = absolute_url(media_attributes.get('poster')) if media_content: @@ -3223,7 +3015,7 @@ class InfoExtractor(object): s_attr = extract_attributes(source_tag) # data-video-src and data-src are non standard but seen # several times in the wild - src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src'))) + src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src', 'data-source'))) if not src: continue f = parse_content_type(s_attr.get('type')) @@ -3332,7 +3124,7 @@ class InfoExtractor(object): http_f = f.copy() del http_f['manifest_url'] http_url = re.sub( - REPL_REGEX, protocol + r'://%s/\g<1>%s\3' % (http_host, qualities[i]), f['url']) + REPL_REGEX, protocol + fr'://{http_host}/\g<1>{qualities[i]}\3', f['url']) http_f.update({ 'format_id': http_f['format_id'].replace('hls-', protocol + '-'), 'url': http_url, @@ -3344,7 +3136,7 @@ class InfoExtractor(object): return formats, subtitles def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]): - query = compat_urlparse.urlparse(url).query + query = urllib.parse.urlparse(url).query url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url) mobj = re.search( r'(?:(?:http|rtmp|rtsp)(?Ps)?:)?(?P//[^?]+)', url) @@ -3353,7 +3145,7 @@ class InfoExtractor(object): formats = [] def manifest_url(manifest): - m_url = '%s/%s' % (http_base_url, manifest) + m_url = f'{http_base_url}/{manifest}' if query: m_url += '?%s' % query return m_url @@ -3390,7 +3182,7 @@ class InfoExtractor(object): for protocol in ('rtmp', 'rtsp'): if protocol not in skip_protocols: formats.append({ - 'url': '%s:%s' % (protocol, url_base), + 'url': f'{protocol}:{url_base}', 'format_id': protocol, 'protocol': protocol, }) @@ -3450,7 +3242,7 @@ class InfoExtractor(object): if not isinstance(track, dict): continue track_kind = track.get('kind') - if not track_kind or not isinstance(track_kind, compat_str): + if not track_kind or not isinstance(track_kind, str): continue if track_kind.lower() not in ('captions', 'subtitles'): continue @@ -3477,7 +3269,6 @@ class InfoExtractor(object): 'url': formats[0]['url'], }) else: - self._sort_formats(formats) entry['formats'] = formats entries.append(entry) if len(entries) == 1: @@ -3523,13 +3314,14 @@ class InfoExtractor(object): # Often no height is provided but there is a label in # format like "1080p", "720p SD", or 1080. height = int_or_none(self._search_regex( - r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''), + r'^(\d{3,4})[pP]?(?:\b|$)', str(source.get('label') or ''), 'height', default=None)) a_format = { 'url': source_url, 'width': int_or_none(source.get('width')), 'height': height, - 'tbr': int_or_none(source.get('bitrate')), + 'tbr': int_or_none(source.get('bitrate'), scale=1000), + 'filesize': int_or_none(source.get('filesize')), 'ext': ext, } if source_url.startswith('rtmp'): @@ -3556,7 +3348,7 @@ class InfoExtractor(object): def _int(self, v, name, fatal=False, **kwargs): res = int_or_none(v, **kwargs) if res is None: - msg = 'Failed to extract %s: Could not parse value %r' % (name, v) + msg = f'Failed to extract {name}: Could not parse value {v!r}' if fatal: raise ExtractorError(msg) else: @@ -3566,7 +3358,7 @@ class InfoExtractor(object): def _float(self, v, name, fatal=False, **kwargs): res = float_or_none(v, **kwargs) if res is None: - msg = 'Failed to extract %s: Could not parse value %r' % (name, v) + msg = f'Failed to extract {name}: Could not parse value {v!r}' if fatal: raise ExtractorError(msg) else: @@ -3575,17 +3367,15 @@ class InfoExtractor(object): def _set_cookie(self, domain, name, value, expire_time=None, port=None, path='/', secure=False, discard=False, rest={}, **kwargs): - cookie = compat_cookiejar_Cookie( + cookie = http.cookiejar.Cookie( 0, name, value, port, port is not None, domain, True, domain.startswith('.'), path, True, secure, expire_time, discard, None, None, rest) - self._downloader.cookiejar.set_cookie(cookie) + self.cookiejar.set_cookie(cookie) def _get_cookies(self, url): - """ Return a compat_cookies_SimpleCookie with the cookies for the url """ - req = sanitized_Request(url) - self._downloader.cookiejar.add_cookie_header(req) - return compat_cookies_SimpleCookie(req.get_header('Cookie')) + """ Return a http.cookies.SimpleCookie with the cookies for the url """ + return LenientSimpleCookie(self._downloader._calc_cookies(url)) def _apply_first_set_cookie_header(self, url_handle, cookie): """ @@ -3604,9 +3394,7 @@ class InfoExtractor(object): for header, cookies in url_handle.headers.items(): if header.lower() != 'set-cookie': continue - if sys.version_info[0] >= 3: - cookies = cookies.encode('iso-8859-1') - cookies = cookies.decode('utf-8') + cookies = cookies.encode('iso-8859-1').decode('utf-8') cookie_value = re.search( r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies) if cookie_value: @@ -3614,34 +3402,82 @@ class InfoExtractor(object): self._set_cookie(domain, cookie, value) break - def get_testcases(self, include_onlymatching=False): - t = getattr(self, '_TEST', None) + @classmethod + def get_testcases(cls, include_onlymatching=False): + # Do not look in super classes + t = vars(cls).get('_TEST') if t: - assert not hasattr(self, '_TESTS'), \ - '%s has _TEST and _TESTS' % type(self).__name__ + assert not hasattr(cls, '_TESTS'), f'{cls.ie_key()}IE has _TEST and _TESTS' tests = [t] else: - tests = getattr(self, '_TESTS', []) + tests = vars(cls).get('_TESTS', []) for t in tests: if not include_onlymatching and t.get('only_matching', False): continue - t['name'] = type(self).__name__[:-len('IE')] + t['name'] = cls.ie_key() yield t - def is_suitable(self, age_limit): - """ Test whether the extractor is generally suitable for the given - age limit (i.e. pornographic sites are not, all others usually are) """ - - any_restricted = False - for tc in self.get_testcases(include_onlymatching=False): - if tc.get('playlist', []): - tc = tc['playlist'][0] - is_restricted = age_restricted( - tc.get('info_dict', {}).get('age_limit'), age_limit) - if not is_restricted: - return True - any_restricted = any_restricted or is_restricted - return not any_restricted + @classmethod + def get_webpage_testcases(cls): + tests = vars(cls).get('_WEBPAGE_TESTS', []) + for t in tests: + t['name'] = cls.ie_key() + return tests + + @classproperty(cache=True) + def age_limit(cls): + """Get age limit from the testcases""" + return max(traverse_obj( + (*cls.get_testcases(include_onlymatching=False), *cls.get_webpage_testcases()), + (..., (('playlist', 0), None), 'info_dict', 'age_limit')) or [0]) + + @classproperty(cache=True) + def _RETURN_TYPE(cls): + """What the extractor returns: "video", "playlist", "any", or None (Unknown)""" + tests = tuple(cls.get_testcases(include_onlymatching=False)) + if not tests: + return None + elif not any(k.startswith('playlist') for test in tests for k in test): + return 'video' + elif all(any(k.startswith('playlist') for k in test) for test in tests): + return 'playlist' + return 'any' + + @classmethod + def is_single_video(cls, url): + """Returns whether the URL is of a single video, None if unknown""" + assert cls.suitable(url), 'The URL must be suitable for the extractor' + return {'video': True, 'playlist': False}.get(cls._RETURN_TYPE) + + @classmethod + def is_suitable(cls, age_limit): + """Test whether the extractor is generally suitable for the given age limit""" + return not age_restricted(cls.age_limit, age_limit) + + @classmethod + def description(cls, *, markdown=True, search_examples=None): + """Description of the extractor""" + desc = '' + if cls._NETRC_MACHINE: + if markdown: + desc += f' [{cls._NETRC_MACHINE}]' + else: + desc += f' [{cls._NETRC_MACHINE}]' + if cls.IE_DESC is False: + desc += ' [HIDDEN]' + elif cls.IE_DESC: + desc += f' {cls.IE_DESC}' + if cls.SEARCH_KEY: + desc += f'; "{cls.SEARCH_KEY}:" prefix' + if search_examples: + _COUNTS = ('', '5', '10', 'all') + desc += f' (e.g. "{cls.SEARCH_KEY}{random.choice(_COUNTS)}:{random.choice(search_examples)}")' + if not cls.working(): + desc += ' (**Currently broken**)' if markdown else ' (Currently broken)' + + # Escape emojis. Ref: https://github.com/github/markup/issues/1153 + name = (' - **%s**' % re.sub(r':(\w+:)', ':\u200B\\g<1>', cls.IE_NAME)) if markdown else cls.IE_NAME + return f'{name}:{desc}' if desc else name def extract_subtitles(self, *args, **kwargs): if (self.get_param('writesubtitles', False) @@ -3652,6 +3488,9 @@ class InfoExtractor(object): def _get_subtitles(self, *args, **kwargs): raise NotImplementedError('This method must be implemented by subclasses') + class CommentsDisabled(Exception): + """Raise in _get_comments if comments are disabled for the video""" + def extract_comments(self, *args, **kwargs): if not self.get_param('getcomments'): return None @@ -3667,6 +3506,8 @@ class InfoExtractor(object): interrupted = False except KeyboardInterrupt: self.to_screen('Interrupted by user') + except self.CommentsDisabled: + return {'comments': None, 'comment_count': None} except Exception as e: if self.get_param('ignoreerrors') is not True: raise @@ -3686,7 +3527,7 @@ class InfoExtractor(object): def _merge_subtitle_items(subtitle_list1, subtitle_list2): """ Merge subtitle items for one language. Items with duplicated URLs/data will be dropped. """ - list1_data = set((item.get('url'), item.get('data')) for item in subtitle_list1) + list1_data = {(item.get('url'), item.get('data')) for item in subtitle_list1} ret = list(subtitle_list1) ret.extend(item for item in subtitle_list2 if (item.get('url'), item.get('data')) not in list1_data) return ret @@ -3710,11 +3551,15 @@ class InfoExtractor(object): def _get_automatic_captions(self, *args, **kwargs): raise NotImplementedError('This method must be implemented by subclasses') + @functools.cached_property + def _cookies_passed(self): + """Whether cookies have been passed to YoutubeDL""" + return self.get_param('cookiefile') is not None or self.get_param('cookiesfrombrowser') is not None + def mark_watched(self, *args, **kwargs): if not self.get_param('mark_watched', False): return - if (self.supports_login() and self._get_login_info()[0] is not None - or self.get_param('cookiefile') or self.get_param('cookiesfrombrowser')): + if self.supports_login() and self._get_login_info()[0] is not None or self._cookies_passed: self._mark_watched(*args, **kwargs) def _mark_watched(self, *args, **kwargs): @@ -3727,11 +3572,15 @@ class InfoExtractor(object): headers['Ytdl-request-proxy'] = geo_verification_proxy return headers - def _generic_id(self, url): - return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0]) + @staticmethod + def _generic_id(url): + return urllib.parse.unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0]) - def _generic_title(self, url): - return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]) + def _generic_title(self, url='', webpage='', *, default=None): + return (self._og_search_title(webpage, default=None) + or self._html_extract_title(webpage, default=None) + or urllib.parse.unquote(os.path.splitext(url_basename(url))[0]) + or default) @staticmethod def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None): @@ -3754,8 +3603,8 @@ class InfoExtractor(object): @param default The default value to return when the key is not present (default: []) @param casesense When false, the values are converted to lower case ''' - val = traverse_obj( - self._downloader.params, ('extractor_args', (ie_key or self.ie_key()).lower(), key)) + ie_key = ie_key if isinstance(ie_key, str) else (ie_key or self).ie_key() + val = traverse_obj(self._downloader.params, ('extractor_args', ie_key.lower(), key)) if val is None: return [] if default is NO_DEFAULT else default return list(val) if casesense else [x.lower() for x in val] @@ -3776,6 +3625,72 @@ class InfoExtractor(object): self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}') return True + def _error_or_warning(self, err, _count=None, _retries=0, *, fatal=True): + RetryManager.report_retry( + err, _count or int(fatal), _retries, + info=self.to_screen, warn=self.report_warning, error=None if fatal else self.report_warning, + sleep_func=self.get_param('retry_sleep_functions', {}).get('extractor')) + + def RetryManager(self, **kwargs): + return RetryManager(self.get_param('extractor_retries', 3), self._error_or_warning, **kwargs) + + def _extract_generic_embeds(self, url, *args, info_dict={}, note='Extracting generic embeds', **kwargs): + display_id = traverse_obj(info_dict, 'display_id', 'id') + self.to_screen(f'{format_field(display_id, None, "%s: ")}{note}') + return self._downloader.get_info_extractor('Generic')._extract_embeds( + smuggle_url(url, {'block_ies': [self.ie_key()]}), *args, **kwargs) + + @classmethod + def extract_from_webpage(cls, ydl, url, webpage): + ie = (cls if isinstance(cls._extract_from_webpage, types.MethodType) + else ydl.get_info_extractor(cls.ie_key())) + for info in ie._extract_from_webpage(url, webpage) or []: + # url = None since we do not want to set (webpage/original)_url + ydl.add_default_extra_info(info, ie, None) + yield info + + @classmethod + def _extract_from_webpage(cls, url, webpage): + for embed_url in orderedSet( + cls._extract_embed_urls(url, webpage) or [], lazy=True): + yield cls.url_result(embed_url, None if cls._VALID_URL is False else cls) + + @classmethod + def _extract_embed_urls(cls, url, webpage): + """@returns all the embed urls on the webpage""" + if '_EMBED_URL_RE' not in cls.__dict__: + assert isinstance(cls._EMBED_REGEX, (list, tuple)) + for idx, regex in enumerate(cls._EMBED_REGEX): + assert regex.count('(?P') == 1, \ + f'{cls.__name__}._EMBED_REGEX[{idx}] must have exactly 1 url group\n\t{regex}' + cls._EMBED_URL_RE = tuple(map(re.compile, cls._EMBED_REGEX)) + + for regex in cls._EMBED_URL_RE: + for mobj in regex.finditer(webpage): + embed_url = urllib.parse.urljoin(url, unescapeHTML(mobj.group('url'))) + if cls._VALID_URL is False or cls.suitable(embed_url): + yield embed_url + + class StopExtraction(Exception): + pass + + @classmethod + def _extract_url(cls, webpage): # TODO: Remove + """Only for compatibility with some older extractors""" + return next(iter(cls._extract_embed_urls(None, webpage) or []), None) + + @classmethod + def __init_subclass__(cls, *, plugin_name=None, **kwargs): + if plugin_name: + mro = inspect.getmro(cls) + super_class = cls.__wrapped__ = mro[mro.index(cls) + 1] + cls.IE_NAME, cls.ie_key = f'{super_class.IE_NAME}+{plugin_name}', super_class.ie_key + while getattr(super_class, '__wrapped__', None): + super_class = super_class.__wrapped__ + setattr(sys.modules[super_class.__module__], super_class.__name__, cls) + + return super().__init_subclass__(**kwargs) + class SearchInfoExtractor(InfoExtractor): """ @@ -3785,9 +3700,10 @@ class SearchInfoExtractor(InfoExtractor): """ _MAX_RESULTS = float('inf') + _RETURN_TYPE = 'playlist' - @classmethod - def _make_valid_url(cls): + @classproperty + def _VALID_URL(cls): return r'%s(?P|[1-9][0-9]*|all):(?P[\s\S]+)' % cls._SEARCH_KEY def _real_extract(self, query): @@ -3799,7 +3715,7 @@ class SearchInfoExtractor(InfoExtractor): else: n = int(prefix) if n <= 0: - raise ExtractorError('invalid download number %s for query "%s"' % (n, query)) + raise ExtractorError(f'invalid download number {n} for query "{query}"') elif n > self._MAX_RESULTS: self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n)) n = self._MAX_RESULTS @@ -3816,6 +3732,15 @@ class SearchInfoExtractor(InfoExtractor): """Returns an iterator of search results""" raise NotImplementedError('This method must be implemented by subclasses') - @property - def SEARCH_KEY(self): - return self._SEARCH_KEY + @classproperty + def SEARCH_KEY(cls): + return cls._SEARCH_KEY + + +class UnsupportedURLIE(InfoExtractor): + _VALID_URL = '.*' + _ENABLED = False + IE_DESC = False + + def _real_extract(self, url): + raise UnsupportedError(url) diff --git a/hypervideo_dl/extractor/commonmistakes.py b/hypervideo_dl/extractor/commonmistakes.py index eb76fe5..a4a38cf 100644 --- a/hypervideo_dl/extractor/commonmistakes.py +++ b/hypervideo_dl/extractor/commonmistakes.py @@ -1,16 +1,10 @@ -from __future__ import unicode_literals - -import sys - from .common import InfoExtractor from ..utils import ExtractorError class CommonMistakesIE(InfoExtractor): IE_DESC = False # Do not list - _VALID_URL = r'''(?x) - (?:url|URL)$ - ''' + _VALID_URL = r'(?:url|URL|hypervideo)$' _TESTS = [{ 'url': 'url', @@ -35,9 +29,7 @@ class UnicodeBOMIE(InfoExtractor): IE_DESC = False _VALID_URL = r'(?P\ufeff)(?P.*)$' - # Disable test for python 3.2 since BOM is broken in re in this version - # (see https://github.com/ytdl-org/youtube-dl/issues/9751) - _TESTS = [] if (3, 0) < sys.version_info <= (3, 3) else [{ + _TESTS = [{ 'url': '\ufeffhttp://www.youtube.com/watch?v=BaW_jenozKc', 'only_matching': True, }] diff --git a/hypervideo_dl/extractor/commonprotocols.py b/hypervideo_dl/extractor/commonprotocols.py index 3708c6a..2f93e8e 100644 --- a/hypervideo_dl/extractor/commonprotocols.py +++ b/hypervideo_dl/extractor/commonprotocols.py @@ -1,10 +1,6 @@ -from __future__ import unicode_literals - +import urllib.parse from .common import InfoExtractor -from ..compat import ( - compat_urlparse, -) class RtmpIE(InfoExtractor): @@ -28,7 +24,7 @@ class RtmpIE(InfoExtractor): 'formats': [{ 'url': url, 'ext': 'flv', - 'format_id': compat_urlparse.urlparse(url).scheme, + 'format_id': urllib.parse.urlparse(url).scheme, }], } diff --git a/hypervideo_dl/extractor/condenast.py b/hypervideo_dl/extractor/condenast.py index 54e7af8..3170c29 100644 --- a/hypervideo_dl/extractor/condenast.py +++ b/hypervideo_dl/extractor/condenast.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -61,7 +58,10 @@ class CondeNastIE(InfoExtractor): )''' % '|'.join(_SITES.keys()) IE_DESC = 'Condé Nast media group: %s' % ', '.join(sorted(_SITES.values())) - EMBED_URL = r'(?:https?:)?//player(?:-backend)?\.(?:%s)\.com/(?:embed(?:js)?|(?:script|inline)/video)/.+?' % '|'.join(_SITES.keys()) + _EMBED_REGEX = [r'''(?x) + <(?:iframe|script)[^>]+?src=(["\'])(?P + (?:https?:)?//player(?:-backend)?\.(?:%s)\.com/(?:embed(?:js)?|(?:script|inline)/video)/.+? + )\1''' % '|'.join(_SITES.keys())] _TESTS = [{ 'url': 'http://video.wired.com/watch/3d-printed-speakers-lit-with-led', @@ -197,7 +197,6 @@ class CondeNastIE(InfoExtractor): 'ext': ext, 'quality': 1 if quality == 'high' else 0, }) - self._sort_formats(formats) subtitles = {} for t, caption in video_info.get('captions', {}).items(): diff --git a/hypervideo_dl/extractor/contv.py b/hypervideo_dl/extractor/contv.py index 84b462d..d69e816 100644 --- a/hypervideo_dl/extractor/contv.py +++ b/hypervideo_dl/extractor/contv.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( float_or_none, @@ -72,8 +69,6 @@ class CONtvIE(InfoExtractor): 'url': media_mp4_url, }) - self._sort_formats(formats) - subtitles = {} captions = m_details.get('captions') or {} for caption_url in captions.values(): diff --git a/hypervideo_dl/extractor/corus.py b/hypervideo_dl/extractor/corus.py index 1194613..c03d653 100644 --- a/hypervideo_dl/extractor/corus.py +++ b/hypervideo_dl/extractor/corus.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .theplatform import ThePlatformFeedIE from ..utils import ( dict_get, @@ -11,7 +7,7 @@ from ..utils import ( ) -class CorusIE(ThePlatformFeedIE): +class CorusIE(ThePlatformFeedIE): # XXX: Do not subclass from concrete IE _VALID_URL = r'''(?x) https?:// (?:www\.)? @@ -130,7 +126,6 @@ class CorusIE(ThePlatformFeedIE): smil, smil_url, video_id, namespace)) if not formats and video.get('drm'): self.report_drm(video_id) - self._sort_formats(formats) subtitles = {} for track in video.get('tracks', []): diff --git a/hypervideo_dl/extractor/coub.py b/hypervideo_dl/extractor/coub.py index e90aa19..9bab698 100644 --- a/hypervideo_dl/extractor/coub.py +++ b/hypervideo_dl/extractor/coub.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( ExtractorError, @@ -107,8 +104,6 @@ class CoubIE(InfoExtractor): 'source_preference': preference_key(MOBILE), }) - self._sort_formats(formats) - thumbnail = coub.get('picture') duration = float_or_none(coub.get('duration')) timestamp = parse_iso8601(coub.get('published_at') or coub.get('created_at')) diff --git a/hypervideo_dl/extractor/cozytv.py b/hypervideo_dl/extractor/cozytv.py index d49f1ca..5ef5afc 100644 --- a/hypervideo_dl/extractor/cozytv.py +++ b/hypervideo_dl/extractor/cozytv.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import unified_strdate diff --git a/hypervideo_dl/extractor/cpac.py b/hypervideo_dl/extractor/cpac.py index 2274115..0f23f2b 100644 --- a/hypervideo_dl/extractor/cpac.py +++ b/hypervideo_dl/extractor/cpac.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import compat_str from ..utils import ( @@ -12,13 +9,6 @@ from ..utils import ( urljoin, ) -# compat_range -try: - if callable(xrange): - range = xrange -except (NameError, TypeError): - pass - class CPACIE(InfoExtractor): IE_NAME = 'cpac' @@ -64,8 +54,6 @@ class CPACIE(InfoExtractor): else: fmt['language_preference'] = -10 - self._sort_formats(formats) - category = str_or_none(content['details']['category_%s_t' % (url_lang, )]) def is_live(v_type): diff --git a/hypervideo_dl/extractor/cracked.py b/hypervideo_dl/extractor/cracked.py index f77a68e..c6aabcc 100644 --- a/hypervideo_dl/extractor/cracked.py +++ b/hypervideo_dl/extractor/cracked.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/crackle.py b/hypervideo_dl/extractor/crackle.py index db4962c..4610015 100644 --- a/hypervideo_dl/extractor/crackle.py +++ b/hypervideo_dl/extractor/crackle.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals, division - import hashlib import hmac import re @@ -180,7 +177,6 @@ class CrackleIE(InfoExtractor): }) if not formats and has_drm: self.report_drm(video_id) - self._sort_formats(formats) description = media.get('Description') duration = int_or_none(media.get( diff --git a/hypervideo_dl/extractor/craftsy.py b/hypervideo_dl/extractor/craftsy.py index ed2f442..307bfb9 100644 --- a/hypervideo_dl/extractor/craftsy.py +++ b/hypervideo_dl/extractor/craftsy.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .brightcove import BrightcoveNewIE from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/crooksandliars.py b/hypervideo_dl/extractor/crooksandliars.py index 7fb782d..4de7e3d 100644 --- a/hypervideo_dl/extractor/crooksandliars.py +++ b/hypervideo_dl/extractor/crooksandliars.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( int_or_none, @@ -9,6 +7,8 @@ from ..utils import ( class CrooksAndLiarsIE(InfoExtractor): _VALID_URL = r'https?://embed\.crooksandliars\.com/(?:embed|v)/(?P[A-Za-z0-9]+)' + _EMBED_REGEX = [r'<(?:iframe[^>]+src|param[^>]+value)=(["\'])(?P(?:https?:)?//embed\.crooksandliars\.com/(?:embed|v)/.+?)\1'] + _TESTS = [{ 'url': 'https://embed.crooksandliars.com/embed/8RUoRhRi', 'info_dict': { @@ -45,7 +45,6 @@ class CrooksAndLiarsIE(InfoExtractor): 'format_id': item['type'], 'quality': quality(item['type']), } for item in manifest['flavors'] if item['mime'].startswith('video/')] - self._sort_formats(formats) return { 'url': url, diff --git a/hypervideo_dl/extractor/crowdbunker.py b/hypervideo_dl/extractor/crowdbunker.py index 72906af..d83c015 100644 --- a/hypervideo_dl/extractor/crowdbunker.py +++ b/hypervideo_dl/extractor/crowdbunker.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import itertools from .common import InfoExtractor @@ -63,7 +60,6 @@ class CrowdBunkerIE(InfoExtractor): 'width': int_or_none(image.get('width')), } for image in video_json.get('thumbnails') or [] if image.get('url')] - self._sort_formats(formats) return { 'id': id, 'title': video_json.get('title'), diff --git a/hypervideo_dl/extractor/crunchyroll.py b/hypervideo_dl/extractor/crunchyroll.py index 7edb645..d226050 100644 --- a/hypervideo_dl/extractor/crunchyroll.py +++ b/hypervideo_dl/extractor/crunchyroll.py @@ -1,44 +1,16 @@ -# coding: utf-8 -from __future__ import unicode_literals - import base64 -import re -import json -import zlib +import urllib.parse -from hashlib import sha1 -from math import pow, sqrt, floor from .common import InfoExtractor -from .vrv import VRVBaseIE -from ..compat import ( - compat_b64decode, - compat_etree_Element, - compat_etree_fromstring, - compat_str, - compat_urllib_parse_urlencode, - compat_urllib_request, - compat_urlparse, -) from ..utils import ( ExtractorError, - bytes_to_intlist, - extract_attributes, float_or_none, format_field, - intlist_to_bytes, - int_or_none, join_nonempty, - lowercase_escape, - merge_dicts, + parse_iso8601, qualities, - remove_end, - sanitized_Request, traverse_obj, try_get, - xpath_text, -) -from ..aes import ( - aes_cbc_decrypt, ) @@ -46,16 +18,7 @@ class CrunchyrollBaseIE(InfoExtractor): _LOGIN_URL = 'https://www.crunchyroll.com/welcome/login' _API_BASE = 'https://api.crunchyroll.com' _NETRC_MACHINE = 'crunchyroll' - - def _call_rpc_api(self, method, video_id, note=None, data=None): - data = data or {} - data['req'] = 'RpcApi' + method - data = compat_urllib_parse_urlencode(data).encode('utf-8') - return self._download_xml( - 'https://www.crunchyroll.com/xml/', - video_id, note, fatal=False, data=data, headers={ - 'Content-Type': 'application/x-www-form-urlencoded', - }) + params = None def _perform_login(self, username, password): if self._get_cookies(self._LOGIN_URL).get('etp_rt'): @@ -76,7 +39,7 @@ class CrunchyrollBaseIE(InfoExtractor): login_response = self._download_json( f'{self._API_BASE}/login.1.json', None, 'Logging in', - data=compat_urllib_parse_urlencode({ + data=urllib.parse.urlencode({ 'account': username, 'password': password, 'session_id': session_id @@ -86,800 +49,173 @@ class CrunchyrollBaseIE(InfoExtractor): if not self._get_cookies(self._LOGIN_URL).get('etp_rt'): raise ExtractorError('Login succeeded but did not set etp_rt cookie') - # Beta-specific, but needed for redirects - def _get_beta_embedded_json(self, webpage, display_id): + def _get_embedded_json(self, webpage, display_id): initial_state = self._parse_json(self._search_regex( r'__INITIAL_STATE__\s*=\s*({.+?})\s*;', webpage, 'initial state'), display_id) app_config = self._parse_json(self._search_regex( r'__APP_CONFIG__\s*=\s*({.+?})\s*;', webpage, 'app config'), display_id) return initial_state, app_config - def _redirect_to_beta(self, webpage, iekey, video_id): - if not self._get_cookies(self._LOGIN_URL).get('etp_rt'): - raise ExtractorError('Received a beta page from non-beta url when not logged in.') - initial_state, app_config = self._get_beta_embedded_json(webpage, video_id) - url = app_config['baseSiteUrl'] + initial_state['router']['locations']['current']['pathname'] - self.to_screen(f'{video_id}: Redirected to beta site - {url}') - return self.url_result(f'{url}', iekey, video_id) - - @staticmethod - def _add_skip_wall(url): - parsed_url = compat_urlparse.urlparse(url) - qs = compat_urlparse.parse_qs(parsed_url.query) - # Always force skip_wall to bypass maturity wall, namely 18+ confirmation message: - # > This content may be inappropriate for some people. - # > Are you sure you want to continue? - # since it's not disabled by default in crunchyroll account's settings. - # See https://github.com/ytdl-org/youtube-dl/issues/7202. - qs['skip_wall'] = ['1'] - return compat_urlparse.urlunparse( - parsed_url._replace(query=compat_urllib_parse_urlencode(qs, True))) - - -class CrunchyrollIE(CrunchyrollBaseIE, VRVBaseIE): - IE_NAME = 'crunchyroll' - _VALID_URL = r'https?://(?:(?Pwww|m)\.)?(?Pcrunchyroll\.(?:com|fr)/(?:media(?:-|/\?id=)|(?:[^/]*/){1,2}[^/?&]*?)(?P[0-9]+))(?:[/?&]|$)' - _TESTS = [{ - 'url': 'http://www.crunchyroll.com/wanna-be-the-strongest-in-the-world/episode-1-an-idol-wrestler-is-born-645513', - 'info_dict': { - 'id': '645513', - 'ext': 'mp4', - 'title': 'Wanna be the Strongest in the World Episode 1 – An Idol-Wrestler is Born!', - 'description': 'md5:2d17137920c64f2f49981a7797d275ef', - 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'Yomiuri Telecasting Corporation (YTV)', - 'upload_date': '20131013', - 'url': 're:(?!.*&)', - }, - 'params': { - # rtmp - 'skip_download': True, - }, - 'skip': 'Video gone', - }, { - 'url': 'http://www.crunchyroll.com/media-589804/culture-japan-1', - 'info_dict': { - 'id': '589804', - 'ext': 'flv', - 'title': 'Culture Japan Episode 1 – Rebuilding Japan after the 3.11', - 'description': 'md5:2fbc01f90b87e8e9137296f37b461c12', - 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'Danny Choo Network', - 'upload_date': '20120213', - }, - 'params': { - # rtmp - 'skip_download': True, - }, - 'skip': 'Video gone', - }, { - 'url': 'http://www.crunchyroll.com/rezero-starting-life-in-another-world-/episode-5-the-morning-of-our-promise-is-still-distant-702409', - 'info_dict': { - 'id': '702409', - 'ext': 'mp4', - 'title': compat_str, - 'description': compat_str, - 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'Re:Zero Partners', - 'timestamp': 1462098900, - 'upload_date': '20160501', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { - 'url': 'http://www.crunchyroll.com/konosuba-gods-blessing-on-this-wonderful-world/episode-1-give-me-deliverance-from-this-judicial-injustice-727589', - 'info_dict': { - 'id': '727589', - 'ext': 'mp4', - 'title': compat_str, - 'description': compat_str, - 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'Kadokawa Pictures Inc.', - 'timestamp': 1484130900, - 'upload_date': '20170111', - 'series': compat_str, - 'season': "KONOSUBA -God's blessing on this wonderful world! 2", - 'season_number': 2, - 'episode': 'Give Me Deliverance From This Judicial Injustice!', - 'episode_number': 1, - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { - 'url': 'http://www.crunchyroll.fr/girl-friend-beta/episode-11-goodbye-la-mode-661697', - 'only_matching': True, - }, { - # geo-restricted (US), 18+ maturity wall, non-premium available - 'url': 'http://www.crunchyroll.com/cosplay-complex-ova/episode-1-the-birth-of-the-cosplay-club-565617', - 'only_matching': True, - }, { - # A description with double quotes - 'url': 'http://www.crunchyroll.com/11eyes/episode-1-piros-jszaka-red-night-535080', - 'info_dict': { - 'id': '535080', - 'ext': 'mp4', - 'title': compat_str, - 'description': compat_str, - 'uploader': 'Marvelous AQL Inc.', - 'timestamp': 1255512600, - 'upload_date': '20091014', - }, - 'params': { - # Just test metadata extraction - 'skip_download': True, - }, - }, { - # make sure we can extract an uploader name that's not a link - 'url': 'http://www.crunchyroll.com/hakuoki-reimeiroku/episode-1-dawn-of-the-divine-warriors-606899', - 'info_dict': { - 'id': '606899', - 'ext': 'mp4', - 'title': 'Hakuoki Reimeiroku Episode 1 – Dawn of the Divine Warriors', - 'description': 'Ryunosuke was left to die, but Serizawa-san asked him a simple question "Do you want to live?"', - 'uploader': 'Geneon Entertainment', - 'upload_date': '20120717', - }, - 'params': { - # just test metadata extraction - 'skip_download': True, - }, - 'skip': 'Video gone', - }, { - # A video with a vastly different season name compared to the series name - 'url': 'http://www.crunchyroll.com/nyarko-san-another-crawling-chaos/episode-1-test-590532', - 'info_dict': { - 'id': '590532', - 'ext': 'mp4', - 'title': compat_str, - 'description': compat_str, - 'uploader': 'TV TOKYO', - 'timestamp': 1330956000, - 'upload_date': '20120305', - 'series': 'Nyarko-san: Another Crawling Chaos', - 'season': 'Haiyoru! Nyaruani (ONA)', - }, - 'params': { - # Just test metadata extraction - 'skip_download': True, - }, - }, { - 'url': 'http://www.crunchyroll.com/media-723735', - 'only_matching': True, - }, { - 'url': 'https://www.crunchyroll.com/en-gb/mob-psycho-100/episode-2-urban-legends-encountering-rumors-780921', - 'only_matching': True, - }] - - _FORMAT_IDS = { - '360': ('60', '106'), - '480': ('61', '106'), - '720': ('62', '106'), - '1080': ('80', '108'), - } - - def _download_webpage(self, url_or_request, *args, **kwargs): - request = (url_or_request if isinstance(url_or_request, compat_urllib_request.Request) - else sanitized_Request(url_or_request)) - # Accept-Language must be set explicitly to accept any language to avoid issues - # similar to https://github.com/ytdl-org/youtube-dl/issues/6797. - # Along with IP address Crunchyroll uses Accept-Language to guess whether georestriction - # should be imposed or not (from what I can see it just takes the first language - # ignoring the priority and requires it to correspond the IP). By the way this causes - # Crunchyroll to not work in georestriction cases in some browsers that don't place - # the locale lang first in header. However allowing any language seems to workaround the issue. - request.add_header('Accept-Language', '*') - return super(CrunchyrollBaseIE, self)._download_webpage(request, *args, **kwargs) - - def _decrypt_subtitles(self, data, iv, id): - data = bytes_to_intlist(compat_b64decode(data)) - iv = bytes_to_intlist(compat_b64decode(iv)) - id = int(id) - - def obfuscate_key_aux(count, modulo, start): - output = list(start) - for _ in range(count): - output.append(output[-1] + output[-2]) - # cut off start values - output = output[2:] - output = list(map(lambda x: x % modulo + 33, output)) - return output - - def obfuscate_key(key): - num1 = int(floor(pow(2, 25) * sqrt(6.9))) - num2 = (num1 ^ key) << 5 - num3 = key ^ num1 - num4 = num3 ^ (num3 >> 3) ^ num2 - prefix = intlist_to_bytes(obfuscate_key_aux(20, 97, (1, 2))) - shaHash = bytes_to_intlist(sha1(prefix + str(num4).encode('ascii')).digest()) - # Extend 160 Bit hash to 256 Bit - return shaHash + [0] * 12 - - key = obfuscate_key(id) - - decrypted_data = intlist_to_bytes(aes_cbc_decrypt(data, key, iv)) - return zlib.decompress(decrypted_data) - - def _convert_subtitles_to_srt(self, sub_root): - output = '' - - for i, event in enumerate(sub_root.findall('./events/event'), 1): - start = event.attrib['start'].replace('.', ',') - end = event.attrib['end'].replace('.', ',') - text = event.attrib['text'].replace('\\N', '\n') - output += '%d\n%s --> %s\n%s\n\n' % (i, start, end, text) - return output - - def _convert_subtitles_to_ass(self, sub_root): - output = '' - - def ass_bool(strvalue): - assvalue = '0' - if strvalue == '1': - assvalue = '-1' - return assvalue - - output = '[Script Info]\n' - output += 'Title: %s\n' % sub_root.attrib['title'] - output += 'ScriptType: v4.00+\n' - output += 'WrapStyle: %s\n' % sub_root.attrib['wrap_style'] - output += 'PlayResX: %s\n' % sub_root.attrib['play_res_x'] - output += 'PlayResY: %s\n' % sub_root.attrib['play_res_y'] - output += """ -[V4+ Styles] -Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding -""" - for style in sub_root.findall('./styles/style'): - output += 'Style: ' + style.attrib['name'] - output += ',' + style.attrib['font_name'] - output += ',' + style.attrib['font_size'] - output += ',' + style.attrib['primary_colour'] - output += ',' + style.attrib['secondary_colour'] - output += ',' + style.attrib['outline_colour'] - output += ',' + style.attrib['back_colour'] - output += ',' + ass_bool(style.attrib['bold']) - output += ',' + ass_bool(style.attrib['italic']) - output += ',' + ass_bool(style.attrib['underline']) - output += ',' + ass_bool(style.attrib['strikeout']) - output += ',' + style.attrib['scale_x'] - output += ',' + style.attrib['scale_y'] - output += ',' + style.attrib['spacing'] - output += ',' + style.attrib['angle'] - output += ',' + style.attrib['border_style'] - output += ',' + style.attrib['outline'] - output += ',' + style.attrib['shadow'] - output += ',' + style.attrib['alignment'] - output += ',' + style.attrib['margin_l'] - output += ',' + style.attrib['margin_r'] - output += ',' + style.attrib['margin_v'] - output += ',' + style.attrib['encoding'] - output += '\n' - - output += """ -[Events] -Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text -""" - for event in sub_root.findall('./events/event'): - output += 'Dialogue: 0' - output += ',' + event.attrib['start'] - output += ',' + event.attrib['end'] - output += ',' + event.attrib['style'] - output += ',' + event.attrib['name'] - output += ',' + event.attrib['margin_l'] - output += ',' + event.attrib['margin_r'] - output += ',' + event.attrib['margin_v'] - output += ',' + event.attrib['effect'] - output += ',' + event.attrib['text'] - output += '\n' - - return output - - def _extract_subtitles(self, subtitle): - sub_root = compat_etree_fromstring(subtitle) - return [{ - 'ext': 'srt', - 'data': self._convert_subtitles_to_srt(sub_root), - }, { - 'ext': 'ass', - 'data': self._convert_subtitles_to_ass(sub_root), - }] - - def _get_subtitles(self, video_id, webpage): - subtitles = {} - for sub_id, sub_name in re.findall(r'\bssid=([0-9]+)"[^>]+?\btitle="([^"]+)', webpage): - sub_doc = self._call_rpc_api( - 'Subtitle_GetXml', video_id, - 'Downloading subtitles for ' + sub_name, data={ - 'subtitle_script_id': sub_id, - }) - if not isinstance(sub_doc, compat_etree_Element): - continue - sid = sub_doc.get('id') - iv = xpath_text(sub_doc, 'iv', 'subtitle iv') - data = xpath_text(sub_doc, 'data', 'subtitle data') - if not sid or not iv or not data: - continue - subtitle = self._decrypt_subtitles(data, iv, sid).decode('utf-8') - lang_code = self._search_regex(r'lang_code=["\']([^"\']+)', subtitle, 'subtitle_lang_code', fatal=False) - if not lang_code: - continue - subtitles[lang_code] = self._extract_subtitles(subtitle) - return subtitles - - def _real_extract(self, url): - mobj = self._match_valid_url(url) - video_id = mobj.group('id') - - if mobj.group('prefix') == 'm': - mobile_webpage = self._download_webpage(url, video_id, 'Downloading mobile webpage') - webpage_url = self._search_regex(r'', mobile_webpage, 'webpage_url') - else: - webpage_url = 'http://www.' + mobj.group('url') - - webpage = self._download_webpage( - self._add_skip_wall(webpage_url), video_id, - headers=self.geo_verification_headers()) - if re.search(r'
    ', webpage): - return self._redirect_to_beta(webpage, CrunchyrollBetaIE.ie_key(), video_id) - note_m = self._html_search_regex( - r'
    (.+?)
    ', - webpage, 'trailer-notice', default='') - if note_m: - raise ExtractorError(note_m, expected=True) - - mobj = re.search(r'Page\.messaging_box_controller\.addItems\(\[(?P{.+?})\]\)', webpage) - if mobj: - msg = json.loads(mobj.group('msg')) - if msg.get('type') == 'error': - raise ExtractorError('crunchyroll returned error: %s' % msg['message_body'], expected=True) - - if 'To view this, please log in to verify you are 18 or older.' in webpage: - self.raise_login_required() - - media = self._parse_json(self._search_regex( - r'vilos\.config\.media\s*=\s*({.+?});', - webpage, 'vilos media', default='{}'), video_id) - media_metadata = media.get('metadata') or {} - - language = self._search_regex( - r'(?:vilos\.config\.player\.language|LOCALE)\s*=\s*(["\'])(?P(?:(?!\1).)+)\1', - webpage, 'language', default=None, group='lang') - - video_title = self._html_search_regex( - (r'(?s)]*>((?:(?!]+itemprop=["\']title["\']|meta[^>]+itemprop=["\']position["\'])[^>]*>(?:(?!', - r'(.+?),\s+-\s+.+? Crunchyroll'), - webpage, 'video_title', default=None) - if not video_title: - video_title = re.sub(r'^Watch\s+', '', self._og_search_description(webpage)) - video_title = re.sub(r' {2,}', ' ', video_title) - video_description = (self._parse_json(self._html_search_regex( - r'<script[^>]*>\s*.+?\[media_id=%s\].+?({.+?"description"\s*:.+?})\);' % video_id, - webpage, 'description', default='{}'), video_id) or media_metadata).get('description') - - thumbnails = [] - thumbnail_url = (self._parse_json(self._html_search_regex( - r'<script type="application\/ld\+json">\n\s*(.+?)<\/script>', - webpage, 'thumbnail_url', default='{}'), video_id)).get('image') - if thumbnail_url: - thumbnails.append({ - 'url': thumbnail_url, - 'width': 1920, - 'height': 1080 - }) - - if video_description: - video_description = lowercase_escape(video_description.replace(r'\r\n', '\n')) - video_uploader = self._html_search_regex( - # try looking for both an uploader that's a link and one that's not - [r'<a[^>]+href="/publisher/[^"]+"[^>]*>([^<]+)</a>', r'<div>\s*Publisher:\s*<span>\s*(.+?)\s*</span>\s*</div>'], - webpage, 'video_uploader', default=False) - - requested_languages = self._configuration_arg('language') - requested_hardsubs = [('' if val == 'none' else val) for val in self._configuration_arg('hardsub')] - language_preference = qualities((requested_languages or [language or ''])[::-1]) - hardsub_preference = qualities((requested_hardsubs or ['', language or ''])[::-1]) - - formats = [] - for stream in media.get('streams', []): - audio_lang = stream.get('audio_lang') or '' - hardsub_lang = stream.get('hardsub_lang') or '' - if (requested_languages and audio_lang.lower() not in requested_languages - or requested_hardsubs and hardsub_lang.lower() not in requested_hardsubs): - continue - vrv_formats = self._extract_vrv_formats( - stream.get('url'), video_id, stream.get('format'), - audio_lang, hardsub_lang) - for f in vrv_formats: - f['language_preference'] = language_preference(audio_lang) - f['quality'] = hardsub_preference(hardsub_lang) - formats.extend(vrv_formats) - if not formats: - available_fmts = [] - for a, fmt in re.findall(r'(<a[^>]+token=["\']showmedia\.([0-9]{3,4})p["\'][^>]+>)', webpage): - attrs = extract_attributes(a) - href = attrs.get('href') - if href and '/freetrial' in href: - continue - available_fmts.append(fmt) - if not available_fmts: - for p in (r'token=["\']showmedia\.([0-9]{3,4})p"', r'showmedia\.([0-9]{3,4})p'): - available_fmts = re.findall(p, webpage) - if available_fmts: - break - if not available_fmts: - available_fmts = self._FORMAT_IDS.keys() - video_encode_ids = [] - - for fmt in available_fmts: - stream_quality, stream_format = self._FORMAT_IDS[fmt] - video_format = fmt + 'p' - stream_infos = [] - streamdata = self._call_rpc_api( - 'VideoPlayer_GetStandardConfig', video_id, - 'Downloading media info for %s' % video_format, data={ - 'media_id': video_id, - 'video_format': stream_format, - 'video_quality': stream_quality, - 'current_page': url, - }) - if isinstance(streamdata, compat_etree_Element): - stream_info = streamdata.find('./{default}preload/stream_info') - if stream_info is not None: - stream_infos.append(stream_info) - stream_info = self._call_rpc_api( - 'VideoEncode_GetStreamInfo', video_id, - 'Downloading stream info for %s' % video_format, data={ - 'media_id': video_id, - 'video_format': stream_format, - 'video_encode_quality': stream_quality, - }) - if isinstance(stream_info, compat_etree_Element): - stream_infos.append(stream_info) - for stream_info in stream_infos: - video_encode_id = xpath_text(stream_info, './video_encode_id') - if video_encode_id in video_encode_ids: - continue - video_encode_ids.append(video_encode_id) - - video_file = xpath_text(stream_info, './file') - if not video_file: - continue - if video_file.startswith('http'): - formats.extend(self._extract_m3u8_formats( - video_file, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - continue - - video_url = xpath_text(stream_info, './host') - if not video_url: - continue - metadata = stream_info.find('./metadata') - format_info = { - 'format': video_format, - 'height': int_or_none(xpath_text(metadata, './height')), - 'width': int_or_none(xpath_text(metadata, './width')), - } - - if '.fplive.net/' in video_url: - video_url = re.sub(r'^rtmpe?://', 'http://', video_url.strip()) - parsed_video_url = compat_urlparse.urlparse(video_url) - direct_video_url = compat_urlparse.urlunparse(parsed_video_url._replace( - netloc='v.lvlt.crcdn.net', - path='%s/%s' % (remove_end(parsed_video_url.path, '/'), video_file.split(':')[-1]))) - if self._is_valid_url(direct_video_url, video_id, video_format): - format_info.update({ - 'format_id': 'http-' + video_format, - 'url': direct_video_url, - }) - formats.append(format_info) - continue - - format_info.update({ - 'format_id': 'rtmp-' + video_format, - 'url': video_url, - 'play_path': video_file, - 'ext': 'flv', - }) - formats.append(format_info) - self._sort_formats(formats) - - metadata = self._call_rpc_api( - 'VideoPlayer_GetMediaMetadata', video_id, - note='Downloading media info', data={ - 'media_id': video_id, - }) - - subtitles = {} - for subtitle in media.get('subtitles', []): - subtitle_url = subtitle.get('url') - if not subtitle_url: - continue - subtitles.setdefault(subtitle.get('language', 'enUS'), []).append({ - 'url': subtitle_url, - 'ext': subtitle.get('format', 'ass'), - }) - if not subtitles: - subtitles = self.extract_subtitles(video_id, webpage) - - # webpage provide more accurate data than series_title from XML - series = self._html_search_regex( - r'(?s)<h\d[^>]+\bid=["\']showmedia_about_episode_num[^>]+>(.+?)</h\d', - webpage, 'series', fatal=False) - - season = episode = episode_number = duration = None - - if isinstance(metadata, compat_etree_Element): - season = xpath_text(metadata, 'series_title') - episode = xpath_text(metadata, 'episode_title') - episode_number = int_or_none(xpath_text(metadata, 'episode_number')) - duration = float_or_none(media_metadata.get('duration'), 1000) - - if not episode: - episode = media_metadata.get('title') - if not episode_number: - episode_number = int_or_none(media_metadata.get('episode_number')) - thumbnail_url = try_get(media, lambda x: x['thumbnail']['url']) - if thumbnail_url: - thumbnails.append({ - 'url': thumbnail_url, - 'width': 640, - 'height': 360 - }) - - season_number = int_or_none(self._search_regex( - r'(?s)<h\d[^>]+id=["\']showmedia_about_episode_num[^>]+>.+?</h\d>\s*<h4>\s*Season (\d+)', - webpage, 'season number', default=None)) - - info = self._search_json_ld(webpage, video_id, default={}) - - return merge_dicts({ - 'id': video_id, - 'title': video_title, - 'description': video_description, - 'duration': duration, - 'thumbnails': thumbnails, - 'uploader': video_uploader, - 'series': series, - 'season': season, - 'season_number': season_number, - 'episode': episode, - 'episode_number': episode_number, - 'subtitles': subtitles, - 'formats': formats, - }, info) - - -class CrunchyrollShowPlaylistIE(CrunchyrollBaseIE): - IE_NAME = 'crunchyroll:playlist' - _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.com/(?:\w{1,2}/)?(?!(?:news|anime-news|library|forum|launchcalendar|lineup|store|comics|freetrial|login|media-\d+))(?P<id>[\w\-]+))/?(?:\?|$)' - - _TESTS = [{ - 'url': 'https://www.crunchyroll.com/a-bridge-to-the-starry-skies-hoshizora-e-kakaru-hashi', - 'info_dict': { - 'id': 'a-bridge-to-the-starry-skies-hoshizora-e-kakaru-hashi', - 'title': 'A Bridge to the Starry Skies - Hoshizora e Kakaru Hashi' - }, - 'playlist_count': 13, - }, { - # geo-restricted (US), 18+ maturity wall, non-premium available - 'url': 'http://www.crunchyroll.com/cosplay-complex-ova', - 'info_dict': { - 'id': 'cosplay-complex-ova', - 'title': 'Cosplay Complex OVA' - }, - 'playlist_count': 3, - 'skip': 'Georestricted', - }, { - # geo-restricted (US), 18+ maturity wall, non-premium will be available since 2015.11.14 - 'url': 'http://www.crunchyroll.com/ladies-versus-butlers?skip_wall=1', - 'only_matching': True, - }, { - 'url': 'http://www.crunchyroll.com/fr/ladies-versus-butlers', - 'only_matching': True, - }] - - def _real_extract(self, url): - show_id = self._match_id(url) - - webpage = self._download_webpage( - # https:// gives a 403, but http:// does not - self._add_skip_wall(url).replace('https://', 'http://'), show_id, - headers=self.geo_verification_headers()) - if re.search(r'<div id="preload-data">', webpage): - return self._redirect_to_beta(webpage, CrunchyrollBetaShowIE.ie_key(), show_id) - title = self._html_search_meta('name', webpage, default=None) - - episode_re = r'<li id="showview_videos_media_(\d+)"[^>]+>.*?<a href="([^"]+)"' - season_re = r'<a [^>]+season-dropdown[^>]+>([^<]+)' - paths = re.findall(f'(?s){episode_re}|{season_re}', webpage) - - entries, current_season = [], None - for ep_id, ep, season in paths: - if season: - current_season = season - continue - entries.append(self.url_result( - f'http://www.crunchyroll.com{ep}', CrunchyrollIE.ie_key(), ep_id, season=current_season)) - - return { - '_type': 'playlist', - 'id': show_id, - 'title': title, - 'entries': reversed(entries), - } + def _get_params(self, lang): + if not CrunchyrollBaseIE.params: + if self._get_cookies(f'https://www.crunchyroll.com/{lang}').get('etp_rt'): + grant_type, key = 'etp_rt_cookie', 'accountAuthClientId' + else: + grant_type, key = 'client_id', 'anonClientId' + initial_state, app_config = self._get_embedded_json(self._download_webpage( + f'https://www.crunchyroll.com/{lang}', None, note='Retrieving main page'), None) + api_domain = app_config['cxApiParams']['apiDomain'].replace('beta.crunchyroll.com', 'www.crunchyroll.com') -class CrunchyrollBetaBaseIE(CrunchyrollBaseIE): - params = None - - def _get_params(self, lang): - if not CrunchyrollBetaBaseIE.params: - initial_state, app_config = self._get_beta_embedded_json(self._download_webpage( - f'https://beta.crunchyroll.com/{lang}', None, note='Retrieving main page'), None) - api_domain = app_config['cxApiParams']['apiDomain'] - basic_token = str(base64.b64encode(('%s:' % app_config['cxApiParams']['accountAuthClientId']).encode('ascii')), 'ascii') auth_response = self._download_json( - f'{api_domain}/auth/v1/token', None, note='Authenticating with cookie', + f'{api_domain}/auth/v1/token', None, note=f'Authenticating with grant_type={grant_type}', headers={ - 'Authorization': 'Basic ' + basic_token - }, data='grant_type=etp_rt_cookie'.encode('ascii')) + 'Authorization': 'Basic ' + str(base64.b64encode(('%s:' % app_config['cxApiParams'][key]).encode('ascii')), 'ascii') + }, data=f'grant_type={grant_type}'.encode('ascii')) policy_response = self._download_json( f'{api_domain}/index/v2', None, note='Retrieving signed policy', headers={ 'Authorization': auth_response['token_type'] + ' ' + auth_response['access_token'] }) - bucket = policy_response['cms']['bucket'] + cms = policy_response.get('cms_web') + bucket = cms['bucket'] params = { - 'Policy': policy_response['cms']['policy'], - 'Signature': policy_response['cms']['signature'], - 'Key-Pair-Id': policy_response['cms']['key_pair_id'] + 'Policy': cms['policy'], + 'Signature': cms['signature'], + 'Key-Pair-Id': cms['key_pair_id'] } locale = traverse_obj(initial_state, ('localization', 'locale')) if locale: params['locale'] = locale - CrunchyrollBetaBaseIE.params = (api_domain, bucket, params) - return CrunchyrollBetaBaseIE.params - - def _redirect_from_beta(self, url, lang, internal_id, display_id, is_episode, iekey): - initial_state, app_config = self._get_beta_embedded_json(self._download_webpage(url, display_id), display_id) - content_data = initial_state['content']['byId'][internal_id] - if is_episode: - video_id = content_data['external_id'].split('.')[1] - series_id = content_data['episode_metadata']['series_slug_title'] - else: - series_id = content_data['slug_title'] - series_id = re.sub(r'-{2,}', '-', series_id) - url = f'https://www.crunchyroll.com/{lang}{series_id}' - if is_episode: - url = url + f'/{display_id}-{video_id}' - self.to_screen(f'{display_id}: Not logged in. Redirecting to non-beta site - {url}') - return self.url_result(url, iekey, display_id) + CrunchyrollBaseIE.params = (api_domain, bucket, params) + return CrunchyrollBaseIE.params -class CrunchyrollBetaIE(CrunchyrollBetaBaseIE): - IE_NAME = 'crunchyroll:beta' - _VALID_URL = r'https?://beta\.crunchyroll\.com/(?P<lang>(?:\w{1,2}/)?)watch/(?P<id>\w+)/(?P<display_id>[\w\-]*)/?(?:\?|$)' +class CrunchyrollBetaIE(CrunchyrollBaseIE): + IE_NAME = 'crunchyroll' + _VALID_URL = r'''(?x) + https?://(?:beta|www)\.crunchyroll\.com/ + (?P<lang>(?:\w{2}(?:-\w{2})?/)?) + watch/(?P<id>\w+) + (?:/(?P<display_id>[\w-]+))?/?(?:[?#]|$)''' _TESTS = [{ - 'url': 'https://beta.crunchyroll.com/watch/GY2P1Q98Y/to-the-future', + 'url': 'https://www.crunchyroll.com/watch/GY2P1Q98Y/to-the-future', 'info_dict': { - 'id': '696363', + 'id': 'GY2P1Q98Y', 'ext': 'mp4', - 'timestamp': 1459610100, + 'duration': 1380.241, + 'timestamp': 1459632600, 'description': 'md5:a022fbec4fbb023d43631032c91ed64b', - 'uploader': 'Toei Animation', 'title': 'World Trigger Episode 73 – To the Future', 'upload_date': '20160402', - 'episode_number': 73, 'series': 'World Trigger', - 'average_rating': 4.9, - 'episode': 'To the Future', + 'series_id': 'GR757DMKY', 'season': 'World Trigger', - 'thumbnail': 'https://img1.ak.crunchyroll.com/i/spire3-tmb/c870dedca1a83137c2d3d144984155ed1459527119_main.jpg', + 'season_id': 'GR9P39NJ6', 'season_number': 1, + 'episode': 'To the Future', + 'episode_number': 73, + 'thumbnail': r're:^https://www.crunchyroll.com/imgsrv/.*\.jpeg$', }, - 'params': {'skip_download': 'm3u8'}, - 'expected_warnings': ['Unable to download XML'] + 'params': {'skip_download': 'm3u8', 'format': 'all[format_id~=hardsub]'}, }, { - 'url': 'https://beta.crunchyroll.com/watch/GYK53DMPR/wicked-lord-shingan-reborn', + 'url': 'https://www.crunchyroll.com/watch/GYE5WKQGR', 'info_dict': { - 'id': '648781', + 'id': 'GYE5WKQGR', 'ext': 'mp4', - 'episode_number': 1, - 'timestamp': 1389173400, - 'series': 'Love, Chunibyo & Other Delusions - Heart Throb -', - 'description': 'md5:5579d1a0355cc618558ba23d27067a62', - 'uploader': 'TBS', - 'episode': 'Wicked Lord Shingan... Reborn', - 'average_rating': 4.9, - 'season': 'Love, Chunibyo & Other Delusions - Heart Throb -', - 'thumbnail': 'https://img1.ak.crunchyroll.com/i/spire3-tmb/2ba0384e225a5370d5f0ee9496d91ea51389046521_main.jpg', - 'title': 'Love, Chunibyo & Other Delusions - Heart Throb - Episode 1 – Wicked Lord Shingan... Reborn', - 'season_number': 2, - 'upload_date': '20140108', + 'duration': 366.459, + 'timestamp': 1476788400, + 'description': 'md5:74b67283ffddd75f6e224ca7dc031e76', + 'title': 'SHELTER Episode – Porter Robinson presents Shelter the Animation', + 'upload_date': '20161018', + 'series': 'SHELTER', + 'series_id': 'GYGG09WWY', + 'season': 'SHELTER', + 'season_id': 'GR09MGK4R', + 'season_number': 1, + 'episode': 'Porter Robinson presents Shelter the Animation', + 'episode_number': 0, + 'thumbnail': r're:^https://www.crunchyroll.com/imgsrv/.*\.jpeg$', }, - 'params': {'skip_download': 'm3u8'}, - 'expected_warnings': ['Unable to download XML'] + 'params': {'skip_download': True}, + 'skip': 'Video is Premium only', + }, { + 'url': 'https://www.crunchyroll.com/watch/GY2P1Q98Y', + 'only_matching': True, }, { - 'url': 'https://beta.crunchyroll.com/watch/GY2P1Q98Y/', + 'url': 'https://beta.crunchyroll.com/pt-br/watch/G8WUN8VKP/the-ruler-of-conspiracy', 'only_matching': True, }] def _real_extract(self, url): lang, internal_id, display_id = self._match_valid_url(url).group('lang', 'id', 'display_id') - - if not self._get_cookies(url).get('etp_rt'): - return self._redirect_from_beta(url, lang, internal_id, display_id, True, CrunchyrollIE.ie_key()) - api_domain, bucket, params = self._get_params(lang) episode_response = self._download_json( f'{api_domain}/cms/v2{bucket}/episodes/{internal_id}', display_id, - note='Retrieving episode metadata', - query=params) + note='Retrieving episode metadata', query=params) if episode_response.get('is_premium_only') and not episode_response.get('playback'): raise ExtractorError('This video is for premium members only.', expected=True) - stream_response = self._download_json( - episode_response['playback'], display_id, - note='Retrieving stream info') - thumbnails = [] - for thumbnails_data in traverse_obj(episode_response, ('images', 'thumbnail')): - for thumbnail_data in thumbnails_data: - thumbnails.append({ - 'url': thumbnail_data.get('source'), - 'width': thumbnail_data.get('width'), - 'height': thumbnail_data.get('height'), - }) - subtitles = {} - for lang, subtitle_data in stream_response.get('subtitles').items(): - subtitles[lang] = [{ - 'url': subtitle_data.get('url'), - 'ext': subtitle_data.get('format') - }] + stream_response = self._download_json( + f'{api_domain}{episode_response["__links__"]["streams"]["href"]}', display_id, + note='Retrieving stream info', query=params) + get_streams = lambda name: (traverse_obj(stream_response, name) or {}).items() requested_hardsubs = [('' if val == 'none' else val) for val in (self._configuration_arg('hardsub') or ['none'])] hardsub_preference = qualities(requested_hardsubs[::-1]) requested_formats = self._configuration_arg('format') or ['adaptive_hls'] - formats = [] - for stream_type, streams in stream_response.get('streams', {}).items(): + available_formats = {} + for stream_type, streams in get_streams('streams'): if stream_type not in requested_formats: continue for stream in streams.values(): - hardsub_lang = stream.get('hardsub_locale') or '' - if hardsub_lang.lower() not in requested_hardsubs: - continue - format_id = join_nonempty( - stream_type, - format_field(stream, 'hardsub_locale', 'hardsub-%s')) if not stream.get('url'): continue - if stream_type.split('_')[-1] == 'hls': + hardsub_lang = stream.get('hardsub_locale') or '' + format_id = join_nonempty(stream_type, format_field(stream, 'hardsub_locale', 'hardsub-%s')) + available_formats[hardsub_lang] = (stream_type, format_id, hardsub_lang, stream['url']) + + if '' in available_formats and 'all' not in requested_hardsubs: + full_format_langs = set(requested_hardsubs) + self.to_screen( + 'To get all formats of a hardsub language, use ' + '"--extractor-args crunchyrollbeta:hardsub=<language_code or all>". ' + 'See https://github.com/hypervideo/hypervideo#crunchyrollbeta for more info', + only_once=True) + else: + full_format_langs = set(map(str.lower, available_formats)) + + formats = [] + for stream_type, format_id, hardsub_lang, stream_url in available_formats.values(): + if stream_type.endswith('hls'): + if hardsub_lang.lower() in full_format_langs: adaptive_formats = self._extract_m3u8_formats( - stream['url'], display_id, 'mp4', m3u8_id=format_id, - note='Downloading %s information' % format_id, - fatal=False) - elif stream_type.split('_')[-1] == 'dash': - adaptive_formats = self._extract_mpd_formats( - stream['url'], display_id, mpd_id=format_id, - note='Downloading %s information' % format_id, - fatal=False) - for f in adaptive_formats: - if f.get('acodec') != 'none': - f['language'] = stream_response.get('audio_locale') - f['quality'] = hardsub_preference(hardsub_lang.lower()) - formats.extend(adaptive_formats) - self._sort_formats(formats) + stream_url, display_id, 'mp4', m3u8_id=format_id, + fatal=False, note=f'Downloading {format_id} HLS manifest') + else: + adaptive_formats = (self._m3u8_meta_format(stream_url, ext='mp4', m3u8_id=format_id),) + elif stream_type.endswith('dash'): + adaptive_formats = self._extract_mpd_formats( + stream_url, display_id, mpd_id=format_id, + fatal=False, note=f'Downloading {format_id} MPD manifest') + else: + self.report_warning(f'Encountered unknown stream_type: {stream_type!r}', display_id, only_once=True) + continue + for f in adaptive_formats: + if f.get('acodec') != 'none': + f['language'] = stream_response.get('audio_locale') + f['quality'] = hardsub_preference(hardsub_lang.lower()) + formats.extend(adaptive_formats) return { 'id': internal_id, - 'title': '%s Episode %s – %s' % (episode_response.get('season_title'), episode_response.get('episode'), episode_response.get('title')), - 'description': episode_response.get('description').replace(r'\r\n', '\n'), + 'title': '%s Episode %s – %s' % ( + episode_response.get('season_title'), episode_response.get('episode'), episode_response.get('title')), + 'description': try_get(episode_response, lambda x: x['description'].replace(r'\r\n', '\n')), 'duration': float_or_none(episode_response.get('duration_ms'), 1000), - 'thumbnails': thumbnails, + 'timestamp': parse_iso8601(episode_response.get('upload_date')), 'series': episode_response.get('series_title'), 'series_id': episode_response.get('series_id'), 'season': episode_response.get('season_title'), @@ -887,39 +223,42 @@ class CrunchyrollBetaIE(CrunchyrollBetaBaseIE): 'season_number': episode_response.get('season_number'), 'episode': episode_response.get('title'), 'episode_number': episode_response.get('sequence_number'), - 'subtitles': subtitles, - 'formats': formats + 'formats': formats, + 'thumbnails': [{ + 'url': thumb.get('source'), + 'width': thumb.get('width'), + 'height': thumb.get('height'), + } for thumb in traverse_obj(episode_response, ('images', 'thumbnail', ..., ...)) or []], + 'subtitles': { + lang: [{ + 'url': subtitle_data.get('url'), + 'ext': subtitle_data.get('format') + }] for lang, subtitle_data in get_streams('subtitles') + }, } -class CrunchyrollBetaShowIE(CrunchyrollBetaBaseIE): - IE_NAME = 'crunchyroll:playlist:beta' - _VALID_URL = r'https?://beta\.crunchyroll\.com/(?P<lang>(?:\w{1,2}/)?)series/(?P<id>\w+)/(?P<display_id>[\w\-]*)/?(?:\?|$)' +class CrunchyrollBetaShowIE(CrunchyrollBaseIE): + IE_NAME = 'crunchyroll:playlist' + _VALID_URL = r'''(?x) + https?://(?:beta|www)\.crunchyroll\.com/ + (?P<lang>(?:\w{2}(?:-\w{2})?/)?) + series/(?P<id>\w+) + (?:/(?P<display_id>[\w-]+))?/?(?:[?#]|$)''' _TESTS = [{ - 'url': 'https://beta.crunchyroll.com/series/GY19NQ2QR/Girl-Friend-BETA', + 'url': 'https://www.crunchyroll.com/series/GY19NQ2QR/Girl-Friend-BETA', 'info_dict': { - 'id': 'girl-friend-beta', + 'id': 'GY19NQ2QR', 'title': 'Girl Friend BETA', }, 'playlist_mincount': 10, }, { - 'url': 'https://beta.crunchyroll.com/series/GYJQV73V6/love-chunibyo--other-delusions---heart-throb--', - 'info_dict': { - 'id': 'love-chunibyo-other-delusions-heart-throb-', - 'title': 'Love, Chunibyo & Other Delusions - Heart Throb -', - }, - 'playlist_mincount': 10, - }, { - 'url': 'https://beta.crunchyroll.com/it/series/GY19NQ2QR/Girl-Friend-BETA', + 'url': 'https://beta.crunchyroll.com/it/series/GY19NQ2QR', 'only_matching': True, }] def _real_extract(self, url): lang, internal_id, display_id = self._match_valid_url(url).group('lang', 'id', 'display_id') - - if not self._get_cookies(url).get('etp_rt'): - return self._redirect_from_beta(url, lang, internal_id, display_id, False, CrunchyrollShowPlaylistIE.ie_key()) - api_domain, bucket, params = self._get_params(lang) series_response = self._download_json( @@ -940,7 +279,7 @@ class CrunchyrollBetaShowIE(CrunchyrollBetaBaseIE): episode_display_id = episode['slug_title'] yield { '_type': 'url', - 'url': f'https://beta.crunchyroll.com/{lang}watch/{episode_id}/{episode_display_id}', + 'url': f'https://www.crunchyroll.com/{lang}watch/{episode_id}/{episode_display_id}', 'ie_key': CrunchyrollBetaIE.ie_key(), 'id': episode_id, 'title': '%s Episode %s – %s' % (episode.get('season_title'), episode.get('episode'), episode.get('title')), diff --git a/hypervideo_dl/extractor/cspan.py b/hypervideo_dl/extractor/cspan.py index f51159b..0075680 100644 --- a/hypervideo_dl/extractor/cspan.py +++ b/hypervideo_dl/extractor/cspan.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -165,7 +163,7 @@ class CSpanIE(InfoExtractor): video_id = m.group('id') video_type = 'program' if m.group('type') == 'prog' else 'clip' else: - senate_isvp_url = SenateISVPIE._search_iframe_url(webpage) + senate_isvp_url = SenateISVPIE._extract_url(webpage) if senate_isvp_url: title = self._og_search_title(webpage) surl = smuggle_url(senate_isvp_url, {'force_title': title}) @@ -220,7 +218,6 @@ class CSpanIE(InfoExtractor): path, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') if determine_ext(path) == 'm3u8' else [{'url': path, }] add_referer(formats) - self._sort_formats(formats) entries.append({ 'id': '%s_%d' % (video_id, partnum + 1), 'title': ( @@ -277,8 +274,7 @@ class CSpanCongressIE(InfoExtractor): self._search_regex(r'jwsetup\s*=\s*({(?:.|\n)[^;]+});', webpage, 'player config'), video_id, transform_source=js_to_json) - title = (self._og_search_title(webpage, default=None) - or self._html_extract_title(webpage, 'video title')) + title = self._generic_title('', webpage) description = (self._og_search_description(webpage, default=None) or self._html_search_meta('description', webpage, 'description', default=None)) diff --git a/hypervideo_dl/extractor/ctsnews.py b/hypervideo_dl/extractor/ctsnews.py index 679f1d9..cec178f 100644 --- a/hypervideo_dl/extractor/ctsnews.py +++ b/hypervideo_dl/extractor/ctsnews.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import unified_timestamp from .youtube import YoutubeIE diff --git a/hypervideo_dl/extractor/ctv.py b/hypervideo_dl/extractor/ctv.py index 756bcc2..f125c1c 100644 --- a/hypervideo_dl/extractor/ctv.py +++ b/hypervideo_dl/extractor/ctv.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/ctvnews.py b/hypervideo_dl/extractor/ctvnews.py index 952f4c7..ad3f0d8 100644 --- a/hypervideo_dl/extractor/ctvnews.py +++ b/hypervideo_dl/extractor/ctvnews.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/cultureunplugged.py b/hypervideo_dl/extractor/cultureunplugged.py index 9002e4c..2fb2280 100644 --- a/hypervideo_dl/extractor/cultureunplugged.py +++ b/hypervideo_dl/extractor/cultureunplugged.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import time from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/curiositystream.py b/hypervideo_dl/extractor/curiositystream.py index b8abcf7..26cf24f 100644 --- a/hypervideo_dl/extractor/curiositystream.py +++ b/hypervideo_dl/extractor/curiositystream.py @@ -1,15 +1,8 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor -from ..utils import ( - int_or_none, - urlencode_postdata, - compat_str, - ExtractorError, -) +from ..compat import compat_str +from ..utils import ExtractorError, int_or_none, urlencode_postdata class CuriosityStreamBaseIE(InfoExtractor): @@ -26,6 +19,11 @@ class CuriosityStreamBaseIE(InfoExtractor): def _call_api(self, path, video_id, query=None): headers = {} + if not self._auth_token: + auth_cookie = self._get_cookies('https://curiositystream.com').get('auth_token') + if auth_cookie: + self.write_debug('Obtained auth_token cookie') + self._auth_token = auth_cookie.value if self._auth_token: headers['X-Auth-Token'] = self._auth_token result = self._download_json( @@ -48,7 +46,7 @@ class CuriosityStreamIE(CuriosityStreamBaseIE): IE_NAME = 'curiositystream' _VALID_URL = r'https?://(?:app\.)?curiositystream\.com/video/(?P<id>\d+)' _TESTS = [{ - 'url': 'https://app.curiositystream.com/video/2', + 'url': 'http://app.curiositystream.com/video/2', 'info_dict': { 'id': '2', 'ext': 'mp4', @@ -119,7 +117,6 @@ class CuriosityStreamIE(CuriosityStreamBaseIE): 'format_id': 'http', }) formats.append(fmt) - self._sort_formats(formats) title = media['title'] diff --git a/hypervideo_dl/extractor/cwtv.py b/hypervideo_dl/extractor/cwtv.py index 7338243..9b83264 100644 --- a/hypervideo_dl/extractor/cwtv.py +++ b/hypervideo_dl/extractor/cwtv.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( ExtractorError, @@ -94,4 +91,5 @@ class CWTVIE(InfoExtractor): 'timestamp': parse_iso8601(video_data.get('start_time')), 'age_limit': parse_age_limit(video_data.get('rating')), 'ie_key': 'ThePlatform', + 'thumbnail': video_data.get('large_thumbnail') } diff --git a/hypervideo_dl/extractor/cybrary.py b/hypervideo_dl/extractor/cybrary.py index c278f0f..73f2439 100644 --- a/hypervideo_dl/extractor/cybrary.py +++ b/hypervideo_dl/extractor/cybrary.py @@ -1,12 +1,10 @@ -# coding: utf-8 from .common import InfoExtractor - from ..utils import ( ExtractorError, smuggle_url, str_or_none, traverse_obj, - urlencode_postdata + urlencode_postdata, ) diff --git a/hypervideo_dl/extractor/daftsex.py b/hypervideo_dl/extractor/daftsex.py index 6037fd9..551d5e3 100644 --- a/hypervideo_dl/extractor/daftsex.py +++ b/hypervideo_dl/extractor/daftsex.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import compat_b64decode from ..utils import ( @@ -84,7 +81,6 @@ class DaftsexIE(InfoExtractor): 'height': int_or_none(height), 'ext': ext, }) - self._sort_formats(formats) return { 'id': video_id, @@ -120,7 +116,6 @@ class DaftsexIE(InfoExtractor): 'height': int_or_none(height), 'ext': ext, }) - self._sort_formats(formats) thumbnails = [] for k, v in item.items(): diff --git a/hypervideo_dl/extractor/dailymail.py b/hypervideo_dl/extractor/dailymail.py index 67b88fd..43401e1 100644 --- a/hypervideo_dl/extractor/dailymail.py +++ b/hypervideo_dl/extractor/dailymail.py @@ -1,8 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - from .common import InfoExtractor from ..compat import compat_str from ..utils import ( @@ -15,6 +10,7 @@ from ..utils import ( class DailyMailIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?dailymail\.co\.uk/(?:video/[^/]+/video-|embed/video/)(?P<id>[0-9]+)' + _EMBED_REGEX = [r'<iframe\b[^>]+\bsrc=["\'](?P<url>(?:https?:)?//(?:www\.)?dailymail\.co\.uk/embed/video/\d+\.html)'] _TESTS = [{ 'url': 'http://www.dailymail.co.uk/video/tvshowbiz/video-1295863/The-Mountain-appears-sparkling-water-ad-Heavy-Bubbles.html', 'md5': 'f6129624562251f628296c3a9ffde124', @@ -29,12 +25,6 @@ class DailyMailIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): - return re.findall( - r'<iframe\b[^>]+\bsrc=["\'](?P<url>(?:https?:)?//(?:www\.)?dailymail\.co\.uk/embed/video/\d+\.html)', - webpage) - def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) @@ -73,7 +63,6 @@ class DailyMailIE(InfoExtractor): 'protocol': protocol, 'ext': 'mp4' if is_hls else None, }) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/dailymotion.py b/hypervideo_dl/extractor/dailymotion.py index 9cb5618..2a44718 100644 --- a/hypervideo_dl/extractor/dailymotion.py +++ b/hypervideo_dl/extractor/dailymotion.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import functools import json import re @@ -8,13 +5,15 @@ import re from .common import InfoExtractor from ..compat import compat_HTTPError from ..utils import ( + ExtractorError, + OnDemandPagedList, age_restricted, clean_html, - ExtractorError, int_or_none, - OnDemandPagedList, + traverse_obj, try_get, unescapeHTML, + unsmuggle_url, urlencode_postdata, ) @@ -100,6 +99,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor): [/=](?P<id>[^/?_&]+)(?:.+?\bplaylist=(?P<playlist_id>x[0-9a-z]+))? ''' IE_NAME = 'dailymotion' + _EMBED_REGEX = [r'<(?:(?:embed|iframe)[^>]+?src=|input[^>]+id=[\'"]dmcloudUrlEmissionSelect[\'"][^>]+value=)(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/(?:embed|swf)/video/.+?)\1'] _TESTS = [{ 'url': 'http://www.dailymotion.com/video/x5kesuj_office-christmas-party-review-jason-bateman-olivia-munn-t-j-miller_news', 'md5': '074b95bdee76b9e3654137aee9c79dfe', @@ -209,20 +209,16 @@ class DailymotionIE(DailymotionBaseInfoExtractor): } xid''' - @staticmethod - def _extract_urls(webpage): - urls = [] - # Look for embedded Dailymotion player + @classmethod + def _extract_embed_urls(cls, url, webpage): # https://developer.dailymotion.com/player#player-parameters - for mobj in re.finditer( - r'<(?:(?:embed|iframe)[^>]+?src=|input[^>]+id=[\'"]dmcloudUrlEmissionSelect[\'"][^>]+value=)(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/(?:embed|swf)/video/.+?)\1', webpage): - urls.append(unescapeHTML(mobj.group('url'))) + yield from super()._extract_embed_urls(url, webpage) for mobj in re.finditer( r'(?s)DM\.player\([^,]+,\s*{.*?video[\'"]?\s*:\s*["\']?(?P<id>[0-9a-zA-Z]+).+?}\s*\);', webpage): - urls.append('https://www.dailymotion.com/embed/video/' + mobj.group('id')) - return urls + yield from 'https://www.dailymotion.com/embed/video/' + mobj.group('id') def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url) video_id, playlist_id = self._match_valid_url(url).groups() if playlist_id: @@ -255,7 +251,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor): metadata = self._download_json( 'https://www.dailymotion.com/player/metadata/video/' + xid, xid, 'Downloading metadata JSON', - query={'app': 'com.dailymotion.neon'}) + query=traverse_obj(smuggled_data, 'query') or {'app': 'com.dailymotion.neon'}) error = metadata.get('error') if error: @@ -297,7 +293,6 @@ class DailymotionIE(DailymotionBaseInfoExtractor): f['url'] = f['url'].split('#')[0] if not f.get('fps') and f['format_id'].endswith('@60'): f['fps'] = 60 - self._sort_formats(formats) subtitles = {} subtitles_data = try_get(metadata, lambda x: x['subtitles']['data'], dict) or {} @@ -378,6 +373,15 @@ class DailymotionPlaylistIE(DailymotionPlaylistBaseIE): }] _OBJECT_TYPE = 'collection' + @classmethod + def _extract_embed_urls(cls, url, webpage): + # Look for embedded Dailymotion playlist player (#3822) + for mobj in re.finditer( + r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.[a-z]{2,3}/widget/jukebox\?.+?)\1', + webpage): + for p in re.findall(r'list\[\]=/playlist/([^/]+)/', unescapeHTML(mobj.group('url'))): + yield '//dailymotion.com/playlist/%s' % p + class DailymotionUserIE(DailymotionPlaylistBaseIE): IE_NAME = 'dailymotion:user' diff --git a/hypervideo_dl/extractor/dailywire.py b/hypervideo_dl/extractor/dailywire.py new file mode 100644 index 0000000..f177c9d --- /dev/null +++ b/hypervideo_dl/extractor/dailywire.py @@ -0,0 +1,113 @@ +from .common import InfoExtractor +from ..utils import ( + determine_ext, + float_or_none, + join_nonempty, + traverse_obj, + url_or_none, +) + + +class DailyWireBaseIE(InfoExtractor): + _JSON_PATH = { + 'episode': ('props', 'pageProps', 'episodeData', 'episode'), + 'videos': ('props', 'pageProps', 'videoData', 'video'), + 'podcasts': ('props', 'pageProps', 'episode'), + } + + def _get_json(self, url): + sites_type, slug = self._match_valid_url(url).group('sites_type', 'id') + json_data = self._search_nextjs_data(self._download_webpage(url, slug), slug) + return slug, traverse_obj(json_data, self._JSON_PATH[sites_type]) + + +class DailyWireIE(DailyWireBaseIE): + _VALID_URL = r'https?://(?:www\.)dailywire(?:\.com)/(?P<sites_type>episode|videos)/(?P<id>[\w-]+)' + _TESTS = [{ + 'url': 'https://www.dailywire.com/episode/1-fauci', + 'info_dict': { + 'id': 'ckzsl50xnqpy30850in3v4bu7', + 'ext': 'mp4', + 'display_id': '1-fauci', + 'title': '1. Fauci', + 'description': 'md5:9df630347ef85081b7e97dd30bc22853', + 'thumbnail': 'https://daily-wire-production.imgix.net/episodes/ckzsl50xnqpy30850in3v4bu7/ckzsl50xnqpy30850in3v4bu7-1648237399554.jpg', + 'creator': 'Caroline Roberts', + 'series_id': 'ckzplm0a097fn0826r2vc3j7h', + 'series': 'China: The Enemy Within', + } + }, { + 'url': 'https://www.dailywire.com/episode/ep-124-bill-maher', + 'info_dict': { + 'id': 'cl0ngbaalplc80894sfdo9edf', + 'ext': 'mp3', + 'display_id': 'ep-124-bill-maher', + 'title': 'Ep. 124 - Bill Maher', + 'thumbnail': 'https://daily-wire-production.imgix.net/episodes/cl0ngbaalplc80894sfdo9edf/cl0ngbaalplc80894sfdo9edf-1647065568518.jpg', + 'creator': 'Caroline Roberts', + 'description': 'md5:adb0de584bcfa9c41374999d9e324e98', + 'series_id': 'cjzvep7270hp00786l9hwccob', + 'series': 'The Sunday Special', + } + }, { + 'url': 'https://www.dailywire.com/videos/the-hyperions', + 'only_matching': True, + }] + + def _real_extract(self, url): + slug, episode_info = self._get_json(url) + urls = traverse_obj( + episode_info, (('segments', 'videoUrl'), ..., ('video', 'audio')), expected_type=url_or_none) + + formats, subtitles = [], {} + for url in urls: + if determine_ext(url) != 'm3u8': + formats.append({'url': url}) + continue + format_, subs_ = self._extract_m3u8_formats_and_subtitles(url, slug) + formats.extend(format_) + self._merge_subtitles(subs_, target=subtitles) + return { + 'id': episode_info['id'], + 'display_id': slug, + 'title': traverse_obj(episode_info, 'title', 'name'), + 'description': episode_info.get('description'), + 'creator': join_nonempty(('createdBy', 'firstName'), ('createdBy', 'lastName'), from_dict=episode_info, delim=' '), + 'duration': float_or_none(episode_info.get('duration')), + 'is_live': episode_info.get('isLive'), + 'thumbnail': traverse_obj(episode_info, 'thumbnail', 'image', expected_type=url_or_none), + 'formats': formats, + 'subtitles': subtitles, + 'series_id': traverse_obj(episode_info, ('show', 'id')), + 'series': traverse_obj(episode_info, ('show', 'name')), + } + + +class DailyWirePodcastIE(DailyWireBaseIE): + _VALID_URL = r'https?://(?:www\.)dailywire(?:\.com)/(?P<sites_type>podcasts)/(?P<podcaster>[\w-]+/(?P<id>[\w-]+))' + _TESTS = [{ + 'url': 'https://www.dailywire.com/podcasts/morning-wire/get-ready-for-recession-6-15-22', + 'info_dict': { + 'id': 'cl4f01d0w8pbe0a98ydd0cfn1', + 'ext': 'm4a', + 'display_id': 'get-ready-for-recession-6-15-22', + 'title': 'Get Ready for Recession | 6.15.22', + 'description': 'md5:c4afbadda4e1c38a4496f6d62be55634', + 'thumbnail': 'https://daily-wire-production.imgix.net/podcasts/ckx4otgd71jm508699tzb6hf4-1639506575562.jpg', + 'duration': 900.117667, + } + }] + + def _real_extract(self, url): + slug, episode_info = self._get_json(url) + audio_id = traverse_obj(episode_info, 'audioMuxPlaybackId', 'VUsAipTrBVSgzw73SpC2DAJD401TYYwEp') + + return { + 'id': episode_info['id'], + 'url': f'https://stream.media.dailywire.com/{audio_id}/audio.m4a', + 'display_id': slug, + 'title': episode_info.get('title'), + 'duration': float_or_none(episode_info.get('duration')), + 'thumbnail': episode_info.get('thumbnail'), + 'description': episode_info.get('description'), + } diff --git a/hypervideo_dl/extractor/damtomo.py b/hypervideo_dl/extractor/damtomo.py index 456cd35..0e08e4f 100644 --- a/hypervideo_dl/extractor/damtomo.py +++ b/hypervideo_dl/extractor/damtomo.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -39,7 +36,6 @@ class DamtomoBaseIE(InfoExtractor): if not m3u8_url: raise ExtractorError('Failed to obtain m3u8 URL') formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4') - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/daum.py b/hypervideo_dl/extractor/daum.py index 4362e92..3ef5140 100644 --- a/hypervideo_dl/extractor/daum.py +++ b/hypervideo_dl/extractor/daum.py @@ -1,7 +1,3 @@ -# coding: utf-8 - -from __future__ import unicode_literals - import itertools from .common import InfoExtractor @@ -129,7 +125,7 @@ class DaumClipIE(DaumBaseIE): self._KAKAO_EMBED_BASE + video_id, 'Kakao', video_id) -class DaumListIE(InfoExtractor): +class DaumListIE(InfoExtractor): # XXX: Conventionally, base classes should end with BaseIE/InfoExtractor def _get_entries(self, list_id, list_id_type): name = None entries = [] diff --git a/hypervideo_dl/extractor/daystar.py b/hypervideo_dl/extractor/daystar.py index 4f59d90..ef3520a 100644 --- a/hypervideo_dl/extractor/daystar.py +++ b/hypervideo_dl/extractor/daystar.py @@ -36,7 +36,6 @@ class DaystarClipIE(InfoExtractor): video_id, 'mp4', fatal=False, headers={'Referer': src_iframe}) formats.extend(fmts) subtitles = self._merge_subtitles(subtitles, subs) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/dbtv.py b/hypervideo_dl/extractor/dbtv.py index 8e73176..18be46f 100644 --- a/hypervideo_dl/extractor/dbtv.py +++ b/hypervideo_dl/extractor/dbtv.py @@ -1,13 +1,9 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - from .common import InfoExtractor class DBTVIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?dagbladet\.no/video/(?:(?:embed|(?P<display_id>[^/]+))/)?(?P<id>[0-9A-Za-z_-]{11}|[a-zA-Z0-9]{8})' + _EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dagbladet\.no/video/embed/(?:[0-9A-Za-z_-]{11}|[a-zA-Z0-9]{8}).*?)\1'] _TESTS = [{ 'url': 'https://www.dagbladet.no/video/PynxJnNWChE/', 'md5': 'b8f850ba1860adbda668d367f9b77699', @@ -31,12 +27,6 @@ class DBTVIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): - return [url for _, url in re.findall( - r'<iframe[^>]+src=(["\'])((?:https?:)?//(?:www\.)?dagbladet\.no/video/embed/(?:[0-9A-Za-z_-]{11}|[a-zA-Z0-9]{8}).*?)\1', - webpage)] - def _real_extract(self, url): display_id, video_id = self._match_valid_url(url).groups() info = { diff --git a/hypervideo_dl/extractor/dctp.py b/hypervideo_dl/extractor/dctp.py index e700f8d..24bb6ac 100644 --- a/hypervideo_dl/extractor/dctp.py +++ b/hypervideo_dl/extractor/dctp.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import compat_str from ..utils import ( diff --git a/hypervideo_dl/extractor/deezer.py b/hypervideo_dl/extractor/deezer.py index 7ba02e5..f61f12a 100644 --- a/hypervideo_dl/extractor/deezer.py +++ b/hypervideo_dl/extractor/deezer.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import json from .common import InfoExtractor @@ -64,7 +62,6 @@ class DeezerPlaylistIE(DeezerBaseInfoExtractor): 'preference': -100, # Only the first 30 seconds 'ext': 'mp3', }] - self._sort_formats(formats) artists = ', '.join( orderedSet(a.get('ART_NAME') for a in s.get('ARTISTS'))) entries.append({ @@ -117,7 +114,6 @@ class DeezerAlbumIE(DeezerBaseInfoExtractor): 'preference': -100, # Only the first 30 seconds 'ext': 'mp3', }] - self._sort_formats(formats) artists = ', '.join( orderedSet(a.get('ART_NAME') for a in s.get('ARTISTS'))) entries.append({ diff --git a/hypervideo_dl/extractor/defense.py b/hypervideo_dl/extractor/defense.py index 9fe144e..7d73ea8 100644 --- a/hypervideo_dl/extractor/defense.py +++ b/hypervideo_dl/extractor/defense.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/democracynow.py b/hypervideo_dl/extractor/democracynow.py index 5c9c0ec..1624d08 100644 --- a/hypervideo_dl/extractor/democracynow.py +++ b/hypervideo_dl/extractor/democracynow.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re import os.path @@ -62,8 +59,6 @@ class DemocracynowIE(InfoExtractor): 'vcodec': 'none' if key == 'audio' else None, }) - self._sort_formats(formats) - default_lang = 'en' subtitles = {} diff --git a/hypervideo_dl/extractor/detik.py b/hypervideo_dl/extractor/detik.py new file mode 100644 index 0000000..f148054 --- /dev/null +++ b/hypervideo_dl/extractor/detik.py @@ -0,0 +1,159 @@ +from .common import InfoExtractor +from ..utils import int_or_none, merge_dicts, try_call, url_basename + + +class DetikEmbedIE(InfoExtractor): + _VALID_URL = False + _WEBPAGE_TESTS = [{ + # cnn embed + 'url': 'https://www.cnnindonesia.com/embed/video/846189', + 'info_dict': { + 'id': '846189', + 'ext': 'mp4', + 'description': 'md5:ece7b003b3ee7d81c6a5cfede7d5397d', + 'thumbnail': r're:https?://akcdn\.detik\.net\.id/visual/2022/09/11/thumbnail-video-1_169.jpeg', + 'title': 'Video CNN Indonesia - VIDEO: Momen Charles Disambut Meriah usai Dilantik jadi Raja Inggris', + 'age_limit': 0, + 'tags': ['raja charles', ' raja charles iii', ' ratu elizabeth', ' ratu elizabeth meninggal dunia', ' raja inggris', ' inggris'], + 'release_timestamp': 1662869995, + 'release_date': '20220911', + 'uploader': 'REUTERS' + } + }, { + # 20.detik + 'url': 'https://20.detik.com/otobuzz/20220704-220704093/mulai-rp-10-jutaan-ini-skema-kredit-mitsubishi-pajero-sport', + 'info_dict': { + 'display_id': 'mulai-rp-10-jutaan-ini-skema-kredit-mitsubishi-pajero-sport', + 'id': '220704093', + 'ext': 'mp4', + 'description': 'md5:9b2257341b6f375cdcf90106146d5ffb', + 'thumbnail': r're:https?://cdnv\.detik\.com/videoservice/AdminTV/2022/07/04/5d6187e402ec4a91877755a5886ff5b6-20220704161859-0s.jpg', + 'title': 'Mulai Rp 10 Jutaan! Ini Skema Kredit Mitsubishi Pajero Sport', + 'timestamp': 1656951521, + 'upload_date': '20220704', + 'duration': 83.0, + 'tags': ['cicilan mobil', 'mitsubishi pajero sport', 'mitsubishi', 'pajero sport'], + 'release_timestamp': 1656926321, + 'release_date': '20220704', + 'age_limit': 0, + 'uploader': 'Ridwan Arifin ' # TODO: strip trailling whitespace at uploader + } + }, { + # pasangmata.detik + 'url': 'https://pasangmata.detik.com/contribution/366649', + 'info_dict': { + 'id': '366649', + 'ext': 'mp4', + 'title': 'Saling Dorong Aparat dan Pendemo di Aksi Tolak Kenaikan BBM', + 'description': 'md5:7a6580876c8381c454679e028620bea7', + 'age_limit': 0, + 'tags': 'count:17', + 'thumbnail': 'https://akcdn.detik.net.id/community/data/media/thumbs-pasangmata/2022/09/08/366649-16626229351533009620.mp4-03.jpg', + } + }, { + # insertlive embed + 'url': 'https://www.insertlive.com/embed/video/290482', + 'info_dict': { + 'id': '290482', + 'ext': 'mp4', + 'release_timestamp': 1663063704, + 'thumbnail': 'https://akcdn.detik.net.id/visual/2022/09/13/leonardo-dicaprio_169.png?w=600&q=90', + 'age_limit': 0, + 'description': 'Aktor Leonardo DiCaprio memang baru saja putus dari kekasihnya yang bernama Camilla Morrone.', + 'release_date': '20220913', + 'title': 'Diincar Leonardo DiCaprio, Gigi Hadid Ngaku Tertarik Tapi Belum Cinta', + 'tags': ['leonardo dicaprio', ' gigi hadid', ' hollywood'], + 'uploader': '!nsertlive', + } + }, { + # beautynesia embed + 'url': 'https://www.beautynesia.id/embed/video/261636', + 'info_dict': { + 'id': '261636', + 'ext': 'mp4', + 'age_limit': 0, + 'release_timestamp': 1662375600, + 'description': 'Menurut ramalan astrologi, tiga zodiak ini bakal hoki sepanjang September 2022.', + 'title': '3 Zodiak Paling Beruntung Selama September 2022', + 'release_date': '20220905', + 'tags': ['zodiac update', ' zodiak', ' ramalan bintang', ' zodiak beruntung 2022', ' zodiak hoki september 2022', ' zodiak beruntung september 2022'], + 'thumbnail': 'https://akcdn.detik.net.id/visual/2022/09/05/3-zodiak-paling-beruntung-selama-september-2022_169.jpeg?w=600&q=90', + 'uploader': 'amh', + } + }, { + # cnbcindonesia embed + 'url': 'https://www.cnbcindonesia.com/embed/video/371839', + 'info_dict': { + 'id': '371839', + 'ext': 'mp4', + 'title': 'Puluhan Pejabat Rusia Tuntut Putin Mundur', + 'tags': ['putin'], + 'age_limit': 0, + 'thumbnail': 'https://awsimages.detik.net.id/visual/2022/09/13/cnbc-indonesia-tv-3_169.png?w=600&q=80', + 'description': 'md5:8b9111e37555fcd95fe549a9b4ae6fdc', + } + }, { + # detik shortlink (we can get it from https://dtk.id/?<url>) + 'url': 'https://dtk.id/NkISKr', + 'info_dict': { + 'id': '220914049', + 'ext': 'mp4', + 'release_timestamp': 1663114488, + 'uploader': 'Tim 20Detik', + 'title': 'Pakar Bicara soal Tim Khusus Jokowi dan Mereka yang Pro ke Bjorka', + 'age_limit': 0, + 'thumbnail': 'https://cdnv.detik.com/videoservice/AdminTV/2022/09/14/f15cae71d7b640c58e75b254ecbb1ce1-20220914071613-0s.jpg?w=400&q=80', + 'display_id': 'pakar-bicara-soal-tim-khusus-jokowi-dan-mereka-yang-pro-ke-bjorka', + 'upload_date': '20220914', + 'release_date': '20220914', + 'description': 'md5:5eb03225f7ee40207dd3a1e18a73f1ff', + 'timestamp': 1663139688, + 'duration': 213.0, + 'tags': ['hacker bjorka', 'bjorka', 'hacker bjorka bocorkan data rahasia presiden jokowi', 'jokowi'], + } + }] + + def _extract_from_webpage(self, url, webpage): + player_type, video_data = self._search_regex( + r'<script\s*[^>]+src="https?://(aws)?cdn\.detik\.net\.id/(?P<type>flowplayer|detikVideo)[^>]+>\s*(?P<video_data>{[^}]+})', + webpage, 'playerjs', group=('type', 'video_data'), default=(None, '')) + if not player_type: + return + + display_id, extra_info_dict = url_basename(url), {} + + if player_type == 'flowplayer': + video_json_data = self._parse_json(video_data.replace('\'', '"'), display_id) + video_url = video_json_data['videoUrl'] + + extra_info_dict = { + 'id': self._search_regex(r'identifier\s*:\s*\'([^\']+)', webpage, 'identifier'), + 'thumbnail': video_json_data.get('imageUrl'), + } + + elif player_type == 'detikVideo': + video_url = self._search_regex( + r'videoUrl\s*:\s*[\'"]?([^"\']+)', video_data, 'videoUrl') + extra_info_dict = { + 'id': self._html_search_meta(['video_id', 'dtk:video_id'], webpage), + 'thumbnail': self._search_regex(r'imageUrl\s*:\s*[\'"]?([^"\']+)', video_data, 'videoUrl'), + 'duration': int_or_none(self._html_search_meta('duration', webpage, fatal=False, default=None)), + 'release_timestamp': int_or_none(self._html_search_meta('dtk:publishdateunix', webpage, fatal=False, default=None), 1000), + 'timestamp': int_or_none(self._html_search_meta('dtk:createdateunix', webpage, fatal=False, default=None), 1000), + 'uploader': self._search_regex( + r'([^-]+)', self._html_search_meta('dtk:author', webpage, default='').strip(), 'uploader', + default=None) + } + + formats, subtitles = self._extract_m3u8_formats_and_subtitles(video_url, display_id) + + json_ld_data = self._search_json_ld(webpage, display_id, default={}) + yield merge_dicts(json_ld_data, extra_info_dict, { + 'display_id': display_id, + 'title': self._html_search_meta(['og:title', 'originalTitle'], webpage) or self._html_extract_title(webpage), + 'description': self._html_search_meta(['og:description', 'twitter:description', 'description'], webpage), + 'formats': formats, + 'subtitles': subtitles, + 'tags': try_call(lambda: self._html_search_meta( + ['keywords', 'keyword', 'dtk:keywords'], webpage).split(',')), + }) diff --git a/hypervideo_dl/extractor/deuxm.py b/hypervideo_dl/extractor/deuxm.py new file mode 100644 index 0000000..74a6da6 --- /dev/null +++ b/hypervideo_dl/extractor/deuxm.py @@ -0,0 +1,76 @@ +from .common import InfoExtractor +from ..utils import url_or_none + + +class DeuxMIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?2m\.ma/[^/]+/replay/single/(?P<id>([\w.]{1,24})+)' + + _TESTS = [{ + 'url': 'https://2m.ma/fr/replay/single/6351d439b15e1a613b3debe8', + 'md5': '5f761f04c9d686e553b685134dca5d32', + 'info_dict': { + 'id': '6351d439b15e1a613b3debe8', + 'ext': 'mp4', + 'title': 'Grand Angle : Jeudi 20 Octobre 2022', + 'thumbnail': r're:^https?://2msoread-ww.amagi.tv/mediasfiles/videos/images/.*\.png$' + } + }, { + 'url': 'https://2m.ma/fr/replay/single/635c0aeab4eec832622356da', + 'md5': 'ad6af2f5e4d5b2ad2194a84b6e890b4c', + 'info_dict': { + 'id': '635c0aeab4eec832622356da', + 'ext': 'mp4', + 'title': 'Journal Amazigh : Vendredi 28 Octobre 2022', + 'thumbnail': r're:^https?://2msoread-ww.amagi.tv/mediasfiles/videos/images/.*\.png$' + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + video = self._download_json( + f'https://2m.ma/api/watchDetail/{video_id}', video_id)['response']['News'] + return { + 'id': video_id, + 'title': video.get('titre'), + 'url': video['url'], + 'description': video.get('description'), + 'thumbnail': url_or_none(video.get('image')), + } + + +class DeuxMNewsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?2m\.ma/(?P<lang>\w+)/news/(?P<id>[^/#?]+)' + + _TESTS = [{ + 'url': 'https://2m.ma/fr/news/Kan-Ya-Mkan-d%C3%A9poussi%C3%A8re-l-histoire-du-phare-du-Cap-Beddouza-20221028', + 'md5': '43d5e693a53fa0b71e8a5204c7d4542a', + 'info_dict': { + 'id': '635c5d1233b83834e35b282e', + 'ext': 'mp4', + 'title': 'Kan Ya Mkan d\u00e9poussi\u00e8re l\u2019histoire du phare du Cap Beddouza', + 'description': 'md5:99dcf29b82f1d7f2a4acafed1d487527', + 'thumbnail': r're:^https?://2msoread-ww.amagi.tv/mediasfiles/videos/images/.*\.png$' + } + }, { + 'url': 'https://2m.ma/fr/news/Interview-Casablanca-hors-des-sentiers-battus-avec-Abderrahim-KASSOU-Replay--20221017', + 'md5': '7aca29f02230945ef635eb8290283c0c', + 'info_dict': { + 'id': '634d9e108b70d40bc51a844b', + 'ext': 'mp4', + 'title': 'Interview: Casablanca hors des sentiers battus avec Abderrahim KASSOU (Replay) ', + 'description': 'md5:3b8e78111de9fcc6ef7f7dd6cff2430c', + 'thumbnail': r're:^https?://2msoread-ww.amagi.tv/mediasfiles/videos/images/.*\.png$' + } + }] + + def _real_extract(self, url): + article_name, lang = self._match_valid_url(url).group('id', 'lang') + video = self._download_json( + f'https://2m.ma/api/articlesByUrl?lang={lang}&url=/news/{article_name}', article_name)['response']['article'][0] + return { + 'id': video['id'], + 'title': video.get('title'), + 'url': video['image'][0], + 'description': video.get('content'), + 'thumbnail': url_or_none(video.get('cover')), + } diff --git a/hypervideo_dl/extractor/dfb.py b/hypervideo_dl/extractor/dfb.py index 97f70fc..c4fb5c2 100644 --- a/hypervideo_dl/extractor/dfb.py +++ b/hypervideo_dl/extractor/dfb.py @@ -1,6 +1,3 @@ -from __future__ import unicode_literals - - from .common import InfoExtractor from ..utils import unified_strdate @@ -44,7 +41,6 @@ class DFBIE(InfoExtractor): formats.extend(self._extract_m3u8_formats( manifest_url, display_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/dhm.py b/hypervideo_dl/extractor/dhm.py index aee72a6..3d42fc2 100644 --- a/hypervideo_dl/extractor/dhm.py +++ b/hypervideo_dl/extractor/dhm.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import parse_duration diff --git a/hypervideo_dl/extractor/digg.py b/hypervideo_dl/extractor/digg.py index 913c175..86e8a6f 100644 --- a/hypervideo_dl/extractor/digg.py +++ b/hypervideo_dl/extractor/digg.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import js_to_json diff --git a/hypervideo_dl/extractor/digitalconcerthall.py b/hypervideo_dl/extractor/digitalconcerthall.py index 8398ae3..3461e36 100644 --- a/hypervideo_dl/extractor/digitalconcerthall.py +++ b/hypervideo_dl/extractor/digitalconcerthall.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( @@ -89,9 +86,8 @@ class DigitalConcertHallIE(InfoExtractor): }) m3u8_url = traverse_obj( - stream_info, ('channel', lambda x: x.startswith('vod_mixed'), 'stream', 0, 'url'), get_all=False) + stream_info, ('channel', lambda k, _: k.startswith('vod_mixed'), 'stream', 0, 'url'), get_all=False) formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', 'm3u8_native', fatal=False) - self._sort_formats(formats) yield { 'id': video_id, diff --git a/hypervideo_dl/extractor/digiteka.py b/hypervideo_dl/extractor/digiteka.py index d632047..912e33b 100644 --- a/hypervideo_dl/extractor/digiteka.py +++ b/hypervideo_dl/extractor/digiteka.py @@ -1,8 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - from .common import InfoExtractor from ..utils import int_or_none @@ -28,6 +23,7 @@ class DigitekaIE(InfoExtractor): ) /id )/(?P<id>[\d+a-z]+)''' + _EMBED_REGEX = [r'<(?:iframe|script)[^>]+src=["\'](?P<url>(?:https?:)?//(?:www\.)?ultimedia\.com/deliver/(?:generic|musique)(?:/[^/]+)*/(?:src|article)/[\d+a-z]+)'] _TESTS = [{ # news 'url': 'https://www.ultimedia.com/default/index/videogeneric/id/s8uk0r', @@ -61,14 +57,6 @@ class DigitekaIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_url(webpage): - mobj = re.search( - r'<(?:iframe|script)[^>]+src=["\'](?P<url>(?:https?:)?//(?:www\.)?ultimedia\.com/deliver/(?:generic|musique)(?:/[^/]+)*/(?:src|article)/[\d+a-z]+)', - webpage) - if mobj: - return mobj.group('url') - def _real_extract(self, url): mobj = self._match_valid_url(url) video_id = mobj.group('id') @@ -93,8 +81,6 @@ class DigitekaIE(InfoExtractor): 'format_id': source.get('label'), }) - self._sort_formats(formats) - title = deliver_info['title'] thumbnail = jwconf.get('image') duration = int_or_none(deliver_info.get('duration')) diff --git a/hypervideo_dl/extractor/discovery.py b/hypervideo_dl/extractor/discovery.py index fd3ad75..fd3fc8f 100644 --- a/hypervideo_dl/extractor/discovery.py +++ b/hypervideo_dl/extractor/discovery.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import random import string diff --git a/hypervideo_dl/extractor/discoverygo.py b/hypervideo_dl/extractor/discoverygo.py index 9e7b14a..1f3d8e3 100644 --- a/hypervideo_dl/extractor/discoverygo.py +++ b/hypervideo_dl/extractor/discoverygo.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -52,7 +50,6 @@ class DiscoveryGoBaseIE(InfoExtractor): elif stream_kind == 'hds': formats.extend(self._extract_f4m_formats( stream_url, display_id, f4m_id=stream_kind, fatal=False)) - self._sort_formats(formats) video_id = video.get('id') or display_id description = video.get('description', {}).get('detailed') diff --git a/hypervideo_dl/extractor/discoverynetworks.py b/hypervideo_dl/extractor/discoverynetworks.py deleted file mode 100644 index f43c871..0000000 --- a/hypervideo_dl/extractor/discoverynetworks.py +++ /dev/null @@ -1,42 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - - -from .dplay import DPlayIE - - -class DiscoveryNetworksDeIE(DPlayIE): - _VALID_URL = r'https?://(?:www\.)?(?P<domain>(?:tlc|dmax)\.de|dplay\.co\.uk)/(?:programme|show|sendungen)/(?P<programme>[^/]+)/(?:video/)?(?P<alternate_id>[^/]+)' - - _TESTS = [{ - 'url': 'https://www.tlc.de/programme/breaking-amish/video/die-welt-da-drauen/DCB331270001100', - 'info_dict': { - 'id': '78867', - 'ext': 'mp4', - 'title': 'Die Welt da draußen', - 'description': 'md5:61033c12b73286e409d99a41742ef608', - 'timestamp': 1554069600, - 'upload_date': '20190331', - }, - 'params': { - 'format': 'bestvideo', - 'skip_download': True, - }, - }, { - 'url': 'https://www.dmax.de/programme/dmax-highlights/video/tuning-star-sidney-hoffmann-exklusiv-bei-dmax/191023082312316', - 'only_matching': True, - }, { - 'url': 'https://www.dplay.co.uk/show/ghost-adventures/video/hotel-leger-103620/EHD_280313B', - 'only_matching': True, - }, { - 'url': 'https://tlc.de/sendungen/breaking-amish/die-welt-da-drauen/', - 'only_matching': True, - }] - - def _real_extract(self, url): - domain, programme, alternate_id = self._match_valid_url(url).groups() - country = 'GB' if domain == 'dplay.co.uk' else 'DE' - realm = 'questuk' if country == 'GB' else domain.replace('.', '') - return self._get_disco_api_info( - url, '%s/%s' % (programme, alternate_id), - 'sonic-eu1-prod.disco-api.com', realm, country) diff --git a/hypervideo_dl/extractor/discoveryplusindia.py b/hypervideo_dl/extractor/discoveryplusindia.py deleted file mode 100644 index 5180140..0000000 --- a/hypervideo_dl/extractor/discoveryplusindia.py +++ /dev/null @@ -1,98 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import json - -from ..compat import compat_str -from ..utils import try_get -from .common import InfoExtractor -from .dplay import DPlayIE - - -class DiscoveryPlusIndiaIE(DPlayIE): - _VALID_URL = r'https?://(?:www\.)?discoveryplus\.in/videos?' + DPlayIE._PATH_REGEX - _TESTS = [{ - 'url': 'https://www.discoveryplus.in/videos/how-do-they-do-it/fugu-and-more?seasonId=8&type=EPISODE', - 'info_dict': { - 'id': '27104', - 'ext': 'mp4', - 'display_id': 'how-do-they-do-it/fugu-and-more', - 'title': 'Fugu and More', - 'description': 'The Japanese catch, prepare and eat the deadliest fish on the planet.', - 'duration': 1319, - 'timestamp': 1582309800, - 'upload_date': '20200221', - 'series': 'How Do They Do It?', - 'season_number': 8, - 'episode_number': 2, - 'creator': 'Discovery Channel', - }, - 'params': { - 'format': 'bestvideo', - 'skip_download': True, - }, - 'skip': 'Cookies (not necessarily logged in) are needed' - }] - - def _update_disco_api_headers(self, headers, disco_base, display_id, realm): - headers['x-disco-params'] = 'realm=%s' % realm - headers['x-disco-client'] = 'WEB:UNKNOWN:dplus-india:17.0.0' - - def _download_video_playback_info(self, disco_base, video_id, headers): - return self._download_json( - disco_base + 'playback/v3/videoPlaybackInfo', - video_id, headers=headers, data=json.dumps({ - 'deviceInfo': { - 'adBlocker': False, - }, - 'videoId': video_id, - }).encode('utf-8'))['data']['attributes']['streaming'] - - def _real_extract(self, url): - display_id = self._match_id(url) - return self._get_disco_api_info( - url, display_id, 'ap2-prod-direct.discoveryplus.in', 'dplusindia', 'in') - - -class DiscoveryPlusIndiaShowIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?discoveryplus\.in/show/(?P<show_name>[^/]+)/?(?:[?#]|$)' - _TESTS = [{ - 'url': 'https://www.discoveryplus.in/show/how-do-they-do-it', - 'playlist_mincount': 140, - 'info_dict': { - 'id': 'how-do-they-do-it', - }, - }] - - def _entries(self, show_name): - headers = { - 'x-disco-client': 'WEB:UNKNOWN:dplus-india:prod', - 'x-disco-params': 'realm=dplusindia', - 'referer': 'https://www.discoveryplus.in/', - } - show_url = 'https://ap2-prod-direct.discoveryplus.in/cms/routes/show/{}?include=default'.format(show_name) - show_json = self._download_json(show_url, - video_id=show_name, - headers=headers)['included'][4]['attributes']['component'] - show_id = show_json['mandatoryParams'].split('=')[-1] - season_url = 'https://ap2-prod-direct.discoveryplus.in/content/videos?sort=episodeNumber&filter[seasonNumber]={}&filter[show.id]={}&page[size]=100&page[number]={}' - for season in show_json['filters'][0]['options']: - season_id = season['id'] - total_pages, page_num = 1, 0 - while page_num < total_pages: - season_json = self._download_json(season_url.format(season_id, show_id, compat_str(page_num + 1)), - video_id=show_id, headers=headers, - note='Downloading JSON metadata%s' % (' page %d' % page_num if page_num else '')) - if page_num == 0: - total_pages = try_get(season_json, lambda x: x['meta']['totalPages'], int) or 1 - episodes_json = season_json['data'] - for episode in episodes_json: - video_id = episode['attributes']['path'] - yield self.url_result( - 'https://discoveryplus.in/videos/%s' % video_id, - ie=DiscoveryPlusIndiaIE.ie_key(), video_id=video_id) - page_num += 1 - - def _real_extract(self, url): - show_name = self._match_valid_url(url).group('show_name') - return self.playlist_result(self._entries(show_name), playlist_id=show_name) diff --git a/hypervideo_dl/extractor/discoveryvr.py b/hypervideo_dl/extractor/discoveryvr.py deleted file mode 100644 index cb63c26..0000000 --- a/hypervideo_dl/extractor/discoveryvr.py +++ /dev/null @@ -1,59 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import parse_duration - - -class DiscoveryVRIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?discoveryvr\.com/watch/(?P<id>[^/?#]+)' - _TEST = { - 'url': 'http://www.discoveryvr.com/watch/discovery-vr-an-introduction', - 'md5': '32b1929798c464a54356378b7912eca4', - 'info_dict': { - 'id': 'discovery-vr-an-introduction', - 'ext': 'mp4', - 'title': 'Discovery VR - An Introduction', - 'description': 'md5:80d418a10efb8899d9403e61d8790f06', - } - } - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - - bootstrap_data = self._search_regex( - r'root\.DVR\.bootstrapData\s+=\s+"({.+?})";', - webpage, 'bootstrap data') - bootstrap_data = self._parse_json( - bootstrap_data.encode('utf-8').decode('unicode_escape'), - display_id) - videos = self._parse_json(bootstrap_data['videos'], display_id)['allVideos'] - video_data = next(video for video in videos if video.get('slug') == display_id) - - series = video_data.get('showTitle') - title = episode = video_data.get('title') or series - if series and series != title: - title = '%s - %s' % (series, title) - - formats = [] - for f, format_id in (('cdnUriM3U8', 'mobi'), ('webVideoUrlSd', 'sd'), ('webVideoUrlHd', 'hd')): - f_url = video_data.get(f) - if not f_url: - continue - formats.append({ - 'format_id': format_id, - 'url': f_url, - }) - - return { - 'id': display_id, - 'display_id': display_id, - 'title': title, - 'description': video_data.get('description'), - 'thumbnail': video_data.get('thumbnail'), - 'duration': parse_duration(video_data.get('runTime')), - 'formats': formats, - 'episode': episode, - 'series': series, - } diff --git a/hypervideo_dl/extractor/disney.py b/hypervideo_dl/extractor/disney.py index 0ad7b1f..430de32 100644 --- a/hypervideo_dl/extractor/disney.py +++ b/hypervideo_dl/extractor/disney.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -137,7 +134,6 @@ class DisneyIE(InfoExtractor): self.raise_no_formats( '%s said: %s' % (self.IE_NAME, page_data['translations']['video_expired']), expected=True) - self._sort_formats(formats) subtitles = {} for caption in video_data.get('captions', []): diff --git a/hypervideo_dl/extractor/dispeak.py b/hypervideo_dl/extractor/dispeak.py index 3d651f3..37f89b9 100644 --- a/hypervideo_dl/extractor/dispeak.py +++ b/hypervideo_dl/extractor/dispeak.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -119,7 +117,6 @@ class DigitallySpeakingIE(InfoExtractor): video_formats = self._parse_mp4(metadata) if video_formats is None: video_formats = self._parse_flv(metadata) - self._sort_formats(video_formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/dlive.py b/hypervideo_dl/extractor/dlive.py index 7410eb6..30fcf9f 100644 --- a/hypervideo_dl/extractor/dlive.py +++ b/hypervideo_dl/extractor/dlive.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import json from .common import InfoExtractor @@ -42,7 +40,6 @@ class DLiveVODIE(InfoExtractor): title = broadcast['title'] formats = self._extract_m3u8_formats( broadcast['playbackUrl'], vod_id, 'mp4', 'm3u8_native') - self._sort_formats(formats) return { 'id': vod_id, 'title': title, @@ -81,7 +78,6 @@ class DLiveStreamIE(InfoExtractor): formats = self._extract_m3u8_formats( 'https://live.prd.dlive.tv/hls/live/%s.m3u8' % username, display_name, 'mp4') - self._sort_formats(formats) return { 'id': display_name, 'title': title, diff --git a/hypervideo_dl/extractor/doodstream.py b/hypervideo_dl/extractor/doodstream.py deleted file mode 100644 index f692127..0000000 --- a/hypervideo_dl/extractor/doodstream.py +++ /dev/null @@ -1,76 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import string -import random -import time - -from .common import InfoExtractor - - -class DoodStreamIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?dood\.(?:to|watch)/[ed]/(?P<id>[a-z0-9]+)' - _TESTS = [{ - 'url': 'http://dood.to/e/5s1wmbdacezb', - 'md5': '4568b83b31e13242b3f1ff96c55f0595', - 'info_dict': { - 'id': '5s1wmbdacezb', - 'ext': 'mp4', - 'title': 'Kat Wonders - Monthly May 2020', - 'description': 'Kat Wonders - Monthly May 2020 | DoodStream.com', - 'thumbnail': 'https://img.doodcdn.com/snaps/flyus84qgl2fsk4g.jpg', - } - }, { - 'url': 'http://dood.watch/d/5s1wmbdacezb', - 'md5': '4568b83b31e13242b3f1ff96c55f0595', - 'info_dict': { - 'id': '5s1wmbdacezb', - 'ext': 'mp4', - 'title': 'Kat Wonders - Monthly May 2020', - 'description': 'Kat Wonders - Monthly May 2020 | DoodStream.com', - 'thumbnail': 'https://img.doodcdn.com/snaps/flyus84qgl2fsk4g.jpg', - } - }, { - 'url': 'https://dood.to/d/jzrxn12t2s7n', - 'md5': '3207e199426eca7c2aa23c2872e6728a', - 'info_dict': { - 'id': 'jzrxn12t2s7n', - 'ext': 'mp4', - 'title': 'Stacy Cruz Cute ALLWAYSWELL', - 'description': 'Stacy Cruz Cute ALLWAYSWELL | DoodStream.com', - 'thumbnail': 'https://img.doodcdn.com/snaps/8edqd5nppkac3x8u.jpg', - } - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - url = f'https://dood.to/e/{video_id}' - webpage = self._download_webpage(url, video_id) - - title = self._html_search_meta(['og:title', 'twitter:title'], webpage, default=None) - thumb = self._html_search_meta(['og:image', 'twitter:image'], webpage, default=None) - token = self._html_search_regex(r'[?&]token=([a-z0-9]+)[&\']', webpage, 'token') - description = self._html_search_meta( - ['og:description', 'description', 'twitter:description'], webpage, default=None) - - headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:53.0) Gecko/20100101 Firefox/66.0', - 'referer': url - } - - pass_md5 = self._html_search_regex(r'(/pass_md5.*?)\'', webpage, 'pass_md5') - final_url = ''.join(( - self._download_webpage(f'https://dood.to{pass_md5}', video_id, headers=headers), - *(random.choice(string.ascii_letters + string.digits) for _ in range(10)), - f'?token={token}&expiry={int(time.time() * 1000)}', - )) - - return { - 'id': video_id, - 'title': title, - 'url': final_url, - 'http_headers': headers, - 'ext': 'mp4', - 'description': description, - 'thumbnail': thumb, - } diff --git a/hypervideo_dl/extractor/dotsub.py b/hypervideo_dl/extractor/dotsub.py index 148605c..079f837 100644 --- a/hypervideo_dl/extractor/dotsub.py +++ b/hypervideo_dl/extractor/dotsub.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( float_or_none, diff --git a/hypervideo_dl/extractor/douyutv.py b/hypervideo_dl/extractor/douyutv.py index 26a8d64..477f468 100644 --- a/hypervideo_dl/extractor/douyutv.py +++ b/hypervideo_dl/extractor/douyutv.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import time import hashlib import re diff --git a/hypervideo_dl/extractor/dplay.py b/hypervideo_dl/extractor/dplay.py index a25f27c..8eb4d8f 100644 --- a/hypervideo_dl/extractor/dplay.py +++ b/hypervideo_dl/extractor/dplay.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json import uuid @@ -11,6 +8,7 @@ from ..utils import ( ExtractorError, float_or_none, int_or_none, + remove_start, strip_or_none, try_get, unified_timestamp, @@ -128,7 +126,6 @@ class DPlayBaseIE(InfoExtractor): 'url': format_url, 'format_id': format_id, }) - self._sort_formats(formats) creator = series = None tags = [] @@ -314,7 +311,7 @@ class DPlayIE(DPlayBaseIE): def _real_extract(self, url): mobj = self._match_valid_url(url) display_id = mobj.group('id') - domain = mobj.group('domain').lstrip('www.') + domain = remove_start(mobj.group('domain'), 'www.') country = mobj.group('country') or mobj.group('subdomain_country') or mobj.group('plus_country') host = 'disco-api.' + domain if domain[0] == 'd' else 'eu2-prod.disco-api.com' return self._get_disco_api_info( @@ -720,6 +717,72 @@ class TLCIE(DiscoveryPlusBaseIE): } +class MotorTrendIE(DiscoveryPlusBaseIE): + _VALID_URL = r'https?://(?:watch\.)?motortrend\.com/video' + DPlayBaseIE._PATH_REGEX + _TESTS = [{ + 'url': 'https://watch.motortrend.com/video/car-issues-motortrend-atve-us/double-dakotas', + 'info_dict': { + 'id': '"4859182"', + 'display_id': 'double-dakotas', + 'ext': 'mp4', + 'title': 'Double Dakotas', + 'description': 'Tylers buy-one-get-one Dakota deal has the Wizard pulling double duty.', + 'season_number': 2, + 'episode_number': 3, + }, + 'skip': 'Available for Premium users', + }, { + 'url': 'https://watch.motortrend.com/video/car-issues-motortrend-atve-us/double-dakotas', + 'only_matching': True, + }] + + _PRODUCT = 'vel' + _DISCO_API_PARAMS = { + 'disco_host': 'us1-prod-direct.watch.motortrend.com', + 'realm': 'go', + 'country': 'us', + } + + +class MotorTrendOnDemandIE(DiscoveryPlusBaseIE): + _VALID_URL = r'https?://(?:www\.)?motortrendondemand\.com/detail' + DPlayBaseIE._PATH_REGEX + _TESTS = [{ + 'url': 'https://www.motortrendondemand.com/detail/wheelstanding-dump-truck-stubby-bobs-comeback/37699/784', + 'info_dict': { + 'id': '37699', + 'display_id': 'wheelstanding-dump-truck-stubby-bobs-comeback/37699', + 'ext': 'mp4', + 'title': 'Wheelstanding Dump Truck! Stubby Bob’s Comeback', + 'description': 'md5:996915abe52a1c3dfc83aecea3cce8e7', + 'season_number': 5, + 'episode_number': 52, + 'episode': 'Episode 52', + 'season': 'Season 5', + 'thumbnail': r're:^https?://.+\.jpe?g$', + 'timestamp': 1388534401, + 'duration': 1887.345, + 'creator': 'Originals', + 'series': 'Roadkill', + 'upload_date': '20140101', + 'tags': [], + }, + }] + + _PRODUCT = 'MTOD' + _DISCO_API_PARAMS = { + 'disco_host': 'us1-prod-direct.motortrendondemand.com', + 'realm': 'motortrend', + 'country': 'us', + } + + def _update_disco_api_headers(self, headers, disco_base, display_id, realm): + headers.update({ + 'x-disco-params': f'realm={realm}', + 'x-disco-client': f'WEB:UNKNOWN:{self._PRODUCT}:4.39.1-gi1', + 'Authorization': self._get_auth(disco_base, display_id, realm), + }) + + class DiscoveryPlusIE(DiscoveryPlusBaseIE): _VALID_URL = r'https?://(?:www\.)?discoveryplus\.com/(?!it/)(?:\w{2}/)?video' + DPlayBaseIE._PATH_REGEX _TESTS = [{ @@ -882,6 +945,9 @@ class DiscoveryPlusItalyIE(DiscoveryPlusBaseIE): _TESTS = [{ 'url': 'https://www.discoveryplus.com/it/video/i-signori-della-neve/stagione-2-episodio-1-i-preparativi', 'only_matching': True, + }, { + 'url': 'https://www.discoveryplus.com/it/video/super-benny/trailer', + 'only_matching': True, }] _PRODUCT = 'dplus_us' @@ -891,6 +957,13 @@ class DiscoveryPlusItalyIE(DiscoveryPlusBaseIE): 'country': 'it', } + def _update_disco_api_headers(self, headers, disco_base, display_id, realm): + headers.update({ + 'x-disco-params': 'realm=%s' % realm, + 'x-disco-client': f'WEB:UNKNOWN:{self._PRODUCT}:25.2.6', + 'Authorization': self._get_auth(disco_base, display_id, realm), + }) + class DiscoveryPlusItalyShowIE(DiscoveryPlusShowBaseIE): _VALID_URL = r'https?://(?:www\.)?discoveryplus\.it/programmi/(?P<show_name>[^/]+)/?(?:[?#]|$)' diff --git a/hypervideo_dl/extractor/drbonanza.py b/hypervideo_dl/extractor/drbonanza.py index ea0f06d..824d70d 100644 --- a/hypervideo_dl/extractor/drbonanza.py +++ b/hypervideo_dl/extractor/drbonanza.py @@ -1,6 +1,3 @@ -from __future__ import unicode_literals - - from .common import InfoExtractor from ..utils import ( js_to_json, @@ -33,7 +30,6 @@ class DRBonanzaIE(InfoExtractor): info = self._parse_html5_media_entries( url, webpage, display_id, m3u8_id='hls', m3u8_entry_protocol='m3u8_native')[0] - self._sort_formats(info['formats']) asset = self._parse_json( self._search_regex( diff --git a/hypervideo_dl/extractor/dreisat.py b/hypervideo_dl/extractor/dreisat.py index 5a07c18..8a59c23 100644 --- a/hypervideo_dl/extractor/dreisat.py +++ b/hypervideo_dl/extractor/dreisat.py @@ -1,9 +1,7 @@ -from __future__ import unicode_literals - from .zdf import ZDFIE -class DreiSatIE(ZDFIE): +class DreiSatIE(ZDFIE): # XXX: Do not subclass from concrete IE IE_NAME = '3sat' _VALID_URL = r'https?://(?:www\.)?3sat\.de/(?:[^/]+/)*(?P<id>[^/?#&]+)\.html' _TESTS = [{ diff --git a/hypervideo_dl/extractor/drooble.py b/hypervideo_dl/extractor/drooble.py index 0584250..106e5c4 100644 --- a/hypervideo_dl/extractor/drooble.py +++ b/hypervideo_dl/extractor/drooble.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/dropbox.py b/hypervideo_dl/extractor/dropbox.py index 2559657..214b309 100644 --- a/hypervideo_dl/extractor/dropbox.py +++ b/hypervideo_dl/extractor/dropbox.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import os.path import re @@ -56,8 +53,8 @@ class DropboxIE(InfoExtractor): else: raise ExtractorError('Password protected video, use --video-password <password>', expected=True) - json_string = self._html_search_regex(r'InitReact\.mountComponent\(.*?,\s*(\{.+\})\s*?\)', webpage, 'Info JSON') - info_json = self._parse_json(json_string, video_id).get('props') + info_json = self._search_json(r'InitReact\.mountComponent\(.*?,', webpage, 'mountComponent', video_id, + contains_pattern=r'{.+?"preview".+?}', end_pattern=r'\)')['props'] transcode_url = traverse_obj(info_json, ((None, 'preview'), 'file', 'preview', 'content', 'transcode_url'), get_all=False) formats, subtitles = self._extract_m3u8_formats_and_subtitles(transcode_url, video_id) @@ -66,7 +63,6 @@ class DropboxIE(InfoExtractor): video_url = re.sub(r'[?&]dl=0', '', url) video_url += ('?' if '?' not in video_url else '&') + 'dl=1' formats.append({'url': video_url, 'format_id': 'original', 'format_note': 'Original', 'quality': 1}) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/dropout.py b/hypervideo_dl/extractor/dropout.py index 2fa6195..e280b1c 100644 --- a/hypervideo_dl/extractor/dropout.py +++ b/hypervideo_dl/extractor/dropout.py @@ -1,9 +1,8 @@ -# coding: utf-8 from .common import InfoExtractor from .vimeo import VHXEmbedIE from ..utils import ( - clean_html, ExtractorError, + clean_html, get_element_by_class, get_element_by_id, get_elements_by_class, @@ -97,11 +96,12 @@ class DropoutIE(InfoExtractor): def _login(self, display_id): username, password = self._get_login_info() - if not (username and password): - self.raise_login_required(method='password') + if not username: + return True response = self._download_webpage( - self._LOGIN_URL, display_id, note='Logging in', data=urlencode_postdata({ + self._LOGIN_URL, display_id, note='Logging in', fatal=False, + data=urlencode_postdata({ 'email': username, 'password': password, 'authenticity_token': self._get_authenticity_token(display_id), @@ -111,19 +111,25 @@ class DropoutIE(InfoExtractor): user_has_subscription = self._search_regex( r'user_has_subscription:\s*["\'](.+?)["\']', response, 'subscription status', default='none') if user_has_subscription.lower() == 'true': - return response + return elif user_has_subscription.lower() == 'false': - raise ExtractorError('Account is not subscribed') + return 'Account is not subscribed' else: - raise ExtractorError('Incorrect username/password') + return 'Incorrect username/password' def _real_extract(self, url): display_id = self._match_id(url) - try: - self._login(display_id) - webpage = self._download_webpage(url, display_id, note='Downloading video webpage') - finally: - self._download_webpage('https://www.dropout.tv/logout', display_id, note='Logging out', fatal=False) + + webpage = None + if self._get_cookies('https://www.dropout.tv').get('_session'): + webpage = self._download_webpage(url, display_id) + if not webpage or '<div id="watch-unauthorized"' in webpage: + login_err = self._login(display_id) + webpage = self._download_webpage(url, display_id) + if login_err and '<div id="watch-unauthorized"' in webpage: + if login_err is True: + self.raise_login_required(method='any') + raise ExtractorError(login_err, expected=True) embed_url = self._search_regex(r'embed_url:\s*["\'](.+?)["\']', webpage, 'embed url') thumbnail = self._og_search_thumbnail(webpage) @@ -138,7 +144,7 @@ class DropoutIE(InfoExtractor): return { '_type': 'url_transparent', 'ie_key': VHXEmbedIE.ie_key(), - 'url': embed_url, + 'url': VHXEmbedIE._smuggle_referrer(embed_url, 'https://www.dropout.tv'), 'id': self._search_regex(r'embed\.vhx\.tv/videos/(.+?)\?', embed_url, 'id'), 'display_id': display_id, 'title': title, diff --git a/hypervideo_dl/extractor/drtuber.py b/hypervideo_dl/extractor/drtuber.py index 540b86a..e5dab6a 100644 --- a/hypervideo_dl/extractor/drtuber.py +++ b/hypervideo_dl/extractor/drtuber.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -13,6 +11,7 @@ from ..utils import ( class DrTuberIE(InfoExtractor): _VALID_URL = r'https?://(?:(?:www|m)\.)?drtuber\.com/(?:video|embed)/(?P<id>\d+)(?:/(?P<display_id>[\w-]+))?' + _EMBED_REGEX = [r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?drtuber\.com/embed/\d+)'] _TESTS = [{ 'url': 'http://www.drtuber.com/video/1740434/hot-perky-blonde-naked-golf', 'md5': '93e680cf2536ad0dfb7e74d94a89facd', @@ -35,12 +34,6 @@ class DrTuberIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): - return re.findall( - r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?drtuber\.com/embed/\d+)', - webpage) - def _real_extract(self, url): mobj = self._match_valid_url(url) video_id = mobj.group('id') @@ -65,7 +58,6 @@ class DrTuberIE(InfoExtractor): 'quality': 2 if format_id == 'hq' else 1, 'url': video_url }) - self._sort_formats(formats) duration = int_or_none(video_data.get('duration')) or parse_duration( video_data.get('duration_format')) diff --git a/hypervideo_dl/extractor/drtv.py b/hypervideo_dl/extractor/drtv.py index 37e4d5b..128f439 100644 --- a/hypervideo_dl/extractor/drtv.py +++ b/hypervideo_dl/extractor/drtv.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import binascii import hashlib import re @@ -26,7 +23,7 @@ class DRTVIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// (?: - (?:www\.)?dr\.dk/(?:tv/se|nyheder|radio(?:/ondemand)?)/(?:[^/]+/)*| + (?:www\.)?dr\.dk/(?:tv/se|nyheder|(?:radio|lyd)(?:/ondemand)?)/(?:[^/]+/)*| (?:www\.)?(?:dr\.dk|dr-massive\.com)/drtv/(?:se|episode|program)/ ) (?P<id>[\da-z_-]+) @@ -54,6 +51,7 @@ class DRTVIE(InfoExtractor): 'release_year': 2016, }, 'expected_warnings': ['Unable to download f4m manifest'], + 'skip': 'this video has been removed', }, { # embed 'url': 'https://www.dr.dk/nyheder/indland/live-christianias-rydning-af-pusher-street-er-i-gang', @@ -74,31 +72,41 @@ class DRTVIE(InfoExtractor): # with SignLanguage formats 'url': 'https://www.dr.dk/tv/se/historien-om-danmark/-/historien-om-danmark-stenalder', 'info_dict': { - 'id': 'historien-om-danmark-stenalder', + 'id': '00831690010', 'ext': 'mp4', 'title': 'Historien om Danmark: Stenalder', 'description': 'md5:8c66dcbc1669bbc6f873879880f37f2a', 'timestamp': 1546628400, 'upload_date': '20190104', - 'duration': 3502.56, + 'duration': 3504.618, 'formats': 'mincount:20', + 'release_year': 2017, + 'season_id': 'urn:dr:mu:bundle:5afc03ad6187a4065ca5fd35', + 'season_number': 1, + 'season': 'Historien om Danmark', + 'series': 'Historien om Danmark', }, 'params': { 'skip_download': True, }, }, { - 'url': 'https://www.dr.dk/radio/p4kbh/regionale-nyheder-kh4/p4-nyheder-2019-06-26-17-30-9', + 'url': 'https://www.dr.dk/lyd/p4kbh/regionale-nyheder-kh4/p4-nyheder-2019-06-26-17-30-9', 'only_matching': True, }, { 'url': 'https://www.dr.dk/drtv/se/bonderoeven_71769', 'info_dict': { 'id': '00951930010', 'ext': 'mp4', - 'title': 'Bonderøven (1:8)', - 'description': 'md5:3cf18fc0d3b205745d4505f896af8121', - 'timestamp': 1546542000, - 'upload_date': '20190103', + 'title': 'Bonderøven 2019 (1:8)', + 'description': 'md5:b6dcfe9b6f0bea6703e9a0092739a5bd', + 'timestamp': 1603188600, + 'upload_date': '20201020', 'duration': 2576.6, + 'season': 'Bonderøven 2019', + 'season_id': 'urn:dr:mu:bundle:5c201667a11fa01ca4528ce5', + 'release_year': 2019, + 'season_number': 2019, + 'series': 'Frank & Kastaniegaarden' }, 'params': { 'skip_download': True, @@ -112,6 +120,24 @@ class DRTVIE(InfoExtractor): }, { 'url': 'https://www.dr.dk/drtv/program/jagten_220924', 'only_matching': True, + }, { + 'url': 'https://www.dr.dk/lyd/p4aarhus/regionale-nyheder-ar4/regionale-nyheder-2022-05-05-12-30-3', + 'info_dict': { + 'id': 'urn:dr:mu:programcard:6265cb2571401424d0360113', + 'title': "Regionale nyheder", + 'ext': 'mp4', + 'duration': 120.043, + 'series': 'P4 Østjylland regionale nyheder', + 'timestamp': 1651746600, + 'season': 'Regionale nyheder', + 'release_year': 0, + 'season_id': 'urn:dr:mu:bundle:61c26889539f0201586b73c5', + 'description': '', + 'upload_date': '20220505', + }, + 'params': { + 'skip_download': True, + }, }] def _real_extract(self, url): @@ -274,8 +300,6 @@ class DRTVIE(InfoExtractor): 'Unfortunately, DR is not allowed to show this program outside Denmark.', countries=self._GEO_COUNTRIES) - self._sort_formats(formats) - return { 'id': video_id, 'title': title, @@ -340,7 +364,6 @@ class DRTVLiveIE(InfoExtractor): formats.extend(self._extract_f4m_formats(update_url_query( '%s/%s' % (server, stream_path), {'hdcore': '3.7.0'}), channel_id, f4m_id=link_type, fatal=False)) - self._sort_formats(formats) return { 'id': channel_id, diff --git a/hypervideo_dl/extractor/dtube.py b/hypervideo_dl/extractor/dtube.py index ad247b7..25a98f6 100644 --- a/hypervideo_dl/extractor/dtube.py +++ b/hypervideo_dl/extractor/dtube.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json from socket import timeout diff --git a/hypervideo_dl/extractor/duboku.py b/hypervideo_dl/extractor/duboku.py index a875978..fb0546c 100644 --- a/hypervideo_dl/extractor/duboku.py +++ b/hypervideo_dl/extractor/duboku.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -54,31 +51,39 @@ def _get_element_by_tag_and_attrib(html, tag=None, attribute=None, value=None, e class DubokuIE(InfoExtractor): IE_NAME = 'duboku' - IE_DESC = 'www.duboku.co' + IE_DESC = 'www.duboku.io' - _VALID_URL = r'(?:https?://[^/]+\.duboku\.co/vodplay/)(?P<id>[0-9]+-[0-9-]+)\.html.*' + _VALID_URL = r'(?:https?://[^/]+\.duboku\.io/vodplay/)(?P<id>[0-9]+-[0-9-]+)\.html.*' _TESTS = [{ - 'url': 'https://www.duboku.co/vodplay/1575-1-1.html', + 'url': 'https://w.duboku.io/vodplay/1575-1-1.html', 'info_dict': { 'id': '1575-1-1', - 'ext': 'ts', + 'ext': 'mp4', 'series': '白色月光', 'title': 'contains:白色月光', 'season_number': 1, 'episode_number': 1, + 'season': 'Season 1', + 'episode_id': '1', + 'season_id': '1', + 'episode': 'Episode 1', }, 'params': { 'skip_download': 'm3u8 download', }, }, { - 'url': 'https://www.duboku.co/vodplay/1588-1-1.html', + 'url': 'https://w.duboku.io/vodplay/1588-1-1.html', 'info_dict': { 'id': '1588-1-1', - 'ext': 'ts', + 'ext': 'mp4', 'series': '亲爱的自己', - 'title': 'contains:预告片', + 'title': 'contains:第1集', 'season_number': 1, 'episode_number': 1, + 'episode': 'Episode 1', + 'season': 'Season 1', + 'episode_id': '1', + 'season_id': '1', }, 'params': { 'skip_download': 'm3u8 download', @@ -94,7 +99,7 @@ class DubokuIE(InfoExtractor): season_id = temp[1] episode_id = temp[2] - webpage_url = 'https://www.duboku.co/vodplay/%s.html' % video_id + webpage_url = 'https://w.duboku.io/vodplay/%s.html' % video_id webpage_html = self._download_webpage(webpage_url, video_id) # extract video url @@ -127,12 +132,13 @@ class DubokuIE(InfoExtractor): data_from = player_data.get('from') # if it is an embedded iframe, maybe it's an external source + headers = {'Referer': webpage_url} if data_from == 'iframe': # use _type url_transparent to retain the meaningful details # of the video. return { '_type': 'url_transparent', - 'url': smuggle_url(data_url, {'http_headers': {'Referer': webpage_url}}), + 'url': smuggle_url(data_url, {'http_headers': headers}), 'id': video_id, 'title': title, 'series': series_title, @@ -142,7 +148,7 @@ class DubokuIE(InfoExtractor): 'episode_id': episode_id, } - formats = self._extract_m3u8_formats(data_url, video_id, 'mp4') + formats = self._extract_m3u8_formats(data_url, video_id, 'mp4', headers=headers) return { 'id': video_id, @@ -153,36 +159,29 @@ class DubokuIE(InfoExtractor): 'episode_number': int_or_none(episode_id), 'episode_id': episode_id, 'formats': formats, - 'http_headers': {'Referer': 'https://www.duboku.co/static/player/videojs.html'} + 'http_headers': headers } class DubokuPlaylistIE(InfoExtractor): IE_NAME = 'duboku:list' - IE_DESC = 'www.duboku.co entire series' + IE_DESC = 'www.duboku.io entire series' - _VALID_URL = r'(?:https?://[^/]+\.duboku\.co/voddetail/)(?P<id>[0-9]+)\.html.*' + _VALID_URL = r'(?:https?://[^/]+\.duboku\.io/voddetail/)(?P<id>[0-9]+)\.html.*' _TESTS = [{ - 'url': 'https://www.duboku.co/voddetail/1575.html', + 'url': 'https://w.duboku.io/voddetail/1575.html', 'info_dict': { 'id': 'startswith:1575', 'title': '白色月光', }, 'playlist_count': 12, }, { - 'url': 'https://www.duboku.co/voddetail/1554.html', + 'url': 'https://w.duboku.io/voddetail/1554.html', 'info_dict': { 'id': 'startswith:1554', 'title': '以家人之名', }, 'playlist_mincount': 30, - }, { - 'url': 'https://www.duboku.co/voddetail/1554.html#playlist2', - 'info_dict': { - 'id': '1554#playlist2', - 'title': '以家人之名', - }, - 'playlist_mincount': 27, }] def _real_extract(self, url): @@ -192,7 +191,7 @@ class DubokuPlaylistIE(InfoExtractor): series_id = mobj.group('id') fragment = compat_urlparse.urlparse(url).fragment - webpage_url = 'https://www.duboku.co/voddetail/%s.html' % series_id + webpage_url = 'https://w.duboku.io/voddetail/%s.html' % series_id webpage_html = self._download_webpage(webpage_url, series_id) # extract title @@ -237,6 +236,6 @@ class DubokuPlaylistIE(InfoExtractor): # return url results return self.playlist_result([ self.url_result( - compat_urlparse.urljoin('https://www.duboku.co', x['href']), + compat_urlparse.urljoin('https://w.duboku.io', x['href']), ie=DubokuIE.ie_key(), video_title=x.get('title')) for x in playlist], series_id + '#' + playlist_id, title) diff --git a/hypervideo_dl/extractor/dumpert.py b/hypervideo_dl/extractor/dumpert.py index d9d9afd..010c2d0 100644 --- a/hypervideo_dl/extractor/dumpert.py +++ b/hypervideo_dl/extractor/dumpert.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( int_or_none, @@ -51,7 +48,6 @@ class DumpertIE(InfoExtractor): 'format_id': version, 'quality': quality(version), }) - self._sort_formats(formats) thumbnails = [] stills = item.get('stills') or {} diff --git a/hypervideo_dl/extractor/dvtv.py b/hypervideo_dl/extractor/dvtv.py index 08663cf..e671433 100644 --- a/hypervideo_dl/extractor/dvtv.py +++ b/hypervideo_dl/extractor/dvtv.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -145,7 +142,6 @@ class DVTVIE(InfoExtractor): 'format_id': join_nonempty('http', ext, label), 'height': int_or_none(height), }) - self._sort_formats(formats) return { 'id': data.get('mediaid') or video_id, diff --git a/hypervideo_dl/extractor/dw.py b/hypervideo_dl/extractor/dw.py index 6eaee07..9c4a08e 100644 --- a/hypervideo_dl/extractor/dw.py +++ b/hypervideo_dl/extractor/dw.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( int_or_none, @@ -65,7 +62,6 @@ class DWIE(InfoExtractor): transform_source=lambda s: s.replace( 'rtmp://tv-od.dw.de/flash/', 'http://tv-download.dw.de/dwtv_video/flv/')) - self._sort_formats(formats) upload_date = hidden_inputs.get('display_date') if not upload_date: diff --git a/hypervideo_dl/extractor/eagleplatform.py b/hypervideo_dl/extractor/eagleplatform.py index f86731a..9ebd24d 100644 --- a/hypervideo_dl/extractor/eagleplatform.py +++ b/hypervideo_dl/extractor/eagleplatform.py @@ -1,6 +1,4 @@ -# coding: utf-8 -from __future__ import unicode_literals - +import functools import re from .common import InfoExtractor @@ -8,6 +6,7 @@ from ..compat import compat_HTTPError from ..utils import ( ExtractorError, int_or_none, + smuggle_url, unsmuggle_url, url_or_none, ) @@ -21,6 +20,7 @@ class EaglePlatformIE(InfoExtractor): ) (?P<id>\d+) ''' + _EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//.+?\.media\.eagleplatform\.com/index/player\?.+?)\1'] _TESTS = [{ # http://lenta.ru/news/2015/03/06/navalny/ 'url': 'http://lentaru.media.eagleplatform.com/index/player?player=new&record_id=227304&player_template_id=5201', @@ -55,14 +55,14 @@ class EaglePlatformIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_url(webpage): - # Regular iframe embedding - mobj = re.search( - r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//.+?\.media\.eagleplatform\.com/index/player\?.+?)\1', - webpage) - if mobj is not None: - return mobj.group('url') + @classmethod + def _extract_embed_urls(cls, url, webpage): + add_referer = functools.partial(smuggle_url, data={'referrer': url}) + + res = tuple(super()._extract_embed_urls(url, webpage)) + if res: + return map(add_referer, res) + PLAYER_JS_RE = r''' <script[^>]+ src=(?P<qjs>["\'])(?:https?:)?//(?P<host>(?:(?!(?P=qjs)).)+\.media\.eagleplatform\.com)/player/player\.js(?P=qjs) @@ -77,7 +77,7 @@ class EaglePlatformIE(InfoExtractor): data-id=["\'](?P<id>\d+) ''' % PLAYER_JS_RE, webpage) if mobj is not None: - return 'eagleplatform:%(host)s:%(id)s' % mobj.groupdict() + return [add_referer('eagleplatform:%(host)s:%(id)s' % mobj.groupdict())] # Generalization of "Javascript code usage", "Combined usage" and # "Usage without attaching to DOM" embeddings (see # http://dultonmedia.github.io/eplayer/) @@ -98,7 +98,7 @@ class EaglePlatformIE(InfoExtractor): </script> ''' % PLAYER_JS_RE, webpage) if mobj is not None: - return 'eagleplatform:%(host)s:%(id)s' % mobj.groupdict() + return [add_referer('eagleplatform:%(host)s:%(id)s' % mobj.groupdict())] @staticmethod def _handle_error(response): @@ -192,8 +192,6 @@ class EaglePlatformIE(InfoExtractor): f['url'] = format_url formats.append(f) - self._sort_formats(formats) - return { 'id': video_id, 'title': title, @@ -204,3 +202,14 @@ class EaglePlatformIE(InfoExtractor): 'age_limit': age_limit, 'formats': formats, } + + +class ClipYouEmbedIE(InfoExtractor): + _VALID_URL = False + + @classmethod + def _extract_embed_urls(cls, url, webpage): + mobj = re.search( + r'<iframe[^>]+src="https?://(?P<host>media\.clipyou\.ru)/index/player\?.*\brecord_id=(?P<id>\d+).*"', webpage) + if mobj is not None: + yield smuggle_url('eagleplatform:%(host)s:%(id)s' % mobj.groupdict(), {'referrer': url}) diff --git a/hypervideo_dl/extractor/ebaumsworld.py b/hypervideo_dl/extractor/ebaumsworld.py index c97682c..0854d03 100644 --- a/hypervideo_dl/extractor/ebaumsworld.py +++ b/hypervideo_dl/extractor/ebaumsworld.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/echomsk.py b/hypervideo_dl/extractor/echomsk.py index 6b7cc65..850eabb 100644 --- a/hypervideo_dl/extractor/echomsk.py +++ b/hypervideo_dl/extractor/echomsk.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/egghead.py b/hypervideo_dl/extractor/egghead.py index b6b8676..a4b2a12 100644 --- a/hypervideo_dl/extractor/egghead.py +++ b/hypervideo_dl/extractor/egghead.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import compat_str from ..utils import ( @@ -120,7 +117,6 @@ class EggheadLessonIE(EggheadBaseIE): formats.append({ 'url': format_url, }) - self._sort_formats(formats) return { 'id': lesson_id, diff --git a/hypervideo_dl/extractor/ehow.py b/hypervideo_dl/extractor/ehow.py index b1cd4f5..74469ce 100644 --- a/hypervideo_dl/extractor/ehow.py +++ b/hypervideo_dl/extractor/ehow.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import compat_urllib_parse_unquote diff --git a/hypervideo_dl/extractor/eighttracks.py b/hypervideo_dl/extractor/eighttracks.py index 9a44f89..3dd9ab1 100644 --- a/hypervideo_dl/extractor/eighttracks.py +++ b/hypervideo_dl/extractor/eighttracks.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json import random diff --git a/hypervideo_dl/extractor/einthusan.py b/hypervideo_dl/extractor/einthusan.py index 7af279a..53bc253 100644 --- a/hypervideo_dl/extractor/einthusan.py +++ b/hypervideo_dl/extractor/einthusan.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json from .common import InfoExtractor @@ -92,8 +89,6 @@ class EinthusanIE(InfoExtractor): 'url': mp4_url, }) - self._sort_formats(formats) - description = get_elements_by_class('synopsis', webpage)[0] thumbnail = self._html_search_regex( r'''<img[^>]+src=(["'])(?P<url>(?!\1).+?/moviecovers/(?!\1).+?)\1''', diff --git a/hypervideo_dl/extractor/eitb.py b/hypervideo_dl/extractor/eitb.py index ee5ead1..bd027da 100644 --- a/hypervideo_dl/extractor/eitb.py +++ b/hypervideo_dl/extractor/eitb.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( float_or_none, @@ -74,8 +71,6 @@ class EitbIE(InfoExtractor): '%s?hdcore=3.7.0' % hds_url.replace('euskalsvod', 'euskalvod'), video_id, f4m_id='hds', fatal=False)) - self._sort_formats(formats) - return { 'id': video_id, 'title': media.get('NAME_ES') or media.get('name') or media['NAME_EU'], diff --git a/hypervideo_dl/extractor/ellentube.py b/hypervideo_dl/extractor/ellentube.py index d451bc0..6eb00f9 100644 --- a/hypervideo_dl/extractor/ellentube.py +++ b/hypervideo_dl/extractor/ellentube.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( clean_html, @@ -31,7 +28,6 @@ class EllenTubeBaseIE(InfoExtractor): entry_protocol='m3u8_native', m3u8_id='hls') duration = int_or_none(entry.get('duration')) break - self._sort_formats(formats) def get_insight(kind): return int_or_none(try_get( diff --git a/hypervideo_dl/extractor/elonet.py b/hypervideo_dl/extractor/elonet.py index 9c6aea2..c5558ff 100644 --- a/hypervideo_dl/extractor/elonet.py +++ b/hypervideo_dl/extractor/elonet.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import determine_ext @@ -56,7 +53,6 @@ class ElonetIE(InfoExtractor): else: formats, subtitles = [], {} self.raise_no_formats(f'Unknown streaming format {ext}') - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/elpais.py b/hypervideo_dl/extractor/elpais.py index b89f6db..7c6c880 100644 --- a/hypervideo_dl/extractor/elpais.py +++ b/hypervideo_dl/extractor/elpais.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import strip_jsonp, unified_strdate diff --git a/hypervideo_dl/extractor/embedly.py b/hypervideo_dl/extractor/embedly.py index a5820b2..483d018 100644 --- a/hypervideo_dl/extractor/embedly.py +++ b/hypervideo_dl/extractor/embedly.py @@ -1,6 +1,5 @@ -# coding: utf-8 -from __future__ import unicode_literals - +import re +import urllib.parse from .common import InfoExtractor from ..compat import compat_urllib_parse_unquote @@ -12,5 +11,14 @@ class EmbedlyIE(InfoExtractor): 'only_matching': True, }] + @classmethod + def _extract_embed_urls(cls, url, webpage): + # Bypass suitable check + for mobj in re.finditer(r'class=["\']embedly-card["\'][^>]href=["\'](?P<url>[^"\']+)', webpage): + yield mobj.group('url') + + for mobj in re.finditer(r'class=["\']embedly-embed["\'][^>]src=["\'][^"\']*url=(?P<url>[^&]+)', webpage): + yield urllib.parse.unquote(mobj.group('url')) + def _real_extract(self, url): return self.url_result(compat_urllib_parse_unquote(self._match_id(url))) diff --git a/hypervideo_dl/extractor/engadget.py b/hypervideo_dl/extractor/engadget.py index 733bf32..e7c5d7b 100644 --- a/hypervideo_dl/extractor/engadget.py +++ b/hypervideo_dl/extractor/engadget.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/epicon.py b/hypervideo_dl/extractor/epicon.py index cd19325..3bfcc54 100644 --- a/hypervideo_dl/extractor/epicon.py +++ b/hypervideo_dl/extractor/epicon.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -62,7 +59,6 @@ class EpiconIE(InfoExtractor): description = self._og_search_description(webpage) or None thumbnail = self._og_search_thumbnail(webpage) or None formats = self._extract_m3u8_formats(data_json['url']['video_url'], id) - self._sort_formats(formats) subtitles = {} for subtitle in data_json.get('subtitles', []): diff --git a/hypervideo_dl/extractor/epoch.py b/hypervideo_dl/extractor/epoch.py new file mode 100644 index 0000000..110e78c --- /dev/null +++ b/hypervideo_dl/extractor/epoch.py @@ -0,0 +1,55 @@ +from .common import InfoExtractor +from ..utils import extract_attributes, get_element_html_by_id + + +class EpochIE(InfoExtractor): + _VALID_URL = r'https?://www.theepochtimes\.com/[\w-]+_(?P<id>\d+).html' + _TESTS = [ + { + 'url': 'https://www.theepochtimes.com/they-can-do-audio-video-physical-surveillance-on-you-24h-365d-a-year-rex-lee-on-intrusive-apps_4661688.html', + 'info_dict': { + 'id': 'a3dd732c-4750-4bc8-8156-69180668bda1', + 'ext': 'mp4', + 'title': '‘They Can Do Audio, Video, Physical Surveillance on You 24H/365D a Year’: Rex Lee on Intrusive Apps', + } + }, + { + 'url': 'https://www.theepochtimes.com/the-communist-partys-cyberattacks-on-america-explained-rex-lee-talks-tech-hybrid-warfare_4342413.html', + 'info_dict': { + 'id': '276c7f46-3bbf-475d-9934-b9bbe827cf0a', + 'ext': 'mp4', + 'title': 'The Communist Party’s Cyberattacks on America Explained; Rex Lee Talks Tech Hybrid Warfare', + } + }, + { + 'url': 'https://www.theepochtimes.com/kash-patel-a-6-year-saga-of-government-corruption-from-russiagate-to-mar-a-lago_4690250.html', + 'info_dict': { + 'id': 'aa9ceecd-a127-453d-a2de-7153d6fd69b6', + 'ext': 'mp4', + 'title': 'Kash Patel: A ‘6-Year-Saga’ of Government Corruption, From Russiagate to Mar-a-Lago', + } + }, + { + 'url': 'https://www.theepochtimes.com/dick-morris-discusses-his-book-the-return-trumps-big-2024-comeback_4819205.html', + 'info_dict': { + 'id': '9489f994-2a20-4812-b233-ac0e5c345632', + 'ext': 'mp4', + 'title': 'Dick Morris Discusses His Book ‘The Return: Trump’s Big 2024 Comeback’', + } + }, + ] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + youmaker_video_id = extract_attributes(get_element_html_by_id('videobox', webpage))['data-id'] + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + f'http://vs1.youmaker.com/assets/{youmaker_video_id}/playlist.m3u8', video_id, 'mp4', m3u8_id='hls') + + return { + 'id': youmaker_video_id, + 'formats': formats, + 'subtitles': subtitles, + 'title': self._html_extract_title(webpage) + } diff --git a/hypervideo_dl/extractor/eporner.py b/hypervideo_dl/extractor/eporner.py index 25a0d97..a233797 100644 --- a/hypervideo_dl/extractor/eporner.py +++ b/hypervideo_dl/extractor/eporner.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor from ..utils import ( encode_base_n, @@ -110,7 +106,6 @@ class EpornerIE(InfoExtractor): 'height': height, 'fps': fps, }) - self._sort_formats(formats) json_ld = self._search_json_ld(webpage, display_id, default={}) diff --git a/hypervideo_dl/extractor/eroprofile.py b/hypervideo_dl/extractor/eroprofile.py index 5d5e7f2..2b61f3b 100644 --- a/hypervideo_dl/extractor/eroprofile.py +++ b/hypervideo_dl/extractor/eroprofile.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/ertgr.py b/hypervideo_dl/extractor/ertgr.py index 19ce23f..9ecdf5d 100644 --- a/hypervideo_dl/extractor/ertgr.py +++ b/hypervideo_dl/extractor/ertgr.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json import re @@ -18,7 +15,6 @@ from ..utils import ( parse_iso8601, str_or_none, try_get, - unescapeHTML, url_or_none, variadic, ) @@ -77,7 +73,7 @@ class ERTFlixCodenameIE(ERTFlixBaseIE): }, ] - def _extract_formats_and_subs(self, video_id, allow_none=True): + def _extract_formats_and_subs(self, video_id): media_info = self._call_api(video_id, codename=video_id) formats, subs = [], {} for media_file in try_get(media_info, lambda x: x['MediaFiles'], list) or []: @@ -101,8 +97,6 @@ class ERTFlixCodenameIE(ERTFlixBaseIE): formats.extend(formats_) self._merge_subtitles(subs_, target=subs) - if formats or not allow_none: - self._sort_formats(formats) return formats, subs def _real_extract(self, url): @@ -122,7 +116,7 @@ class ERTFlixCodenameIE(ERTFlixBaseIE): class ERTFlixIE(ERTFlixBaseIE): IE_NAME = 'ertflix' IE_DESC = 'ERTFLIX videos' - _VALID_URL = r'https?://www\.ertflix\.gr/(?:series|vod)/(?P<id>[a-z]{3}\.\d+)' + _VALID_URL = r'https?://www\.ertflix\.gr/(?:[^/]+/)?(?:series|vod)/(?P<id>[a-z]{3}\.\d+)' _TESTS = [{ 'url': 'https://www.ertflix.gr/vod/vod.173258-aoratoi-ergates', 'md5': '6479d5e60fd7e520b07ba5411dcdd6e7', @@ -174,6 +168,9 @@ class ERTFlixIE(ERTFlixBaseIE): 'title': 'Το δίκτυο', }, 'playlist_mincount': 9, + }, { + 'url': 'https://www.ertflix.gr/en/vod/vod.127652-ta-kalytera-mas-chronia-ep1-mia-volta-sto-feggari', + 'only_matching': True, }] def _extract_episode(self, episode): @@ -275,6 +272,7 @@ class ERTWebtvEmbedIE(InfoExtractor): IE_DESC = 'ert.gr webtv embedded videos' _BASE_PLAYER_URL_RE = re.escape('//www.ert.gr/webtv/live-uni/vod/dt-uni-vod.php') _VALID_URL = rf'https?:{_BASE_PLAYER_URL_RE}\?([^#]+&)?f=(?P<id>[^#&]+)' + _EMBED_REGEX = [rf'<iframe[^>]+?src=(?P<_q1>["\'])(?P<url>(?:https?:)?{_BASE_PLAYER_URL_RE}\?(?:(?!(?P=_q1)).)+)(?P=_q1)'] _TESTS = [{ 'url': 'https://www.ert.gr/webtv/live-uni/vod/dt-uni-vod.php?f=trailers/E2251_TO_DIKTYO_E09_16-01_1900.mp4&bgimg=/photos/2022/1/to_diktio_ep09_i_istoria_tou_diadiktiou_stin_Ellada_1021x576.jpg', @@ -287,23 +285,11 @@ class ERTWebtvEmbedIE(InfoExtractor): }, }] - @classmethod - def _extract_urls(cls, webpage): - EMBED_URL_RE = rf'(?:https?:)?{cls._BASE_PLAYER_URL_RE}\?(?:(?!(?P=_q1)).)+' - EMBED_RE = rf'<iframe[^>]+?src=(?P<_q1>["\'])(?P<url>{EMBED_URL_RE})(?P=_q1)' - - for mobj in re.finditer(EMBED_RE, webpage): - url = unescapeHTML(mobj.group('url')) - if not cls.suitable(url): - continue - yield url - def _real_extract(self, url): video_id = self._match_id(url) formats, subs = self._extract_m3u8_formats_and_subtitles( f'https://mediastream.ert.gr/vodedge/_definst_/mp4:dvrorigin/{video_id}/playlist.m3u8', video_id, 'mp4') - self._sort_formats(formats) thumbnail_id = parse_qs(url).get('bgimg', [None])[0] if thumbnail_id and not thumbnail_id.startswith('http'): thumbnail_id = f'https://program.ert.gr{thumbnail_id}' diff --git a/hypervideo_dl/extractor/escapist.py b/hypervideo_dl/extractor/escapist.py index 4cd815e..85a1cbf 100644 --- a/hypervideo_dl/extractor/escapist.py +++ b/hypervideo_dl/extractor/escapist.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( determine_ext, @@ -97,7 +95,6 @@ class EscapistIE(InfoExtractor): 'format_id': '%s-%sp' % (determine_ext(video['src']), video['res']), 'height': int_or_none(video.get('res')), } for video in data['files']['videos']] - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/espn.py b/hypervideo_dl/extractor/espn.py index dc50f3b..f4b0134 100644 --- a/hypervideo_dl/extractor/espn.py +++ b/hypervideo_dl/extractor/espn.py @@ -1,14 +1,16 @@ -from __future__ import unicode_literals - +import base64 +import json import re +import urllib.parse +from .adobepass import AdobePassIE from .common import InfoExtractor from .once import OnceIE -from ..compat import compat_str from ..utils import ( determine_ext, dict_get, int_or_none, + traverse_obj, unified_strdate, unified_timestamp, ) @@ -26,7 +28,6 @@ class ESPNIE(OnceIE): (?: (?: video/(?:clip|iframe/twitter)| - watch/player ) (?: .*?\?.*?\bid=| @@ -49,6 +50,8 @@ class ESPNIE(OnceIE): 'description': 'md5:39370c2e016cb4ecf498ffe75bef7f0f', 'timestamp': 1390936111, 'upload_date': '20140128', + 'duration': 1302, + 'thumbnail': r're:https://.+\.jpg', }, 'params': { 'skip_download': True, @@ -73,15 +76,6 @@ class ESPNIE(OnceIE): }, { 'url': 'https://cdn.espn.go.com/video/clip/_/id/19771774', 'only_matching': True, - }, { - 'url': 'http://www.espn.com/watch/player?id=19141491', - 'only_matching': True, - }, { - 'url': 'http://www.espn.com/watch/player?bucketId=257&id=19505875', - 'only_matching': True, - }, { - 'url': 'http://www.espn.com/watch/player/_/id/19141491', - 'only_matching': True, }, { 'url': 'http://www.espn.com/video/clip?id=10365079', 'only_matching': True, @@ -100,7 +94,13 @@ class ESPNIE(OnceIE): }, { 'url': 'http://www.espn.com/espnw/video/26066627/arkansas-gibson-completes-hr-cycle-four-innings', 'only_matching': True, - }] + }, { + 'url': 'http://www.espn.com/watch/player?id=19141491', + 'only_matching': True, + }, { + 'url': 'http://www.espn.com/watch/player?bucketId=257&id=19505875', + 'only_matching': True, + }, ] def _real_extract(self, url): video_id = self._match_id(url) @@ -118,7 +118,7 @@ class ESPNIE(OnceIE): for source_id, source in source.items(): if source_id == 'alert': continue - elif isinstance(source, compat_str): + elif isinstance(source, str): extract_source(source, base_source_id) elif isinstance(source, dict): traverse_source( @@ -162,7 +162,6 @@ class ESPNIE(OnceIE): links = clip.get('links', {}) traverse_source(links.get('source', {})) traverse_source(links.get('mobile', {})) - self._sort_formats(formats) description = clip.get('caption') or clip.get('description') thumbnail = clip.get('thumbnail') @@ -198,7 +197,7 @@ class ESPNArticleIE(InfoExtractor): @classmethod def suitable(cls, url): - return False if ESPNIE.suitable(url) else super(ESPNArticleIE, cls).suitable(url) + return False if (ESPNIE.suitable(url) or WatchESPNIE.suitable(url)) else super().suitable(url) def _real_extract(self, url): video_id = self._match_id(url) @@ -269,7 +268,6 @@ class ESPNCricInfoIE(InfoExtractor): 'url': item['url'], 'vcodec': 'none', }) - self._sort_formats(formats) return { 'id': id, 'title': data_json.get('title'), @@ -279,3 +277,134 @@ class ESPNCricInfoIE(InfoExtractor): 'formats': formats, 'subtitles': subtitles, } + + +class WatchESPNIE(AdobePassIE): + _VALID_URL = r'https?://(?:www\.)?espn\.com/(?:watch|espnplus)/player/_/id/(?P<id>[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})' + _TESTS = [{ + 'url': 'https://www.espn.com/watch/player/_/id/dbbc6b1d-c084-4b47-9878-5f13c56ce309', + 'info_dict': { + 'id': 'dbbc6b1d-c084-4b47-9878-5f13c56ce309', + 'ext': 'mp4', + 'title': 'Huddersfield vs. Burnley', + 'duration': 7500, + 'thumbnail': 'https://artwork.api.espn.com/artwork/collections/media/dbbc6b1d-c084-4b47-9878-5f13c56ce309/default?width=640&apikey=1ngjw23osgcis1i1vbj96lmfqs', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.espn.com/watch/player/_/id/a049a56e-a7ce-477e-aef3-c7e48ef8221c', + 'info_dict': { + 'id': 'a049a56e-a7ce-477e-aef3-c7e48ef8221c', + 'ext': 'mp4', + 'title': 'Dynamo Dresden vs. VfB Stuttgart (Round #1) (German Cup)', + 'duration': 8335, + 'thumbnail': 'https://s.secure.espncdn.com/stitcher/artwork/collections/media/bd1f3d12-0654-47d9-852e-71b85ea695c7/16x9.jpg?timestamp=202201112217&showBadge=true&cb=12&package=ESPN_PLUS', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.espn.com/espnplus/player/_/id/317f5fd1-c78a-4ebe-824a-129e0d348421', + 'info_dict': { + 'id': '317f5fd1-c78a-4ebe-824a-129e0d348421', + 'ext': 'mp4', + 'title': 'The Wheel - Episode 10', + 'duration': 3352, + 'thumbnail': 'https://s.secure.espncdn.com/stitcher/artwork/collections/media/317f5fd1-c78a-4ebe-824a-129e0d348421/16x9.jpg?timestamp=202205031523&showBadge=true&cb=12&package=ESPN_PLUS', + }, + 'params': { + 'skip_download': True, + }, + }] + + _API_KEY = 'ZXNwbiZicm93c2VyJjEuMC4w.ptUt7QxsteaRruuPmGZFaJByOoqKvDP2a5YkInHrc7c' + + def _call_bamgrid_api(self, path, video_id, payload=None, headers={}): + if 'Authorization' not in headers: + headers['Authorization'] = f'Bearer {self._API_KEY}' + parse = urllib.parse.urlencode if path == 'token' else json.dumps + return self._download_json( + f'https://espn.api.edge.bamgrid.com/{path}', video_id, headers=headers, data=parse(payload).encode()) + + def _real_extract(self, url): + video_id = self._match_id(url) + cdn_data = self._download_json( + f'https://watch-cdn.product.api.espn.com/api/product/v3/watchespn/web/playback/event?id={video_id}', + video_id) + video_data = cdn_data['playbackState'] + + # ESPN+ subscription required, through cookies + if 'DTC' in video_data.get('sourceId'): + cookie = self._get_cookies(url).get('ESPN-ONESITE.WEB-PROD.token') + if not cookie: + self.raise_login_required(method='cookies') + + assertion = self._call_bamgrid_api( + 'devices', video_id, + headers={'Content-Type': 'application/json; charset=UTF-8'}, + payload={ + 'deviceFamily': 'android', + 'applicationRuntime': 'android', + 'deviceProfile': 'tv', + 'attributes': {}, + })['assertion'] + token = self._call_bamgrid_api( + 'token', video_id, payload={ + 'subject_token': assertion, + 'subject_token_type': 'urn:bamtech:params:oauth:token-type:device', + 'platform': 'android', + 'grant_type': 'urn:ietf:params:oauth:grant-type:token-exchange' + })['access_token'] + + assertion = self._call_bamgrid_api( + 'accounts/grant', video_id, payload={'id_token': cookie.value.split('|')[1]}, + headers={ + 'Authorization': token, + 'Content-Type': 'application/json; charset=UTF-8' + })['assertion'] + token = self._call_bamgrid_api( + 'token', video_id, payload={ + 'subject_token': assertion, + 'subject_token_type': 'urn:bamtech:params:oauth:token-type:account', + 'platform': 'android', + 'grant_type': 'urn:ietf:params:oauth:grant-type:token-exchange' + })['access_token'] + + playback = self._download_json( + video_data['videoHref'].format(scenario='browser~ssai'), video_id, + headers={ + 'Accept': 'application/vnd.media-service+json; version=5', + 'Authorization': token + }) + m3u8_url, headers = playback['stream']['complete'][0]['url'], {'authorization': token} + + # No login required + elif video_data.get('sourceId') == 'ESPN_FREE': + asset = self._download_json( + f'https://watch.auth.api.espn.com/video/auth/media/{video_id}/asset?apikey=uiqlbgzdwuru14v627vdusswb', + video_id) + m3u8_url, headers = asset['stream'], {} + + # TV Provider required + else: + resource = self._get_mvpd_resource('ESPN', video_data['name'], video_id, None) + auth = self._extract_mvpd_auth(url, video_id, 'ESPN', resource).encode() + + asset = self._download_json( + f'https://watch.auth.api.espn.com/video/auth/media/{video_id}/asset?apikey=uiqlbgzdwuru14v627vdusswb', + video_id, data=f'adobeToken={urllib.parse.quote_plus(base64.b64encode(auth))}&drmSupport=HLS'.encode()) + m3u8_url, headers = asset['stream'], {} + + formats, subtitles = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, 'mp4', m3u8_id='hls') + + return { + 'id': video_id, + 'duration': traverse_obj(cdn_data, ('tracking', 'duration')), + 'title': video_data.get('name'), + 'formats': formats, + 'subtitles': subtitles, + 'thumbnail': video_data.get('posterHref'), + 'http_headers': headers, + } diff --git a/hypervideo_dl/extractor/esri.py b/hypervideo_dl/extractor/esri.py index e9dcaeb..02e7efa 100644 --- a/hypervideo_dl/extractor/esri.py +++ b/hypervideo_dl/extractor/esri.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -46,7 +43,6 @@ class EsriVideoIE(InfoExtractor): 'height': int(height), 'filesize_approx': parse_filesize(filesize), }) - self._sort_formats(formats) title = self._html_search_meta('title', webpage, 'title') description = self._html_search_meta( diff --git a/hypervideo_dl/extractor/europa.py b/hypervideo_dl/extractor/europa.py index 60ab2ce..c2b4937 100644 --- a/hypervideo_dl/extractor/europa.py +++ b/hypervideo_dl/extractor/europa.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( int_or_none, @@ -79,7 +76,6 @@ class EuropaIE(InfoExtractor): 'format_note': xpath_text(file_, './lglabel'), 'language_preference': language_preference(lang) }) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/europeantour.py b/hypervideo_dl/extractor/europeantour.py index e28f067..1995a74 100644 --- a/hypervideo_dl/extractor/europeantour.py +++ b/hypervideo_dl/extractor/europeantour.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/eurosport.py b/hypervideo_dl/extractor/eurosport.py new file mode 100644 index 0000000..654e112 --- /dev/null +++ b/hypervideo_dl/extractor/eurosport.py @@ -0,0 +1,97 @@ +from .common import InfoExtractor +from ..utils import traverse_obj + + +class EurosportIE(InfoExtractor): + _VALID_URL = r'https?://www\.eurosport\.com/\w+/[\w-]+/\d+/[\w-]+_(?P<id>vid\d+)' + _TESTS = [{ + 'url': 'https://www.eurosport.com/tennis/roland-garros/2022/highlights-rafael-nadal-brushes-aside-caper-ruud-to-win-record-extending-14th-french-open-title_vid1694147/video.shtml', + 'info_dict': { + 'id': '2480939', + 'ext': 'mp4', + 'title': 'Highlights: Rafael Nadal brushes aside Caper Ruud to win record-extending 14th French Open title', + 'description': 'md5:b564db73ecfe4b14ebbd8e62a3692c76', + 'thumbnail': 'https://imgresizer.eurosport.com/unsafe/1280x960/smart/filters:format(jpeg)/origin-imgresizer.eurosport.com/2022/06/05/3388285-69245968-2560-1440.png', + 'duration': 195.0, + 'display_id': 'vid1694147', + 'timestamp': 1654446698, + 'upload_date': '20220605', + } + }, { + 'url': 'https://www.eurosport.com/tennis/roland-garros/2022/watch-the-top-five-shots-from-men-s-final-as-rafael-nadal-beats-casper-ruud-to-seal-14th-french-open_vid1694283/video.shtml', + 'info_dict': { + 'id': '2481254', + 'ext': 'mp4', + 'title': 'md5:149dcc5dfb38ab7352acc008cc9fb071', + 'duration': 130.0, + 'thumbnail': 'https://imgresizer.eurosport.com/unsafe/1280x960/smart/filters:format(jpeg)/origin-imgresizer.eurosport.com/2022/06/05/3388422-69248708-2560-1440.png', + 'description': 'md5:a0c8a7f6b285e48ae8ddbe7aa85cfee6', + 'display_id': 'vid1694283', + 'timestamp': 1654456090, + 'upload_date': '20220605', + } + }, { + # geo-fence but can bypassed by xff + 'url': 'https://www.eurosport.com/cycling/tour-de-france-femmes/2022/incredible-ride-marlen-reusser-storms-to-stage-4-win-at-tour-de-france-femmes_vid1722221/video.shtml', + 'info_dict': { + 'id': '2582552', + 'ext': 'mp4', + 'title': '‘Incredible ride!’ - Marlen Reusser storms to Stage 4 win at Tour de France Femmes', + 'duration': 188.0, + 'display_id': 'vid1722221', + 'timestamp': 1658936167, + 'thumbnail': 'https://imgresizer.eurosport.com/unsafe/1280x960/smart/filters:format(jpeg)/origin-imgresizer.eurosport.com/2022/07/27/3423347-69852108-2560-1440.jpg', + 'description': 'md5:32bbe3a773ac132c57fb1e8cca4b7c71', + 'upload_date': '20220727', + } + }] + + _TOKEN = None + + # actually defined in https://netsport.eurosport.io/?variables={"databaseId":<databaseId>,"playoutType":"VDP"}&extensions={"persistedQuery":{"version":1 .. + # but this method require to get sha256 hash + _GEO_COUNTRIES = ['DE', 'NL', 'EU', 'IT', 'FR'] # Not complete list but it should work + + def _real_initialize(self): + if EurosportIE._TOKEN is None: + EurosportIE._TOKEN = self._download_json( + 'https://eu3-prod-direct.eurosport.com/token?realm=eurosport', None, + 'Trying to get token')['data']['attributes']['token'] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + json_data = self._download_json( + f'https://eu3-prod-direct.eurosport.com/playback/v2/videoPlaybackInfo/sourceSystemId/eurosport-{display_id}', + display_id, query={'usePreAuth': True}, headers={'Authorization': f'Bearer {EurosportIE._TOKEN}'})['data'] + + json_ld_data = self._search_json_ld(webpage, display_id) + + formats, subtitles = [], {} + for stream_type in json_data['attributes']['streaming']: + if stream_type == 'hls': + fmts, subs = self._extract_m3u8_formats_and_subtitles( + traverse_obj(json_data, ('attributes', 'streaming', stream_type, 'url')), display_id, ext='mp4') + elif stream_type == 'dash': + fmts, subs = self._extract_mpd_formats_and_subtitles( + traverse_obj(json_data, ('attributes', 'streaming', stream_type, 'url')), display_id) + elif stream_type == 'mss': + fmts, subs = self._extract_ism_formats_and_subtitles( + traverse_obj(json_data, ('attributes', 'streaming', stream_type, 'url')), display_id) + + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + + return { + 'id': json_data['id'], + 'title': json_ld_data.get('title') or self._og_search_title(webpage), + 'display_id': display_id, + 'formats': formats, + 'subtitles': subtitles, + 'thumbnails': json_ld_data.get('thumbnails'), + 'description': (json_ld_data.get('description') + or self._html_search_meta(['og:description', 'description'], webpage)), + 'duration': json_ld_data.get('duration'), + 'timestamp': json_ld_data.get('timestamp'), + } diff --git a/hypervideo_dl/extractor/euscreen.py b/hypervideo_dl/extractor/euscreen.py index 2759e74..65a1dc7 100644 --- a/hypervideo_dl/extractor/euscreen.py +++ b/hypervideo_dl/extractor/euscreen.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( @@ -48,7 +45,6 @@ class EUScreenIE(InfoExtractor): formats = [{ 'url': source['src'], } for source in video_json.get('sources', [])] - self._sort_formats(formats) return { 'id': id, diff --git a/hypervideo_dl/extractor/everyonesmixtape.py b/hypervideo_dl/extractor/everyonesmixtape.py deleted file mode 100644 index 80cb032..0000000 --- a/hypervideo_dl/extractor/everyonesmixtape.py +++ /dev/null @@ -1,76 +0,0 @@ -from __future__ import unicode_literals - - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - sanitized_Request, -) - - -class EveryonesMixtapeIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?everyonesmixtape\.com/#/mix/(?P<id>[0-9a-zA-Z]+)(?:/(?P<songnr>[0-9]))?$' - - _TESTS = [{ - 'url': 'http://everyonesmixtape.com/#/mix/m7m0jJAbMQi/5', - 'info_dict': { - 'id': '5bfseWNmlds', - 'ext': 'mp4', - 'title': "Passion Pit - \"Sleepyhead\" (Official Music Video)", - 'uploader': 'FKR.TV', - 'uploader_id': 'frenchkissrecords', - 'description': "Music video for \"Sleepyhead\" from Passion Pit's debut EP Chunk Of Change.\nBuy on iTunes: https://itunes.apple.com/us/album/chunk-of-change-ep/id300087641\n\nDirected by The Wilderness.\n\nhttp://www.passionpitmusic.com\nhttp://www.frenchkissrecords.com", - 'upload_date': '20081015' - }, - 'params': { - 'skip_download': True, # This is simply YouTube - } - }, { - 'url': 'http://everyonesmixtape.com/#/mix/m7m0jJAbMQi', - 'info_dict': { - 'id': 'm7m0jJAbMQi', - 'title': 'Driving', - }, - 'playlist_count': 24 - }] - - def _real_extract(self, url): - mobj = self._match_valid_url(url) - playlist_id = mobj.group('id') - - pllist_url = 'http://everyonesmixtape.com/mixtape.php?a=getMixes&u=-1&linked=%s&explore=' % playlist_id - pllist_req = sanitized_Request(pllist_url) - pllist_req.add_header('X-Requested-With', 'XMLHttpRequest') - - playlist_list = self._download_json( - pllist_req, playlist_id, note='Downloading playlist metadata') - try: - playlist_no = next(playlist['id'] - for playlist in playlist_list - if playlist['code'] == playlist_id) - except StopIteration: - raise ExtractorError('Playlist id not found') - - pl_url = 'http://everyonesmixtape.com/mixtape.php?a=getMix&id=%s&userId=null&code=' % playlist_no - pl_req = sanitized_Request(pl_url) - pl_req.add_header('X-Requested-With', 'XMLHttpRequest') - playlist = self._download_json( - pl_req, playlist_id, note='Downloading playlist info') - - entries = [{ - '_type': 'url', - 'url': t['url'], - 'title': t['title'], - } for t in playlist['tracks']] - - if mobj.group('songnr'): - songnr = int(mobj.group('songnr')) - 1 - return entries[songnr] - - playlist_title = playlist['mixData']['name'] - return { - '_type': 'playlist', - 'id': playlist_id, - 'title': playlist_title, - 'entries': entries, - } diff --git a/hypervideo_dl/extractor/expotv.py b/hypervideo_dl/extractor/expotv.py index 95a8977..bda6e3c 100644 --- a/hypervideo_dl/extractor/expotv.py +++ b/hypervideo_dl/extractor/expotv.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( int_or_none, @@ -51,7 +49,6 @@ class ExpoTVIE(InfoExtractor): r'filename=.*\.([a-z0-9_A-Z]+)&', media_url, 'file extension', default=None) or fcfg.get('type'), }) - self._sort_formats(formats) title = self._og_search_title(webpage) description = self._og_search_description(webpage) diff --git a/hypervideo_dl/extractor/expressen.py b/hypervideo_dl/extractor/expressen.py index dc8b855..86967b6 100644 --- a/hypervideo_dl/extractor/expressen.py +++ b/hypervideo_dl/extractor/expressen.py @@ -1,8 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - from .common import InfoExtractor from ..utils import ( determine_ext, @@ -20,11 +15,13 @@ class ExpressenIE(InfoExtractor): tv/(?:[^/]+/)* (?P<id>[^/?#&]+) ''' + _EMBED_REGEX = [r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:www\.)?(?:expressen|di)\.se/(?:tvspelare/video|videoplayer/embed)/tv/.+?)\1'] _TESTS = [{ 'url': 'https://www.expressen.se/tv/ledare/ledarsnack/ledarsnack-om-arbetslosheten-bland-kvinnor-i-speciellt-utsatta-omraden/', - 'md5': '2fbbe3ca14392a6b1b36941858d33a45', + 'md5': 'deb2ca62e7b1dcd19fa18ba37523f66e', 'info_dict': { - 'id': '8690962', + 'id': 'ba90f5a9-78d1-4511-aa02-c177b9c99136', + 'display_id': 'ledarsnack-om-arbetslosheten-bland-kvinnor-i-speciellt-utsatta-omraden', 'ext': 'mp4', 'title': 'Ledarsnack: Om arbetslösheten bland kvinnor i speciellt utsatta områden', 'description': 'md5:f38c81ff69f3de4d269bbda012fcbbba', @@ -47,13 +44,6 @@ class ExpressenIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): - return [ - mobj.group('url') for mobj in re.finditer( - r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:www\.)?(?:expressen|di)\.se/(?:tvspelare/video|videoplayer/embed)/tv/.+?)\1', - webpage)] - def _real_extract(self, url): display_id = self._match_id(url) @@ -67,7 +57,7 @@ class ExpressenIE(InfoExtractor): display_id, transform_source=unescapeHTML) info = extract_data('video-tracking-info') - video_id = info['videoId'] + video_id = info['contentId'] data = extract_data('article-data') stream = data['stream'] @@ -80,7 +70,6 @@ class ExpressenIE(InfoExtractor): formats = [{ 'url': stream, }] - self._sort_formats(formats) title = info.get('titleRaw') or data['title'] description = info.get('descriptionRaw') diff --git a/hypervideo_dl/extractor/extractors.py b/hypervideo_dl/extractor/extractors.py index 457f4c2..610e02f 100644 --- a/hypervideo_dl/extractor/extractors.py +++ b/hypervideo_dl/extractor/extractors.py @@ -1,2144 +1,26 @@ -# flake8: noqa -from __future__ import unicode_literals +import contextlib +import os -from .abc import ( - ABCIE, - ABCIViewIE, - ABCIViewShowSeriesIE, -) -from .abcnews import ( - AbcNewsIE, - AbcNewsVideoIE, -) -from .abcotvs import ( - ABCOTVSIE, - ABCOTVSClipsIE, -) -from .abematv import ( - AbemaTVIE, - AbemaTVTitleIE, -) -from .academicearth import AcademicEarthCourseIE -from .acast import ( - ACastIE, - ACastChannelIE, -) -from .adn import ADNIE -from .adobeconnect import AdobeConnectIE -from .adobetv import ( - AdobeTVEmbedIE, - AdobeTVIE, - AdobeTVShowIE, - AdobeTVChannelIE, - AdobeTVVideoIE, -) -from .adultswim import AdultSwimIE -from .aenetworks import ( - AENetworksIE, - AENetworksCollectionIE, - AENetworksShowIE, - HistoryTopicIE, - HistoryPlayerIE, - BiographyIE, -) -from .afreecatv import ( - AfreecaTVIE, - AfreecaTVLiveIE, -) -from .airmozilla import AirMozillaIE -from .aljazeera import AlJazeeraIE -from .alphaporno import AlphaPornoIE -from .amara import AmaraIE -from .alura import ( - AluraIE, - AluraCourseIE -) -from .amcnetworks import AMCNetworksIE -from .animelab import ( - AnimeLabIE, - AnimeLabShowsIE, -) -from .amazon import AmazonStoreIE -from .americastestkitchen import ( - AmericasTestKitchenIE, - AmericasTestKitchenSeasonIE, -) -from .animeondemand import AnimeOnDemandIE -from .anvato import AnvatoIE -from .aol import AolIE -from .allocine import AllocineIE -from .aliexpress import AliExpressLiveIE -from .alsace20tv import ( - Alsace20TVIE, - Alsace20TVEmbedIE, -) -from .apa import APAIE -from .aparat import AparatIE -from .appleconnect import AppleConnectIE -from .appletrailers import ( - AppleTrailersIE, - AppleTrailersSectionIE, -) -from .applepodcasts import ApplePodcastsIE -from .archiveorg import ( - ArchiveOrgIE, - YoutubeWebArchiveIE, -) -from .arcpublishing import ArcPublishingIE -from .arkena import ArkenaIE -from .ard import ( - ARDBetaMediathekIE, - ARDIE, - ARDMediathekIE, -) -from .arte import ( - ArteTVIE, - ArteTVEmbedIE, - ArteTVPlaylistIE, - ArteTVCategoryIE, -) -from .arnes import ArnesIE -from .asiancrush import ( - AsianCrushIE, - AsianCrushPlaylistIE, -) -from .atresplayer import AtresPlayerIE -from .atttechchannel import ATTTechChannelIE -from .atvat import ATVAtIE -from .audimedia import AudiMediaIE -from .audioboom import AudioBoomIE -from .audiomack import AudiomackIE, AudiomackAlbumIE -from .audius import ( - AudiusIE, - AudiusTrackIE, - AudiusPlaylistIE, - AudiusProfileIE, -) -from .awaan import ( - AWAANIE, - AWAANVideoIE, - AWAANLiveIE, - AWAANSeasonIE, -) -from .azmedien import AZMedienIE -from .baidu import BaiduVideoIE -from .banbye import ( - BanByeIE, - BanByeChannelIE, -) -from .bandaichannel import BandaiChannelIE -from .bandcamp import ( - BandcampIE, - BandcampAlbumIE, - BandcampWeeklyIE, - BandcampUserIE, -) -from .bannedvideo import BannedVideoIE -from .bbc import ( - BBCCoUkIE, - BBCCoUkArticleIE, - BBCCoUkIPlayerEpisodesIE, - BBCCoUkIPlayerGroupIE, - BBCCoUkPlaylistIE, - BBCIE, -) -from .beeg import BeegIE -from .behindkink import BehindKinkIE -from .bellmedia import BellMediaIE -from .beatport import BeatportIE -from .bet import BetIE -from .bfi import BFIPlayerIE -from .bfmtv import ( - BFMTVIE, - BFMTVLiveIE, - BFMTVArticleIE, -) -from .bibeltv import BibelTVIE -from .bigflix import BigflixIE -from .bigo import BigoIE -from .bild import BildIE -from .bilibili import ( - BiliBiliIE, - BiliBiliSearchIE, - BilibiliCategoryIE, - BiliBiliBangumiIE, - BilibiliAudioIE, - BilibiliAudioAlbumIE, - BiliBiliPlayerIE, - BilibiliChannelIE, - BiliIntlIE, - BiliIntlSeriesIE, -) -from .biobiochiletv import BioBioChileTVIE -from .bitchute import ( - BitChuteIE, - BitChuteChannelIE, -) -from .bitwave import ( - BitwaveReplayIE, - BitwaveStreamIE, -) -from .biqle import BIQLEIE -from .blackboardcollaborate import BlackboardCollaborateIE -from .bleacherreport import ( - BleacherReportIE, - BleacherReportCMSIE, -) -from .blogger import BloggerIE -from .bloomberg import BloombergIE -from .bokecc import BokeCCIE -from .bongacams import BongaCamsIE -from .bostonglobe import BostonGlobeIE -from .box import BoxIE -from .bpb import BpbIE -from .br import ( - BRIE, - BRMediathekIE, -) -from .bravotv import BravoTVIE -from .breakcom import BreakIE -from .breitbart import BreitBartIE -from .brightcove import ( - BrightcoveLegacyIE, - BrightcoveNewIE, -) -from .businessinsider import BusinessInsiderIE -from .buzzfeed import BuzzFeedIE -from .byutv import BYUtvIE -from .c56 import C56IE -from .cableav import CableAVIE -from .callin import CallinIE -from .caltrans import CaltransIE -from .cam4 import CAM4IE -from .camdemy import ( - CamdemyIE, - CamdemyFolderIE -) -from .cammodels import CamModelsIE -from .camwithher import CamWithHerIE -from .canalalpha import CanalAlphaIE -from .canalplus import CanalplusIE -from .canalc2 import Canalc2IE -from .canvas import ( - CanvasIE, - CanvasEenIE, - VrtNUIE, - DagelijkseKostIE, -) -from .carambatv import ( - CarambaTVIE, - CarambaTVPageIE, -) -from .cartoonnetwork import CartoonNetworkIE -from .cbc import ( - CBCIE, - CBCPlayerIE, - CBCGemIE, - CBCGemPlaylistIE, - CBCGemLiveIE, -) -from .cbs import CBSIE -from .cbslocal import ( - CBSLocalIE, - CBSLocalArticleIE, -) -from .cbsinteractive import CBSInteractiveIE -from .cbsnews import ( - CBSNewsEmbedIE, - CBSNewsIE, - CBSNewsLiveVideoIE, -) -from .cbssports import ( - CBSSportsEmbedIE, - CBSSportsIE, - TwentyFourSevenSportsIE, -) -from .ccc import ( - CCCIE, - CCCPlaylistIE, -) -from .ccma import CCMAIE -from .cctv import CCTVIE -from .cda import CDAIE -from .ceskatelevize import CeskaTelevizeIE -from .cgtn import CGTNIE -from .channel9 import Channel9IE -from .charlierose import CharlieRoseIE -from .chaturbate import ChaturbateIE -from .chilloutzone import ChilloutzoneIE -from .chingari import ( - ChingariIE, - ChingariUserIE, -) -from .chirbit import ( - ChirbitIE, - ChirbitProfileIE, -) -from .cinchcast import CinchcastIE -from .cinemax import CinemaxIE -from .ciscolive import ( - CiscoLiveSessionIE, - CiscoLiveSearchIE, -) -from .ciscowebex import CiscoWebexIE -from .cjsw import CJSWIE -from .cliphunter import CliphunterIE -from .clippit import ClippitIE -from .cliprs import ClipRsIE -from .clipsyndicate import ClipsyndicateIE -from .closertotruth import CloserToTruthIE -from .cloudflarestream import CloudflareStreamIE -from .cloudy import CloudyIE -from .clubic import ClubicIE -from .clyp import ClypIE -from .cmt import CMTIE -from .cnbc import ( - CNBCIE, - CNBCVideoIE, -) -from .cnn import ( - CNNIE, - CNNBlogsIE, - CNNArticleIE, -) -from .coub import CoubIE -from .comedycentral import ( - ComedyCentralIE, - ComedyCentralTVIE, -) -from .commonmistakes import CommonMistakesIE, UnicodeBOMIE -from .commonprotocols import ( - MmsIE, - RtmpIE, - ViewSourceIE, -) -from .condenast import CondeNastIE -from .contv import CONtvIE -from .corus import CorusIE -from .cpac import ( - CPACIE, - CPACPlaylistIE, -) -from .cozytv import CozyTVIE -from .cracked import CrackedIE -from .crackle import CrackleIE -from .craftsy import CraftsyIE -from .crooksandliars import CrooksAndLiarsIE -from .crowdbunker import ( - CrowdBunkerIE, - CrowdBunkerChannelIE, -) -from .crunchyroll import ( - CrunchyrollIE, - CrunchyrollShowPlaylistIE, - CrunchyrollBetaIE, - CrunchyrollBetaShowIE, -) -from .cspan import CSpanIE, CSpanCongressIE -from .ctsnews import CtsNewsIE -from .ctv import CTVIE -from .ctvnews import CTVNewsIE -from .cultureunplugged import CultureUnpluggedIE -from .curiositystream import ( - CuriosityStreamIE, - CuriosityStreamCollectionsIE, - CuriosityStreamSeriesIE, -) -from .cwtv import CWTVIE -from .cybrary import ( - CybraryIE, - CybraryCourseIE -) -from .daftsex import DaftsexIE -from .dailymail import DailyMailIE -from .dailymotion import ( - DailymotionIE, - DailymotionPlaylistIE, - DailymotionUserIE, -) -from .damtomo import ( - DamtomoRecordIE, - DamtomoVideoIE, -) -from .daum import ( - DaumIE, - DaumClipIE, - DaumPlaylistIE, - DaumUserIE, -) -from .daystar import DaystarClipIE -from .dbtv import DBTVIE -from .dctp import DctpTvIE -from .deezer import ( - DeezerPlaylistIE, - DeezerAlbumIE, -) -from .democracynow import DemocracynowIE -from .dfb import DFBIE -from .dhm import DHMIE -from .digg import DiggIE -from .dotsub import DotsubIE -from .douyutv import ( - DouyuShowIE, - DouyuTVIE, -) -from .dplay import ( - DPlayIE, - DiscoveryPlusIE, - HGTVDeIE, - GoDiscoveryIE, - TravelChannelIE, - CookingChannelIE, - HGTVUsaIE, - FoodNetworkIE, - InvestigationDiscoveryIE, - DestinationAmericaIE, - AmHistoryChannelIE, - ScienceChannelIE, - DIYNetworkIE, - DiscoveryLifeIE, - AnimalPlanetIE, - TLCIE, - DiscoveryPlusIndiaIE, - DiscoveryNetworksDeIE, - DiscoveryPlusItalyIE, - DiscoveryPlusItalyShowIE, - DiscoveryPlusIndiaShowIE, -) -from .dreisat import DreiSatIE -from .drbonanza import DRBonanzaIE -from .drtuber import DrTuberIE -from .drtv import ( - DRTVIE, - DRTVLiveIE, -) -from .dtube import DTubeIE -from .dvtv import DVTVIE -from .duboku import ( - DubokuIE, - DubokuPlaylistIE -) -from .dumpert import DumpertIE -from .defense import DefenseGouvFrIE -from .digitalconcerthall import DigitalConcertHallIE -from .discovery import DiscoveryIE -from .disney import DisneyIE -from .dispeak import DigitallySpeakingIE -from .doodstream import DoodStreamIE -from .dropbox import DropboxIE -from .dropout import ( - DropoutSeasonIE, - DropoutIE -) -from .dw import ( - DWIE, - DWArticleIE, -) -from .eagleplatform import EaglePlatformIE -from .ebaumsworld import EbaumsWorldIE -from .echomsk import EchoMskIE -from .egghead import ( - EggheadCourseIE, - EggheadLessonIE, -) -from .ehow import EHowIE -from .eighttracks import EightTracksIE -from .einthusan import EinthusanIE -from .eitb import EitbIE -from .ellentube import ( - EllenTubeIE, - EllenTubeVideoIE, - EllenTubePlaylistIE, -) -from .elonet import ElonetIE -from .elpais import ElPaisIE -from .embedly import EmbedlyIE -from .engadget import EngadgetIE -from .epicon import ( - EpiconIE, - EpiconSeriesIE, -) -from .eporner import EpornerIE -from .eroprofile import ( - EroProfileIE, - EroProfileAlbumIE, -) -from .ertgr import ( - ERTFlixCodenameIE, - ERTFlixIE, - ERTWebtvEmbedIE, -) -from .escapist import EscapistIE -from .espn import ( - ESPNIE, - ESPNArticleIE, - FiveThirtyEightIE, - ESPNCricInfoIE, -) -from .esri import EsriVideoIE -from .europa import EuropaIE -from .europeantour import EuropeanTourIE -from .euscreen import EUScreenIE -from .expotv import ExpoTVIE -from .expressen import ExpressenIE -from .extremetube import ExtremeTubeIE -from .eyedotv import EyedoTVIE -from .facebook import ( - FacebookIE, - FacebookPluginsVideoIE, - FacebookRedirectURLIE, -) -from .fancode import ( - FancodeVodIE, - FancodeLiveIE -) +from ..utils import load_plugins -from .faz import FazIE -from .fc2 import ( - FC2IE, - FC2EmbedIE, - FC2LiveIE, -) -from .fczenit import FczenitIE -from .filmmodu import FilmmoduIE -from .filmon import ( - FilmOnIE, - FilmOnChannelIE, -) -from .filmweb import FilmwebIE -from .firsttv import FirstTVIE -from .fivetv import FiveTVIE -from .flickr import FlickrIE -from .folketinget import FolketingetIE -from .footyroom import FootyRoomIE -from .formula1 import Formula1IE -from .fourtube import ( - FourTubeIE, - PornTubeIE, - PornerBrosIE, - FuxIE, -) -from .fox import FOXIE -from .fox9 import ( - FOX9IE, - FOX9NewsIE, -) -from .foxgay import FoxgayIE -from .foxnews import ( - FoxNewsIE, - FoxNewsArticleIE, -) -from .foxsports import FoxSportsIE -from .fptplay import FptplayIE -from .franceculture import FranceCultureIE -from .franceinter import FranceInterIE -from .francetv import ( - FranceTVIE, - FranceTVSiteIE, - FranceTVInfoIE, -) -from .freesound import FreesoundIE -from .freespeech import FreespeechIE -from .frontendmasters import ( - FrontendMastersIE, - FrontendMastersLessonIE, - FrontendMastersCourseIE -) -from .fujitv import FujiTVFODPlus7IE -from .funimation import ( - FunimationIE, - FunimationPageIE, - FunimationShowIE, -) -from .funk import FunkIE -from .fusion import FusionIE -from .gab import ( - GabTVIE, - GabIE, -) -from .gaia import GaiaIE -from .gameinformer import GameInformerIE -from .gamejolt import ( - GameJoltIE, - GameJoltUserIE, - GameJoltGameIE, - GameJoltGameSoundtrackIE, - GameJoltCommunityIE, - GameJoltSearchIE, -) -from .gamespot import GameSpotIE -from .gamestar import GameStarIE -from .gaskrank import GaskrankIE -from .gazeta import GazetaIE -from .gdcvault import GDCVaultIE -from .gedidigital import GediDigitalIE -from .generic import GenericIE -from .gettr import ( - GettrIE, - GettrStreamingIE, -) -from .gfycat import GfycatIE -from .giantbomb import GiantBombIE -from .giga import GigaIE -from .glide import GlideIE -from .globo import ( - GloboIE, - GloboArticleIE, -) -from .go import GoIE -from .godtube import GodTubeIE -from .gofile import GofileIE -from .golem import GolemIE -from .googledrive import GoogleDriveIE -from .googlepodcasts import ( - GooglePodcastsIE, - GooglePodcastsFeedIE, -) -from .googlesearch import GoogleSearchIE -from .gopro import GoProIE -from .goshgay import GoshgayIE -from .gotostage import GoToStageIE -from .gputechconf import GPUTechConfIE -from .gronkh import GronkhIE -from .groupon import GrouponIE -from .hbo import HBOIE -from .hearthisat import HearThisAtIE -from .heise import HeiseIE -from .hellporno import HellPornoIE -from .helsinki import HelsinkiIE -from .hentaistigma import HentaiStigmaIE -from .hgtv import HGTVComShowIE -from .hketv import HKETVIE -from .hidive import HiDiveIE -from .historicfilms import HistoricFilmsIE -from .hitbox import HitboxIE, HitboxLiveIE -from .hitrecord import HitRecordIE -from .hotnewhiphop import HotNewHipHopIE -from .hotstar import ( - HotStarIE, - HotStarPlaylistIE, - HotStarSeriesIE, -) -from .howcast import HowcastIE -from .howstuffworks import HowStuffWorksIE -from .hrfensehen import HRFernsehenIE -from .hrti import ( - HRTiIE, - HRTiPlaylistIE, -) -from .hse import ( - HSEShowIE, - HSEProductIE, -) -from .huajiao import HuajiaoIE -from .huya import HuyaLiveIE -from .huffpost import HuffPostIE -from .hungama import ( - HungamaIE, - HungamaSongIE, - HungamaAlbumPlaylistIE, -) -from .hypem import HypemIE -from .ichinanalive import ( - IchinanaLiveIE, - IchinanaLiveClipIE, -) -from .ign import ( - IGNIE, - IGNVideoIE, - IGNArticleIE, -) -from .iheart import ( - IHeartRadioIE, - IHeartRadioPodcastIE, -) -from .imdb import ( - ImdbIE, - ImdbListIE -) -from .imgur import ( - ImgurIE, - ImgurAlbumIE, - ImgurGalleryIE, -) -from .ina import InaIE -from .inc import IncIE -from .indavideo import IndavideoEmbedIE -from .infoq import InfoQIE -from .instagram import ( - InstagramIE, - InstagramIOSIE, - InstagramUserIE, - InstagramTagIE, - InstagramStoryIE, -) -from .internazionale import InternazionaleIE -from .internetvideoarchive import InternetVideoArchiveIE -from .iprima import ( - IPrimaIE, - IPrimaCNNIE -) -from .iqiyi import ( - IqiyiIE, - IqIE, - IqAlbumIE -) +# NB: Must be before other imports so that plugins can be correctly injected +_PLUGIN_CLASSES = load_plugins('extractor', 'IE', {}) -from .itprotv import ( - ITProTVIE, - ITProTVCourseIE -) +_LAZY_LOADER = False +if not os.environ.get('YTDLP_NO_LAZY_EXTRACTORS'): + with contextlib.suppress(ImportError): + from .lazy_extractors import * # noqa: F403 + from .lazy_extractors import _ALL_CLASSES + _LAZY_LOADER = True -from .itv import ( - ITVIE, - ITVBTCCIE, -) -from .ivi import ( - IviIE, - IviCompilationIE -) -from .ivideon import IvideonIE -from .iwara import IwaraIE -from .izlesene import IzleseneIE -from .jamendo import ( - JamendoIE, - JamendoAlbumIE, -) -from .jeuxvideo import JeuxVideoIE -from .jove import JoveIE -from .joj import JojIE -from .jwplatform import JWPlatformIE -from .kakao import KakaoIE -from .kaltura import KalturaIE -from .karaoketv import KaraoketvIE -from .karrierevideos import KarriereVideosIE -from .keezmovies import KeezMoviesIE -from .kelbyone import KelbyOneIE -from .ketnet import KetnetIE -from .khanacademy import ( - KhanAcademyIE, - KhanAcademyUnitIE, -) -from .kickstarter import KickStarterIE -from .kinja import KinjaEmbedIE -from .kinopoisk import KinoPoiskIE -from .konserthusetplay import KonserthusetPlayIE -from .koo import KooIE -from .krasview import KrasViewIE -from .ku6 import Ku6IE -from .kusi import KUSIIE -from .kuwo import ( - KuwoIE, - KuwoAlbumIE, - KuwoChartIE, - KuwoSingerIE, - KuwoCategoryIE, - KuwoMvIE, -) -from .la7 import ( - LA7IE, - LA7PodcastEpisodeIE, - LA7PodcastIE, -) -from .laola1tv import ( - Laola1TvEmbedIE, - Laola1TvIE, - EHFTVIE, - ITTFIE, -) -from .lastfm import ( - LastFMIE, - LastFMPlaylistIE, - LastFMUserIE, -) -from .lbry import ( - LBRYIE, - LBRYChannelIE, -) -from .lci import LCIIE -from .lcp import ( - LcpPlayIE, - LcpIE, -) -from .lecture2go import Lecture2GoIE -from .lecturio import ( - LecturioIE, - LecturioCourseIE, - LecturioDeCourseIE, -) -from .leeco import ( - LeIE, - LePlaylistIE, - LetvCloudIE, -) -from .lego import LEGOIE -from .lemonde import LemondeIE -from .lenta import LentaIE -from .libraryofcongress import LibraryOfCongressIE -from .libsyn import LibsynIE -from .lifenews import ( - LifeNewsIE, - LifeEmbedIE, -) -from .limelight import ( - LimelightMediaIE, - LimelightChannelIE, - LimelightChannelListIE, -) -from .line import ( - LineLiveIE, - LineLiveChannelIE, -) -from .linkedin import ( - LinkedInIE, - LinkedInLearningIE, - LinkedInLearningCourseIE, -) -from .linuxacademy import LinuxAcademyIE -from .litv import LiTVIE -from .livejournal import LiveJournalIE -from .livestream import ( - LivestreamIE, - LivestreamOriginalIE, - LivestreamShortenerIE, -) -from .lnkgo import ( - LnkGoIE, - LnkIE, -) -from .localnews8 import LocalNews8IE -from .lovehomeporn import LoveHomePornIE -from .lrt import LRTIE -from .lynda import ( - LyndaIE, - LyndaCourseIE -) -from .m6 import M6IE -from .magentamusik360 import MagentaMusik360IE -from .mailru import ( - MailRuIE, - MailRuMusicIE, - MailRuMusicSearchIE, -) -from .mainstreaming import MainStreamingIE -from .malltv import MallTVIE -from .mangomolo import ( - MangomoloVideoIE, - MangomoloLiveIE, -) -from .manoto import ( - ManotoTVIE, - ManotoTVShowIE, - ManotoTVLiveIE, -) -from .manyvids import ManyVidsIE -from .maoritv import MaoriTVIE -from .markiza import ( - MarkizaIE, - MarkizaPageIE, -) -from .massengeschmacktv import MassengeschmackTVIE -from .matchtv import MatchTVIE -from .mdr import MDRIE -from .medaltv import MedalTVIE -from .mediaite import MediaiteIE -from .mediaklikk import MediaKlikkIE -from .mediaset import ( - MediasetIE, - MediasetShowIE, -) -from .mediasite import ( - MediasiteIE, - MediasiteCatalogIE, - MediasiteNamedCatalogIE, -) -from .medici import MediciIE -from .megaphone import MegaphoneIE -from .meipai import MeipaiIE -from .melonvod import MelonVODIE -from .meta import METAIE -from .metacafe import MetacafeIE -from .metacritic import MetacriticIE -from .mgoon import MgoonIE -from .mgtv import MGTVIE -from .miaopai import MiaoPaiIE -from .microsoftstream import MicrosoftStreamIE -from .microsoftvirtualacademy import ( - MicrosoftVirtualAcademyIE, - MicrosoftVirtualAcademyCourseIE, -) -from .mildom import ( - MildomIE, - MildomVodIE, - MildomClipIE, - MildomUserVodIE, -) -from .minds import ( - MindsIE, - MindsChannelIE, - MindsGroupIE, -) -from .ministrygrid import MinistryGridIE -from .minoto import MinotoIE -from .miomio import MioMioIE -from .mirrativ import ( - MirrativIE, - MirrativUserIE, -) -from .mit import TechTVMITIE, OCWMITIE -from .mitele import MiTeleIE -from .mixch import ( - MixchIE, - MixchArchiveIE, -) -from .mixcloud import ( - MixcloudIE, - MixcloudUserIE, - MixcloudPlaylistIE, -) -from .mlb import ( - MLBIE, - MLBVideoIE, -) -from .mlssoccer import MLSSoccerIE -from .mnet import MnetIE -from .moevideo import MoeVideoIE -from .mofosex import ( - MofosexIE, - MofosexEmbedIE, -) -from .mojvideo import MojvideoIE -from .morningstar import MorningstarIE -from .motherless import ( - MotherlessIE, - MotherlessGroupIE -) -from .motorsport import MotorsportIE -from .movieclips import MovieClipsIE -from .moviezine import MoviezineIE -from .movingimage import MovingImageIE -from .msn import MSNIE -from .mtv import ( - MTVIE, - MTVVideoIE, - MTVServicesEmbeddedIE, - MTVDEIE, - MTVJapanIE, - MTVItaliaIE, - MTVItaliaProgrammaIE, -) -from .muenchentv import MuenchenTVIE -from .murrtube import MurrtubeIE, MurrtubeUserIE -from .musescore import MuseScoreIE -from .musicdex import ( - MusicdexSongIE, - MusicdexAlbumIE, - MusicdexArtistIE, - MusicdexPlaylistIE, -) -from .mwave import MwaveIE, MwaveMeetGreetIE -from .mxplayer import ( - MxplayerIE, - MxplayerShowIE, -) -from .mychannels import MyChannelsIE -from .myspace import MySpaceIE, MySpaceAlbumIE -from .myspass import MySpassIE -from .myvi import ( - MyviIE, - MyviEmbedIE, -) -from .myvideoge import MyVideoGeIE -from .myvidster import MyVidsterIE -from .n1 import ( - N1InfoAssetIE, - N1InfoIIE, -) -from .nate import ( - NateIE, - NateProgramIE, -) -from .nationalgeographic import ( - NationalGeographicVideoIE, - NationalGeographicTVIE, -) -from .naver import ( - NaverIE, - NaverLiveIE, -) -from .nba import ( - NBAWatchEmbedIE, - NBAWatchIE, - NBAWatchCollectionIE, - NBAEmbedIE, - NBAIE, - NBAChannelIE, -) -from .nbc import ( - NBCIE, - NBCNewsIE, - NBCOlympicsIE, - NBCOlympicsStreamIE, - NBCSportsIE, - NBCSportsStreamIE, - NBCSportsVPlayerIE, -) -from .ndr import ( - NDRIE, - NJoyIE, - NDREmbedBaseIE, - NDREmbedIE, - NJoyEmbedIE, -) -from .ndtv import NDTVIE -from .nebula import ( - NebulaIE, - NebulaCollectionIE, -) -from .nerdcubed import NerdCubedFeedIE -from .netzkino import NetzkinoIE -from .neteasemusic import ( - NetEaseMusicIE, - NetEaseMusicAlbumIE, - NetEaseMusicSingerIE, - NetEaseMusicListIE, - NetEaseMusicMvIE, - NetEaseMusicProgramIE, - NetEaseMusicDjRadioIE, -) -from .newgrounds import ( - NewgroundsIE, - NewgroundsPlaylistIE, - NewgroundsUserIE, -) -from .newstube import NewstubeIE -from .newsy import NewsyIE -from .nextmedia import ( - NextMediaIE, - NextMediaActionNewsIE, - AppleDailyIE, - NextTVIE, -) -from .nexx import ( - NexxIE, - NexxEmbedIE, -) -from .nfb import NFBIE -from .nfhsnetwork import NFHSNetworkIE -from .nfl import ( - NFLIE, - NFLArticleIE, -) -from .nhk import ( - NhkVodIE, - NhkVodProgramIE, - NhkForSchoolBangumiIE, - NhkForSchoolSubjectIE, - NhkForSchoolProgramListIE, -) -from .nhl import NHLIE -from .nick import ( - NickIE, - NickBrIE, - NickDeIE, - NickNightIE, - NickRuIE, -) -from .niconico import ( - NiconicoIE, - NiconicoPlaylistIE, - NiconicoUserIE, - NiconicoSeriesIE, - NiconicoHistoryIE, - NicovideoSearchDateIE, - NicovideoSearchIE, - NicovideoSearchURLIE, - NicovideoTagURLIE, -) -from .ninecninemedia import ( - NineCNineMediaIE, - CPTwentyFourIE, -) -from .ninegag import NineGagIE -from .ninenow import NineNowIE -from .nintendo import NintendoIE -from .nitter import NitterIE -from .njpwworld import NJPWWorldIE -from .nobelprize import NobelPrizeIE -from .nonktube import NonkTubeIE -from .noodlemagazine import NoodleMagazineIE -from .noovo import NoovoIE -from .normalboots import NormalbootsIE -from .nosvideo import NosVideoIE -from .nova import ( - NovaEmbedIE, - NovaIE, -) -from .novaplay import NovaPlayIE -from .nowness import ( - NownessIE, - NownessPlaylistIE, - NownessSeriesIE, -) -from .noz import NozIE -from .npo import ( - AndereTijdenIE, - NPOIE, - NPOLiveIE, - NPORadioIE, - NPORadioFragmentIE, - SchoolTVIE, - HetKlokhuisIE, - VPROIE, - WNLIE, -) -from .npr import NprIE -from .nrk import ( - NRKIE, - NRKPlaylistIE, - NRKSkoleIE, - NRKTVIE, - NRKTVDirekteIE, - NRKRadioPodkastIE, - NRKTVEpisodeIE, - NRKTVEpisodesIE, - NRKTVSeasonIE, - NRKTVSeriesIE, -) -from .nrl import NRLTVIE -from .ntvcojp import NTVCoJpCUIE -from .ntvde import NTVDeIE -from .ntvru import NTVRuIE -from .nytimes import ( - NYTimesIE, - NYTimesArticleIE, - NYTimesCookingIE, -) -from .nuvid import NuvidIE -from .nzherald import NZHeraldIE -from .nzz import NZZIE -from .odatv import OdaTVIE -from .odnoklassniki import OdnoklassnikiIE -from .oktoberfesttv import OktoberfestTVIE -from .olympics import OlympicsReplayIE -from .on24 import On24IE -from .ondemandkorea import OnDemandKoreaIE -from .onefootball import OneFootballIE -from .onet import ( - OnetIE, - OnetChannelIE, - OnetMVPIE, - OnetPlIE, -) -from .onionstudios import OnionStudiosIE -from .ooyala import ( - OoyalaIE, - OoyalaExternalIE, -) -from .opencast import ( - OpencastIE, - OpencastPlaylistIE, -) -from .openrec import ( - OpenRecIE, - OpenRecCaptureIE, - OpenRecMovieIE, -) -from .ora import OraTVIE -from .orf import ( - ORFTVthekIE, - ORFFM4IE, - ORFFM4StoryIE, - ORFOE1IE, - ORFOE3IE, - ORFNOEIE, - ORFWIEIE, - ORFBGLIE, - ORFOOEIE, - ORFSTMIE, - ORFKTNIE, - ORFSBGIE, - ORFTIRIE, - ORFVBGIE, - ORFIPTVIE, -) -from .outsidetv import OutsideTVIE -from .packtpub import ( - PacktPubIE, - PacktPubCourseIE, -) -from .palcomp3 import ( - PalcoMP3IE, - PalcoMP3ArtistIE, - PalcoMP3VideoIE, -) -from .pandoratv import PandoraTVIE -from .panopto import ( - PanoptoIE, - PanoptoListIE, - PanoptoPlaylistIE -) -from .paramountplus import ( - ParamountPlusIE, - ParamountPlusSeriesIE, -) -from .parliamentliveuk import ParliamentLiveUKIE -from .parlview import ParlviewIE -from .patreon import ( - PatreonIE, - PatreonUserIE -) -from .pbs import PBSIE -from .pearvideo import PearVideoIE -from .peekvids import PeekVidsIE, PlayVidsIE -from .peertube import ( - PeerTubeIE, - PeerTubePlaylistIE, -) -from .peertv import PeerTVIE -from .peloton import ( - PelotonIE, - PelotonLiveIE -) -from .people import PeopleIE -from .performgroup import PerformGroupIE -from .periscope import ( - PeriscopeIE, - PeriscopeUserIE, -) -from .philharmoniedeparis import PhilharmonieDeParisIE -from .phoenix import PhoenixIE -from .photobucket import PhotobucketIE -from .piapro import PiaproIE -from .picarto import ( - PicartoIE, - PicartoVodIE, -) -from .piksel import PikselIE -from .pinkbike import PinkbikeIE -from .pinterest import ( - PinterestIE, - PinterestCollectionIE, -) -from .pixivsketch import ( - PixivSketchIE, - PixivSketchUserIE, -) -from .pladform import PladformIE -from .planetmarathi import PlanetMarathiIE -from .platzi import ( - PlatziIE, - PlatziCourseIE, -) -from .playfm import PlayFMIE -from .playplustv import PlayPlusTVIE -from .plays import PlaysTVIE -from .playstuff import PlayStuffIE -from .playtvak import PlaytvakIE -from .playvid import PlayvidIE -from .playwire import PlaywireIE -from .plutotv import PlutoTVIE -from .pluralsight import ( - PluralsightIE, - PluralsightCourseIE, -) -from .podomatic import PodomaticIE -from .pokemon import ( - PokemonIE, - PokemonWatchIE, - PokemonSoundLibraryIE, -) -from .pokergo import ( - PokerGoIE, - PokerGoCollectionIE, -) -from .polsatgo import PolsatGoIE -from .polskieradio import ( - PolskieRadioIE, - PolskieRadioCategoryIE, - PolskieRadioPlayerIE, - PolskieRadioPodcastIE, - PolskieRadioPodcastListIE, - PolskieRadioRadioKierowcowIE, -) -from .popcorntimes import PopcorntimesIE -from .popcorntv import PopcornTVIE -from .porn91 import Porn91IE -from .porncom import PornComIE -from .pornflip import PornFlipIE -from .pornhd import PornHdIE -from .pornhub import ( - PornHubIE, - PornHubUserIE, - PornHubPlaylistIE, - PornHubPagedVideoListIE, - PornHubUserVideosUploadIE, -) -from .pornotube import PornotubeIE -from .pornovoisines import PornoVoisinesIE -from .pornoxo import PornoXOIE -from .pornez import PornezIE -from .puhutv import ( - PuhuTVIE, - PuhuTVSerieIE, -) -from .presstv import PressTVIE -from .projectveritas import ProjectVeritasIE -from .prosiebensat1 import ProSiebenSat1IE -from .prx import ( - PRXStoryIE, - PRXSeriesIE, - PRXAccountIE, - PRXStoriesSearchIE, - PRXSeriesSearchIE -) -from .puls4 import Puls4IE -from .pyvideo import PyvideoIE -from .qqmusic import ( - QQMusicIE, - QQMusicSingerIE, - QQMusicAlbumIE, - QQMusicToplistIE, - QQMusicPlaylistIE, -) -from .r7 import ( - R7IE, - R7ArticleIE, -) -from .radiko import RadikoIE, RadikoRadioIE -from .radiocanada import ( - RadioCanadaIE, - RadioCanadaAudioVideoIE, -) -from .radiode import RadioDeIE -from .radiojavan import RadioJavanIE -from .radiobremen import RadioBremenIE -from .radiofrance import RadioFranceIE -from .radiozet import RadioZetPodcastIE -from .radiokapital import ( - RadioKapitalIE, - RadioKapitalShowIE, -) -from .radlive import ( - RadLiveIE, - RadLiveChannelIE, - RadLiveSeasonIE, -) -from .rai import ( - RaiPlayIE, - RaiPlayLiveIE, - RaiPlayPlaylistIE, - RaiPlaySoundIE, - RaiPlaySoundLiveIE, - RaiPlaySoundPlaylistIE, - RaiIE, -) -from .raywenderlich import ( - RayWenderlichIE, - RayWenderlichCourseIE, -) -from .rbmaradio import RBMARadioIE -from .rcs import ( - RCSIE, - RCSEmbedsIE, - RCSVariousIE, -) -from .rcti import ( - RCTIPlusIE, - RCTIPlusSeriesIE, - RCTIPlusTVIE, -) -from .rds import RDSIE -from .redbulltv import ( - RedBullTVIE, - RedBullEmbedIE, - RedBullTVRrnContentIE, - RedBullIE, -) -from .reddit import RedditIE -from .redgifs import ( - RedGifsIE, - RedGifsSearchIE, - RedGifsUserIE, -) -from .redtube import RedTubeIE -from .regiotv import RegioTVIE -from .rentv import ( - RENTVIE, - RENTVArticleIE, -) -from .restudy import RestudyIE -from .reuters import ReutersIE -from .reverbnation import ReverbNationIE -from .rice import RICEIE -from .rmcdecouverte import RMCDecouverteIE -from .rockstargames import RockstarGamesIE -from .rokfin import ( - RokfinIE, - RokfinStackIE, - RokfinChannelIE, -) -from .roosterteeth import RoosterTeethIE, RoosterTeethSeriesIE -from .rottentomatoes import RottenTomatoesIE -from .rozhlas import RozhlasIE -from .rtbf import RTBFIE -from .rte import RteIE, RteRadioIE -from .rtlnl import RtlNlIE -from .rtl2 import ( - RTL2IE, - RTL2YouIE, - RTL2YouSeriesIE, -) -from .rtnews import ( - RTNewsIE, - RTDocumentryIE, - RTDocumentryPlaylistIE, - RuptlyIE, -) -from .rtp import RTPIE -from .rtrfm import RTRFMIE -from .rts import RTSIE -from .rtve import ( - RTVEALaCartaIE, - RTVEAudioIE, - RTVELiveIE, - RTVEInfantilIE, - RTVETelevisionIE, -) -from .rtvnh import RTVNHIE -from .rtvs import RTVSIE -from .ruhd import RUHDIE -from .rule34video import Rule34VideoIE -from .rumble import ( - RumbleEmbedIE, - RumbleChannelIE, -) -from .rutube import ( - RutubeIE, - RutubeChannelIE, - RutubeEmbedIE, - RutubeMovieIE, - RutubePersonIE, - RutubePlaylistIE, - RutubeTagsIE, -) -from .glomex import ( - GlomexIE, - GlomexEmbedIE, -) -from .megatvcom import ( - MegaTVComIE, - MegaTVComEmbedIE, -) -from .ant1newsgr import ( - Ant1NewsGrWatchIE, - Ant1NewsGrArticleIE, - Ant1NewsGrEmbedIE, -) -from .rutv import RUTVIE -from .ruutu import RuutuIE -from .ruv import ( - RuvIE, - RuvSpilaIE -) -from .safari import ( - SafariIE, - SafariApiIE, - SafariCourseIE, -) -from .saitosan import SaitosanIE -from .samplefocus import SampleFocusIE -from .sapo import SapoIE -from .savefrom import SaveFromIE -from .sbs import SBSIE -from .screencast import ScreencastIE -from .screencastomatic import ScreencastOMaticIE -from .scrippsnetworks import ( - ScrippsNetworksWatchIE, - ScrippsNetworksIE, -) -from .scte import ( - SCTEIE, - SCTECourseIE, -) -from .seeker import SeekerIE -from .senategov import SenateISVPIE, SenateGovIE -from .sendtonews import SendtoNewsIE -from .servus import ServusIE -from .sevenplus import SevenPlusIE -from .sexu import SexuIE -from .seznamzpravy import ( - SeznamZpravyIE, - SeznamZpravyArticleIE, -) -from .shahid import ( - ShahidIE, - ShahidShowIE, -) -from .shared import ( - SharedIE, - VivoIE, -) -from .shemaroome import ShemarooMeIE -from .showroomlive import ShowRoomLiveIE -from .simplecast import ( - SimplecastIE, - SimplecastEpisodeIE, - SimplecastPodcastIE, -) -from .sina import SinaIE -from .sixplay import SixPlayIE -from .skeb import SkebIE -from .skyit import ( - SkyItPlayerIE, - SkyItVideoIE, - SkyItVideoLiveIE, - SkyItIE, - SkyItAcademyIE, - SkyItArteIE, - CieloTVItIE, - TV8ItIE, -) -from .skylinewebcams import SkylineWebcamsIE -from .skynewsarabia import ( - SkyNewsArabiaIE, - SkyNewsArabiaArticleIE, -) -from .skynewsau import SkyNewsAUIE -from .sky import ( - SkyNewsIE, - SkyNewsStoryIE, - SkySportsIE, - SkySportsNewsIE, -) -from .slideshare import SlideshareIE -from .slideslive import SlidesLiveIE -from .slutload import SlutloadIE -from .snotr import SnotrIE -from .sohu import SohuIE -from .sonyliv import ( - SonyLIVIE, - SonyLIVSeriesIE, -) -from .soundcloud import ( - SoundcloudEmbedIE, - SoundcloudIE, - SoundcloudSetIE, - SoundcloudRelatedIE, - SoundcloudUserIE, - SoundcloudTrackStationIE, - SoundcloudPlaylistIE, - SoundcloudSearchIE, -) -from .soundgasm import ( - SoundgasmIE, - SoundgasmProfileIE -) -from .southpark import ( - SouthParkIE, - SouthParkDeIE, - SouthParkDkIE, - SouthParkEsIE, - SouthParkNlIE -) -from .sovietscloset import ( - SovietsClosetIE, - SovietsClosetPlaylistIE -) -from .spankbang import ( - SpankBangIE, - SpankBangPlaylistIE, -) -from .spankwire import SpankwireIE -from .spiegel import SpiegelIE -from .spike import ( - BellatorIE, - ParamountNetworkIE, -) -from .stitcher import ( - StitcherIE, - StitcherShowIE, -) -from .sport5 import Sport5IE -from .sportbox import SportBoxIE -from .sportdeutschland import SportDeutschlandIE -from .spotify import ( - SpotifyIE, - SpotifyShowIE, -) -from .spreaker import ( - SpreakerIE, - SpreakerPageIE, - SpreakerShowIE, - SpreakerShowPageIE, -) -from .springboardplatform import SpringboardPlatformIE -from .sprout import SproutIE -from .srgssr import ( - SRGSSRIE, - SRGSSRPlayIE, -) -from .srmediathek import SRMediathekIE -from .stanfordoc import StanfordOpenClassroomIE -from .startv import StarTVIE -from .steam import SteamIE -from .storyfire import ( - StoryFireIE, - StoryFireUserIE, - StoryFireSeriesIE, -) -from .streamable import StreamableIE -from .streamanity import StreamanityIE -from .streamcloud import StreamcloudIE -from .streamcz import StreamCZIE -from .streamff import StreamFFIE -from .streetvoice import StreetVoiceIE -from .stretchinternet import StretchInternetIE -from .stripchat import StripchatIE -from .stv import STVPlayerIE -from .sunporno import SunPornoIE -from .sverigesradio import ( - SverigesRadioEpisodeIE, - SverigesRadioPublicationIE, -) -from .svt import ( - SVTIE, - SVTPageIE, - SVTPlayIE, - SVTSeriesIE, -) -from .swrmediathek import SWRMediathekIE -from .syfy import SyfyIE -from .sztvhu import SztvHuIE -from .tagesschau import TagesschauIE -from .tass import TassIE -from .tbs import TBSIE -from .tdslifeway import TDSLifewayIE -from .teachable import ( - TeachableIE, - TeachableCourseIE, -) -from .teachertube import ( - TeacherTubeIE, - TeacherTubeUserIE, -) -from .teachingchannel import TeachingChannelIE -from .teamcoco import TeamcocoIE -from .teamtreehouse import TeamTreeHouseIE -from .techtalks import TechTalksIE -from .ted import ( - TedEmbedIE, - TedPlaylistIE, - TedSeriesIE, - TedTalkIE, -) -from .tele5 import Tele5IE -from .tele13 import Tele13IE -from .telebruxelles import TeleBruxellesIE -from .telecinco import TelecincoIE -from .telegraaf import TelegraafIE -from .telegram import TelegramEmbedIE -from .telemb import TeleMBIE -from .telemundo import TelemundoIE -from .telequebec import ( - TeleQuebecIE, - TeleQuebecSquatIE, - TeleQuebecEmissionIE, - TeleQuebecLiveIE, - TeleQuebecVideoIE, -) -from .teletask import TeleTaskIE -from .telewebion import TelewebionIE -from .tennistv import TennisTVIE -from .tenplay import TenPlayIE -from .testurl import TestURLIE -from .tf1 import TF1IE -from .tfo import TFOIE -from .theintercept import TheInterceptIE -from .theplatform import ( - ThePlatformIE, - ThePlatformFeedIE, -) -from .thestar import TheStarIE -from .thesun import TheSunIE -from .theta import ( - ThetaVideoIE, - ThetaStreamIE, -) -from .theweatherchannel import TheWeatherChannelIE -from .thisamericanlife import ThisAmericanLifeIE -from .thisav import ThisAVIE -from .thisoldhouse import ThisOldHouseIE -from .threespeak import ( - ThreeSpeakIE, - ThreeSpeakUserIE, -) -from .threeqsdn import ThreeQSDNIE -from .tiktok import ( - TikTokIE, - TikTokUserIE, - TikTokSoundIE, - TikTokEffectIE, - TikTokTagIE, - TikTokVMIE, - DouyinIE, -) -from .tinypic import TinyPicIE -from .tmz import TMZIE -from .tnaflix import ( - TNAFlixNetworkEmbedIE, - TNAFlixIE, - EMPFlixIE, - MovieFapIE, -) -from .toggle import ( - ToggleIE, - MeWatchIE, -) -from .toggo import ( - ToggoIE, -) -from .tokentube import ( - TokentubeIE, - TokentubeChannelIE -) -from .tonline import TOnlineIE -from .toongoggles import ToonGogglesIE -from .toutv import TouTvIE -from .toypics import ToypicsUserIE, ToypicsIE -from .traileraddict import TrailerAddictIE -from .trilulilu import TriluliluIE -from .trovo import ( - TrovoIE, - TrovoVodIE, - TrovoChannelVodIE, - TrovoChannelClipIE, -) -from .trueid import TrueIDIE -from .trunews import TruNewsIE -from .trutv import TruTVIE -from .tube8 import Tube8IE -from .tubitv import ( - TubiTvIE, - TubiTvShowIE, -) -from .tumblr import TumblrIE -from .tunein import ( - TuneInClipIE, - TuneInStationIE, - TuneInProgramIE, - TuneInTopicIE, - TuneInShortenerIE, -) -from .tunepk import TunePkIE -from .turbo import TurboIE -from .tv2 import ( - TV2IE, - TV2ArticleIE, - KatsomoIE, - MTVUutisetArticleIE, -) -from .tv2dk import ( - TV2DKIE, - TV2DKBornholmPlayIE, -) -from .tv2hu import ( - TV2HuIE, - TV2HuSeriesIE, -) -from .tv4 import TV4IE -from .tv5mondeplus import TV5MondePlusIE -from .tv5unis import ( - TV5UnisVideoIE, - TV5UnisIE, -) -from .tva import ( - TVAIE, - QubIE, -) -from .tvanouvelles import ( - TVANouvellesIE, - TVANouvellesArticleIE, -) -from .tvc import ( - TVCIE, - TVCArticleIE, -) -from .tver import TVerIE -from .tvigle import TvigleIE -from .tvland import TVLandIE -from .tvn24 import TVN24IE -from .tvnet import TVNetIE -from .tvnoe import TVNoeIE -from .tvnow import ( - TVNowIE, - TVNowFilmIE, - TVNowNewIE, - TVNowSeasonIE, - TVNowAnnualIE, - TVNowShowIE, -) -from .tvopengr import ( - TVOpenGrWatchIE, - TVOpenGrEmbedIE, -) -from .tvp import ( - TVPEmbedIE, - TVPIE, - TVPStreamIE, - TVPWebsiteIE, -) -from .tvplay import ( - TVPlayIE, - ViafreeIE, - TVPlayHomeIE, -) -from .tvplayer import TVPlayerIE -from .tweakers import TweakersIE -from .twentyfourvideo import TwentyFourVideoIE -from .twentymin import TwentyMinutenIE -from .twentythreevideo import TwentyThreeVideoIE -from .twitcasting import ( - TwitCastingIE, - TwitCastingLiveIE, - TwitCastingUserIE, -) -from .twitch import ( - TwitchVodIE, - TwitchCollectionIE, - TwitchVideosIE, - TwitchVideosClipsIE, - TwitchVideosCollectionsIE, - TwitchStreamIE, - TwitchClipsIE, -) -from .twitter import ( - TwitterCardIE, - TwitterIE, - TwitterAmplifyIE, - TwitterBroadcastIE, - TwitterShortenerIE, -) -from .udemy import ( - UdemyIE, - UdemyCourseIE -) -from .udn import UDNEmbedIE -from .ufctv import ( - UFCTVIE, - UFCArabiaIE, -) -from .ukcolumn import UkColumnIE -from .uktvplay import UKTVPlayIE -from .digiteka import DigitekaIE -from .dlive import ( - DLiveVODIE, - DLiveStreamIE, -) -from .drooble import DroobleIE -from .umg import UMGDeIE -from .unistra import UnistraIE -from .unity import UnityIE -from .uol import UOLIE -from .uplynk import ( - UplynkIE, - UplynkPreplayIE, -) -from .urort import UrortIE -from .urplay import URPlayIE -from .usanetwork import USANetworkIE -from .usatoday import USATodayIE -from .ustream import UstreamIE, UstreamChannelIE -from .ustudio import ( - UstudioIE, - UstudioEmbedIE, -) -from .utreon import UtreonIE -from .varzesh3 import Varzesh3IE -from .vbox7 import Vbox7IE -from .veehd import VeeHDIE -from .veo import VeoIE -from .veoh import VeohIE -from .vesti import VestiIE -from .vevo import ( - VevoIE, - VevoPlaylistIE, -) -from .vgtv import ( - BTArticleIE, - BTVestlendingenIE, - VGTVIE, -) -from .vh1 import VH1IE -from .vice import ( - ViceIE, - ViceArticleIE, - ViceShowIE, -) -from .vidbit import VidbitIE -from .viddler import ViddlerIE -from .videa import VideaIE -from .videocampus_sachsen import ( - VideocampusSachsenIE, - VideocampusSachsenEmbedIE, -) -from .videodetective import VideoDetectiveIE -from .videofyme import VideofyMeIE -from .videomore import ( - VideomoreIE, - VideomoreVideoIE, - VideomoreSeasonIE, -) -from .videopress import VideoPressIE -from .vidio import ( - VidioIE, - VidioPremierIE, - VidioLiveIE -) -from .vidlii import VidLiiIE -from .vier import VierIE, VierVideosIE -from .viewlift import ( - ViewLiftIE, - ViewLiftEmbedIE, -) -from .viidea import ViideaIE -from .vimeo import ( - VimeoIE, - VimeoAlbumIE, - VimeoChannelIE, - VimeoGroupsIE, - VimeoLikesIE, - VimeoOndemandIE, - VimeoReviewIE, - VimeoUserIE, - VimeoWatchLaterIE, - VHXEmbedIE, -) -from .vimm import ( - VimmIE, - VimmRecordingIE, -) -from .vimple import VimpleIE -from .vine import ( - VineIE, - VineUserIE, -) -from .viki import ( - VikiIE, - VikiChannelIE, -) -from .viqeo import ViqeoIE -from .viu import ( - ViuIE, - ViuPlaylistIE, - ViuOTTIE, -) -from .vk import ( - VKIE, - VKUserVideosIE, - VKWallPostIE, -) -from .vlive import ( - VLiveIE, - VLivePostIE, - VLiveChannelIE, -) -from .vodlocker import VodlockerIE -from .vodpl import VODPlIE -from .vodplatform import VODPlatformIE -from .voicerepublic import VoiceRepublicIE -from .voicy import ( - VoicyIE, - VoicyChannelIE, -) -from .voot import ( - VootIE, - VootSeriesIE, -) -from .voxmedia import ( - VoxMediaVolumeIE, - VoxMediaIE, -) -from .vrt import VRTIE -from .vrak import VrakIE -from .vrv import ( - VRVIE, - VRVSeriesIE, -) -from .vshare import VShareIE -from .vtm import VTMIE -from .medialaan import MedialaanIE -from .vuclip import VuClipIE -from .vupload import VuploadIE -from .vvvvid import ( - VVVVIDIE, - VVVVIDShowIE, -) -from .vyborymos import VyboryMosIE -from .vzaar import VzaarIE -from .wakanim import WakanimIE -from .walla import WallaIE -from .washingtonpost import ( - WashingtonPostIE, - WashingtonPostArticleIE, -) -from .wasdtv import ( - WASDTVStreamIE, - WASDTVRecordIE, - WASDTVClipIE, -) -from .wat import WatIE -from .watchbox import WatchBoxIE -from .watchindianporn import WatchIndianPornIE -from .wdr import ( - WDRIE, - WDRPageIE, - WDRElefantIE, - WDRMobileIE, -) -from .webcaster import ( - WebcasterIE, - WebcasterFeedIE, -) -from .webofstories import ( - WebOfStoriesIE, - WebOfStoriesPlaylistIE, -) -from .weibo import ( - WeiboIE, - WeiboMobileIE -) -from .weiqitv import WeiqiTVIE -from .willow import WillowIE -from .wimtv import WimTVIE -from .whowatch import WhoWatchIE -from .wistia import ( - WistiaIE, - WistiaPlaylistIE, -) -from .worldstarhiphop import WorldStarHipHopIE -from .wppilot import ( - WPPilotIE, - WPPilotChannelsIE, -) -from .wsj import ( - WSJIE, - WSJArticleIE, -) -from .wwe import WWEIE -from .xbef import XBefIE -from .xboxclips import XboxClipsIE -from .xfileshare import XFileShareIE -from .xhamster import ( - XHamsterIE, - XHamsterEmbedIE, - XHamsterUserIE, -) -from .xiami import ( - XiamiSongIE, - XiamiAlbumIE, - XiamiArtistIE, - XiamiCollectionIE -) -from .ximalaya import ( - XimalayaIE, - XimalayaAlbumIE -) -from .xinpianchang import XinpianchangIE -from .xminus import XMinusIE -from .xnxx import XNXXIE -from .xstream import XstreamIE -from .xtube import XTubeUserIE, XTubeIE -from .xuite import XuiteIE -from .xvideos import XVideosIE -from .xxxymovies import XXXYMoviesIE -from .yahoo import ( - YahooIE, - YahooSearchIE, - YahooGyaOPlayerIE, - YahooGyaOIE, - YahooJapanNewsIE, -) -from .yandexdisk import YandexDiskIE -from .yandexmusic import ( - YandexMusicTrackIE, - YandexMusicAlbumIE, - YandexMusicPlaylistIE, - YandexMusicArtistTracksIE, - YandexMusicArtistAlbumsIE, -) -from .yandexvideo import ( - YandexVideoIE, - YandexVideoPreviewIE, - ZenYandexIE, - ZenYandexChannelIE, -) -from .yapfiles import YapFilesIE -from .yesjapan import YesJapanIE -from .yinyuetai import YinYueTaiIE -from .ynet import YnetIE -from .youjizz import YouJizzIE -from .youku import ( - YoukuIE, - YoukuShowIE, -) -from .younow import ( - YouNowLiveIE, - YouNowChannelIE, - YouNowMomentIE, -) -from .youporn import YouPornIE -from .yourporn import YourPornIE -from .yourupload import YourUploadIE -from .youtube import ( - YoutubeIE, - YoutubeClipIE, - YoutubeFavouritesIE, - YoutubeHistoryIE, - YoutubeTabIE, - YoutubeLivestreamEmbedIE, - YoutubePlaylistIE, - YoutubeRecommendedIE, - YoutubeSearchDateIE, - YoutubeSearchIE, - YoutubeSearchURLIE, - YoutubeMusicSearchURLIE, - YoutubeSubscriptionsIE, - YoutubeTruncatedIDIE, - YoutubeTruncatedURLIE, - YoutubeYtBeIE, - YoutubeYtUserIE, - YoutubeWatchLaterIE, -) -from .zapiks import ZapiksIE -from .zattoo import ( - BBVTVIE, - EinsUndEinsTVIE, - EWETVIE, - GlattvisionTVIE, - MNetTVIE, - MyVisionTVIE, - NetPlusIE, - OsnatelTVIE, - QuantumTVIE, - QuicklineIE, - QuicklineLiveIE, - SaltTVIE, - SAKTVIE, - VTXTVIE, - WalyTVIE, - ZattooIE, - ZattooLiveIE, -) -from .zdf import ZDFIE, ZDFChannelIE -from .zee5 import ( - Zee5IE, - Zee5SeriesIE, -) -from .zhihu import ZhihuIE -from .zingmp3 import ( - ZingMp3IE, - ZingMp3AlbumIE, -) -from .zoom import ZoomIE -from .zype import ZypeIE +if not _LAZY_LOADER: + from ._extractors import * # noqa: F403 + _ALL_CLASSES = [ # noqa: F811 + klass + for name, klass in globals().items() + if name.endswith('IE') and name != 'GenericIE' + ] + _ALL_CLASSES.append(GenericIE) # noqa: F405 + +globals().update(_PLUGIN_CLASSES) +_ALL_CLASSES[:0] = _PLUGIN_CLASSES.values() diff --git a/hypervideo_dl/extractor/extremetube.py b/hypervideo_dl/extractor/extremetube.py index acd4090..2c19698 100644 --- a/hypervideo_dl/extractor/extremetube.py +++ b/hypervideo_dl/extractor/extremetube.py @@ -1,10 +1,8 @@ -from __future__ import unicode_literals - from ..utils import str_to_int from .keezmovies import KeezMoviesIE -class ExtremeTubeIE(KeezMoviesIE): +class ExtremeTubeIE(KeezMoviesIE): # XXX: Do not subclass from concrete IE _VALID_URL = r'https?://(?:www\.)?extremetube\.com/(?:[^/]+/)?video/(?P<id>[^/#?&]+)' _TESTS = [{ 'url': 'http://www.extremetube.com/video/music-video-14-british-euro-brit-european-cumshots-swallow-652431', diff --git a/hypervideo_dl/extractor/eyedotv.py b/hypervideo_dl/extractor/eyedotv.py index f62ddeb..d8b068e 100644 --- a/hypervideo_dl/extractor/eyedotv.py +++ b/hypervideo_dl/extractor/eyedotv.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( xpath_text, diff --git a/hypervideo_dl/extractor/facebook.py b/hypervideo_dl/extractor/facebook.py index 022ea85..a58d9c8 100644 --- a/hypervideo_dl/extractor/facebook.py +++ b/hypervideo_dl/extractor/facebook.py @@ -1,21 +1,18 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json import re +import urllib.parse from .common import InfoExtractor from ..compat import ( compat_etree_fromstring, compat_str, compat_urllib_parse_unquote, - compat_urllib_parse_unquote_plus, ) from ..utils import ( + ExtractorError, clean_html, determine_ext, error_to_compat_str, - ExtractorError, float_or_none, get_element_by_id, get_first, @@ -60,6 +57,13 @@ class FacebookIE(InfoExtractor): ) (?P<id>[0-9]+) ''' + _EMBED_REGEX = [ + r'<iframe[^>]+?src=(["\'])(?P<url>https?://www\.facebook\.com/(?:video/embed|plugins/video\.php).+?)\1', + # Facebook API embed https://developers.facebook.com/docs/plugins/embedded-video-player + r'''(?x)<div[^>]+ + class=(?P<q1>[\'"])[^\'"]*\bfb-(?:video|post)\b[^\'"]*(?P=q1)[^>]+ + data-href=(?P<q2>[\'"])(?P<url>(?:https?:)?//(?:www\.)?facebook.com/.+?)(?P=q2)''', + ] _LOGIN_URL = 'https://www.facebook.com/login.php?next=http%3A%2F%2Ffacebook.com%2Fhome.php&login_attempt=1' _CHECKPOINT_URL = 'https://www.facebook.com/checkpoint/?next=http%3A%2F%2Ffacebook.com%2Fhome.php&_fb_noscript=1' _NETRC_MACHINE = 'facebook' @@ -314,21 +318,6 @@ class FacebookIE(InfoExtractor): 'graphURI': '/api/graphql/' } - @staticmethod - def _extract_urls(webpage): - urls = [] - for mobj in re.finditer( - r'<iframe[^>]+?src=(["\'])(?P<url>https?://www\.facebook\.com/(?:video/embed|plugins/video\.php).+?)\1', - webpage): - urls.append(mobj.group('url')) - # Facebook API embed - # see https://developers.facebook.com/docs/plugins/embedded-video-player - for mobj in re.finditer(r'''(?x)<div[^>]+ - class=(?P<q1>[\'"])[^\'"]*\bfb-(?:video|post)\b[^\'"]*(?P=q1)[^>]+ - data-href=(?P<q2>[\'"])(?P<url>(?:https?:)?//(?:www\.)?facebook.com/.+?)(?P=q2)''', webpage): - urls.append(mobj.group('url')) - return urls - def _perform_login(self, username, password): login_page_req = sanitized_Request(self._LOGIN_URL) self._set_cookie('facebook.com', 'locale', 'en_US') @@ -397,10 +386,8 @@ class FacebookIE(InfoExtractor): r'handleWithCustomApplyEach\(\s*ScheduledApplyEach\s*,\s*(\{.+?\})\s*\);', webpage)] post = traverse_obj(post_data, ( ..., 'require', ..., ..., ..., '__bbox', 'result', 'data'), expected_type=dict) or [] - media = traverse_obj( - post, - (..., 'attachments', ..., 'media', lambda _, m: str(m['id']) == video_id and m['__typename'] == 'Video'), - expected_type=dict) + media = traverse_obj(post, (..., 'attachments', ..., lambda k, v: ( + k == 'media' and str(v['id']) == video_id and v['__typename'] == 'Video')), expected_type=dict) title = get_first(media, ('title', 'text')) description = get_first(media, ('creation_story', 'comet_sections', 'message', 'story', 'message', 'text')) uploader_data = get_first(media, 'owner') or get_first(post, ('node', 'actors', ...)) or {} @@ -472,15 +459,14 @@ class FacebookIE(InfoExtractor): dash_manifest = video.get('dash_manifest') if dash_manifest: formats.extend(self._parse_mpd_formats( - compat_etree_fromstring(compat_urllib_parse_unquote_plus(dash_manifest)))) + compat_etree_fromstring(urllib.parse.unquote_plus(dash_manifest)))) - def process_formats(formats): + def process_formats(info): # Downloads with browser's User-Agent are rate limited. Working around # with non-browser User-Agent. - for f in formats: + for f in info['formats']: f.setdefault('http_headers', {})['User-Agent'] = 'facebookexternalhit/1.1' - - self._sort_formats(formats, ('res', 'quality')) + info['_format_sort_fields'] = ('res', 'quality') def extract_relay_data(_filter): return self._parse_json(self._search_regex( @@ -523,16 +509,17 @@ class FacebookIE(InfoExtractor): 'url': playable_url, }) extract_dash_manifest(video, formats) - process_formats(formats) v_id = video.get('videoId') or video.get('id') or video_id info = { 'id': v_id, 'formats': formats, - 'thumbnail': try_get(video, lambda x: x['thumbnailImage']['uri']), + 'thumbnail': traverse_obj( + video, ('thumbnailImage', 'uri'), ('preferred_thumbnail', 'image', 'uri')), 'uploader_id': try_get(video, lambda x: x['owner']['id']), 'timestamp': int_or_none(video.get('publish_time')), 'duration': float_or_none(video.get('playable_duration_in_ms'), 1000), } + process_formats(info) description = try_get(video, lambda x: x['savable_description']['text']) title = video.get('name') if title: @@ -699,13 +686,12 @@ class FacebookIE(InfoExtractor): if subtitles_src: subtitles.setdefault('en', []).append({'url': subtitles_src}) - process_formats(formats) - info_dict = { 'id': video_id, 'formats': formats, 'subtitles': subtitles, } + process_formats(info_dict) info_dict.update(extract_metadata(webpage)) return info_dict @@ -784,3 +770,30 @@ class FacebookRedirectURLIE(InfoExtractor): if not redirect_url: raise ExtractorError('Invalid facebook redirect URL', expected=True) return self.url_result(redirect_url) + + +class FacebookReelIE(InfoExtractor): + _VALID_URL = r'https?://(?:[\w-]+\.)?facebook\.com/reel/(?P<id>\d+)' + IE_NAME = 'facebook:reel' + + _TESTS = [{ + 'url': 'https://www.facebook.com/reel/1195289147628387', + 'md5': 'c4ff9a7182ff9ff7d6f7a83603bae831', + 'info_dict': { + 'id': '1195289147628387', + 'ext': 'mp4', + 'title': 'md5:9f5b142921b2dc57004fa13f76005f87', + 'description': 'md5:24ea7ef062215d295bdde64e778f5474', + 'uploader': 'Beast Camp Training', + 'uploader_id': '1738535909799870', + 'duration': 9.536, + 'thumbnail': r're:^https?://.*', + 'upload_date': '20211121', + 'timestamp': 1637502604, + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + return self.url_result( + f'https://m.facebook.com/watch/?v={video_id}&_rdr', FacebookIE, video_id) diff --git a/hypervideo_dl/extractor/fancode.py b/hypervideo_dl/extractor/fancode.py index 7ea16c6..1b5db81 100644 --- a/hypervideo_dl/extractor/fancode.py +++ b/hypervideo_dl/extractor/fancode.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import compat_str @@ -128,7 +125,7 @@ class FancodeVodIE(InfoExtractor): } -class FancodeLiveIE(FancodeVodIE): +class FancodeLiveIE(FancodeVodIE): # XXX: Do not subclass from concrete IE IE_NAME = 'fancode:live' _VALID_URL = r'https?://(www\.)?fancode\.com/match/(?P<id>[0-9]+).+' diff --git a/hypervideo_dl/extractor/faz.py b/hypervideo_dl/extractor/faz.py index 312ee2a..bca62ad 100644 --- a/hypervideo_dl/extractor/faz.py +++ b/hypervideo_dl/extractor/faz.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -81,7 +78,6 @@ class FazIE(InfoExtractor): 'tbr': tbr or int(mobj.group(3)), }) formats.append(f) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/fc2.py b/hypervideo_dl/extractor/fc2.py index 54a83aa..dd5e088 100644 --- a/hypervideo_dl/extractor/fc2.py +++ b/hypervideo_dl/extractor/fc2.py @@ -1,19 +1,13 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor -from ..compat import ( - compat_parse_qs, -) +from ..compat import compat_parse_qs +from ..dependencies import websockets from ..utils import ( ExtractorError, WebSocketsWrapper, - has_websockets, js_to_json, sanitized_Request, - std_headers, traverse_obj, update_url_query, urlencode_postdata, @@ -84,7 +78,7 @@ class FC2IE(InfoExtractor): webpage = None if not url.startswith('fc2:'): webpage = self._download_webpage(url, video_id) - self._downloader.cookiejar.clear_session_cookies() # must clear + self.cookiejar.clear_session_cookies() # must clear self._login() title, thumbnail, description = None, None, None @@ -173,7 +167,7 @@ class FC2LiveIE(InfoExtractor): }] def _real_extract(self, url): - if not has_websockets: + if not websockets: raise ExtractorError('websockets library is not available. Please install it.', expected=True) video_id = self._match_id(url) webpage = self._download_webpage('https://live.fc2.com/%s/' % video_id, video_id) @@ -210,10 +204,10 @@ class FC2LiveIE(InfoExtractor): 'Cookie': str(self._get_cookies('https://live.fc2.com/'))[12:], 'Origin': 'https://live.fc2.com', 'Accept': '*/*', - 'User-Agent': std_headers['User-Agent'], + 'User-Agent': self.get_param('http_headers')['User-Agent'], }) - self.write_debug('[debug] Sending HLS server request') + self.write_debug('Sending HLS server request') while True: recv = ws.recv() @@ -235,13 +229,10 @@ class FC2LiveIE(InfoExtractor): if not data or not isinstance(data, dict): continue if data.get('name') == '_response_' and data.get('id') == 1: - self.write_debug('[debug] Goodbye.') + self.write_debug('Goodbye') playlist_data = data break - elif self._downloader.params.get('verbose', False): - if len(recv) > 100: - recv = recv[:100] + '...' - self.to_screen('[debug] Server said: %s' % recv) + self.write_debug('Server said: %s%s' % (recv[:100], '...' if len(recv) > 100 else '')) if not playlist_data: raise ExtractorError('Unable to fetch HLS playlist info via WebSocket') @@ -259,7 +250,6 @@ class FC2LiveIE(InfoExtractor): 'Referer': url, })) - self._sort_formats(formats) for fmt in formats: fmt.update({ 'protocol': 'fc2_live', diff --git a/hypervideo_dl/extractor/fczenit.py b/hypervideo_dl/extractor/fczenit.py index 8db7c59..8175b6b 100644 --- a/hypervideo_dl/extractor/fczenit.py +++ b/hypervideo_dl/extractor/fczenit.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( int_or_none, @@ -41,8 +38,6 @@ class FczenitIE(InfoExtractor): 'height': int_or_none(q.get('label')), } for q in msi_data['qualities'] if q.get('url')] - self._sort_formats(formats) - tags = [tag['label'] for tag in msi_data.get('tags', []) if tag.get('label')] return { diff --git a/hypervideo_dl/extractor/fifa.py b/hypervideo_dl/extractor/fifa.py new file mode 100644 index 0000000..dc00edc --- /dev/null +++ b/hypervideo_dl/extractor/fifa.py @@ -0,0 +1,94 @@ +from .common import InfoExtractor + +from ..utils import ( + int_or_none, + traverse_obj, + unified_timestamp, +) + + +class FifaIE(InfoExtractor): + _VALID_URL = r'https?://www.fifa.com/fifaplus/(?P<locale>\w{2})/watch/([^#?]+/)?(?P<id>\w+)' + _TESTS = [{ + 'url': 'https://www.fifa.com/fifaplus/en/watch/7on10qPcnyLajDDU3ntg6y', + 'info_dict': { + 'id': '7on10qPcnyLajDDU3ntg6y', + 'title': 'Italy v France | Final | 2006 FIFA World Cup Germany™ | Full Match Replay', + 'description': 'md5:f4520d0ee80529c8ba4134a7d692ff8b', + 'ext': 'mp4', + 'categories': ['FIFA Tournaments'], + 'thumbnail': 'https://digitalhub.fifa.com/transform/fa6f0b3e-a2e9-4cf7-9f32-53c57bcb7360/2006_Final_ITA_FRA', + 'duration': 8165, + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.fifa.com/fifaplus/pt/watch/1cg5r5Qt6Qt12ilkDgb1sV', + 'info_dict': { + 'id': '1cg5r5Qt6Qt12ilkDgb1sV', + 'title': 'Brazil v Germany | Semi-finals | 2014 FIFA World Cup Brazil™ | Extended Highlights', + 'description': 'md5:d908c74ee66322b804ae2e521b02a855', + 'ext': 'mp4', + 'categories': ['FIFA Tournaments', 'Highlights'], + 'thumbnail': 'https://digitalhub.fifa.com/transform/d8fe6f61-276d-4a73-a7fe-6878a35fd082/FIFAPLS_100EXTHL_2014BRAvGER_TMB', + 'duration': 902, + 'release_timestamp': 1404777600, + 'release_date': '20140708', + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.fifa.com/fifaplus/fr/watch/3C6gQH9C2DLwzNx7BMRQdp', + 'info_dict': { + 'id': '3C6gQH9C2DLwzNx7BMRQdp', + 'title': 'Josimar goal against Northern Ireland | Classic Goals', + 'description': 'md5:cbe7e7bb52f603c9f1fe9a4780fe983b', + 'ext': 'mp4', + 'categories': ['FIFA Tournaments', 'Goal'], + 'duration': 28, + 'thumbnail': 'https://digitalhub.fifa.com/transform/f9301391-f8d9-48b5-823e-c093ac5e3e11/CG_MEN_1986_JOSIMAR', + }, + 'params': {'skip_download': 'm3u8'}, + }] + + def _real_extract(self, url): + video_id, locale = self._match_valid_url(url).group('id', 'locale') + webpage = self._download_webpage(url, video_id) + + preconnect_link = self._search_regex( + r'<link[^>]+rel\s*=\s*"preconnect"[^>]+href\s*=\s*"([^"]+)"', webpage, 'Preconnect Link') + + video_details = self._download_json( + f'{preconnect_link}/sections/videoDetails/{video_id}', video_id, 'Downloading Video Details', fatal=False) + + preplay_parameters = self._download_json( + f'{preconnect_link}/videoPlayerData/{video_id}', video_id, 'Downloading Preplay Parameters')['preplayParameters'] + + cid = preplay_parameters['contentId'] + content_data = self._download_json( + f'https://content.uplynk.com/preplay/{cid}/multiple.json', video_id, 'Downloading Content Data', query={ + 'v': preplay_parameters['preplayAPIVersion'], + 'tc': preplay_parameters['tokenCheckAlgorithmVersion'], + 'rn': preplay_parameters['randomNumber'], + 'exp': preplay_parameters['tokenExpirationDate'], + 'ct': preplay_parameters['contentType'], + 'cid': cid, + 'mbtracks': preplay_parameters['tracksAssetNumber'], + 'ad': preplay_parameters['adConfiguration'], + 'ad.preroll': int(preplay_parameters['adPreroll']), + 'ad.cmsid': preplay_parameters['adCMSSourceId'], + 'ad.vid': preplay_parameters['adSourceVideoID'], + 'sig': preplay_parameters['signature'], + }) + + formats, subtitles = self._extract_m3u8_formats_and_subtitles(content_data['playURL'], video_id) + + return { + 'id': video_id, + 'title': video_details.get('title'), + 'description': video_details.get('description'), + 'duration': int_or_none(video_details.get('duration')), + 'release_timestamp': unified_timestamp(video_details.get('dateOfRelease')), + 'categories': traverse_obj(video_details, (('videoCategory', 'videoSubcategory'),)), + 'thumbnail': traverse_obj(video_details, ('backgroundImage', 'src')), + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/hypervideo_dl/extractor/filmmodu.py b/hypervideo_dl/extractor/filmmodu.py index 2746876..9eb550e 100644 --- a/hypervideo_dl/extractor/filmmodu.py +++ b/hypervideo_dl/extractor/filmmodu.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import int_or_none @@ -54,8 +51,6 @@ class FilmmoduIE(InfoExtractor): 'protocol': 'm3u8_native', } for source in data['sources']] - self._sort_formats(formats) - subtitles = {} if data.get('subtitle'): diff --git a/hypervideo_dl/extractor/filmon.py b/hypervideo_dl/extractor/filmon.py index 7b43ecc..9a93cb9 100644 --- a/hypervideo_dl/extractor/filmon.py +++ b/hypervideo_dl/extractor/filmon.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import ( compat_str, @@ -68,7 +65,6 @@ class FilmOnIE(InfoExtractor): 'quality': QUALITY(stream.get('quality')), 'protocol': 'm3u8_native', }) - self._sort_formats(formats) thumbnails = [] poster = response.get('poster', {}) @@ -156,7 +152,6 @@ class FilmOnChannelIE(InfoExtractor): 'ext': 'mp4', 'quality': QUALITY(quality), }) - self._sort_formats(formats) thumbnails = [] for name, width, height in self._THUMBNAIL_RES: diff --git a/hypervideo_dl/extractor/filmweb.py b/hypervideo_dl/extractor/filmweb.py index 5e323b4..cfea1f2 100644 --- a/hypervideo_dl/extractor/filmweb.py +++ b/hypervideo_dl/extractor/filmweb.py @@ -1,6 +1,3 @@ -from __future__ import unicode_literals - - from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/firsttv.py b/hypervideo_dl/extractor/firsttv.py index ccad173..f74bd13 100644 --- a/hypervideo_dl/extractor/firsttv.py +++ b/hypervideo_dl/extractor/firsttv.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import ( compat_str, @@ -126,7 +123,6 @@ class FirstTVIE(InfoExtractor): % (path, m3u8_path), display_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) - self._sort_formats(formats) thumbnail = item.get('poster') or self._og_search_thumbnail(webpage) duration = int_or_none(item.get('duration') or self._html_search_meta( diff --git a/hypervideo_dl/extractor/fivemin.py b/hypervideo_dl/extractor/fivemin.py deleted file mode 100644 index f3f876e..0000000 --- a/hypervideo_dl/extractor/fivemin.py +++ /dev/null @@ -1,54 +0,0 @@ -from __future__ import unicode_literals - -from .common import InfoExtractor - - -class FiveMinIE(InfoExtractor): - IE_NAME = '5min' - _VALID_URL = r'(?:5min:|https?://(?:[^/]*?5min\.com/|delivery\.vidible\.tv/aol)(?:(?:Scripts/PlayerSeed\.js|playerseed/?)?\?.*?playList=)?)(?P<id>\d+)' - - _TESTS = [ - { - # From http://www.engadget.com/2013/11/15/ipad-mini-retina-display-review/ - 'url': 'http://pshared.5min.com/Scripts/PlayerSeed.js?sid=281&width=560&height=345&playList=518013791', - 'md5': '4f7b0b79bf1a470e5004f7112385941d', - 'info_dict': { - 'id': '518013791', - 'ext': 'mp4', - 'title': 'iPad Mini with Retina Display Review', - 'description': 'iPad mini with Retina Display review', - 'duration': 177, - 'uploader': 'engadget', - 'upload_date': '20131115', - 'timestamp': 1384515288, - }, - 'params': { - # m3u8 download - 'skip_download': True, - } - }, - { - # From http://on.aol.com/video/how-to-make-a-next-level-fruit-salad-518086247 - 'url': '5min:518086247', - 'md5': 'e539a9dd682c288ef5a498898009f69e', - 'info_dict': { - 'id': '518086247', - 'ext': 'mp4', - 'title': 'How to Make a Next-Level Fruit Salad', - 'duration': 184, - }, - 'skip': 'no longer available', - }, - { - 'url': 'http://embed.5min.com/518726732/', - 'only_matching': True, - }, - { - 'url': 'http://delivery.vidible.tv/aol?playList=518013791', - 'only_matching': True, - } - ] - - def _real_extract(self, url): - video_id = self._match_id(url) - return self.url_result('aol-video:%s' % video_id) diff --git a/hypervideo_dl/extractor/fivetv.py b/hypervideo_dl/extractor/fivetv.py index d6bebd1..1f48cfd 100644 --- a/hypervideo_dl/extractor/fivetv.py +++ b/hypervideo_dl/extractor/fivetv.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor from ..utils import int_or_none @@ -75,7 +71,7 @@ class FiveTVIE(InfoExtractor): r'<a[^>]+?href="([^"]+)"[^>]+?class="videoplayer"'], webpage, 'video url') - title = self._og_search_title(webpage, default=None) or self._html_extract_title(webpage) + title = self._generic_title('', webpage) duration = int_or_none(self._og_search_property( 'video:duration', webpage, 'duration', default=None)) diff --git a/hypervideo_dl/extractor/flickr.py b/hypervideo_dl/extractor/flickr.py index 2ed6c2b..89a40d7 100644 --- a/hypervideo_dl/extractor/flickr.py +++ b/hypervideo_dl/extractor/flickr.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import ( compat_str, @@ -91,12 +89,11 @@ class FlickrIE(InfoExtractor): 'url': stream['_content'], 'quality': preference(stream_type), }) - self._sort_formats(formats) owner = video_info.get('owner', {}) uploader_id = owner.get('nsid') uploader_path = owner.get('path_alias') or uploader_id - uploader_url = format_field(uploader_path, template='https://www.flickr.com/photos/%s/') + uploader_url = format_field(uploader_path, None, 'https://www.flickr.com/photos/%s/') return { 'id': video_id, diff --git a/hypervideo_dl/extractor/folketinget.py b/hypervideo_dl/extractor/folketinget.py index b3df93f..55a11e5 100644 --- a/hypervideo_dl/extractor/folketinget.py +++ b/hypervideo_dl/extractor/folketinget.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import compat_parse_qs from ..utils import ( @@ -62,7 +59,6 @@ class FolketingetIE(InfoExtractor): 'url': xpath_text(n, './url', fatal=True), 'tbr': int_or_none(n.attrib['bitrate']), } for n in doc.findall('.//streams/stream')] - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/footyroom.py b/hypervideo_dl/extractor/footyroom.py index 118325b..4a1316b 100644 --- a/hypervideo_dl/extractor/footyroom.py +++ b/hypervideo_dl/extractor/footyroom.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from .streamable import StreamableIE diff --git a/hypervideo_dl/extractor/formula1.py b/hypervideo_dl/extractor/formula1.py index 67662e6..0a8ef85 100644 --- a/hypervideo_dl/extractor/formula1.py +++ b/hypervideo_dl/extractor/formula1.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/fourtube.py b/hypervideo_dl/extractor/fourtube.py index d4d955b..b6368b8 100644 --- a/hypervideo_dl/extractor/fourtube.py +++ b/hypervideo_dl/extractor/fourtube.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -37,7 +35,6 @@ class FourTubeBaseIE(InfoExtractor): 'resolution': format + 'p', 'quality': int(format), } for format in sources] - self._sort_formats(formats) return formats def _real_extract(self, url): diff --git a/hypervideo_dl/extractor/fourzerostudio.py b/hypervideo_dl/extractor/fourzerostudio.py new file mode 100644 index 0000000..c388a3a --- /dev/null +++ b/hypervideo_dl/extractor/fourzerostudio.py @@ -0,0 +1,106 @@ +from .common import InfoExtractor +from ..utils import traverse_obj, unified_timestamp + + +class FourZeroStudioArchiveIE(InfoExtractor): + _VALID_URL = r'https?://0000\.studio/(?P<uploader_id>[^/]+)/broadcasts/(?P<id>[^/]+)/archive' + IE_NAME = '0000studio:archive' + _TESTS = [{ + 'url': 'https://0000.studio/mumeijiten/broadcasts/1290f433-fce0-4909-a24a-5f7df09665dc/archive', + 'info_dict': { + 'id': '1290f433-fce0-4909-a24a-5f7df09665dc', + 'title': 'noteで『canape』様へのファンレターを執筆します。(数秘術その2)', + 'timestamp': 1653802534, + 'release_timestamp': 1653796604, + 'thumbnails': 'count:1', + 'comments': 'count:7', + 'uploader': '『中崎雄心』の執務室。', + 'uploader_id': 'mumeijiten', + } + }] + + def _real_extract(self, url): + video_id, uploader_id = self._match_valid_url(url).group('id', 'uploader_id') + webpage = self._download_webpage(url, video_id) + nuxt_data = self._search_nuxt_data(webpage, video_id, traverse=None) + + pcb = traverse_obj(nuxt_data, ('ssrRefs', lambda _, v: v['__typename'] == 'PublicCreatorBroadcast'), get_all=False) + uploader_internal_id = traverse_obj(nuxt_data, ( + 'ssrRefs', lambda _, v: v['__typename'] == 'PublicUser', 'id'), get_all=False) + + formats, subs = self._extract_m3u8_formats_and_subtitles(pcb['archiveUrl'], video_id, ext='mp4') + + return { + 'id': video_id, + 'title': pcb.get('title'), + 'age_limit': 18 if pcb.get('isAdult') else None, + 'timestamp': unified_timestamp(pcb.get('finishTime')), + 'release_timestamp': unified_timestamp(pcb.get('createdAt')), + 'thumbnails': [{ + 'url': pcb['thumbnailUrl'], + 'ext': 'png', + }] if pcb.get('thumbnailUrl') else None, + 'formats': formats, + 'subtitles': subs, + 'comments': [{ + 'author': c.get('username'), + 'author_id': c.get('postedUserId'), + 'author_thumbnail': c.get('userThumbnailUrl'), + 'id': c.get('id'), + 'text': c.get('body'), + 'timestamp': unified_timestamp(c.get('createdAt')), + 'like_count': c.get('likeCount'), + 'is_favorited': c.get('isLikedByOwner'), + 'author_is_uploader': c.get('postedUserId') == uploader_internal_id, + } for c in traverse_obj(nuxt_data, ( + 'ssrRefs', ..., lambda _, v: v['__typename'] == 'PublicCreatorBroadcastComment')) or []], + 'uploader_id': uploader_id, + 'uploader': traverse_obj(nuxt_data, ( + 'ssrRefs', lambda _, v: v['__typename'] == 'PublicUser', 'username'), get_all=False), + } + + +class FourZeroStudioClipIE(InfoExtractor): + _VALID_URL = r'https?://0000\.studio/(?P<uploader_id>[^/]+)/archive-clip/(?P<id>[^/]+)' + IE_NAME = '0000studio:clip' + _TESTS = [{ + 'url': 'https://0000.studio/soeji/archive-clip/e46b0278-24cd-40a8-92e1-b8fc2b21f34f', + 'info_dict': { + 'id': 'e46b0278-24cd-40a8-92e1-b8fc2b21f34f', + 'title': 'わたベーさんからイラスト差し入れいただきました。ありがとうございました!', + 'timestamp': 1652109105, + 'like_count': 1, + 'uploader': 'ソエジマケイタ', + 'uploader_id': 'soeji', + } + }] + + def _real_extract(self, url): + video_id, uploader_id = self._match_valid_url(url).group('id', 'uploader_id') + webpage = self._download_webpage(url, video_id) + nuxt_data = self._search_nuxt_data(webpage, video_id, traverse=None) + + clip_info = traverse_obj(nuxt_data, ('ssrRefs', lambda _, v: v['__typename'] == 'PublicCreatorArchivedClip'), get_all=False) + + info = next(( + m for m in self._parse_html5_media_entries(url, webpage, video_id) + if 'mp4' in traverse_obj(m, ('formats', ..., 'ext')) + ), None) + if not info: + self.report_warning('Failed to find a desired media element. Falling back to using NUXT data.') + info = { + 'formats': [{ + 'ext': 'mp4', + 'url': url, + } for url in clip_info.get('mediaFiles') or [] if url], + } + return { + **info, + 'id': video_id, + 'title': clip_info.get('clipComment'), + 'timestamp': unified_timestamp(clip_info.get('createdAt')), + 'like_count': clip_info.get('likeCount'), + 'uploader_id': uploader_id, + 'uploader': traverse_obj(nuxt_data, ( + 'ssrRefs', lambda _, v: v['__typename'] == 'PublicUser', 'username'), get_all=False), + } diff --git a/hypervideo_dl/extractor/fox.py b/hypervideo_dl/extractor/fox.py index 4c52b9a..15c0c48 100644 --- a/hypervideo_dl/extractor/fox.py +++ b/hypervideo_dl/extractor/fox.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json import uuid @@ -15,8 +12,10 @@ from ..utils import ( int_or_none, parse_age_limit, parse_duration, + traverse_obj, try_get, unified_timestamp, + url_or_none, ) @@ -37,7 +36,8 @@ class FOXIE(InfoExtractor): 'creator': 'FOX', 'series': 'Gotham', 'age_limit': 14, - 'episode': 'Aftermath: Bruce Wayne Develops Into The Dark Knight' + 'episode': 'Aftermath: Bruce Wayne Develops Into The Dark Knight', + 'thumbnail': r're:^https?://.*\.jpg$', }, 'params': { 'skip_download': True, @@ -132,7 +132,6 @@ class FOXIE(InfoExtractor): formats = self._extract_m3u8_formats( m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') - self._sort_formats(formats) data = try_get( video, lambda x: x['trackingData']['properties'], dict) or {} @@ -168,6 +167,7 @@ class FOXIE(InfoExtractor): 'season_number': int_or_none(video.get('seasonNumber')), 'episode': video.get('name'), 'episode_number': int_or_none(video.get('episodeNumber')), + 'thumbnail': traverse_obj(video, ('images', 'still', 'raw'), expected_type=url_or_none), 'release_year': int_or_none(video.get('releaseYear')), 'subtitles': subtitles, } diff --git a/hypervideo_dl/extractor/fox9.py b/hypervideo_dl/extractor/fox9.py index 91f8f7b..dfbafa7 100644 --- a/hypervideo_dl/extractor/fox9.py +++ b/hypervideo_dl/extractor/fox9.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/foxgay.py b/hypervideo_dl/extractor/foxgay.py index 1c53e06..f4f29c6 100644 --- a/hypervideo_dl/extractor/foxgay.py +++ b/hypervideo_dl/extractor/foxgay.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import itertools from .common import InfoExtractor @@ -33,7 +31,7 @@ class FoxgayIE(InfoExtractor): description = get_element_by_id('inf_tit', webpage) # The default user-agent with foxgay cookies leads to pages without videos - self._downloader.cookiejar.clear('.foxgay.com') + self.cookiejar.clear('.foxgay.com') # Find the URL for the iFrame which contains the actual video. iframe_url = self._html_search_regex( r'<iframe[^>]+src=([\'"])(?P<url>[^\'"]+)\1', webpage, @@ -50,8 +48,6 @@ class FoxgayIE(InfoExtractor): } for source, resolution in zip( video_data['sources'], video_data.get('resolutions', itertools.repeat(None)))] - self._sort_formats(formats) - return { 'id': video_id, 'title': title, diff --git a/hypervideo_dl/extractor/foxnews.py b/hypervideo_dl/extractor/foxnews.py index 18fa0a5..52172aa 100644 --- a/hypervideo_dl/extractor/foxnews.py +++ b/hypervideo_dl/extractor/foxnews.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re from .amp import AMPIE @@ -58,13 +56,15 @@ class FoxNewsIE(AMPIE): }, ] - @staticmethod - def _extract_urls(webpage): - return [ - mobj.group('url') - for mobj in re.finditer( - r'<(?:amp-)?iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//video\.foxnews\.com/v/video-embed\.html?.*?\bvideo_id=\d+.*?)\1', - webpage)] + @classmethod + def _extract_embed_urls(cls, url, webpage): + for mobj in re.finditer( + r'''(?x) + <(?:script|(?:amp-)?iframe)[^>]+\bsrc=["\'] + (?:https?:)?//video\.foxnews\.com/v/(?:video-embed\.html|embed\.js)\? + (?:[^>"\']+&)?(?:video_)?id=(?P<video_id>\d+) + ''', webpage): + yield f'https://video.foxnews.com/v/video-embed.html?video_id={mobj.group("video_id")}' def _real_extract(self, url): host, video_id = self._match_valid_url(url).groups() @@ -75,6 +75,29 @@ class FoxNewsIE(AMPIE): return info +class FoxNewsVideoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?foxnews\.com/video/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.foxnews.com/video/6313058664112', + 'info_dict': { + 'id': '6313058664112', + 'ext': 'mp4', + 'thumbnail': r're:https://.+/1280x720/match/image\.jpg', + 'upload_date': '20220930', + 'description': 'New York City, Kids Therapy, Biden', + 'duration': 2415, + 'title': 'Gutfeld! - Thursday, September 29', + 'timestamp': 1664527538, + }, + 'expected_warnings': ['Ignoring subtitle tracks'], + 'params': {'skip_download': 'm3u8'}, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + return self.url_result(f'https://video.foxnews.com/v/{video_id}', FoxNewsIE, video_id) + + class FoxNewsArticleIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?(?:insider\.)?foxnews\.com/(?!v)([^/]+/)+(?P<id>[a-z-]+)' IE_NAME = 'foxnews:article' @@ -124,4 +147,4 @@ class FoxNewsArticleIE(InfoExtractor): 'http://video.foxnews.com/v/' + video_id, FoxNewsIE.ie_key()) return self.url_result( - FoxNewsIE._extract_urls(webpage)[0], FoxNewsIE.ie_key()) + next(FoxNewsIE._extract_embed_urls(url, webpage)), FoxNewsIE.ie_key()) diff --git a/hypervideo_dl/extractor/foxsports.py b/hypervideo_dl/extractor/foxsports.py index 2b2cb6c..f9d7fe5 100644 --- a/hypervideo_dl/extractor/foxsports.py +++ b/hypervideo_dl/extractor/foxsports.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/fptplay.py b/hypervideo_dl/extractor/fptplay.py index a34e90b..85613ba 100644 --- a/hypervideo_dl/extractor/fptplay.py +++ b/hypervideo_dl/extractor/fptplay.py @@ -1,18 +1,17 @@ -# coding: utf-8 -from __future__ import unicode_literals - import hashlib import time import urllib.parse from .common import InfoExtractor from ..utils import ( + clean_html, join_nonempty, + strip_or_none, ) class FptplayIE(InfoExtractor): - _VALID_URL = r'https?://fptplay\.vn/(?P<type>xem-video)/[^/]+\-(?P<id>\w+)(?:/tap-(?P<episode>[^/]+)?/?(?:[?#]|$)|)' + _VALID_URL = r'https?://fptplay\.vn/xem-video/[^/]+\-(?P<id>\w+)(?:/tap-(?P<episode>\d+)?/?(?:[?#]|$)|)' _GEO_COUNTRIES = ['VN'] IE_NAME = 'fptplay' IE_DESC = 'fptplay.vn' @@ -22,7 +21,7 @@ class FptplayIE(InfoExtractor): 'info_dict': { 'id': '621a123016f369ebbde55945', 'ext': 'mp4', - 'title': 'Nhân Duyên Đại Nhân Xin Dừng Bước - Ms. Cupid In Love', + 'title': 'Nhân Duyên Đại Nhân Xin Dừng Bước - Tập 1A', 'description': 'md5:23cf7d1ce0ade8e21e76ae482e6a8c6c', }, }, { @@ -31,25 +30,41 @@ class FptplayIE(InfoExtractor): 'info_dict': { 'id': '61f3aa8a6b3b1d2e73c60eb5', 'ext': 'mp4', - 'title': 'Má Tôi Là Đại Gia - 3', + 'title': 'Má Tôi Là Đại Gia - Tập 3', 'description': 'md5:ff8ba62fb6e98ef8875c42edff641d1c', }, + }, { + 'url': 'https://fptplay.vn/xem-video/lap-toi-do-giam-under-the-skin-6222d9684ec7230fa6e627a2/tap-4', + 'md5': 'bcb06c55ec14786d7d4eda07fa1ccbb9', + 'info_dict': { + 'id': '6222d9684ec7230fa6e627a2', + 'ext': 'mp4', + 'title': 'Lạp Tội Đồ Giám - Tập 2B', + 'description': 'md5:e5a47e9d35fbf7e9479ca8a77204908b', + }, }, { 'url': 'https://fptplay.vn/xem-video/nha-co-chuyen-hi-alls-well-ends-well-1997-6218995f6af792ee370459f0', 'only_matching': True, }] def _real_extract(self, url): - type_url, video_id, episode = self._match_valid_url(url).group('type', 'id', 'episode') - webpage = self._download_webpage(url, video_id=video_id, fatal=False) - info = self._download_json(self.get_api_with_st_token(video_id, episode or 0), video_id) + video_id, slug_episode = self._match_valid_url(url).group('id', 'episode') + webpage = self._download_webpage(url, video_id=video_id, fatal=False) or '' + title = self._search_regex( + r'(?s)<h4\s+class="mb-1 text-2xl text-white"[^>]*>(.+)</h4>', webpage, 'title', fatal=False) + real_episode = slug_episode if not title else self._search_regex( + r'<p.+title="(?P<episode>[^">]+)"\s+class="epi-title active"', webpage, 'episode', fatal=False) + title = strip_or_none(title) or self._html_search_meta(('og:title', 'twitter:title'), webpage) + + info = self._download_json( + self.get_api_with_st_token(video_id, int(slug_episode) - 1 if slug_episode else 0), video_id) formats, subtitles = self._extract_m3u8_formats_and_subtitles(info['data']['url'], video_id, 'mp4') - self._sort_formats(formats) return { 'id': video_id, - 'title': join_nonempty( - self._html_search_meta(('og:title', 'twitter:title'), webpage), episode, delim=' - '), - 'description': self._html_search_meta(['og:description', 'twitter:description'], webpage), + 'title': join_nonempty(title, real_episode, delim=' - '), + 'description': ( + clean_html(self._search_regex(r'<p\s+class="overflow-hidden"[^>]*>(.+)</p>', webpage, 'description')) + or self._html_search_meta(('og:description', 'twitter:description'), webpage)), 'formats': formats, 'subtitles': subtitles, } diff --git a/hypervideo_dl/extractor/franceculture.py b/hypervideo_dl/extractor/franceculture.py deleted file mode 100644 index 9dc28d8..0000000 --- a/hypervideo_dl/extractor/franceculture.py +++ /dev/null @@ -1,128 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re -from .common import InfoExtractor -from ..utils import ( - determine_ext, - extract_attributes, - int_or_none, - traverse_obj, - unified_strdate, -) - - -class FranceCultureIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?franceculture\.fr/emissions/(?:[^/]+/)*(?P<id>[^/?#&]+)' - _TESTS = [{ - # playlist - 'url': 'https://www.franceculture.fr/emissions/serie/hasta-dente', - 'playlist_count': 12, - 'info_dict': { - 'id': 'hasta-dente', - 'title': 'Hasta Dente', - 'description': 'md5:57479af50648d14e9bb649e6b1f8f911', - 'thumbnail': r're:^https?://.*\.jpg$', - 'upload_date': '20201024', - }, - 'playlist': [{ - 'info_dict': { - 'id': '3c1c2e55-41a0-11e5-9fe0-005056a87c89', - 'ext': 'mp3', - 'title': 'Jeudi, vous avez dit bizarre ?', - 'description': 'md5:47cf1e00cc21c86b0210279996a812c6', - 'duration': 604, - 'upload_date': '20201024', - 'thumbnail': r're:^https?://.*\.jpg$', - 'timestamp': 1603576680 - }, - }, - ], - }, { - 'url': 'https://www.franceculture.fr/emissions/carnet-nomade/rendez-vous-au-pays-des-geeks', - 'info_dict': { - 'id': 'rendez-vous-au-pays-des-geeks', - 'display_id': 'rendez-vous-au-pays-des-geeks', - 'ext': 'mp3', - 'title': 'Rendez-vous au pays des geeks', - 'thumbnail': r're:^https?://.*\.jpg$', - 'upload_date': '20140301', - 'vcodec': 'none', - 'duration': 3569, - }, - }, { - # no thumbnail - 'url': 'https://www.franceculture.fr/emissions/la-recherche-montre-en-main/la-recherche-montre-en-main-du-mercredi-10-octobre-2018', - 'only_matching': True, - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - - info = { - 'id': display_id, - 'title': self._html_search_regex( - r'(?s)<h1[^>]*itemprop="[^"]*name[^"]*"[^>]*>(.+?)</h1>', - webpage, 'title', default=self._og_search_title(webpage)), - 'description': self._html_search_regex( - r'(?s)<div[^>]+class="excerpt"[^>]*>(.*?)</div>', webpage, 'description', default=None), - 'thumbnail': self._og_search_thumbnail(webpage), - 'uploader': self._html_search_regex( - r'(?s)<span class="author">(.*?)</span>', webpage, 'uploader', default=None), - 'upload_date': unified_strdate(self._html_search_regex( - r'(?s)class="teaser-text-date".*?(\d{2}/\d{2}/\d{4})', webpage, 'date', default=None)), - } - - playlist_data = self._search_regex( - r'''(?sx) - <section[^>]+data-xiti-place="[^"]*?liste_episodes[^"?]*?"[^>]*> - (.*?) - </section> - ''', - webpage, 'playlist data', fatal=False, default=None) - - if playlist_data: - entries = [] - for item, item_description in re.findall( - r'(?s)(<button[^<]*class="[^"]*replay-button[^>]*>).*?<p[^>]*class="[^"]*teaser-text-chapo[^>]*>(.*?)</p>', - playlist_data): - - item_attributes = extract_attributes(item) - entries.append({ - 'id': item_attributes.get('data-emission-uuid'), - 'url': item_attributes.get('data-url'), - 'title': item_attributes.get('data-diffusion-title'), - 'duration': int_or_none(traverse_obj(item_attributes, 'data-duration-seconds', 'data-duration-seconds')), - 'description': item_description, - 'timestamp': int_or_none(item_attributes.get('data-start-time')), - 'thumbnail': info['thumbnail'], - 'uploader': info['uploader'], - }) - - return { - '_type': 'playlist', - 'entries': entries, - **info - } - - video_data = extract_attributes(self._search_regex( - r'''(?sx) - (?: - </h1>| - <div[^>]+class="[^"]*?(?:title-zone-diffusion|heading-zone-(?:wrapper|player-button))[^"]*?"[^>]*> - ).*? - (<button[^>]+data-(?:url|asset-source)="[^"]+"[^>]+>) - ''', - webpage, 'video data')) - video_url = traverse_obj(video_data, 'data-url', 'data-asset-source') - ext = determine_ext(video_url.lower()) - - return { - 'display_id': display_id, - 'url': video_url, - 'ext': ext, - 'vcodec': 'none' if ext == 'mp3' else None, - 'duration': int_or_none(video_data.get('data-duration')), - **info - } diff --git a/hypervideo_dl/extractor/franceinter.py b/hypervideo_dl/extractor/franceinter.py index ae822a5..779249b 100644 --- a/hypervideo_dl/extractor/franceinter.py +++ b/hypervideo_dl/extractor/franceinter.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import month_by_name diff --git a/hypervideo_dl/extractor/francetv.py b/hypervideo_dl/extractor/francetv.py index 347a766..0523172 100644 --- a/hypervideo_dl/extractor/francetv.py +++ b/hypervideo_dl/extractor/francetv.py @@ -1,8 +1,3 @@ -# coding: utf-8 - -from __future__ import unicode_literals - - from .common import InfoExtractor from ..utils import ( determine_ext, @@ -37,6 +32,7 @@ class FranceTVIE(InfoExtractor): (?P<id>[^@]+)(?:@(?P<catalog>.+))? ) ''' + _EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?://)?embed\.francetv\.fr/\?ue=.+?)\1'] _TESTS = [{ # without catalog @@ -195,8 +191,6 @@ class FranceTVIE(InfoExtractor): } for sheet in spritesheets] }) - self._sort_formats(formats) - if subtitle: title += ' - %s' % subtitle title = title.strip() @@ -375,7 +369,7 @@ class FranceTVInfoIE(FranceTVBaseInfoExtractor): webpage = self._download_webpage(url, display_id) - dailymotion_urls = DailymotionIE._extract_urls(webpage) + dailymotion_urls = tuple(DailymotionIE._extract_embed_urls(url, webpage)) if dailymotion_urls: return self.playlist_result([ self.url_result(dailymotion_url, DailymotionIE.ie_key()) diff --git a/hypervideo_dl/extractor/freesound.py b/hypervideo_dl/extractor/freesound.py index 138b6bc..8b5f227 100644 --- a/hypervideo_dl/extractor/freesound.py +++ b/hypervideo_dl/extractor/freesound.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -65,7 +63,6 @@ class FreesoundIE(InfoExtractor): 'format_note': channels, 'quality': quality, } for quality, format_url in enumerate(audio_urls)] - self._sort_formats(formats) return { 'id': audio_id, diff --git a/hypervideo_dl/extractor/freespeech.py b/hypervideo_dl/extractor/freespeech.py index ea9c3e3..aea5513 100644 --- a/hypervideo_dl/extractor/freespeech.py +++ b/hypervideo_dl/extractor/freespeech.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from .youtube import YoutubeIE diff --git a/hypervideo_dl/extractor/freetv.py b/hypervideo_dl/extractor/freetv.py new file mode 100644 index 0000000..757a10d --- /dev/null +++ b/hypervideo_dl/extractor/freetv.py @@ -0,0 +1,139 @@ +import itertools +import re + +from .common import InfoExtractor +from ..utils import int_or_none, traverse_obj, urlencode_postdata + + +class FreeTvBaseIE(InfoExtractor): + def _get_api_response(self, content_id, resource_type, postdata): + return self._download_json( + 'https://www.freetv.com/wordpress/wp-admin/admin-ajax.php', + content_id, data=urlencode_postdata(postdata), + note=f'Downloading {content_id} {resource_type} JSON')['data'] + + +class FreeTvMoviesIE(FreeTvBaseIE): + _VALID_URL = r'https?://(?:www\.)?freetv\.com/peliculas/(?P<id>[^/]+)' + _TESTS = [{ + 'url': 'https://www.freetv.com/peliculas/atrapame-si-puedes/', + 'md5': 'dc62d5abf0514726640077cd1591aa92', + 'info_dict': { + 'id': '428021', + 'title': 'Atrápame Si Puedes', + 'description': 'md5:ca63bc00898aeb2f64ec87c6d3a5b982', + 'ext': 'mp4', + } + }, { + 'url': 'https://www.freetv.com/peliculas/monstruoso/', + 'md5': '509c15c68de41cb708d1f92d071f20aa', + 'info_dict': { + 'id': '377652', + 'title': 'Monstruoso', + 'description': 'md5:333fc19ee327b457b980e54a911ea4a3', + 'ext': 'mp4', + } + }] + + def _extract_video(self, content_id, action='olyott_video_play'): + api_response = self._get_api_response(content_id, 'video', { + 'action': action, + 'contentID': content_id, + }) + + video_id, video_url = api_response['displayMeta']['contentID'], api_response['displayMeta']['streamURLVideo'] + formats, subtitles = self._extract_m3u8_formats_and_subtitles(video_url, video_id, 'mp4') + + return { + 'id': video_id, + 'title': traverse_obj(api_response, ('displayMeta', 'title')), + 'description': traverse_obj(api_response, ('displayMeta', 'desc')), + 'formats': formats, + 'subtitles': subtitles, + } + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + return self._extract_video( + self._search_regex(( + r'class=["\'][^>]+postid-(?P<video_id>\d+)', + r'<link[^>]+freetv.com/\?p=(?P<video_id>\d+)', + r'<div[^>]+data-params=["\'][^>]+post_id=(?P<video_id>\d+)', + ), webpage, 'video id', group='video_id')) + + +class FreeTvIE(FreeTvBaseIE): + IE_NAME = 'freetv:series' + _VALID_URL = r'https?://(?:www\.)?freetv\.com/series/(?P<id>[^/]+)' + _TESTS = [{ + 'url': 'https://www.freetv.com/series/el-detective-l/', + 'info_dict': { + 'id': 'el-detective-l', + 'title': 'El Detective L', + 'description': 'md5:f9f1143bc33e9856ecbfcbfb97a759be' + }, + 'playlist_count': 24, + }, { + 'url': 'https://www.freetv.com/series/esmeraldas/', + 'info_dict': { + 'id': 'esmeraldas', + 'title': 'Esmeraldas', + 'description': 'md5:43d7ec45bd931d8268a4f5afaf4c77bf' + }, + 'playlist_count': 62, + }, { + 'url': 'https://www.freetv.com/series/las-aventuras-de-leonardo/', + 'info_dict': { + 'id': 'las-aventuras-de-leonardo', + 'title': 'Las Aventuras de Leonardo', + 'description': 'md5:0c47130846c141120a382aca059288f6' + }, + 'playlist_count': 13, + }, + ] + + def _extract_series_season(self, season_id, series_title): + episodes = self._get_api_response(season_id, 'series', { + 'contentID': season_id, + 'action': 'olyott_get_dynamic_series_content', + 'type': 'list', + 'perPage': '1000', + })['1'] + + for episode in episodes: + video_id = str(episode['contentID']) + formats, subtitles = self._extract_m3u8_formats_and_subtitles(episode['streamURL'], video_id, 'mp4') + + yield { + 'id': video_id, + 'title': episode.get('fullTitle'), + 'description': episode.get('description'), + 'formats': formats, + 'subtitles': subtitles, + 'thumbnail': episode.get('thumbnail'), + 'series': series_title, + 'series_id': traverse_obj(episode, ('contentMeta', 'displayMeta', 'seriesID')), + 'season_id': traverse_obj(episode, ('contentMeta', 'displayMeta', 'seasonID')), + 'season_number': traverse_obj( + episode, ('contentMeta', 'displayMeta', 'seasonNum'), expected_type=int_or_none), + 'episode_number': traverse_obj( + episode, ('contentMeta', 'displayMeta', 'episodeNum'), expected_type=int_or_none), + } + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + title = self._html_search_regex( + r'<h1[^>]+class=["\']synopis[^>]>(?P<title>[^<]+)', webpage, 'title', group='title', fatal=False) + description = self._html_search_regex( + r'<div[^>]+class=["\']+synopis content[^>]><p>(?P<description>[^<]+)', + webpage, 'description', group='description', fatal=False) + + return self.playlist_result( + itertools.chain.from_iterable( + self._extract_series_season(season_id, title) + for season_id in re.findall(r'<option[^>]+value=["\'](\d+)["\']', webpage)), + display_id, title, description) diff --git a/hypervideo_dl/extractor/freshlive.py b/hypervideo_dl/extractor/freshlive.py deleted file mode 100644 index 72a8459..0000000 --- a/hypervideo_dl/extractor/freshlive.py +++ /dev/null @@ -1,83 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - ExtractorError, - int_or_none, - try_get, - unified_timestamp, -) - - -class FreshLiveIE(InfoExtractor): - _VALID_URL = r'https?://freshlive\.tv/[^/]+/(?P<id>\d+)' - _TEST = { - 'url': 'https://freshlive.tv/satotv/74712', - 'md5': '9f0cf5516979c4454ce982df3d97f352', - 'info_dict': { - 'id': '74712', - 'ext': 'mp4', - 'title': 'テスト', - 'description': 'テスト', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 1511, - 'timestamp': 1483619655, - 'upload_date': '20170105', - 'uploader': 'サトTV', - 'uploader_id': 'satotv', - 'view_count': int, - 'comment_count': int, - 'is_live': False, - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - options = self._parse_json( - self._search_regex( - r'window\.__CONTEXT__\s*=\s*({.+?});\s*</script>', - webpage, 'initial context'), - video_id) - - info = options['context']['dispatcher']['stores']['ProgramStore']['programs'][video_id] - - title = info['title'] - - if info.get('status') == 'upcoming': - raise ExtractorError('Stream %s is upcoming' % video_id, expected=True) - - stream_url = info.get('liveStreamUrl') or info['archiveStreamUrl'] - - is_live = info.get('liveStreamUrl') is not None - - formats = self._extract_m3u8_formats( - stream_url, video_id, 'mp4', - 'm3u8_native', m3u8_id='hls') - - if is_live: - title = self._live_title(title) - - return { - 'id': video_id, - 'formats': formats, - 'title': title, - 'description': info.get('description'), - 'thumbnail': info.get('thumbnailUrl'), - 'duration': int_or_none(info.get('airTime')), - 'timestamp': unified_timestamp(info.get('createdAt')), - 'uploader': try_get( - info, lambda x: x['channel']['title'], compat_str), - 'uploader_id': try_get( - info, lambda x: x['channel']['code'], compat_str), - 'uploader_url': try_get( - info, lambda x: x['channel']['permalink'], compat_str), - 'view_count': int_or_none(info.get('viewCount')), - 'comment_count': int_or_none(info.get('commentCount')), - 'tags': info.get('tags', []), - 'is_live': is_live, - } diff --git a/hypervideo_dl/extractor/frontendmasters.py b/hypervideo_dl/extractor/frontendmasters.py index fc67a84..3bae8ad 100644 --- a/hypervideo_dl/extractor/frontendmasters.py +++ b/hypervideo_dl/extractor/frontendmasters.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -163,7 +160,6 @@ class FrontendMastersIE(FrontendMastersBaseIE): 'format_id': format_id, }) formats.append(f) - self._sort_formats(formats) subtitles = { 'en': [{ diff --git a/hypervideo_dl/extractor/fujitv.py b/hypervideo_dl/extractor/fujitv.py index 4fdfe12..668bb27 100644 --- a/hypervideo_dl/extractor/fujitv.py +++ b/hypervideo_dl/extractor/fujitv.py @@ -1,5 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals from ..utils import HEADRequest from .common import InfoExtractor @@ -19,7 +17,7 @@ class FujiTVFODPlus7IE(InfoExtractor): 'url': 'https://fod.fujitv.co.jp/title/5d40/5d40110076', 'info_dict': { 'id': '5d40110076', - 'ext': 'mp4', + 'ext': 'ts', 'title': '#1318 『まる子、まぼろしの洋館を見る』の巻', 'series': 'ちびまる子ちゃん', 'series_id': '5d40', @@ -30,7 +28,7 @@ class FujiTVFODPlus7IE(InfoExtractor): 'url': 'https://fod.fujitv.co.jp/title/5d40/5d40810083', 'info_dict': { 'id': '5d40810083', - 'ext': 'mp4', + 'ext': 'ts', 'title': '#1324 『まる子とオニの子』の巻/『結成!2月をムダにしない会』の巻', 'description': 'md5:3972d900b896adc8ab1849e310507efa', 'series': 'ちびまる子ちゃん', @@ -47,19 +45,18 @@ class FujiTVFODPlus7IE(InfoExtractor): if token: json_info = self._download_json('https://fod-sp.fujitv.co.jp/apps/api/episode/detail/?ep_id=%s&is_premium=false' % video_id, video_id, headers={'x-authorization': f'Bearer {token.value}'}, fatal=False) else: - self.report_warning(f'The token cookie is needed to extract video metadata. {self._LOGIN_HINTS["cookies"]}') + self.report_warning(f'The token cookie is needed to extract video metadata. {self._login_hint("cookies")}') formats, subtitles = [], {} src_json = self._download_json(f'{self._BASE_URL}abrjson_v2/tv_android/{video_id}', video_id) for src in src_json['video_selector']: if not src.get('url'): continue - fmt, subs = self._extract_m3u8_formats_and_subtitles(src['url'], video_id, 'mp4') + fmt, subs = self._extract_m3u8_formats_and_subtitles(src['url'], video_id, 'ts') for f in fmt: f.update(dict(zip(('height', 'width'), self._BITRATE_MAP.get(f.get('tbr'), ())))) formats.extend(fmt) subtitles = self._merge_subtitles(subtitles, subs) - self._sort_formats(formats, ['tbr']) return { 'id': video_id, @@ -70,4 +67,5 @@ class FujiTVFODPlus7IE(InfoExtractor): 'formats': formats, 'subtitles': subtitles, 'thumbnail': f'{self._BASE_URL}img/program/{series_id}/episode/{video_id}_a.jpg', + '_format_sort_fields': ('tbr', ) } diff --git a/hypervideo_dl/extractor/funimation.py b/hypervideo_dl/extractor/funimation.py index 6aa9bc9..18363c1 100644 --- a/hypervideo_dl/extractor/funimation.py +++ b/hypervideo_dl/extractor/funimation.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import random import re import string @@ -8,17 +5,18 @@ import string from .common import InfoExtractor from ..compat import compat_HTTPError from ..utils import ( + ExtractorError, determine_ext, int_or_none, join_nonempty, js_to_json, + make_archive_id, orderedSet, qualities, str_or_none, traverse_obj, try_get, urlencode_postdata, - ExtractorError, ) @@ -245,11 +243,14 @@ class FunimationIE(FunimationBaseIE): 'language_preference': language_preference(lang.lower()), }) formats.extend(current_formats) + if not formats and (requested_languages or requested_versions): + self.raise_no_formats( + 'There are no video formats matching the requested languages/versions', expected=True, video_id=display_id) self._remove_duplicate_formats(formats) - self._sort_formats(formats, ('lang', 'source')) return { - 'id': initial_experience_id if only_initial_experience else episode_id, + 'id': episode_id, + '_old_archive_ids': [make_archive_id(self, initial_experience_id)], 'display_id': display_id, 'duration': duration, 'title': episode['episodeTitle'], @@ -264,6 +265,7 @@ class FunimationIE(FunimationBaseIE): 'formats': formats, 'thumbnails': thumbnails, 'subtitles': subtitles, + '_format_sort_fields': ('lang', 'source'), } def _get_subtitles(self, subtitles, experience_id, episode, display_id, format_name): diff --git a/hypervideo_dl/extractor/funk.py b/hypervideo_dl/extractor/funk.py index 2c5cfe8..539d719 100644 --- a/hypervideo_dl/extractor/funk.py +++ b/hypervideo_dl/extractor/funk.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor from .nexx import NexxIE from ..utils import ( diff --git a/hypervideo_dl/extractor/fusion.py b/hypervideo_dl/extractor/fusion.py index a3f44b8..689422f 100644 --- a/hypervideo_dl/extractor/fusion.py +++ b/hypervideo_dl/extractor/fusion.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( determine_ext, @@ -72,7 +70,6 @@ class FusionIE(InfoExtractor): 'protocol': 'm3u8_native' if ext == 'm3u8' else 'https', }) if formats: - self._sort_formats(formats) info['formats'] = formats else: info.update({ diff --git a/hypervideo_dl/extractor/fuyintv.py b/hypervideo_dl/extractor/fuyintv.py new file mode 100644 index 0000000..197901d --- /dev/null +++ b/hypervideo_dl/extractor/fuyintv.py @@ -0,0 +1,30 @@ +from .common import InfoExtractor +from ..utils import traverse_obj + + +class FuyinTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?fuyin\.tv/html/(?:\d+)/(?P<id>\d+)\.html' + _TESTS = [{ + 'url': 'https://www.fuyin.tv/html/2733/44129.html', + 'info_dict': { + 'id': '44129', + 'ext': 'mp4', + 'title': '第1集', + 'description': 'md5:21a3d238dc8d49608e1308e85044b9c3', + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + json_data = self._download_json( + 'https://www.fuyin.tv/api/api/tv.movie/url', + video_id, query={'urlid': f'{video_id}'}) + webpage = self._download_webpage(url, video_id, fatal=False) + + return { + 'id': video_id, + 'title': traverse_obj(json_data, ('data', 'title')), + 'url': json_data['data']['url'], + 'ext': 'mp4', + 'description': self._html_search_meta('description', webpage), + } diff --git a/hypervideo_dl/extractor/fxnetworks.py b/hypervideo_dl/extractor/fxnetworks.py deleted file mode 100644 index 00e6742..0000000 --- a/hypervideo_dl/extractor/fxnetworks.py +++ /dev/null @@ -1,77 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .adobepass import AdobePassIE -from ..utils import ( - extract_attributes, - int_or_none, - parse_age_limit, - smuggle_url, - update_url_query, -) - - -class FXNetworksIE(AdobePassIE): - _VALID_URL = r'https?://(?:www\.)?(?:fxnetworks|simpsonsworld)\.com/video/(?P<id>\d+)' - _TESTS = [{ - 'url': 'http://www.fxnetworks.com/video/1032565827847', - 'md5': '8d99b97b4aa7a202f55b6ed47ea7e703', - 'info_dict': { - 'id': 'dRzwHC_MMqIv', - 'ext': 'mp4', - 'title': 'First Look: Better Things - Season 2', - 'description': 'Because real life is like a fart. Watch this FIRST LOOK to see what inspired the new season of Better Things.', - 'age_limit': 14, - 'uploader': 'NEWA-FNG-FX', - 'upload_date': '20170825', - 'timestamp': 1503686274, - 'episode_number': 0, - 'season_number': 2, - 'series': 'Better Things', - }, - 'add_ie': ['ThePlatform'], - }, { - 'url': 'http://www.simpsonsworld.com/video/716094019682', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - if 'The content you are trying to access is not available in your region.' in webpage: - self.raise_geo_restricted() - video_data = extract_attributes(self._search_regex( - r'(<a.+?rel="https?://link\.theplatform\.com/s/.+?</a>)', webpage, 'video data')) - player_type = self._search_regex(r'playerType\s*=\s*[\'"]([^\'"]+)', webpage, 'player type', default=None) - release_url = video_data['rel'] - title = video_data['data-title'] - rating = video_data.get('data-rating') - query = { - 'mbr': 'true', - } - if player_type == 'movies': - query.update({ - 'manifest': 'm3u', - }) - else: - query.update({ - 'switch': 'http', - }) - if video_data.get('data-req-auth') == '1': - resource = self._get_mvpd_resource( - video_data['data-channel'], title, - video_data.get('data-guid'), rating) - query['auth'] = self._extract_mvpd_auth(url, video_id, 'fx', resource) - - return { - '_type': 'url_transparent', - 'id': video_id, - 'title': title, - 'url': smuggle_url(update_url_query(release_url, query), {'force_smil_url': True}), - 'series': video_data.get('data-show-title'), - 'episode_number': int_or_none(video_data.get('data-episode')), - 'season_number': int_or_none(video_data.get('data-season')), - 'thumbnail': video_data.get('data-large-thumb'), - 'age_limit': parse_age_limit(rating), - 'ie_key': 'ThePlatform', - } diff --git a/hypervideo_dl/extractor/gab.py b/hypervideo_dl/extractor/gab.py index 9ba0b1c..5016e2f 100644 --- a/hypervideo_dl/extractor/gab.py +++ b/hypervideo_dl/extractor/gab.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -57,7 +54,6 @@ class GabTVIE(InfoExtractor): else: frmt['height'] = str_to_int(resolution.replace('p', '')) formats.append(frmt) - self._sort_formats(formats) return { 'id': id, @@ -123,8 +119,6 @@ class GabIE(InfoExtractor): } for url, f in ((media.get('url'), metadata.get('original') or {}), (media.get('source_mp4'), metadata.get('playable') or {})) if url] - self._sort_formats(formats) - author = json_data.get('account') or {} entries.append({ 'id': f'{post_id}-{idx}', diff --git a/hypervideo_dl/extractor/gaia.py b/hypervideo_dl/extractor/gaia.py index 5b0195c..c84386f 100644 --- a/hypervideo_dl/extractor/gaia.py +++ b/hypervideo_dl/extractor/gaia.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor from ..compat import ( compat_str, @@ -92,7 +88,6 @@ class GaiaIE(InfoExtractor): media_id, headers=headers) formats = self._extract_m3u8_formats( media['mediaUrls']['bcHLS'], media_id, 'mp4') - self._sort_formats(formats) subtitles = {} text_tracks = media.get('textTracks', {}) diff --git a/hypervideo_dl/extractor/gameinformer.py b/hypervideo_dl/extractor/gameinformer.py index f1b96c1..2664edb 100644 --- a/hypervideo_dl/extractor/gameinformer.py +++ b/hypervideo_dl/extractor/gameinformer.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .brightcove import BrightcoveNewIE from .common import InfoExtractor from ..utils import ( diff --git a/hypervideo_dl/extractor/gamejolt.py b/hypervideo_dl/extractor/gamejolt.py index a13e528..440b832 100644 --- a/hypervideo_dl/extractor/gamejolt.py +++ b/hypervideo_dl/extractor/gamejolt.py @@ -1,4 +1,3 @@ -# coding: utf-8 import itertools import json import math diff --git a/hypervideo_dl/extractor/gamespot.py b/hypervideo_dl/extractor/gamespot.py index 7a1beae..8dec252 100644 --- a/hypervideo_dl/extractor/gamespot.py +++ b/hypervideo_dl/extractor/gamespot.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .once import OnceIE from ..compat import compat_urllib_parse_unquote @@ -67,8 +65,6 @@ class GameSpotIE(OnceIE): formats.extend(self._extract_mpd_formats( mpd_url, page_id, mpd_id='dash', fatal=False)) - self._sort_formats(formats) - return { 'id': data_video.get('guid') or page_id, 'display_id': page_id, diff --git a/hypervideo_dl/extractor/gamestar.py b/hypervideo_dl/extractor/gamestar.py index e882fa6..e9966f5 100644 --- a/hypervideo_dl/extractor/gamestar.py +++ b/hypervideo_dl/extractor/gamestar.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor from ..utils import ( int_or_none, diff --git a/hypervideo_dl/extractor/gaskrank.py b/hypervideo_dl/extractor/gaskrank.py index 03acd2a..e0bbdae 100644 --- a/hypervideo_dl/extractor/gaskrank.py +++ b/hypervideo_dl/extractor/gaskrank.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor from ..utils import ( @@ -96,6 +93,5 @@ class GaskrankIE(InfoExtractor): 'view_count': view_count, 'average_rating': average_rating, }) - self._sort_formats(entry['formats']) return entry diff --git a/hypervideo_dl/extractor/gazeta.py b/hypervideo_dl/extractor/gazeta.py index 3671870..c6868a6 100644 --- a/hypervideo_dl/extractor/gazeta.py +++ b/hypervideo_dl/extractor/gazeta.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/gdcvault.py b/hypervideo_dl/extractor/gdcvault.py index c3ad6b4..2878bbd 100644 --- a/hypervideo_dl/extractor/gdcvault.py +++ b/hypervideo_dl/extractor/gdcvault.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/gedidigital.py b/hypervideo_dl/extractor/gedidigital.py index ec386c2..1878d63 100644 --- a/hypervideo_dl/extractor/gedidigital.py +++ b/hypervideo_dl/extractor/gedidigital.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -14,7 +11,7 @@ from ..utils import ( class GediDigitalIE(InfoExtractor): - _VALID_URL = r'''(?x)(?P<url>(?:https?:)//video\. + _VALID_URL = r'''(?x:(?P<base_url>(?:https?:)//video\. (?: (?: (?:espresso\.)?repubblica @@ -36,7 +33,13 @@ class GediDigitalIE(InfoExtractor): |corrierealpi |lasentinella )\.gelocal - )\.it(?:/[^/]+){2,4}/(?P<id>\d+))(?:$|[?&].*)''' + )\.it(?:/[^/]+){2,4}/(?P<id>\d+))(?:$|[?&].*))''' + _EMBED_REGEX = [rf'''(?x) + (?: + data-frame-src=| + <iframe[^\n]+src= + ) + (["'])(?P<url>{_VALID_URL})\1'''] _TESTS = [{ 'url': 'https://video.lastampa.it/politica/il-paradosso-delle-regionali-la-lega-vince-ma-sembra-aver-perso/121559/121683', 'md5': '84658d7fb9e55a6e57ecc77b73137494', @@ -112,22 +115,9 @@ class GediDigitalIE(InfoExtractor): urls[i] = urljoin(base_url(e), url_basename(e)) return urls - @staticmethod - def _extract_urls(webpage): - entries = [ - mobj.group('eurl') - for mobj in re.finditer(r'''(?x) - (?: - data-frame-src=| - <iframe[^\n]+src= - ) - (["'])(?P<eurl>%s)\1''' % GediDigitalIE._VALID_URL, webpage)] - return GediDigitalIE._sanitize_urls(entries) - - @staticmethod - def _extract_url(webpage): - urls = GediDigitalIE._extract_urls(webpage) - return urls[0] if urls else None + @classmethod + def _extract_embed_urls(cls, url, webpage): + return cls._sanitize_urls(tuple(super()._extract_embed_urls(url, webpage))) @staticmethod def _clean_formats(formats): @@ -142,8 +132,7 @@ class GediDigitalIE(InfoExtractor): formats[:] = clean_formats def _real_extract(self, url): - video_id = self._match_id(url) - url = self._match_valid_url(url).group('url') + video_id, url = self._match_valid_url(url).group('id', 'base_url') webpage = self._download_webpage(url, video_id) title = self._html_search_meta( ['twitter:title', 'og:title'], webpage, fatal=True) @@ -197,7 +186,6 @@ class GediDigitalIE(InfoExtractor): duration = int_or_none(v) self._clean_formats(formats) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/generic.py b/hypervideo_dl/extractor/generic.py index 03e6eb2..f28a77e 100644 --- a/hypervideo_dl/extractor/generic.py +++ b/hypervideo_dl/extractor/generic.py @@ -1,162 +1,49 @@ -# coding: utf-8 - -from __future__ import unicode_literals - import os import re -import sys +import types +import urllib.parse +import xml.etree.ElementTree -from .common import InfoExtractor +from .common import InfoExtractor # isort: split +from .commonprotocols import RtmpIE from .youtube import YoutubeIE -from ..compat import ( - compat_etree_fromstring, - compat_str, - compat_urllib_parse_unquote, - compat_urlparse, - compat_xml_parse_error, -) +from ..compat import compat_etree_fromstring from ..utils import ( + KNOWN_EXTENSIONS, + MEDIA_EXTENSIONS, + ExtractorError, + UnsupportedError, determine_ext, dict_get, - ExtractorError, - float_or_none, - HEADRequest, + format_field, int_or_none, is_html, js_to_json, - KNOWN_EXTENSIONS, merge_dicts, mimetype2ext, orderedSet, parse_duration, parse_resolution, - sanitized_Request, smuggle_url, str_or_none, + traverse_obj, + try_call, unescapeHTML, unified_timestamp, unsmuggle_url, - UnsupportedError, url_or_none, + variadic, xpath_attr, xpath_text, xpath_with_ns, ) -from .commonprotocols import RtmpIE -from .brightcove import ( - BrightcoveLegacyIE, - BrightcoveNewIE, -) -from .nexx import ( - NexxIE, - NexxEmbedIE, -) -from .nbc import NBCSportsVPlayerIE -from .ooyala import OoyalaIE -from .rutv import RUTVIE -from .tvc import TVCIE -from .sportbox import SportBoxIE -from .myvi import MyviIE -from .condenast import CondeNastIE -from .udn import UDNEmbedIE -from .senategov import SenateISVPIE -from .svt import SVTIE -from .pornhub import PornHubIE -from .xhamster import XHamsterEmbedIE -from .tnaflix import TNAFlixNetworkEmbedIE -from .drtuber import DrTuberIE -from .redtube import RedTubeIE -from .tube8 import Tube8IE -from .mofosex import MofosexEmbedIE -from .spankwire import SpankwireIE -from .youporn import YouPornIE -from .vimeo import ( - VimeoIE, - VHXEmbedIE, -) -from .dailymotion import DailymotionIE -from .dailymail import DailyMailIE -from .onionstudios import OnionStudiosIE -from .viewlift import ViewLiftEmbedIE -from .mtv import MTVServicesEmbeddedIE -from .pladform import PladformIE -from .videomore import VideomoreIE -from .webcaster import WebcasterFeedIE -from .googledrive import GoogleDriveIE -from .jwplatform import JWPlatformIE -from .digiteka import DigitekaIE -from .arkena import ArkenaIE -from .instagram import InstagramIE -from .threeqsdn import ThreeQSDNIE -from .theplatform import ThePlatformIE -from .kaltura import KalturaIE -from .eagleplatform import EaglePlatformIE -from .facebook import FacebookIE -from .soundcloud import SoundcloudEmbedIE -from .tunein import TuneInBaseIE -from .vbox7 import Vbox7IE -from .dbtv import DBTVIE -from .piksel import PikselIE -from .videa import VideaIE -from .twentymin import TwentyMinutenIE -from .ustream import UstreamIE -from .arte import ArteTVEmbedIE -from .videopress import VideoPressIE -from .rutube import RutubeIE -from .glomex import GlomexEmbedIE -from .megatvcom import MegaTVComEmbedIE -from .ant1newsgr import Ant1NewsGrEmbedIE -from .limelight import LimelightBaseIE -from .anvato import AnvatoIE -from .washingtonpost import WashingtonPostIE -from .wistia import WistiaIE -from .mediaset import MediasetIE -from .joj import JojIE -from .megaphone import MegaphoneIE -from .vzaar import VzaarIE -from .channel9 import Channel9IE -from .vshare import VShareIE -from .mediasite import MediasiteIE -from .springboardplatform import SpringboardPlatformIE -from .ted import TedEmbedIE -from .yapfiles import YapFilesIE -from .vice import ViceIE -from .xfileshare import XFileShareIE -from .cloudflarestream import CloudflareStreamIE -from .peertube import PeerTubeIE -from .teachable import TeachableIE -from .indavideo import IndavideoEmbedIE -from .apa import APAIE -from .foxnews import FoxNewsIE -from .viqeo import ViqeoIE -from .expressen import ExpressenIE -from .zype import ZypeIE -from .odnoklassniki import OdnoklassnikiIE -from .vk import VKIE -from .kinja import KinjaEmbedIE -from .gedidigital import GediDigitalIE -from .rcs import RCSEmbedsIE -from .bitchute import BitChuteIE -from .rumble import RumbleEmbedIE -from .arcpublishing import ArcPublishingIE -from .medialaan import MedialaanIE -from .simplecast import SimplecastIE -from .wimtv import WimTVIE -from .tvopengr import TVOpenGrEmbedIE -from .ertgr import ERTWebtvEmbedIE -from .tvp import TVPEmbedIE -from .blogger import BloggerIE -from .mainstreaming import MainStreamingIE -from .gfycat import GfycatIE -from .panopto import PanoptoBaseIE -from .ruutu import RuutuIE class GenericIE(InfoExtractor): IE_DESC = 'Generic downloader that works on some sites' _VALID_URL = r'.*' IE_NAME = 'generic' - _NETRC_MACHINE = False # Supress username warning + _NETRC_MACHINE = False # Suppress username warning _TESTS = [ # Direct link to a video { @@ -474,188 +361,6 @@ class GenericIE(InfoExtractor): }, 'skip': 'There is a limit of 200 free downloads / month for the test song', }, - { - # embedded brightcove video - # it also tests brightcove videos that need to set the 'Referer' - # in the http requests - 'add_ie': ['BrightcoveLegacy'], - 'url': 'http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/', - 'info_dict': { - 'id': '2765128793001', - 'ext': 'mp4', - 'title': 'Le cours de bourse : l’analyse technique', - 'description': 'md5:7e9ad046e968cb2d1114004aba466fd9', - 'uploader': 'BFM BUSINESS', - }, - 'params': { - 'skip_download': True, - }, - }, - { - # embedded with itemprop embedURL and video id spelled as `idVideo` - 'add_id': ['BrightcoveLegacy'], - 'url': 'http://bfmbusiness.bfmtv.com/mediaplayer/chroniques/olivier-delamarche/', - 'info_dict': { - 'id': '5255628253001', - 'ext': 'mp4', - 'title': 'md5:37c519b1128915607601e75a87995fc0', - 'description': 'md5:37f7f888b434bb8f8cc8dbd4f7a4cf26', - 'uploader': 'BFM BUSINESS', - 'uploader_id': '876450612001', - 'timestamp': 1482255315, - 'upload_date': '20161220', - }, - 'params': { - 'skip_download': True, - }, - }, - { - # https://github.com/ytdl-org/youtube-dl/issues/2253 - 'url': 'http://bcove.me/i6nfkrc3', - 'md5': '0ba9446db037002366bab3b3eb30c88c', - 'info_dict': { - 'id': '3101154703001', - 'ext': 'mp4', - 'title': 'Still no power', - 'uploader': 'thestar.com', - 'description': 'Mississauga resident David Farmer is still out of power as a result of the ice storm a month ago. To keep the house warm, Farmer cuts wood from his property for a wood burning stove downstairs.', - }, - 'add_ie': ['BrightcoveLegacy'], - 'skip': 'video gone', - }, - { - 'url': 'http://www.championat.com/video/football/v/87/87499.html', - 'md5': 'fb973ecf6e4a78a67453647444222983', - 'info_dict': { - 'id': '3414141473001', - 'ext': 'mp4', - 'title': 'Видео. Удаление Дзагоева (ЦСКА)', - 'description': 'Онлайн-трансляция матча ЦСКА - "Волга"', - 'uploader': 'Championat', - }, - }, - { - # https://github.com/ytdl-org/youtube-dl/issues/3541 - 'add_ie': ['BrightcoveLegacy'], - 'url': 'http://www.kijk.nl/sbs6/leermijvrouwenkennen/videos/jqMiXKAYan2S/aflevering-1', - 'info_dict': { - 'id': '3866516442001', - 'ext': 'mp4', - 'title': 'Leer mij vrouwen kennen: Aflevering 1', - 'description': 'Leer mij vrouwen kennen: Aflevering 1', - 'uploader': 'SBS Broadcasting', - }, - 'skip': 'Restricted to Netherlands', - 'params': { - 'skip_download': True, # m3u8 download - }, - }, - { - # Brightcove video in <iframe> - 'url': 'http://www.un.org/chinese/News/story.asp?NewsID=27724', - 'md5': '36d74ef5e37c8b4a2ce92880d208b968', - 'info_dict': { - 'id': '5360463607001', - 'ext': 'mp4', - 'title': '叙利亚失明儿童在废墟上演唱《心跳》 呼吁获得正常童年生活', - 'description': '联合国儿童基金会中东和北非区域大使、作曲家扎德·迪拉尼(Zade Dirani)在3月15日叙利亚冲突爆发7周年纪念日之际发布了为叙利亚谱写的歌曲《心跳》(HEARTBEAT),为受到六年冲突影响的叙利亚儿童发出强烈呐喊,呼吁世界做出共同努力,使叙利亚儿童重新获得享有正常童年生活的权利。', - 'uploader': 'United Nations', - 'uploader_id': '1362235914001', - 'timestamp': 1489593889, - 'upload_date': '20170315', - }, - 'add_ie': ['BrightcoveLegacy'], - }, - { - # Brightcove with alternative playerID key - 'url': 'http://www.nature.com/nmeth/journal/v9/n7/fig_tab/nmeth.2062_SV1.html', - 'info_dict': { - 'id': 'nmeth.2062_SV1', - 'title': 'Simultaneous multiview imaging of the Drosophila syncytial blastoderm : Quantitative high-speed imaging of entire developing embryos with simultaneous multiview light-sheet microscopy : Nature Methods : Nature Research', - }, - 'playlist': [{ - 'info_dict': { - 'id': '2228375078001', - 'ext': 'mp4', - 'title': 'nmeth.2062-sv1', - 'description': 'nmeth.2062-sv1', - 'timestamp': 1363357591, - 'upload_date': '20130315', - 'uploader': 'Nature Publishing Group', - 'uploader_id': '1964492299001', - }, - }], - }, - { - # Brightcove with UUID in videoPlayer - 'url': 'http://www8.hp.com/cn/zh/home.html', - 'info_dict': { - 'id': '5255815316001', - 'ext': 'mp4', - 'title': 'Sprocket Video - China', - 'description': 'Sprocket Video - China', - 'uploader': 'HP-Video Gallery', - 'timestamp': 1482263210, - 'upload_date': '20161220', - 'uploader_id': '1107601872001', - }, - 'params': { - 'skip_download': True, # m3u8 download - }, - 'skip': 'video rotates...weekly?', - }, - { - # Brightcove:new type [2]. - 'url': 'http://www.delawaresportszone.com/video-st-thomas-more-earns-first-trip-to-basketball-semis', - 'md5': '2b35148fcf48da41c9fb4591650784f3', - 'info_dict': { - 'id': '5348741021001', - 'ext': 'mp4', - 'upload_date': '20170306', - 'uploader_id': '4191638492001', - 'timestamp': 1488769918, - 'title': 'VIDEO: St. Thomas More earns first trip to basketball semis', - - }, - }, - { - # Alternative brightcove <video> attributes - 'url': 'http://www.programme-tv.net/videos/extraits/81095-guillaume-canet-evoque-les-rumeurs-d-infidelite-de-marion-cotillard-avec-brad-pitt-dans-vivement-dimanche/', - 'info_dict': { - 'id': '81095-guillaume-canet-evoque-les-rumeurs-d-infidelite-de-marion-cotillard-avec-brad-pitt-dans-vivement-dimanche', - 'title': "Guillaume Canet évoque les rumeurs d'infidélité de Marion Cotillard avec Brad Pitt dans Vivement Dimanche, Extraits : toutes les vidéos avec Télé-Loisirs", - }, - 'playlist': [{ - 'md5': '732d22ba3d33f2f3fc253c39f8f36523', - 'info_dict': { - 'id': '5311302538001', - 'ext': 'mp4', - 'title': "Guillaume Canet évoque les rumeurs d'infidélité de Marion Cotillard avec Brad Pitt dans Vivement Dimanche", - 'description': "Guillaume Canet évoque les rumeurs d'infidélité de Marion Cotillard avec Brad Pitt dans Vivement Dimanche (France 2, 5 février 2017)", - 'timestamp': 1486321708, - 'upload_date': '20170205', - 'uploader_id': '800000640001', - }, - 'only_matching': True, - }], - }, - { - # Brightcove with UUID in videoPlayer - 'url': 'http://www8.hp.com/cn/zh/home.html', - 'info_dict': { - 'id': '5255815316001', - 'ext': 'mp4', - 'title': 'Sprocket Video - China', - 'description': 'Sprocket Video - China', - 'uploader': 'HP-Video Gallery', - 'timestamp': 1482263210, - 'upload_date': '20161220', - 'uploader_id': '1107601872001', - }, - 'params': { - 'skip_download': True, # m3u8 download - }, - }, # ooyala video { 'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219', @@ -947,45 +652,6 @@ class GenericIE(InfoExtractor): 'skip_download': True, } }, - # YouTube <object> embed - { - 'url': 'http://www.improbable.com/2017/04/03/untrained-modern-youths-and-ancient-masters-in-selfie-portraits/', - 'md5': '516718101ec834f74318df76259fb3cc', - 'info_dict': { - 'id': 'msN87y-iEx0', - 'ext': 'webm', - 'title': 'Feynman: Mirrors FUN TO IMAGINE 6', - 'upload_date': '20080526', - 'description': 'md5:0ffc78ea3f01b2e2c247d5f8d1d3c18d', - 'uploader': 'Christopher Sykes', - 'uploader_id': 'ChristopherJSykes', - }, - 'add_ie': ['Youtube'], - }, - # Camtasia studio - { - 'url': 'http://www.ll.mit.edu/workshops/education/videocourses/antennas/lecture1/video/', - 'playlist': [{ - 'md5': '0c5e352edabf715d762b0ad4e6d9ee67', - 'info_dict': { - 'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final', - 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - video1', - 'ext': 'flv', - 'duration': 2235.90, - } - }, { - 'md5': '10e4bb3aaca9fd630e273ff92d9f3c63', - 'info_dict': { - 'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final_PIP', - 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - pip', - 'ext': 'flv', - 'duration': 2235.93, - } - }], - 'info_dict': { - 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final', - } - }, # Flowplayer { 'url': 'http://www.handjobhub.com/video/busty-blonde-siri-tit-fuck-while-wank-6313.html', @@ -998,20 +664,6 @@ class GenericIE(InfoExtractor): 'title': 'Busty Blonde Siri Tit Fuck While Wank at HandjobHub.com', } }, - # Multiple brightcove videos - # https://github.com/ytdl-org/youtube-dl/issues/2283 - { - 'url': 'http://www.newyorker.com/online/blogs/newsdesk/2014/01/always-never-nuclear-command-and-control.html', - 'info_dict': { - 'id': 'always-never', - 'title': 'Always / Never - The New Yorker', - }, - 'playlist_count': 3, - 'params': { - 'extract_flat': False, - 'skip_download': True, - } - }, # MLB embed { 'url': 'http://umpire-empire.com/index.php/topic/58125-laz-decides-no-thats-low/', @@ -1027,36 +679,6 @@ class GenericIE(InfoExtractor): 'thumbnail': r're:^https?://.*\.jpg$', }, }, - # Wistia embed - { - 'url': 'http://study.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson', - 'md5': '1953f3a698ab51cfc948ed3992a0b7ff', - 'info_dict': { - 'id': '6e2wtrbdaf', - 'ext': 'mov', - 'title': 'paywall_north-american-exploration-failed-colonies-of-spain-france-england', - 'description': 'a Paywall Videos video from Remilon', - 'duration': 644.072, - 'uploader': 'study.com', - 'timestamp': 1459678540, - 'upload_date': '20160403', - 'filesize': 24687186, - }, - }, - { - 'url': 'http://thoughtworks.wistia.com/medias/uxjb0lwrcz', - 'md5': 'baf49c2baa8a7de5f3fc145a8506dcd4', - 'info_dict': { - 'id': 'uxjb0lwrcz', - 'ext': 'mp4', - 'title': 'Conversation about Hexagonal Rails Part 1', - 'description': 'a Martin Fowler video from ThoughtWorks', - 'duration': 1715.0, - 'uploader': 'thoughtworks.wistia.com', - 'timestamp': 1401832161, - 'upload_date': '20140603', - }, - }, # Wistia standard embed (async) { 'url': 'https://www.getdrip.com/university/brennan-dunn-drip-workshop/', @@ -1071,7 +693,8 @@ class GenericIE(InfoExtractor): }, 'params': { 'skip_download': True, - } + }, + 'skip': 'webpage 404 not found', }, # Soundcloud embed { @@ -1254,18 +877,6 @@ class GenericIE(InfoExtractor): 'skip_download': True, } }, - { - # JWPlatform iframe - 'url': 'https://www.covermagazine.co.uk/feature/2465255/business-protection-involved', - 'info_dict': { - 'id': 'AG26UQXM', - 'ext': 'mp4', - 'upload_date': '20160719', - 'timestamp': 468923808, - 'title': '2016_05_18 Cover L&G Business Protection V1 FINAL.mp4', - }, - 'add_ie': [JWPlatformIE.ie_key()], - }, { # Video.js embed, multiple formats 'url': 'http://ortcam.com/solidworks-урок-6-настройка-чертежа_33f9b7351.html', @@ -1545,21 +1156,6 @@ class GenericIE(InfoExtractor): }, 'expected_warnings': ['Failed to parse JSON Expecting value'], }, - # Brightcove URL in single quotes - { - 'url': 'http://www.sportsnet.ca/baseball/mlb/sn-presents-russell-martin-world-citizen/', - 'md5': '4ae374f1f8b91c889c4b9203c8c752af', - 'info_dict': { - 'id': '4255764656001', - 'ext': 'mp4', - 'title': 'SN Presents: Russell Martin, World Citizen', - 'description': 'To understand why he was the Toronto Blue Jays’ top off-season priority is to appreciate his background and upbringing in Montreal, where he first developed his baseball skills. Written and narrated by Stephen Brunt.', - 'uploader': 'Rogers Sportsnet', - 'uploader_id': '1704050871', - 'upload_date': '20150525', - 'timestamp': 1432570283, - }, - }, # Kinja embed { 'url': 'http://www.clickhole.com/video/dont-understand-bitcoin-man-will-mumble-explanatio-2537', @@ -1595,52 +1191,6 @@ class GenericIE(InfoExtractor): 'duration': 248.667, }, }, - # BrightcoveInPageEmbed embed - { - 'url': 'http://www.geekandsundry.com/tabletop-bonus-wils-final-thoughts-on-dread/', - 'info_dict': { - 'id': '4238694884001', - 'ext': 'flv', - 'title': 'Tabletop: Dread, Last Thoughts', - 'description': 'Tabletop: Dread, Last Thoughts', - 'duration': 51690, - }, - }, - # Brightcove embed, with no valid 'renditions' but valid 'IOSRenditions' - # This video can't be played in browsers if Flash disabled and UA set to iPhone, which is actually a false alarm - { - 'url': 'https://dl.dropboxusercontent.com/u/29092637/interview.html', - 'info_dict': { - 'id': '4785848093001', - 'ext': 'mp4', - 'title': 'The Cardinal Pell Interview', - 'description': 'Sky News Contributor Andrew Bolt interviews George Pell in Rome, following the Cardinal\'s evidence before the Royal Commission into Child Abuse. ', - 'uploader': 'GlobeCast Australia - GlobeStream', - 'uploader_id': '2733773828001', - 'upload_date': '20160304', - 'timestamp': 1457083087, - }, - 'params': { - # m3u8 downloads - 'skip_download': True, - }, - }, - { - # Brightcove embed with whitespace around attribute names - 'url': 'http://www.stack.com/video/3167554373001/learn-to-hit-open-three-pointers-with-damian-lillard-s-baseline-drift-drill', - 'info_dict': { - 'id': '3167554373001', - 'ext': 'mp4', - 'title': "Learn to Hit Open Three-Pointers With Damian Lillard's Baseline Drift Drill", - 'description': 'md5:57bacb0e0f29349de4972bfda3191713', - 'uploader_id': '1079349493', - 'upload_date': '20140207', - 'timestamp': 1391810548, - }, - 'params': { - 'skip_download': True, - }, - }, # Another form of arte.tv embed { 'url': 'http://www.tv-replay.fr/redirection/09-04-16/arte-reportage-arte-11508975.html', @@ -1691,7 +1241,7 @@ class GenericIE(InfoExtractor): 'timestamp': 1464107587, 'uploader': 'TheAtlantic', }, - 'add_ie': ['BrightcoveLegacy'], + 'skip': 'Private Youtube video', }, # Facebook <iframe> embed { @@ -1800,7 +1350,7 @@ class GenericIE(InfoExtractor): 'params': { 'skip_download': True, }, - 'add_ie': [ArkenaIE.ie_key()], + 'add_ie': ['Arkena'], }, { 'url': 'http://nova.bg/news/view/2016/08/16/156543/%D0%BD%D0%B0-%D0%BA%D0%BE%D1%81%D1%8A%D0%BC-%D0%BE%D1%82-%D0%B2%D0%B7%D1%80%D0%B8%D0%B2-%D0%BE%D1%82%D1%86%D0%B5%D0%BF%D0%B8%D1%85%D0%B0-%D1%86%D1%8F%D0%BB-%D0%BA%D0%B2%D0%B0%D1%80%D1%82%D0%B0%D0%BB-%D0%B7%D0%B0%D1%80%D0%B0%D0%B4%D0%B8-%D0%B8%D0%B7%D1%82%D0%B8%D1%87%D0%B0%D0%BD%D0%B5-%D0%BD%D0%B0-%D0%B3%D0%B0%D0%B7-%D0%B2-%D0%BF%D0%BB%D0%BE%D0%B2%D0%B4%D0%B8%D0%B2/', @@ -1812,7 +1362,7 @@ class GenericIE(InfoExtractor): 'params': { 'skip_download': True, }, - 'add_ie': [Vbox7IE.ie_key()], + 'add_ie': ['Vbox7'], }, { # DBTV embeds @@ -1844,7 +1394,7 @@ class GenericIE(InfoExtractor): 'params': { 'skip_download': True, }, - 'add_ie': [TwentyMinutenIE.ie_key()], + 'add_ie': ['TwentyMinuten'], }, { # VideoPress embed @@ -1859,7 +1409,7 @@ class GenericIE(InfoExtractor): 'params': { 'skip_download': True, }, - 'add_ie': [VideoPressIE.ie_key()], + 'add_ie': ['VideoPress'], }, { # Rutube embed @@ -1876,7 +1426,7 @@ class GenericIE(InfoExtractor): 'params': { 'skip_download': True, }, - 'add_ie': [RutubeIE.ie_key()], + 'add_ie': ['Rutube'], }, { # glomex:embed @@ -1948,7 +1498,7 @@ class GenericIE(InfoExtractor): 'ext': 'mp4', 'title': 'Integrated Senate Video Player', }, - 'add_ie': [SenateISVPIE.ie_key()], + 'add_ie': ['SenateISVP'], }, { # Limelight embeds (1 channel embed + 4 media embeds) @@ -1995,7 +1545,7 @@ class GenericIE(InfoExtractor): 'uploader': 'The Washington Post', 'upload_date': '20160211', }, - 'add_ie': [WashingtonPostIE.ie_key()], + 'add_ie': ['WashingtonPost'], }, { # Mediaset embed @@ -2008,7 +1558,7 @@ class GenericIE(InfoExtractor): 'params': { 'skip_download': True, }, - 'add_ie': [MediasetIE.ie_key()], + 'add_ie': ['Mediaset'], }, { # JOJ.sk embeds @@ -2018,7 +1568,7 @@ class GenericIE(InfoExtractor): 'title': 'Slovenskom sa prehnala vlna silných búrok', }, 'playlist_mincount': 5, - 'add_ie': [JojIE.ie_key()], + 'add_ie': ['Joj'], }, { # AMP embed (see https://www.ampproject.org/docs/reference/components/amp-video) @@ -2084,7 +1634,7 @@ class GenericIE(InfoExtractor): 'params': { 'skip_download': True, }, - 'add_ie': [SpringboardPlatformIE.ie_key()], + 'add_ie': ['SpringboardPlatform'], }, { 'url': 'https://www.yapfiles.ru/show/1872528/690b05d3054d2dbe1e69523aa21bb3b1.mp4.html', @@ -2093,7 +1643,7 @@ class GenericIE(InfoExtractor): 'ext': 'mp4', 'title': 'Котята', }, - 'add_ie': [YapFilesIE.ie_key()], + 'add_ie': ['YapFiles'], 'params': { 'skip_download': True, }, @@ -2106,7 +1656,7 @@ class GenericIE(InfoExtractor): 'ext': 'mp4', 'title': '31c9291ab41fac05471db4e73aa11717', }, - 'add_ie': [CloudflareStreamIE.ie_key()], + 'add_ie': ['CloudflareStream'], 'params': { 'skip_download': True, }, @@ -2133,7 +1683,7 @@ class GenericIE(InfoExtractor): 'uploader': 'StreetKitchen', 'uploader_id': '546363', }, - 'add_ie': [IndavideoEmbedIE.ie_key()], + 'add_ie': ['IndavideoEmbed'], 'params': { 'skip_download': True, }, @@ -2174,22 +1724,6 @@ class GenericIE(InfoExtractor): }, 'playlist_count': 6, }, - { - # Squarespace video embed, 2019-08-28 - 'url': 'http://ootboxford.com', - 'info_dict': { - 'id': 'Tc7b_JGdZfw', - 'title': 'Out of the Blue, at Childish Things 10', - 'ext': 'mp4', - 'description': 'md5:a83d0026666cf5ee970f8bd1cfd69c7f', - 'uploader_id': 'helendouglashouse', - 'uploader': 'Helen & Douglas House', - 'upload_date': '20140328', - }, - 'params': { - 'skip_download': True, - }, - }, # { # # Zype embed # 'url': 'https://www.cookscountry.com/episode/554-smoky-barbecue-favorites', @@ -2508,10 +2042,10 @@ class GenericIE(InfoExtractor): # Panopto embeds 'url': 'https://www.monash.edu/learning-teaching/teachhq/learning-technologies/panopto/how-to/insert-a-quiz-into-a-panopto-video', 'info_dict': { - 'title': 'Insert a quiz into a Panopto video', - 'id': 'insert-a-quiz-into-a-panopto-video' + 'ext': 'mp4', + 'id': '0bd3f16c-824a-436a-8486-ac5900693aef', + 'title': 'Quizzes in Panopto', }, - 'playlist_count': 1 }, { # Ruutu embed @@ -2530,114 +2064,178 @@ class GenericIE(InfoExtractor): 'upload_date': '20220308', }, }, + { + # Multiple Ruutu embeds + 'url': 'https://www.hs.fi/kotimaa/art-2000008762560.html', + 'info_dict': { + 'title': 'Koronavirus | Epidemiahuippu voi olla Suomessa ohi, mutta koronaviruksen poistamista yleisvaarallisten tautien joukosta harkitaan vasta syksyllä', + 'id': 'art-2000008762560' + }, + 'playlist_count': 3 + }, + { + # Ruutu embed in hs.fi with a single video + 'url': 'https://www.hs.fi/kotimaa/art-2000008793421.html', + 'md5': 'f8964e65d8fada6e8a562389bf366bb4', + 'info_dict': { + 'id': '4081841', + 'ext': 'mp4', + 'title': 'Puolustusvoimat siirsi panssariajoneuvoja harjoituksiin Niinisaloon 2.5.2022', + 'thumbnail': r're:^https?://.+\.jpg$', + 'duration': 138, + 'age_limit': 0, + 'upload_date': '20220504', + }, + }, + { + # Webpage contains double BOM + 'url': 'https://www.filmarkivet.se/movies/paris-d-moll/', + 'md5': 'df02cadc719dcc63d43288366f037754', + 'info_dict': { + 'id': 'paris-d-moll', + 'ext': 'mp4', + 'upload_date': '20220518', + 'title': 'Paris d-moll', + 'description': 'md5:319e37ea5542293db37e1e13072fe330', + 'thumbnail': 'https://www.filmarkivet.se/wp-content/uploads/parisdmoll2.jpg', + 'timestamp': 1652833414, + 'age_limit': 0, + } + }, + { + 'url': 'https://www.mollymovieclub.com/p/interstellar?s=r#details', + 'md5': '198bde8bed23d0b23c70725c83c9b6d9', + 'info_dict': { + 'id': '53602801', + 'ext': 'mpga', + 'title': 'Interstellar', + 'description': 'Listen now | Episode One', + 'thumbnail': 'md5:c30d9c83f738e16d8551d7219d321538', + 'uploader': 'Molly Movie Club', + 'uploader_id': '839621', + }, + }, + { + 'url': 'https://www.blockedandreported.org/p/episode-117-lets-talk-about-depp?s=r', + 'md5': 'c0cc44ee7415daeed13c26e5b56d6aa0', + 'info_dict': { + 'id': '57962052', + 'ext': 'mpga', + 'title': 'md5:855b2756f0ee10f6723fa00b16266f8d', + 'description': 'md5:fe512a5e94136ad260c80bde00ea4eef', + 'thumbnail': 'md5:2218f27dfe517bb5ac16c47d0aebac59', + 'uploader': 'Blocked and Reported', + 'uploader_id': '500230', + }, + }, + { + 'url': 'https://www.skimag.com/video/ski-people-1980/', + 'md5': '022a7e31c70620ebec18deeab376ee03', + 'info_dict': { + 'id': 'YTmgRiNU', + 'ext': 'mp4', + 'title': '1980 Ski People', + 'timestamp': 1610407738, + 'description': 'md5:cf9c3d101452c91e141f292b19fe4843', + 'thumbnail': 'https://cdn.jwplayer.com/v2/media/YTmgRiNU/poster.jpg?width=720', + 'duration': 5688.0, + 'upload_date': '20210111', + } + }, + { + 'note': 'JSON LD with multiple @type', + 'url': 'https://www.nu.nl/280161/video/hoe-een-bladvlo-dit-verwoestende-japanse-onkruid-moet-vernietigen.html', + 'md5': 'c7949f34f57273013fb7ccb1156393db', + 'info_dict': { + 'id': 'ipy2AcGL', + 'ext': 'mp4', + 'description': 'md5:6a9d644bab0dc2dc06849c2505d8383d', + 'thumbnail': r're:https://media\.nu\.nl/m/.+\.jpg', + 'title': 'Hoe een bladvlo dit verwoestende Japanse onkruid moet vernietigen', + 'timestamp': 1586577474, + 'upload_date': '20200411', + 'age_limit': 0, + 'duration': 111.0, + } + }, + { + 'note': 'JSON LD with unexpected data type', + 'url': 'https://www.autoweek.nl/autotests/artikel/porsche-911-gt3-rs-rij-impressie-2/', + 'info_dict': { + 'id': 'porsche-911-gt3-rs-rij-impressie-2', + 'ext': 'mp4', + 'title': 'Test: Porsche 911 GT3 RS', + 'description': 'Je ziet het niet, maar het is er wel. Downforce, hebben we het dan over. En in de nieuwe Porsche 911 GT3 RS is er zelfs heel veel downforce.', + 'timestamp': 1664920902, + 'upload_date': '20221004', + 'thumbnail': r're:^https://media.autoweek.nl/m/.+\.jpg$', + 'age_limit': 0, + 'direct': True, + } + } ] def report_following_redirect(self, new_url): """Report information extraction.""" self._downloader.to_screen('[redirect] Following redirect to %s' % new_url) - def report_detected(self, name): - self._downloader.write_debug(f'Identified a {name}') + def report_detected(self, name, num=1, note=None): + if num > 1: + name += 's' + elif not num: + return + else: + num = 'a' - def _extract_rss(self, url, video_id, doc): - playlist_title = doc.find('./channel/title').text - playlist_desc_el = doc.find('./channel/description') - playlist_desc = None if playlist_desc_el is None else playlist_desc_el.text + self._downloader.write_debug(f'Identified {num} {name}{format_field(note, None, "; %s")}') + def _fragment_query(self, url): + if self._configuration_arg('fragment_query'): + query_string = urllib.parse.urlparse(url).query + if query_string: + return {'extra_param_to_segment_url': query_string} + return {} + + def _extract_rss(self, url, video_id, doc): NS_MAP = { 'itunes': 'http://www.itunes.com/dtds/podcast-1.0.dtd', } entries = [] for it in doc.findall('./channel/item'): - next_url = None - enclosure_nodes = it.findall('./enclosure') - for e in enclosure_nodes: - next_url = e.attrib.get('url') - if next_url: - break - - if not next_url: - next_url = xpath_text(it, 'link', fatal=False) - + next_url = next( + (e.attrib.get('url') for e in it.findall('./enclosure')), + xpath_text(it, 'link', fatal=False)) if not next_url: continue - if it.find('guid').text is not None: - next_url = smuggle_url(next_url, {'force_videoid': it.find('guid').text}) + guid = try_call(lambda: it.find('guid').text) + if guid: + next_url = smuggle_url(next_url, {'force_videoid': guid}) def itunes(key): - return xpath_text( - it, xpath_with_ns('./itunes:%s' % key, NS_MAP), - default=None) - - duration = itunes('duration') - explicit = (itunes('explicit') or '').lower() - if explicit in ('true', 'yes'): - age_limit = 18 - elif explicit in ('false', 'no'): - age_limit = 0 - else: - age_limit = None + return xpath_text(it, xpath_with_ns(f'./itunes:{key}', NS_MAP), default=None) entries.append({ '_type': 'url_transparent', 'url': next_url, - 'title': it.find('title').text, + 'title': try_call(lambda: it.find('title').text), 'description': xpath_text(it, 'description', default=None), - 'timestamp': unified_timestamp( - xpath_text(it, 'pubDate', default=None)), - 'duration': int_or_none(duration) or parse_duration(duration), + 'timestamp': unified_timestamp(xpath_text(it, 'pubDate', default=None)), + 'duration': parse_duration(itunes('duration')), 'thumbnail': url_or_none(xpath_attr(it, xpath_with_ns('./itunes:image', NS_MAP), 'href')), 'episode': itunes('title'), 'episode_number': int_or_none(itunes('episode')), 'season_number': int_or_none(itunes('season')), - 'age_limit': age_limit, + 'age_limit': {'true': 18, 'yes': 18, 'false': 0, 'no': 0}.get((itunes('explicit') or '').lower()), }) return { '_type': 'playlist', 'id': url, - 'title': playlist_title, - 'description': playlist_desc, - 'entries': entries, - } - - def _extract_camtasia(self, url, video_id, webpage): - """ Returns None if no camtasia video can be found. """ - - camtasia_cfg = self._search_regex( - r'fo\.addVariable\(\s*"csConfigFile",\s*"([^"]+)"\s*\);', - webpage, 'camtasia configuration file', default=None) - if camtasia_cfg is None: - return None - - title = self._html_search_meta('DC.title', webpage, fatal=True) - - camtasia_url = compat_urlparse.urljoin(url, camtasia_cfg) - camtasia_cfg = self._download_xml( - camtasia_url, video_id, - note='Downloading camtasia configuration', - errnote='Failed to download camtasia configuration') - fileset_node = camtasia_cfg.find('./playlist/array/fileset') - - entries = [] - for n in fileset_node.getchildren(): - url_n = n.find('./uri') - if url_n is None: - continue - - entries.append({ - 'id': os.path.splitext(url_n.text.rpartition('/')[2])[0], - 'title': '%s - %s' % (title, n.tag), - 'url': compat_urlparse.urljoin(url, url_n.text), - 'duration': float_or_none(n.find('./duration').text), - }) - - return { - '_type': 'playlist', + 'title': try_call(lambda: doc.find('./channel/title').text), + 'description': try_call(lambda: doc.find('./channel/description').text), 'entries': entries, - 'title': title, } def _kvs_getrealurl(self, video_url, license_code): @@ -2651,7 +2249,7 @@ class GenericIE(InfoExtractor): for o in range(len(newmagic) - 1, -1, -1): new = '' - l = (o + sum([int(n) for n in license[o:]])) % 32 + l = (o + sum(int(n) for n in license[o:])) % 32 for i in range(0, len(newmagic)): if i == o: @@ -2682,7 +2280,7 @@ class GenericIE(InfoExtractor): if url.startswith('//'): return self.url_result(self.http_scheme() + url) - parsed_url = compat_urlparse.urlparse(url) + parsed_url = urllib.parse.urlparse(url) if not parsed_url.scheme: default_search = self.get_param('default_search') if default_search is None: @@ -2713,59 +2311,59 @@ class GenericIE(InfoExtractor): default_search += ':' return self.url_result(default_search + url) - url, smuggled_data = unsmuggle_url(url) + original_url = url + url, smuggled_data = unsmuggle_url(url, {}) force_videoid = None - is_intentional = smuggled_data and smuggled_data.get('to_generic') - if smuggled_data and 'force_videoid' in smuggled_data: + is_intentional = smuggled_data.get('to_generic') + if 'force_videoid' in smuggled_data: force_videoid = smuggled_data['force_videoid'] video_id = force_videoid else: video_id = self._generic_id(url) - self.to_screen('%s: Requesting header' % video_id) - - head_req = HEADRequest(url) - head_response = self._request_webpage( - head_req, video_id, - note=False, errnote='Could not send HEAD request to %s' % url, - fatal=False) - - if head_response is not False: - # Check for redirect - new_url = head_response.geturl() - if url != new_url: - self.report_following_redirect(new_url) - if force_videoid: - new_url = smuggle_url( - new_url, {'force_videoid': force_videoid}) - return self.url_result(new_url) - - full_response = None - if head_response is False: - request = sanitized_Request(url) - request.add_header('Accept-Encoding', '*') - full_response = self._request_webpage(request, video_id) - head_response = full_response + # Some webservers may serve compressed content of rather big size (e.g. gzipped flac) + # making it impossible to download only chunk of the file (yet we need only 512kB to + # test whether it's HTML or not). According to hypervideo default Accept-Encoding + # that will always result in downloading the whole file that is not desirable. + # Therefore for extraction pass we have to override Accept-Encoding to any in order + # to accept raw bytes and being able to download only a chunk. + # It may probably better to solve this by checking Content-Type for application/octet-stream + # after a HEAD request, but not sure if we can rely on this. + full_response = self._request_webpage(url, video_id, headers={ + 'Accept-Encoding': '*', + **smuggled_data.get('http_headers', {}) + }) + new_url = full_response.geturl() + if new_url == urllib.parse.urlparse(url)._replace(scheme='https').geturl(): + url = new_url + elif url != new_url: + self.report_following_redirect(new_url) + if force_videoid: + new_url = smuggle_url(new_url, {'force_videoid': force_videoid}) + return self.url_result(new_url) info_dict = { 'id': video_id, 'title': self._generic_title(url), - 'timestamp': unified_timestamp(head_response.headers.get('Last-Modified')) + 'timestamp': unified_timestamp(full_response.headers.get('Last-Modified')) } # Check for direct link to a video - content_type = head_response.headers.get('Content-Type', '').lower() + content_type = full_response.headers.get('Content-Type', '').lower() m = re.match(r'^(?P<type>audio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P<format_id>[^;\s]+)', content_type) if m: self.report_detected('direct video link') - format_id = compat_str(m.group('format_id')) + headers = smuggled_data.get('http_headers', {}) + format_id = str(m.group('format_id')) subtitles = {} if format_id.endswith('mpegurl'): - formats, subtitles = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4') + formats, subtitles = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4', headers=headers) + info_dict.update(self._fragment_query(url)) elif format_id.endswith('mpd') or format_id.endswith('dash+xml'): - formats, subtitles = self._extract_mpd_formats_and_subtitles(url, video_id) + formats, subtitles = self._extract_mpd_formats_and_subtitles(url, video_id, headers=headers) + info_dict.update(self._fragment_query(url)) elif format_id == 'f4m': - formats = self._extract_f4m_formats(url, video_id) + formats = self._extract_f4m_formats(url, video_id, headers=headers) else: formats = [{ 'format_id': format_id, @@ -2773,28 +2371,16 @@ class GenericIE(InfoExtractor): 'vcodec': 'none' if m.group('type') == 'audio' else None }] info_dict['direct'] = True - self._sort_formats(formats) - info_dict['formats'] = formats - info_dict['subtitles'] = subtitles + info_dict.update({ + 'formats': formats, + 'subtitles': subtitles, + 'http_headers': headers, + }) return info_dict if not self.get_param('test', False) and not is_intentional: force = self.get_param('force_generic_extractor', False) - self.report_warning( - '%s on generic information extractor.' % ('Forcing' if force else 'Falling back')) - - if not full_response: - request = sanitized_Request(url) - # Some webservers may serve compressed content of rather big size (e.g. gzipped flac) - # making it impossible to download only chunk of the file (yet we need only 512kB to - # test whether it's HTML or not). According to hypervideo default Accept-Encoding - # that will always result in downloading the whole file that is not desirable. - # Therefore for extraction pass we have to override Accept-Encoding to any in order - # to accept raw bytes and being able to download only a chunk. - # It may probably better to solve this by checking Content-Type for application/octet-stream - # after HEAD request finishes, but not sure if we can rely on this. - request.add_header('Accept-Encoding', '*') - full_response = self._request_webpage(request, video_id) + self.report_warning('%s generic information extractor' % ('Forcing' if force else 'Falling back on')) first_bytes = full_response.read(512) @@ -2802,7 +2388,7 @@ class GenericIE(InfoExtractor): if first_bytes.startswith(b'#EXTM3U'): self.report_detected('M3U playlist') info_dict['formats'], info_dict['subtitles'] = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4') - self._sort_formats(info_dict['formats']) + info_dict.update(self._fragment_query(url)) return info_dict # Maybe it's a direct link to a video? @@ -2828,7 +2414,7 @@ class GenericIE(InfoExtractor): try: try: doc = compat_etree_fromstring(webpage) - except compat_xml_parse_error: + except xml.etree.ElementTree.ParseError: doc = compat_etree_fromstring(webpage.encode('utf-8')) if doc.tag == 'rss': self.report_detected('RSS feed') @@ -2836,12 +2422,10 @@ class GenericIE(InfoExtractor): elif doc.tag == 'SmoothStreamingMedia': info_dict['formats'], info_dict['subtitles'] = self._parse_ism_formats_and_subtitles(doc, url) self.report_detected('ISM manifest') - self._sort_formats(info_dict['formats']) return info_dict elif re.match(r'^(?:{[^}]+})?smil$', doc.tag): smil = self._parse_smil(doc, url, video_id) self.report_detected('SMIL file') - self._sort_formats(smil['formats']) return smil elif doc.tag == '{http://xspf.org/ns/0/}playlist': self.report_detected('XSPF playlist') @@ -2855,947 +2439,83 @@ class GenericIE(InfoExtractor): doc, mpd_base_url=full_response.geturl().rpartition('/')[0], mpd_url=url) + info_dict.update(self._fragment_query(url)) self.report_detected('DASH manifest') - self._sort_formats(info_dict['formats']) return info_dict elif re.match(r'^{http://ns\.adobe\.com/f4m/[12]\.0}manifest$', doc.tag): info_dict['formats'] = self._parse_f4m_formats(doc, url, video_id) self.report_detected('F4M manifest') - self._sort_formats(info_dict['formats']) return info_dict - except compat_xml_parse_error: + except xml.etree.ElementTree.ParseError: pass - # Is it a Camtasia project? - camtasia_res = self._extract_camtasia(url, video_id, webpage) - if camtasia_res is not None: - self.report_detected('Camtasia video') - return camtasia_res + info_dict.update({ + # it's tempting to parse this further, but you would + # have to take into account all the variations like + # Video Title - Site Name + # Site Name | Video Title + # Video Title - Tagline | Site Name + # and so on and so forth; it's just not practical + 'title': self._generic_title('', webpage, default='video'), + 'description': self._og_search_description(webpage, default=None), + 'thumbnail': self._og_search_thumbnail(webpage, default=None), + 'age_limit': self._rta_search(webpage), + }) + + self._downloader.write_debug('Looking for embeds') + embeds = list(self._extract_embeds(original_url, webpage, urlh=full_response, info_dict=info_dict)) + if len(embeds) == 1: + return {**info_dict, **embeds[0]} + elif embeds: + return self.playlist_result(embeds, **info_dict) + raise UnsupportedError(url) + + def _extract_embeds(self, url, webpage, *, urlh=None, info_dict={}): + """Returns an iterator of video entries""" + info_dict = types.MappingProxyType(info_dict) # Prevents accidental mutation + video_id = traverse_obj(info_dict, 'display_id', 'id') or self._generic_id(url) + url, smuggled_data = unsmuggle_url(url, {}) + actual_url = urlh.geturl() if urlh else url # Sometimes embedded video player is hidden behind percent encoding # (e.g. https://github.com/ytdl-org/youtube-dl/issues/2448) # Unescaping the whole page allows to handle those cases in a generic way # FIXME: unescaping the whole page may break URLs, commenting out for now. # There probably should be a second run of generic extractor on unescaped webpage. - # webpage = compat_urllib_parse_unquote(webpage) - - # Unescape squarespace embeds to be detected by generic extractor, - # see https://github.com/ytdl-org/youtube-dl/issues/21294 - webpage = re.sub( - r'<div[^>]+class=[^>]*?\bsqs-video-wrapper\b[^>]*>', - lambda x: unescapeHTML(x.group(0)), webpage) - - # it's tempting to parse this further, but you would - # have to take into account all the variations like - # Video Title - Site Name - # Site Name | Video Title - # Video Title - Tagline | Site Name - # and so on and so forth; it's just not practical - video_title = (self._og_search_title(webpage, default=None) - or self._html_extract_title(webpage, 'video title', default='video')) - - # Try to detect age limit automatically - age_limit = self._rta_search(webpage) - # And then there are the jokers who advertise that they use RTA, - # but actually don't. - AGE_LIMIT_MARKERS = [ - r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>', - ] - if any(re.search(marker, webpage) for marker in AGE_LIMIT_MARKERS): - age_limit = 18 - - # video uploader is domain name - video_uploader = self._search_regex( - r'^(?:https?://)?([^/]*)/.*', url, 'video uploader') - - video_description = self._og_search_description(webpage, default=None) - video_thumbnail = self._og_search_thumbnail(webpage, default=None) - - info_dict.update({ - 'title': video_title, - 'description': video_description, - 'thumbnail': video_thumbnail, - 'age_limit': age_limit, - }) - - self._downloader.write_debug('Looking for video embeds') - - # Look for Brightcove Legacy Studio embeds - bc_urls = BrightcoveLegacyIE._extract_brightcove_urls(webpage) - if bc_urls: - entries = [{ - '_type': 'url', - 'url': smuggle_url(bc_url, {'Referer': url}), - 'ie_key': 'BrightcoveLegacy' - } for bc_url in bc_urls] - - return { - '_type': 'playlist', - 'title': video_title, - 'id': video_id, - 'entries': entries, - } - - # Look for Brightcove New Studio embeds - bc_urls = BrightcoveNewIE._extract_urls(self, webpage) - if bc_urls: - return self.playlist_from_matches( - bc_urls, video_id, video_title, - getter=lambda x: smuggle_url(x, {'referrer': url}), - ie='BrightcoveNew') - - # Look for Nexx embeds - nexx_urls = NexxIE._extract_urls(webpage) - if nexx_urls: - return self.playlist_from_matches(nexx_urls, video_id, video_title, ie=NexxIE.ie_key()) - - # Look for Nexx iFrame embeds - nexx_embed_urls = NexxEmbedIE._extract_urls(webpage) - if nexx_embed_urls: - return self.playlist_from_matches(nexx_embed_urls, video_id, video_title, ie=NexxEmbedIE.ie_key()) - - # Look for ThePlatform embeds - tp_urls = ThePlatformIE._extract_urls(webpage) - if tp_urls: - return self.playlist_from_matches(tp_urls, video_id, video_title, ie='ThePlatform') - - arc_urls = ArcPublishingIE._extract_urls(webpage) - if arc_urls: - return self.playlist_from_matches(arc_urls, video_id, video_title, ie=ArcPublishingIE.ie_key()) - - mychannels_urls = MedialaanIE._extract_urls(webpage) - if mychannels_urls: - return self.playlist_from_matches( - mychannels_urls, video_id, video_title, ie=MedialaanIE.ie_key()) - - # Look for embedded rtl.nl player - matches = re.findall( - r'<iframe[^>]+?src="((?:https?:)?//(?:(?:www|static)\.)?rtl\.nl/(?:system/videoplayer/[^"]+(?:video_)?)?embed[^"]+)"', - webpage) - if matches: - return self.playlist_from_matches(matches, video_id, video_title, ie='RtlNl') - - vimeo_urls = VimeoIE._extract_urls(url, webpage) - if vimeo_urls: - return self.playlist_from_matches(vimeo_urls, video_id, video_title, ie=VimeoIE.ie_key()) - - vhx_url = VHXEmbedIE._extract_url(webpage) - if vhx_url: - return self.url_result(vhx_url, VHXEmbedIE.ie_key()) - - # Invidious Instances - # https://github.com/hypervideo/hypervideo/issues/195 - # https://github.com/iv-org/invidious/pull/1730 - youtube_url = self._search_regex( - r'<link rel="alternate" href="(https://www\.youtube\.com/watch\?v=[0-9A-Za-z_-]{11})"', - webpage, 'youtube link', default=None) - if youtube_url: - return self.url_result(youtube_url, YoutubeIE.ie_key()) - - # Look for YouTube embeds - youtube_urls = YoutubeIE._extract_urls(webpage) - if youtube_urls: - return self.playlist_from_matches( - youtube_urls, video_id, video_title, ie=YoutubeIE.ie_key()) - - matches = DailymotionIE._extract_urls(webpage) - if matches: - return self.playlist_from_matches(matches, video_id, video_title) - - # Look for embedded Dailymotion playlist player (#3822) - m = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.[a-z]{2,3}/widget/jukebox\?.+?)\1', webpage) - if m: - playlists = re.findall( - r'list\[\]=/playlist/([^/]+)/', unescapeHTML(m.group('url'))) - if playlists: - return self.playlist_from_matches( - playlists, video_id, video_title, lambda p: '//dailymotion.com/playlist/%s' % p) - - # Look for DailyMail embeds - dailymail_urls = DailyMailIE._extract_urls(webpage) - if dailymail_urls: - return self.playlist_from_matches( - dailymail_urls, video_id, video_title, ie=DailyMailIE.ie_key()) - - # Look for Teachable embeds, must be before Wistia - teachable_url = TeachableIE._extract_url(webpage, url) - if teachable_url: - return self.url_result(teachable_url) - - # Look for embedded Wistia player - wistia_urls = WistiaIE._extract_urls(webpage) - if wistia_urls: - playlist = self.playlist_from_matches(wistia_urls, video_id, video_title, ie=WistiaIE.ie_key()) - for entry in playlist['entries']: - entry.update({ - '_type': 'url_transparent', - 'uploader': video_uploader, - }) - return playlist - - # Look for SVT player - svt_url = SVTIE._extract_url(webpage) - if svt_url: - return self.url_result(svt_url, 'SVT') - - # Look for Bandcamp pages with custom domain - mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage) - if mobj is not None: - burl = unescapeHTML(mobj.group(1)) - # Don't set the extractor because it can be a track url or an album - return self.url_result(burl) - - # Look for embedded Vevo player - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:cache\.)?vevo\.com/.+?)\1', webpage) - if mobj is not None: - return self.url_result(mobj.group('url')) - - # Look for embedded Viddler player - mobj = re.search( - r'<(?:iframe[^>]+?src|param[^>]+?value)=(["\'])(?P<url>(?:https?:)?//(?:www\.)?viddler\.com/(?:embed|player)/.+?)\1', - webpage) - if mobj is not None: - return self.url_result(mobj.group('url')) - - # Look for NYTimes player - mobj = re.search( - r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//graphics8\.nytimes\.com/bcvideo/[^/]+/iframe/embed\.html.+?)\1>', - webpage) - if mobj is not None: - return self.url_result(mobj.group('url')) - - # Look for Libsyn player - mobj = re.search( - r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//html5-player\.libsyn\.com/embed/.+?)\1', webpage) - if mobj is not None: - return self.url_result(mobj.group('url')) - - # Look for Ooyala videos - mobj = (re.search(r'player\.ooyala\.com/[^"?]+[?#][^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) - or re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage) - or re.search(r'OO\.Player\.create\.apply\(\s*OO\.Player\s*,\s*op\(\s*\[\s*[\'"][^\'"]*[\'"]\s*,\s*[\'"](?P<ec>.{32})[\'"]', webpage) - or re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P<ec>.{32})[\'"]\)', webpage) - or re.search(r'data-ooyala-video-id\s*=\s*[\'"](?P<ec>.{32})[\'"]', webpage)) - if mobj is not None: - embed_token = self._search_regex( - r'embedToken[\'"]?\s*:\s*[\'"]([^\'"]+)', - webpage, 'ooyala embed token', default=None) - return OoyalaIE._build_url_result(smuggle_url( - mobj.group('ec'), { - 'domain': url, - 'embed_token': embed_token, - })) - - # Look for multiple Ooyala embeds on SBN network websites - mobj = re.search(r'SBN\.VideoLinkset\.entryGroup\((\[.*?\])', webpage) - if mobj is not None: - embeds = self._parse_json(mobj.group(1), video_id, fatal=False) - if embeds: - return self.playlist_from_matches( - embeds, video_id, video_title, - getter=lambda v: OoyalaIE._url_for_embed_code(smuggle_url(v['provider_video_id'], {'domain': url})), ie='Ooyala') - - # Look for Aparat videos - mobj = re.search(r'<iframe .*?src="(http://www\.aparat\.com/video/[^"]+)"', webpage) - if mobj is not None: - return self.url_result(mobj.group(1), 'Aparat') - - # Look for MPORA videos - mobj = re.search(r'<iframe .*?src="(http://mpora\.(?:com|de)/videos/[^"]+)"', webpage) - if mobj is not None: - return self.url_result(mobj.group(1), 'Mpora') - - # Look for embedded Facebook player - facebook_urls = FacebookIE._extract_urls(webpage) - if facebook_urls: - return self.playlist_from_matches(facebook_urls, video_id, video_title) - - # Look for embedded VK player - mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://vk\.com/video_ext\.php.+?)\1', webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'VK') + # webpage = urllib.parse.unquote(webpage) - # Look for embedded Odnoklassniki player - odnoklassniki_url = OdnoklassnikiIE._extract_url(webpage) - if odnoklassniki_url: - return self.url_result(odnoklassniki_url, OdnoklassnikiIE.ie_key()) - - # Look for sibnet embedded player - sibnet_urls = VKIE._extract_sibnet_urls(webpage) - if sibnet_urls: - return self.playlist_from_matches(sibnet_urls, video_id, video_title) - - # Look for embedded ivi player - mobj = re.search(r'<embed[^>]+?src=(["\'])(?P<url>https?://(?:www\.)?ivi\.ru/video/player.+?)\1', webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'Ivi') - - # Look for embedded Huffington Post player - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed\.live\.huffingtonpost\.com/.+?)\1', webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'HuffPost') - - # Look for embed.ly - mobj = re.search(r'class=["\']embedly-card["\'][^>]href=["\'](?P<url>[^"\']+)', webpage) - if mobj is not None: - return self.url_result(mobj.group('url')) - mobj = re.search(r'class=["\']embedly-embed["\'][^>]src=["\'][^"\']*url=(?P<url>[^&]+)', webpage) - if mobj is not None: - return self.url_result(compat_urllib_parse_unquote(mobj.group('url'))) - - # Look for funnyordie embed - matches = re.findall(r'<iframe[^>]+?src="(https?://(?:www\.)?funnyordie\.com/embed/[^"]+)"', webpage) - if matches: - return self.playlist_from_matches( - matches, video_id, video_title, getter=unescapeHTML, ie='FunnyOrDie') - - # Look for Simplecast embeds - simplecast_urls = SimplecastIE._extract_urls(webpage) - if simplecast_urls: - return self.playlist_from_matches( - simplecast_urls, video_id, video_title) - - # Look for BBC iPlayer embed - matches = re.findall(r'setPlaylist\("(https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)', webpage) - if matches: - return self.playlist_from_matches(matches, video_id, video_title, ie='BBCCoUk') - - # Look for embedded RUTV player - rutv_url = RUTVIE._extract_url(webpage) - if rutv_url: - return self.url_result(rutv_url, 'RUTV') - - # Look for embedded TVC player - tvc_url = TVCIE._extract_url(webpage) - if tvc_url: - return self.url_result(tvc_url, 'TVC') - - # Look for embedded SportBox player - sportbox_urls = SportBoxIE._extract_urls(webpage) - if sportbox_urls: - return self.playlist_from_matches(sportbox_urls, video_id, video_title, ie=SportBoxIE.ie_key()) - - # Look for embedded XHamster player - xhamster_urls = XHamsterEmbedIE._extract_urls(webpage) - if xhamster_urls: - return self.playlist_from_matches(xhamster_urls, video_id, video_title, ie='XHamsterEmbed') - - # Look for embedded TNAFlixNetwork player - tnaflix_urls = TNAFlixNetworkEmbedIE._extract_urls(webpage) - if tnaflix_urls: - return self.playlist_from_matches(tnaflix_urls, video_id, video_title, ie=TNAFlixNetworkEmbedIE.ie_key()) - - # Look for embedded PornHub player - pornhub_urls = PornHubIE._extract_urls(webpage) - if pornhub_urls: - return self.playlist_from_matches(pornhub_urls, video_id, video_title, ie=PornHubIE.ie_key()) - - # Look for embedded DrTuber player - drtuber_urls = DrTuberIE._extract_urls(webpage) - if drtuber_urls: - return self.playlist_from_matches(drtuber_urls, video_id, video_title, ie=DrTuberIE.ie_key()) - - # Look for embedded RedTube player - redtube_urls = RedTubeIE._extract_urls(webpage) - if redtube_urls: - return self.playlist_from_matches(redtube_urls, video_id, video_title, ie=RedTubeIE.ie_key()) - - # Look for embedded Tube8 player - tube8_urls = Tube8IE._extract_urls(webpage) - if tube8_urls: - return self.playlist_from_matches(tube8_urls, video_id, video_title, ie=Tube8IE.ie_key()) - - # Look for embedded Mofosex player - mofosex_urls = MofosexEmbedIE._extract_urls(webpage) - if mofosex_urls: - return self.playlist_from_matches(mofosex_urls, video_id, video_title, ie=MofosexEmbedIE.ie_key()) - - # Look for embedded Spankwire player - spankwire_urls = SpankwireIE._extract_urls(webpage) - if spankwire_urls: - return self.playlist_from_matches(spankwire_urls, video_id, video_title, ie=SpankwireIE.ie_key()) - - # Look for embedded YouPorn player - youporn_urls = YouPornIE._extract_urls(webpage) - if youporn_urls: - return self.playlist_from_matches(youporn_urls, video_id, video_title, ie=YouPornIE.ie_key()) - - # Look for embedded Tvigle player - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//cloud\.tvigle\.ru/video/.+?)\1', webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'Tvigle') - - # Look for embedded TED player - ted_urls = TedEmbedIE._extract_urls(webpage) - if ted_urls: - return self.playlist_from_matches(ted_urls, video_id, video_title, ie=TedEmbedIE.ie_key()) - - # Look for embedded Ustream videos - ustream_url = UstreamIE._extract_url(webpage) - if ustream_url: - return self.url_result(ustream_url, UstreamIE.ie_key()) - - # Look for embedded arte.tv player - arte_urls = ArteTVEmbedIE._extract_urls(webpage) - if arte_urls: - return self.playlist_from_matches(arte_urls, video_id, video_title) - - # Look for embedded francetv player - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?://)?embed\.francetv\.fr/\?ue=.+?)\1', - webpage) - if mobj is not None: - return self.url_result(mobj.group('url')) - - # Look for embedded Myvi.ru player - myvi_url = MyviIE._extract_url(webpage) - if myvi_url: - return self.url_result(myvi_url) - - # Look for embedded soundcloud player - soundcloud_urls = SoundcloudEmbedIE._extract_urls(webpage) - if soundcloud_urls: - return self.playlist_from_matches(soundcloud_urls, video_id, video_title, getter=unescapeHTML) - - # Look for tunein player - tunein_urls = TuneInBaseIE._extract_urls(webpage) - if tunein_urls: - return self.playlist_from_matches(tunein_urls, video_id, video_title) - - # Look for embedded mtvservices player - mtvservices_url = MTVServicesEmbeddedIE._extract_url(webpage) - if mtvservices_url: - return self.url_result(mtvservices_url, ie='MTVServicesEmbedded') - - # Look for embedded yahoo player - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:screen|movies)\.yahoo\.com/.+?\.html\?format=embed)\1', - webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'Yahoo') - - # Look for embedded sbs.com.au player - mobj = re.search( - r'''(?x) - (?: - <meta\s+property="og:video"\s+content=| - <iframe[^>]+?src= - ) - (["\'])(?P<url>https?://(?:www\.)?sbs\.com\.au/ondemand/video/.+?)\1''', - webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'SBS') - - # Look for embedded Cinchcast player - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.cinchcast\.com/.+?)\1', - webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'Cinchcast') - - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>https?://m(?:lb)?\.mlb\.com/shared/video/embed/embed\.html\?.+?)\1', - webpage) - if not mobj: - mobj = re.search( - r'data-video-link=["\'](?P<url>http://m\.mlb\.com/video/[^"\']+)', - webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'MLB') - - mobj = re.search( - r'<(?:iframe|script)[^>]+?src=(["\'])(?P<url>%s)\1' % CondeNastIE.EMBED_URL, - webpage) - if mobj is not None: - return self.url_result(self._proto_relative_url(mobj.group('url'), scheme='http:'), 'CondeNast') - - mobj = re.search( - r'<iframe[^>]+src="(?P<url>https?://(?:new\.)?livestream\.com/[^"]+/player[^"]+)"', - webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'Livestream') - - # Look for Zapiks embed - mobj = re.search( - r'<iframe[^>]+src="(?P<url>https?://(?:www\.)?zapiks\.fr/index\.php\?.+?)"', webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'Zapiks') - - # Look for Kaltura embeds - kaltura_urls = KalturaIE._extract_urls(webpage) - if kaltura_urls: - return self.playlist_from_matches( - kaltura_urls, video_id, video_title, - getter=lambda x: smuggle_url(x, {'source_url': url}), - ie=KalturaIE.ie_key()) - - # Look for EaglePlatform embeds - eagleplatform_url = EaglePlatformIE._extract_url(webpage) - if eagleplatform_url: - return self.url_result(smuggle_url(eagleplatform_url, {'referrer': url}), EaglePlatformIE.ie_key()) - - # Look for ClipYou (uses EaglePlatform) embeds - mobj = re.search( - r'<iframe[^>]+src="https?://(?P<host>media\.clipyou\.ru)/index/player\?.*\brecord_id=(?P<id>\d+).*"', webpage) - if mobj is not None: - return self.url_result('eagleplatform:%(host)s:%(id)s' % mobj.groupdict(), 'EaglePlatform') - - # Look for Pladform embeds - pladform_url = PladformIE._extract_url(webpage) - if pladform_url: - return self.url_result(pladform_url) - - # Look for Videomore embeds - videomore_url = VideomoreIE._extract_url(webpage) - if videomore_url: - return self.url_result(videomore_url) - - # Look for Webcaster embeds - webcaster_url = WebcasterFeedIE._extract_url(self, webpage) - if webcaster_url: - return self.url_result(webcaster_url, ie=WebcasterFeedIE.ie_key()) - - # Look for Playwire embeds - mobj = re.search( - r'<script[^>]+data-config=(["\'])(?P<url>(?:https?:)?//config\.playwire\.com/.+?)\1', webpage) - if mobj is not None: - return self.url_result(mobj.group('url')) - - # Look for Crooks and Liars embeds - mobj = re.search( - r'<(?:iframe[^>]+src|param[^>]+value)=(["\'])(?P<url>(?:https?:)?//embed\.crooksandliars\.com/(?:embed|v)/.+?)\1', webpage) - if mobj is not None: - return self.url_result(mobj.group('url')) - - # Look for NBC Sports VPlayer embeds - nbc_sports_url = NBCSportsVPlayerIE._extract_url(webpage) - if nbc_sports_url: - return self.url_result(nbc_sports_url, 'NBCSportsVPlayer') - - # Look for NBC News embeds - nbc_news_embed_url = re.search( - r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//www\.nbcnews\.com/widget/video-embed/[^"\']+)\1', webpage) - if nbc_news_embed_url: - return self.url_result(nbc_news_embed_url.group('url'), 'NBCNews') - - # Look for Google Drive embeds - google_drive_url = GoogleDriveIE._extract_url(webpage) - if google_drive_url: - return self.url_result(google_drive_url, 'GoogleDrive') - - # Look for UDN embeds - mobj = re.search( - r'<iframe[^>]+src="(?:https?:)?(?P<url>%s)"' % UDNEmbedIE._PROTOCOL_RELATIVE_VALID_URL, webpage) - if mobj is not None: - return self.url_result( - compat_urlparse.urljoin(url, mobj.group('url')), 'UDNEmbed') - - # Look for Senate ISVP iframe - senate_isvp_url = SenateISVPIE._search_iframe_url(webpage) - if senate_isvp_url: - return self.url_result(senate_isvp_url, 'SenateISVP') - - # Look for Kinja embeds - kinja_embed_urls = KinjaEmbedIE._extract_urls(webpage, url) - if kinja_embed_urls: - return self.playlist_from_matches( - kinja_embed_urls, video_id, video_title) - - # Look for OnionStudios embeds - onionstudios_url = OnionStudiosIE._extract_url(webpage) - if onionstudios_url: - return self.url_result(onionstudios_url) - - # Look for Blogger embeds - blogger_urls = BloggerIE._extract_urls(webpage) - if blogger_urls: - return self.playlist_from_matches(blogger_urls, video_id, video_title, ie=BloggerIE.ie_key()) - - # Look for ViewLift embeds - viewlift_url = ViewLiftEmbedIE._extract_url(webpage) - if viewlift_url: - return self.url_result(viewlift_url) - - # Look for JWPlatform embeds - jwplatform_urls = JWPlatformIE._extract_urls(webpage) - if jwplatform_urls: - return self.playlist_from_matches(jwplatform_urls, video_id, video_title, ie=JWPlatformIE.ie_key()) - - # Look for Digiteka embeds - digiteka_url = DigitekaIE._extract_url(webpage) - if digiteka_url: - return self.url_result(self._proto_relative_url(digiteka_url), DigitekaIE.ie_key()) - - # Look for Arkena embeds - arkena_url = ArkenaIE._extract_url(webpage) - if arkena_url: - return self.url_result(arkena_url, ArkenaIE.ie_key()) - - # Look for Piksel embeds - piksel_url = PikselIE._extract_url(webpage) - if piksel_url: - return self.url_result(piksel_url, PikselIE.ie_key()) - - # Look for Limelight embeds - limelight_urls = LimelightBaseIE._extract_urls(webpage, url) - if limelight_urls: - return self.playlist_result( - limelight_urls, video_id, video_title, video_description) - - # Look for Anvato embeds - anvato_urls = AnvatoIE._extract_urls(self, webpage, video_id) - if anvato_urls: - return self.playlist_result( - anvato_urls, video_id, video_title, video_description) - - # Look for AdobeTVVideo embeds - mobj = re.search( - r'<iframe[^>]+src=[\'"]((?:https?:)?//video\.tv\.adobe\.com/v/\d+[^"]+)[\'"]', - webpage) - if mobj is not None: - return self.url_result( - self._proto_relative_url(unescapeHTML(mobj.group(1))), - 'AdobeTVVideo') - - # Look for Vine embeds - mobj = re.search( - r'<iframe[^>]+src=[\'"]((?:https?:)?//(?:www\.)?vine\.co/v/[^/]+/embed/(?:simple|postcard))', - webpage) - if mobj is not None: - return self.url_result( - self._proto_relative_url(unescapeHTML(mobj.group(1))), 'Vine') - - # Look for VODPlatform embeds - mobj = re.search( - r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:(?:www\.)?vod-platform\.net|embed\.kwikmotion\.com)/[eE]mbed/.+?)\1', - webpage) - if mobj is not None: - return self.url_result( - self._proto_relative_url(unescapeHTML(mobj.group('url'))), 'VODPlatform') - - # Look for Mangomolo embeds - mobj = re.search( - r'''(?x)<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?// - (?: - admin\.mangomolo\.com/analytics/index\.php/customers/embed| - player\.mangomolo\.com/v1 - )/ - (?: - video\?.*?\bid=(?P<video_id>\d+)| - (?:index|live)\?.*?\bchannelid=(?P<channel_id>(?:[A-Za-z0-9+/=]|%2B|%2F|%3D)+) - ).+?)\1''', webpage) - if mobj is not None: - info = { - '_type': 'url_transparent', - 'url': self._proto_relative_url(unescapeHTML(mobj.group('url'))), - 'title': video_title, - 'description': video_description, - 'thumbnail': video_thumbnail, - 'uploader': video_uploader, - } - video_id = mobj.group('video_id') - if video_id: - info.update({ - 'ie_key': 'MangomoloVideo', - 'id': video_id, - }) - else: - info.update({ - 'ie_key': 'MangomoloLive', - 'id': mobj.group('channel_id'), - }) - return info - - # Look for Instagram embeds - instagram_embed_url = InstagramIE._extract_embed_url(webpage) - if instagram_embed_url is not None: - return self.url_result( - self._proto_relative_url(instagram_embed_url), InstagramIE.ie_key()) - - # Look for 3Q SDN embeds - threeqsdn_url = ThreeQSDNIE._extract_url(webpage) - if threeqsdn_url: - return { - '_type': 'url_transparent', - 'ie_key': ThreeQSDNIE.ie_key(), - 'url': self._proto_relative_url(threeqsdn_url), - 'title': video_title, - 'description': video_description, - 'thumbnail': video_thumbnail, - 'uploader': video_uploader, - } - - # Look for VBOX7 embeds - vbox7_url = Vbox7IE._extract_url(webpage) - if vbox7_url: - return self.url_result(vbox7_url, Vbox7IE.ie_key()) - - # Look for DBTV embeds - dbtv_urls = DBTVIE._extract_urls(webpage) - if dbtv_urls: - return self.playlist_from_matches(dbtv_urls, video_id, video_title, ie=DBTVIE.ie_key()) - - # Look for Videa embeds - videa_urls = VideaIE._extract_urls(webpage) - if videa_urls: - return self.playlist_from_matches(videa_urls, video_id, video_title, ie=VideaIE.ie_key()) - - # Look for 20 minuten embeds - twentymin_urls = TwentyMinutenIE._extract_urls(webpage) - if twentymin_urls: - return self.playlist_from_matches( - twentymin_urls, video_id, video_title, ie=TwentyMinutenIE.ie_key()) - - # Look for VideoPress embeds - videopress_urls = VideoPressIE._extract_urls(webpage) - if videopress_urls: - return self.playlist_from_matches( - videopress_urls, video_id, video_title, ie=VideoPressIE.ie_key()) - - # Look for Rutube embeds - rutube_urls = RutubeIE._extract_urls(webpage) - if rutube_urls: - return self.playlist_from_matches( - rutube_urls, video_id, video_title, ie=RutubeIE.ie_key()) - - # Look for Glomex embeds - glomex_urls = list(GlomexEmbedIE._extract_urls(webpage, url)) - if glomex_urls: - return self.playlist_from_matches( - glomex_urls, video_id, video_title, ie=GlomexEmbedIE.ie_key()) - - # Look for megatv.com embeds - megatvcom_urls = list(MegaTVComEmbedIE._extract_urls(webpage)) - if megatvcom_urls: - return self.playlist_from_matches( - megatvcom_urls, video_id, video_title, ie=MegaTVComEmbedIE.ie_key()) - - # Look for ant1news.gr embeds - ant1newsgr_urls = list(Ant1NewsGrEmbedIE._extract_urls(webpage)) - if ant1newsgr_urls: - return self.playlist_from_matches( - ant1newsgr_urls, video_id, video_title, ie=Ant1NewsGrEmbedIE.ie_key()) - - # Look for WashingtonPost embeds - wapo_urls = WashingtonPostIE._extract_urls(webpage) - if wapo_urls: - return self.playlist_from_matches( - wapo_urls, video_id, video_title, ie=WashingtonPostIE.ie_key()) - - # Look for Mediaset embeds - mediaset_urls = MediasetIE._extract_urls(self, webpage) - if mediaset_urls: - return self.playlist_from_matches( - mediaset_urls, video_id, video_title, ie=MediasetIE.ie_key()) - - # Look for JOJ.sk embeds - joj_urls = JojIE._extract_urls(webpage) - if joj_urls: - return self.playlist_from_matches( - joj_urls, video_id, video_title, ie=JojIE.ie_key()) - - # Look for megaphone.fm embeds - mpfn_urls = MegaphoneIE._extract_urls(webpage) - if mpfn_urls: - return self.playlist_from_matches( - mpfn_urls, video_id, video_title, ie=MegaphoneIE.ie_key()) - - # Look for vzaar embeds - vzaar_urls = VzaarIE._extract_urls(webpage) - if vzaar_urls: - return self.playlist_from_matches( - vzaar_urls, video_id, video_title, ie=VzaarIE.ie_key()) - - channel9_urls = Channel9IE._extract_urls(webpage) - if channel9_urls: - return self.playlist_from_matches( - channel9_urls, video_id, video_title, ie=Channel9IE.ie_key()) - - vshare_urls = VShareIE._extract_urls(webpage) - if vshare_urls: - return self.playlist_from_matches( - vshare_urls, video_id, video_title, ie=VShareIE.ie_key()) - - # Look for Mediasite embeds - mediasite_urls = MediasiteIE._extract_urls(webpage) - if mediasite_urls: - entries = [ - self.url_result(smuggle_url( - compat_urlparse.urljoin(url, mediasite_url), - {'UrlReferrer': url}), ie=MediasiteIE.ie_key()) - for mediasite_url in mediasite_urls] - return self.playlist_result(entries, video_id, video_title) - - springboardplatform_urls = SpringboardPlatformIE._extract_urls(webpage) - if springboardplatform_urls: - return self.playlist_from_matches( - springboardplatform_urls, video_id, video_title, - ie=SpringboardPlatformIE.ie_key()) - - yapfiles_urls = YapFilesIE._extract_urls(webpage) - if yapfiles_urls: - return self.playlist_from_matches( - yapfiles_urls, video_id, video_title, ie=YapFilesIE.ie_key()) - - vice_urls = ViceIE._extract_urls(webpage) - if vice_urls: - return self.playlist_from_matches( - vice_urls, video_id, video_title, ie=ViceIE.ie_key()) - - xfileshare_urls = XFileShareIE._extract_urls(webpage) - if xfileshare_urls: - return self.playlist_from_matches( - xfileshare_urls, video_id, video_title, ie=XFileShareIE.ie_key()) - - cloudflarestream_urls = CloudflareStreamIE._extract_urls(webpage) - if cloudflarestream_urls: - return self.playlist_from_matches( - cloudflarestream_urls, video_id, video_title, ie=CloudflareStreamIE.ie_key()) - - peertube_urls = PeerTubeIE._extract_urls(webpage, url) - if peertube_urls: - return self.playlist_from_matches( - peertube_urls, video_id, video_title, ie=PeerTubeIE.ie_key()) - - indavideo_urls = IndavideoEmbedIE._extract_urls(webpage) - if indavideo_urls: - return self.playlist_from_matches( - indavideo_urls, video_id, video_title, ie=IndavideoEmbedIE.ie_key()) - - apa_urls = APAIE._extract_urls(webpage) - if apa_urls: - return self.playlist_from_matches( - apa_urls, video_id, video_title, ie=APAIE.ie_key()) - - foxnews_urls = FoxNewsIE._extract_urls(webpage) - if foxnews_urls: - return self.playlist_from_matches( - foxnews_urls, video_id, video_title, ie=FoxNewsIE.ie_key()) - - sharevideos_urls = [sharevideos_mobj.group('url') for sharevideos_mobj in re.finditer( - r'<iframe[^>]+?\bsrc\s*=\s*(["\'])(?P<url>(?:https?:)?//embed\.share-videos\.se/auto/embed/\d+\?.*?\buid=\d+.*?)\1', - webpage)] - if sharevideos_urls: - return self.playlist_from_matches( - sharevideos_urls, video_id, video_title) - - viqeo_urls = ViqeoIE._extract_urls(webpage) - if viqeo_urls: - return self.playlist_from_matches( - viqeo_urls, video_id, video_title, ie=ViqeoIE.ie_key()) - - expressen_urls = ExpressenIE._extract_urls(webpage) - if expressen_urls: - return self.playlist_from_matches( - expressen_urls, video_id, video_title, ie=ExpressenIE.ie_key()) - - zype_urls = ZypeIE._extract_urls(webpage) - if zype_urls: - return self.playlist_from_matches( - zype_urls, video_id, video_title, ie=ZypeIE.ie_key()) - - gedi_urls = GediDigitalIE._extract_urls(webpage) - if gedi_urls: - return self.playlist_from_matches( - gedi_urls, video_id, video_title, ie=GediDigitalIE.ie_key()) - - # Look for RCS media group embeds - rcs_urls = RCSEmbedsIE._extract_urls(webpage) - if rcs_urls: - return self.playlist_from_matches( - rcs_urls, video_id, video_title, ie=RCSEmbedsIE.ie_key()) - - wimtv_urls = WimTVIE._extract_urls(webpage) - if wimtv_urls: - return self.playlist_from_matches( - wimtv_urls, video_id, video_title, ie=WimTVIE.ie_key()) - - bitchute_urls = BitChuteIE._extract_urls(webpage) - if bitchute_urls: - return self.playlist_from_matches( - bitchute_urls, video_id, video_title, ie=BitChuteIE.ie_key()) - - rumble_urls = RumbleEmbedIE._extract_urls(webpage) - if len(rumble_urls) == 1: - return self.url_result(rumble_urls[0], RumbleEmbedIE.ie_key()) - if rumble_urls: - return self.playlist_from_matches( - rumble_urls, video_id, video_title, ie=RumbleEmbedIE.ie_key()) - - # Look for (tvopen|ethnos).gr embeds - tvopengr_urls = list(TVOpenGrEmbedIE._extract_urls(webpage)) - if tvopengr_urls: - return self.playlist_from_matches(tvopengr_urls, video_id, video_title, ie=TVOpenGrEmbedIE.ie_key()) - - # Look for ert.gr webtv embeds - ertwebtv_urls = list(ERTWebtvEmbedIE._extract_urls(webpage)) - if len(ertwebtv_urls) == 1: - return self.url_result(self._proto_relative_url(ertwebtv_urls[0]), video_title=video_title, url_transparent=True) - elif ertwebtv_urls: - return self.playlist_from_matches(ertwebtv_urls, video_id, video_title, ie=ERTWebtvEmbedIE.ie_key()) - - tvp_urls = TVPEmbedIE._extract_urls(webpage) - if tvp_urls: - return self.playlist_from_matches(tvp_urls, video_id, video_title, ie=TVPEmbedIE.ie_key()) - - # Look for MainStreaming embeds - mainstreaming_urls = MainStreamingIE._extract_urls(webpage) - if mainstreaming_urls: - return self.playlist_from_matches(mainstreaming_urls, video_id, video_title, ie=MainStreamingIE.ie_key()) - - # Look for Gfycat Embeds - gfycat_urls = GfycatIE._extract_urls(webpage) - if gfycat_urls: - return self.playlist_from_matches(gfycat_urls, video_id, video_title, ie=GfycatIE.ie_key()) - - panopto_urls = PanoptoBaseIE._extract_urls(webpage) - if panopto_urls: - return self.playlist_from_matches(panopto_urls, video_id, video_title) - - # Look for Ruutu embeds - ruutu_url = RuutuIE._extract_url(webpage) - if ruutu_url: - return self.url_result(ruutu_url, RuutuIE) - - # Look for HTML5 media - entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls') - if entries: - self.report_detected('HTML5 media') - if len(entries) == 1: - entries[0].update({ - 'id': video_id, - 'title': video_title, - }) - else: - for num, entry in enumerate(entries, start=1): - entry.update({ - 'id': '%s-%s' % (video_id, num), - 'title': '%s (%d)' % (video_title, num), - }) - for entry in entries: - self._sort_formats(entry['formats']) - return self.playlist_result(entries, video_id, video_title) + embeds = [] + for ie in self._downloader._ies.values(): + if ie.ie_key() in smuggled_data.get('block_ies', []): + continue + gen = ie.extract_from_webpage(self._downloader, url, webpage) + current_embeds = [] + try: + while True: + current_embeds.append(next(gen)) + except self.StopExtraction: + self.report_detected(f'{ie.IE_NAME} exclusive embed', len(current_embeds), + embeds and 'discarding other embeds') + return current_embeds + except StopIteration: + self.report_detected(f'{ie.IE_NAME} embed', len(current_embeds)) + embeds.extend(current_embeds) + + if embeds: + return embeds jwplayer_data = self._find_jwplayer_data( webpage, video_id, transform_source=js_to_json) if jwplayer_data: if isinstance(jwplayer_data.get('playlist'), str): self.report_detected('JW Player playlist') - return { - **info_dict, - '_type': 'url', - 'ie_key': JWPlatformIE.ie_key(), - 'url': jwplayer_data['playlist'], - } + return [self.url_result(jwplayer_data['playlist'], 'JWPlatform')] try: info = self._parse_jwplayer_data( jwplayer_data, video_id, require_title=False, base_url=url) - self.report_detected('JW Player data') - return merge_dicts(info, info_dict) + if traverse_obj(info, 'formats', ('entries', ..., 'formats')): + self.report_detected('JW Player data') + return [info] except ExtractorError: # See https://github.com/ytdl-org/youtube-dl/pull/16735 pass @@ -3806,24 +2526,21 @@ class GenericIE(InfoExtractor): webpage) if mobj is not None: varname = mobj.group(1) - sources = self._parse_json( - mobj.group(2), video_id, transform_source=js_to_json, - fatal=False) or [] - if not isinstance(sources, list): - sources = [sources] + sources = variadic(self._parse_json( + mobj.group(2), video_id, transform_source=js_to_json, fatal=False) or []) formats = [] subtitles = {} for source in sources: src = source.get('src') - if not src or not isinstance(src, compat_str): + if not src or not isinstance(src, str): continue - src = compat_urlparse.urljoin(url, src) + src = urllib.parse.urljoin(url, src) src_type = source.get('type') - if isinstance(src_type, compat_str): + if isinstance(src_type, str): src_type = src_type.lower() ext = determine_ext(src).lower() if src_type == 'video/youtube': - return self.url_result(src, YoutubeIE.ie_key()) + return [self.url_result(src, YoutubeIE.ie_key())] if src_type == 'application/dash+xml' or ext == 'mpd': fmts, subs = self._extract_mpd_formats_and_subtitles( src, video_id, mpd_id='dash', fatal=False) @@ -3835,13 +2552,16 @@ class GenericIE(InfoExtractor): m3u8_id='hls', fatal=False) formats.extend(fmts) self._merge_subtitles(subs, target=subtitles) - else: + for fmt in formats: + fmt.update(self._fragment_query(src)) + + if not formats: formats.append({ 'url': src, 'ext': (mimetype2ext(src_type) or ext if ext in KNOWN_EXTENSIONS else 'mp4'), 'http_headers': { - 'Referer': full_response.geturl(), + 'Referer': actual_url, }, }) # https://docs.videojs.com/player#addRemoteTextTrack @@ -3853,39 +2573,36 @@ class GenericIE(InfoExtractor): if not src: continue subtitles.setdefault(dict_get(sub, ('language', 'srclang')) or 'und', []).append({ - 'url': compat_urlparse.urljoin(url, src), + 'url': urllib.parse.urljoin(url, src), 'name': sub.get('label'), 'http_headers': { - 'Referer': full_response.geturl(), + 'Referer': actual_url, }, }) if formats or subtitles: self.report_detected('video.js embed') - self._sort_formats(formats) - info_dict['formats'] = formats - info_dict['subtitles'] = subtitles - return info_dict + return [{'formats': formats, 'subtitles': subtitles}] # Looking for http://schema.org/VideoObject json_ld = self._search_json_ld(webpage, video_id, default={}) if json_ld.get('url') not in (url, None): self.report_detected('JSON LD') - if determine_ext(json_ld['url']) == 'm3u8': - json_ld['formats'], json_ld['subtitles'] = self._extract_m3u8_formats_and_subtitles( - json_ld['url'], video_id, 'mp4') - json_ld.pop('url') - self._sort_formats(json_ld['formats']) - else: - json_ld['_type'] = 'url_transparent' - json_ld['url'] = smuggle_url(json_ld['url'], {'force_videoid': video_id, 'to_generic': True}) - return merge_dicts(json_ld, info_dict) + is_direct = json_ld.get('ext') not in (None, *MEDIA_EXTENSIONS.manifests) + return [merge_dicts({ + '_type': 'video' if is_direct else 'url_transparent', + 'url': smuggle_url(json_ld['url'], { + 'force_videoid': video_id, + 'to_generic': True, + 'http_headers': {'Referer': url}, + }), + }, json_ld)] def check_video(vurl): if YoutubeIE.suitable(vurl): return True if RtmpIE.suitable(vurl): return True - vpath = compat_urlparse.urlparse(vurl).path + vpath = urllib.parse.urlparse(vurl).path vext = determine_ext(vpath, None) return vext not in (None, 'swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml', 'js', 'xml') @@ -3947,15 +2664,13 @@ class GenericIE(InfoExtractor): if not formats[-1].get('height'): formats[-1]['quality'] = 1 - self._sort_formats(formats) - - return { + return [{ 'id': flashvars['video_id'], 'display_id': display_id, 'title': title, 'thumbnail': thumbnail, 'formats': formats, - } + }] if not found: # Broaden the search a little bit found = filter_video(re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)) @@ -3994,7 +2709,7 @@ class GenericIE(InfoExtractor): self.report_detected('Twitter card') if not found: # We look for Open Graph info: - # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am) + # We have to match any number spaces between elements, some sites try to align them, e.g.: statigr.am m_video_type = re.findall(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage) # We only look in og:video if the MIME type is a video, don't try if it's a Flash player: if m_video_type is not None: @@ -4009,20 +2724,14 @@ class GenericIE(InfoExtractor): webpage) if not found: # Look also in Refresh HTTP header - refresh_header = head_response.headers.get('Refresh') + refresh_header = urlh and urlh.headers.get('Refresh') if refresh_header: - # In python 2 response HTTP headers are bytestrings - if sys.version_info < (3, 0) and isinstance(refresh_header, str): - refresh_header = refresh_header.decode('iso-8859-1') found = re.search(REDIRECT_REGEX, refresh_header) if found: - new_url = compat_urlparse.urljoin(url, unescapeHTML(found.group(1))) + new_url = urllib.parse.urljoin(url, unescapeHTML(found.group(1))) if new_url != url: self.report_following_redirect(new_url) - return { - '_type': 'url', - 'url': new_url, - } + return [self.url_result(new_url)] else: found = None @@ -4033,34 +2742,35 @@ class GenericIE(InfoExtractor): embed_url = self._html_search_meta('twitter:player', webpage, default=None) if embed_url and embed_url != url: self.report_detected('twitter:player iframe') - return self.url_result(embed_url) + return [self.url_result(embed_url)] if not found: - raise UnsupportedError(url) + return [] + + domain_name = self._search_regex(r'^(?:https?://)?([^/]*)/.*', url, 'video uploader', default=None) entries = [] for video_url in orderedSet(found): video_url = unescapeHTML(video_url) video_url = video_url.replace('\\/', '/') - video_url = compat_urlparse.urljoin(url, video_url) - video_id = compat_urllib_parse_unquote(os.path.basename(video_url)) + video_url = urllib.parse.urljoin(url, video_url) + video_id = urllib.parse.unquote(os.path.basename(video_url)) # Sometimes, jwplayer extraction will result in a YouTube URL if YoutubeIE.suitable(video_url): entries.append(self.url_result(video_url, 'Youtube')) continue - # here's a fun little line of code for you: video_id = os.path.splitext(video_id)[0] headers = { - 'referer': full_response.geturl() + 'referer': actual_url } entry_info_dict = { 'id': video_id, - 'uploader': video_uploader, - 'title': video_title, - 'age_limit': age_limit, + 'uploader': domain_name, + 'title': info_dict['title'], + 'age_limit': info_dict['age_limit'], 'http_headers': headers, } @@ -4077,11 +2787,13 @@ class GenericIE(InfoExtractor): if ext == 'smil': entry_info_dict = {**self._extract_smil_info(video_url, video_id), **entry_info_dict} elif ext == 'xspf': - return self.playlist_result(self._extract_xspf_playlist(video_url, video_id), video_id) + return [self._extract_xspf_playlist(video_url, video_id)] elif ext == 'm3u8': entry_info_dict['formats'], entry_info_dict['subtitles'] = self._extract_m3u8_formats_and_subtitles(video_url, video_id, ext='mp4', headers=headers) + entry_info_dict.update(self._fragment_query(video_url)) elif ext == 'mpd': entry_info_dict['formats'], entry_info_dict['subtitles'] = self._extract_mpd_formats_and_subtitles(video_url, video_id, headers=headers) + entry_info_dict.update(self._fragment_query(video_url)) elif ext == 'f4m': entry_info_dict['formats'] = self._extract_f4m_formats(video_url, video_id, headers=headers) elif re.search(r'(?i)\.(?:ism|smil)/manifest', video_url) and video_url != url: @@ -4102,19 +2814,11 @@ class GenericIE(InfoExtractor): else: entry_info_dict['url'] = video_url - if entry_info_dict.get('formats'): - self._sort_formats(entry_info_dict['formats']) - entries.append(entry_info_dict) - if len(entries) == 1: - return entries[0] - else: + if len(entries) > 1: for num, e in enumerate(entries, start=1): # 'url' results don't have a title if e.get('title') is not None: e['title'] = '%s (%d)' % (e['title'], num) - return { - '_type': 'playlist', - 'entries': entries, - } + return entries diff --git a/hypervideo_dl/extractor/genericembeds.py b/hypervideo_dl/extractor/genericembeds.py new file mode 100644 index 0000000..9b4f14d --- /dev/null +++ b/hypervideo_dl/extractor/genericembeds.py @@ -0,0 +1,114 @@ +import re +import urllib.parse + +from .common import InfoExtractor +from ..utils import make_archive_id, unescapeHTML + + +class HTML5MediaEmbedIE(InfoExtractor): + _VALID_URL = False + IE_NAME = 'html5' + _WEBPAGE_TESTS = [ + { + 'url': 'https://html.com/media/', + 'info_dict': { + 'title': 'HTML5 Media', + 'description': 'md5:933b2d02ceffe7a7a0f3c8326d91cc2a', + }, + 'playlist_count': 2 + } + ] + + def _extract_from_webpage(self, url, webpage): + video_id, title = self._generic_id(url), self._generic_title(url, webpage) + entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls') or [] + for num, entry in enumerate(entries, start=1): + entry.update({ + 'id': f'{video_id}-{num}', + 'title': f'{title} ({num})', + '_old_archive_ids': [ + make_archive_id('generic', f'{video_id}-{num}' if len(entries) > 1 else video_id), + ], + }) + yield entry + + +class QuotedHTMLIE(InfoExtractor): + """For common cases of quoted/escaped html parts in the webpage""" + _VALID_URL = False + IE_NAME = 'generic:quoted-html' + IE_DESC = False # Do not list + _WEBPAGE_TESTS = [{ + # 2 YouTube embeds in data-html + 'url': 'https://24tv.ua/bronetransporteri-ozbroyenni-zsu-shho-vidomo-pro-bronovik-wolfhound_n2167966', + 'info_dict': { + 'id': 'bronetransporteri-ozbroyenni-zsu-shho-vidomo-pro-bronovik-wolfhound_n2167966', + 'title': 'Броньовик Wolfhound: гігант, який допомагає ЗСУ знищувати окупантів на фронті', + 'thumbnail': r're:^https?://.*\.jpe?g', + 'timestamp': float, + 'upload_date': str, + 'description': 'md5:6816e1e5a65304bd7898e4c7eb1b26f7', + 'age_limit': 0, + }, + 'playlist_count': 2 + }, { + # Generic iframe embed of TV24UAPlayerIE within data-html + 'url': 'https://24tv.ua/harkivyani-zgaduyut-misto-do-viyni-shhemlive-video_n1887584', + 'info_dict': { + 'id': '1887584', + 'ext': 'mp4', + 'title': 'Харків\'яни згадують місто до війни: щемливе відео', + 'thumbnail': r're:^https?://.*\.jpe?g', + }, + 'params': {'skip_download': True} + }, { + # YouTube embeds on Squarespace (data-html): https://github.com/ytdl-org/youtube-dl/issues/21294 + 'url': 'https://www.harvardballetcompany.org/past-productions', + 'info_dict': { + 'id': 'past-productions', + 'title': 'Productions — Harvard Ballet Company', + 'age_limit': 0, + 'description': 'Past Productions', + }, + 'playlist_mincount': 26 + }, { + # Squarespace video embed, 2019-08-28, data-html + 'url': 'http://ootboxford.com', + 'info_dict': { + 'id': 'Tc7b_JGdZfw', + 'title': 'Out of the Blue, at Childish Things 10', + 'ext': 'mp4', + 'description': 'md5:a83d0026666cf5ee970f8bd1cfd69c7f', + 'uploader_id': 'helendouglashouse', + 'uploader': 'Helen & Douglas House', + 'upload_date': '20140328', + 'availability': 'public', + 'view_count': int, + 'channel': 'Helen & Douglas House', + 'comment_count': int, + 'uploader_url': 'http://www.youtube.com/user/helendouglashouse', + 'duration': 253, + 'channel_url': 'https://www.youtube.com/channel/UCTChGezrZVmlYlpMlkmulPA', + 'playable_in_embed': True, + 'age_limit': 0, + 'channel_follower_count': int, + 'channel_id': 'UCTChGezrZVmlYlpMlkmulPA', + 'tags': 'count:6', + 'categories': ['Nonprofits & Activism'], + 'like_count': int, + 'thumbnail': 'https://i.ytimg.com/vi/Tc7b_JGdZfw/hqdefault.jpg', + }, + 'params': { + 'skip_download': True, + }, + }] + + def _extract_from_webpage(self, url, webpage): + combined = '' + for _, html in re.findall(r'(?s)\bdata-html=(["\'])((?:(?!\1).)+)\1', webpage): + # unescapeHTML can handle " etc., unquote can handle percent encoding + unquoted_html = unescapeHTML(urllib.parse.unquote(html)) + if unquoted_html != html: + combined += unquoted_html + if combined: + yield from self._extract_generic_embeds(url, combined) diff --git a/hypervideo_dl/extractor/genius.py b/hypervideo_dl/extractor/genius.py new file mode 100644 index 0000000..62f5a28 --- /dev/null +++ b/hypervideo_dl/extractor/genius.py @@ -0,0 +1,127 @@ +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + js_to_json, + smuggle_url, + str_or_none, + traverse_obj, + unescapeHTML, +) + + +class GeniusIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?genius\.com/videos/(?P<id>[^?/#]+)' + _TESTS = [{ + 'url': 'https://genius.com/videos/Vince-staples-breaks-down-the-meaning-of-when-sparks-fly', + 'md5': '64c2ad98cfafcfda23bfa0ad0c512f4c', + 'info_dict': { + 'id': '6313303597112', + 'ext': 'mp4', + 'title': 'Vince Staples Breaks Down The Meaning Of “When Sparks Fly”', + 'description': 'md5:bc15e00342c537c0039d414423ae5752', + 'tags': 'count:1', + 'uploader_id': '4863540648001', + 'duration': 388.416, + 'upload_date': '20221005', + 'timestamp': 1664982341, + 'thumbnail': r're:^https?://.*\.jpg$', + }, + }, { + 'url': 'https://genius.com/videos/Breaking-down-drakes-certified-lover-boy-kanye-beef-way-2-sexy-cudi', + 'md5': 'b8ed87a5efd1473bd027c20a969d4060', + 'info_dict': { + 'id': '6271792014001', + 'ext': 'mp4', + 'title': 'md5:c6355f7fa8a70bc86492a3963919fc15', + 'description': 'md5:1774638c31548b31b037c09e9b821393', + 'tags': 'count:3', + 'uploader_id': '4863540648001', + 'duration': 2685.099, + 'upload_date': '20210909', + 'timestamp': 1631209167, + 'thumbnail': r're:^https?://.*\.jpg$', + }, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + metadata = self._search_json( + r'<meta content="', webpage, 'metadata', display_id, transform_source=unescapeHTML) + video_id = traverse_obj( + metadata, ('video', 'provider_id'), + ('dfp_kv', lambda _, x: x['name'] == 'brightcove_video_id', 'values', 0), get_all=False) + if not video_id: + raise ExtractorError('Brightcove video id not found in webpage') + + config = self._search_json(r'var\s*APP_CONFIG\s*=', webpage, 'config', video_id, default={}) + account_id = config.get('brightcove_account_id', '4863540648001') + player_id = traverse_obj( + config, 'brightcove_standard_web_player_id', 'brightcove_standard_no_autoplay_web_player_id', + 'brightcove_modal_web_player_id', 'brightcove_song_story_web_player_id', default='S1ZcmcOC1x') + + return self.url_result( + smuggle_url( + f'https://players.brightcove.net/{account_id}/{player_id}_default/index.html?videoId={video_id}', + {'referrer': url}), 'BrightcoveNew', video_id) + + +class GeniusLyricsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?genius\.com/(?P<id>[^?/#]+)-lyrics[?/#]?' + _TESTS = [{ + 'url': 'https://genius.com/Lil-baby-heyy-lyrics', + 'playlist_mincount': 2, + 'info_dict': { + 'id': '8454545', + 'title': 'Heyy', + 'description': 'Heyy by Lil Baby', + }, + }, { + 'url': 'https://genius.com/Outkast-two-dope-boyz-in-a-cadillac-lyrics', + 'playlist_mincount': 1, + 'info_dict': { + 'id': '36239', + 'title': 'Two Dope Boyz (In a Cadillac)', + 'description': 'Two Dope Boyz (In a Cadillac) by OutKast', + }, + }, { + 'url': 'https://genius.com/Playboi-carti-rip-lyrics', + 'playlist_mincount': 1, + 'info_dict': { + 'id': '3710582', + 'title': 'R.I.P.', + 'description': 'R.I.P. by Playboi Carti', + }, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + json_string = self._search_json( + r'window\.__PRELOADED_STATE__\s*=\s*JSON\.parse\(', webpage, 'json string', + display_id, transform_source=js_to_json, contains_pattern=r'\'{(?s:.+)}\'') + song_info = self._parse_json(json_string, display_id) + song_id = str_or_none(traverse_obj(song_info, ('songPage', 'song'))) + if not song_id: + raise ExtractorError('Song id not found in webpage') + + title = traverse_obj( + song_info, ('songPage', 'trackingData', lambda _, x: x['key'] == 'Title', 'value'), + get_all=False, default='untitled') + artist = traverse_obj( + song_info, ('songPage', 'trackingData', lambda _, x: x['key'] == 'Primary Artist', 'value'), + get_all=False, default='unknown artist') + media = traverse_obj( + song_info, ('entities', 'songs', song_id, 'media'), expected_type=list, default=[]) + + entries = [] + for m in media: + if m.get('type') in ('video', 'audio') and m.get('url'): + if m.get('provider') == 'spotify': + self.to_screen(f'{song_id}: Skipping Spotify audio embed') + else: + entries.append(self.url_result(m['url'])) + + return self.playlist_result(entries, song_id, title, f'{title} by {artist}') diff --git a/hypervideo_dl/extractor/gettr.py b/hypervideo_dl/extractor/gettr.py index 327a4d0..7795dc5 100644 --- a/hypervideo_dl/extractor/gettr.py +++ b/hypervideo_dl/extractor/gettr.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( bool_or_none, @@ -124,8 +121,6 @@ class GettrIE(GettrBaseIE): 'height': int_or_none(post_data.get('vid_hgt')), }) - self._sort_formats(formats) - return { 'id': post_id, 'title': title, @@ -195,8 +190,6 @@ class GettrStreamingIE(GettrBaseIE): 'url': urljoin(self._MEDIA_BASE_URL, thumbnail), } for thumbnail in try_get(video_info, lambda x: x['postData']['imgs'], list) or []] - self._sort_formats(formats) - return { 'id': video_id, 'title': try_get(video_info, lambda x: x['postData']['ttl'], str), diff --git a/hypervideo_dl/extractor/gfycat.py b/hypervideo_dl/extractor/gfycat.py index 2ad03e2..edc2e56 100644 --- a/hypervideo_dl/extractor/gfycat.py +++ b/hypervideo_dl/extractor/gfycat.py @@ -1,8 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - from .common import InfoExtractor from ..utils import ( int_or_none, @@ -13,7 +8,8 @@ from ..utils import ( class GfycatIE(InfoExtractor): - _VALID_URL = r'(?i)https?://(?:(?:www|giant|thumbs)\.)?gfycat\.com/(?:ru/|ifr/|gifs/detail/)?(?P<id>[^-/?#\."\']+)' + _VALID_URL = r'https?://(?:(?:www|giant|thumbs)\.)?gfycat\.com/(?i:ru/|ifr/|gifs/detail/)?(?P<id>[^-/?#\."\']+)' + _EMBED_REGEX = [rf'<(?:iframe|source)[^>]+\bsrc=["\'](?P<url>{_VALID_URL})'] _TESTS = [{ 'url': 'http://gfycat.com/DeadlyDecisiveGermanpinscher', 'info_dict': { @@ -85,14 +81,6 @@ class GfycatIE(InfoExtractor): 'only_matching': True }] - @staticmethod - def _extract_urls(webpage): - return [ - mobj.group('url') - for mobj in re.finditer( - r'<(?:iframe|source)[^>]+\bsrc=["\'](?P<url>%s)' % GfycatIE._VALID_URL, - webpage)] - def _real_extract(self, url): video_id = self._match_id(url) @@ -139,7 +127,6 @@ class GfycatIE(InfoExtractor): 'filesize': filesize, 'quality': quality(format_id), }) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/giantbomb.py b/hypervideo_dl/extractor/giantbomb.py index 1920923..1125723 100644 --- a/hypervideo_dl/extractor/giantbomb.py +++ b/hypervideo_dl/extractor/giantbomb.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import json from .common import InfoExtractor @@ -76,8 +74,6 @@ class GiantBombIE(InfoExtractor): if youtube_id: return self.url_result(youtube_id, 'Youtube') - self._sort_formats(formats) - return { 'id': video_id, 'display_id': display_id, diff --git a/hypervideo_dl/extractor/giga.py b/hypervideo_dl/extractor/giga.py index 5a9992a..b59c129 100644 --- a/hypervideo_dl/extractor/giga.py +++ b/hypervideo_dl/extractor/giga.py @@ -1,16 +1,8 @@ -# coding: utf-8 -from __future__ import unicode_literals - import itertools from .common import InfoExtractor -from ..utils import ( - qualities, - compat_str, - parse_duration, - parse_iso8601, - str_to_int, -) +from ..compat import compat_str +from ..utils import parse_duration, parse_iso8601, qualities, str_to_int class GigaIE(InfoExtractor): @@ -67,7 +59,6 @@ class GigaIE(InfoExtractor): 'format_id': '%s-%s' % (fmt['quality'], fmt['type'].split('/')[-1]), 'quality': quality(fmt['quality']), }) - self._sort_formats(formats) title = self._html_search_meta( 'title', webpage, 'title', fatal=True) diff --git a/hypervideo_dl/extractor/gigya.py b/hypervideo_dl/extractor/gigya.py index 4121784..c5bc86b 100644 --- a/hypervideo_dl/extractor/gigya.py +++ b/hypervideo_dl/extractor/gigya.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( diff --git a/hypervideo_dl/extractor/glide.py b/hypervideo_dl/extractor/glide.py index 12af859..d114f34 100644 --- a/hypervideo_dl/extractor/glide.py +++ b/hypervideo_dl/extractor/glide.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor @@ -23,7 +20,7 @@ class GlideIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - title = self._html_extract_title(webpage, default=None) or self._og_search_title(webpage) + title = self._generic_title('', webpage) video_url = self._proto_relative_url(self._search_regex( r'<source[^>]+src=(["\'])(?P<url>.+?)\1', webpage, 'video URL', default=None, diff --git a/hypervideo_dl/extractor/globo.py b/hypervideo_dl/extractor/globo.py index f6aaae1..a7be2cb 100644 --- a/hypervideo_dl/extractor/globo.py +++ b/hypervideo_dl/extractor/globo.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import base64 import hashlib import json @@ -142,7 +139,6 @@ class GloboIE(InfoExtractor): fmts, subtitles = self._extract_m3u8_formats_and_subtitles( signed_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) formats.extend(fmts) - self._sort_formats(formats) for resource in video['resources']: if resource.get('type') == 'subtitle': @@ -181,12 +177,12 @@ class GloboArticleIE(InfoExtractor): _VALID_URL = r'https?://.+?\.globo\.com/(?:[^/]+/)*(?P<id>[^/.]+)(?:\.html)?' _VIDEOID_REGEXES = [ - r'\bdata-video-id=["\'](\d{7,})', - r'\bdata-player-videosids=["\'](\d{7,})', + r'\bdata-video-id=["\'](\d{7,})["\']', + r'\bdata-player-videosids=["\'](\d{7,})["\']', r'\bvideosIDs\s*:\s*["\']?(\d{7,})', - r'\bdata-id=["\'](\d{7,})', - r'<div[^>]+\bid=["\'](\d{7,})', - r'<bs-player[^>]+\bvideoid=["\'](\d{8,})', + r'\bdata-id=["\'](\d{7,})["\']', + r'<div[^>]+\bid=["\'](\d{7,})["\']', + r'<bs-player[^>]+\bvideoid=["\'](\d{8,})["\']', ] _TESTS = [{ @@ -222,6 +218,14 @@ class GloboArticleIE(InfoExtractor): 'description': 'md5:2d089d036c4c9675117d3a56f8c61739', }, 'playlist_count': 1, + }, { + 'url': 'https://redeglobo.globo.com/rpc/meuparana/noticia/a-producao-de-chocolates-no-parana.ghtml', + 'info_dict': { + 'id': 'a-producao-de-chocolates-no-parana', + 'title': 'A produção de chocolates no Paraná', + 'description': 'md5:f2e3daf00ffd1dc0e9a8a6c7cfb0a89e', + }, + 'playlist_count': 2, }] @classmethod @@ -237,6 +241,6 @@ class GloboArticleIE(InfoExtractor): entries = [ self.url_result('globo:%s' % video_id, GloboIE.ie_key()) for video_id in orderedSet(video_ids)] - title = self._og_search_title(webpage) + title = self._og_search_title(webpage).strip() description = self._html_search_meta('description', webpage) return self.playlist_result(entries, display_id, title, description) diff --git a/hypervideo_dl/extractor/glomex.py b/hypervideo_dl/extractor/glomex.py index d9ef433..22aac0d 100644 --- a/hypervideo_dl/extractor/glomex.py +++ b/hypervideo_dl/extractor/glomex.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re import urllib.parse @@ -85,7 +82,6 @@ class GlomexBaseIE(InfoExtractor): if video.get('language'): for fmt in formats: fmt['language'] = video['language'] - self._sort_formats(formats) images = (video.get('images') or []) + [video.get('image') or {}] thumbnails = [{ @@ -177,7 +173,7 @@ class GlomexEmbedIE(GlomexBaseIE): return cls._smuggle_origin_url(f'https:{cls._BASE_PLAYER_URL}?{query_string}', origin_url) @classmethod - def _extract_urls(cls, webpage, origin_url): + def _extract_embed_urls(cls, url, webpage): # https://docs.glomex.com/publisher/video-player-integration/javascript-api/ quot_re = r'["\']' @@ -186,9 +182,9 @@ class GlomexEmbedIE(GlomexBaseIE): (?:https?:)?{cls._BASE_PLAYER_URL_RE}\?(?:(?!(?P=q)).)+ )(?P=q)''' for mobj in re.finditer(regex, webpage): - url = unescapeHTML(mobj.group('url')) - if cls.suitable(url): - yield cls._smuggle_origin_url(url, origin_url) + embed_url = unescapeHTML(mobj.group('url')) + if cls.suitable(embed_url): + yield cls._smuggle_origin_url(embed_url, url) regex = fr'''(?x) <glomex-player [^>]+?>| @@ -196,7 +192,7 @@ class GlomexEmbedIE(GlomexBaseIE): for mobj in re.finditer(regex, webpage): attrs = extract_attributes(mobj.group(0)) if attrs.get('data-integration-id') and attrs.get('data-playlist-id'): - yield cls.build_player_url(attrs['data-playlist-id'], attrs['data-integration-id'], origin_url) + yield cls.build_player_url(attrs['data-playlist-id'], attrs['data-integration-id'], url) # naive parsing of inline scripts for hard-coded integration parameters regex = fr'''(?x) @@ -209,7 +205,7 @@ class GlomexEmbedIE(GlomexBaseIE): continue playlist_id = re.search(regex % 'playlistId', script) if playlist_id: - yield cls.build_player_url(playlist_id, integration_id, origin_url) + yield cls.build_player_url(playlist_id, integration_id, url) def _real_extract(self, url): url, origin_url = self._unsmuggle_origin_url(url) diff --git a/hypervideo_dl/extractor/go.py b/hypervideo_dl/extractor/go.py index f92e166..b075a02 100644 --- a/hypervideo_dl/extractor/go.py +++ b/hypervideo_dl/extractor/go.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .adobepass import AdobePassIE @@ -14,6 +11,8 @@ from ..utils import ( try_get, urlencode_postdata, ExtractorError, + unified_timestamp, + traverse_obj, ) @@ -73,7 +72,7 @@ class GoIE(AdobePassIE): }, 'skip': 'This content is no longer available.', }, { - 'url': 'http://watchdisneyxd.go.com/doraemon', + 'url': 'https://disneynow.com/shows/big-hero-6-the-series', 'info_dict': { 'title': 'Doraemon', 'id': 'SH55574025', @@ -83,10 +82,19 @@ class GoIE(AdobePassIE): 'url': 'http://freeform.go.com/shows/shadowhunters/episodes/season-2/1-this-guilty-blood', 'info_dict': { 'id': 'VDKA3609139', - 'ext': 'mp4', 'title': 'This Guilty Blood', 'description': 'md5:f18e79ad1c613798d95fdabfe96cd292', 'age_limit': 14, + 'episode': 'Episode 1', + 'upload_date': '20170102', + 'season': 'Season 2', + 'thumbnail': 'http://cdn1.edgedatg.com/aws/v2/abcf/Shadowhunters/video/201/ae5f75608d86bf88aa4f9f4aa76ab1b7/579x325-Q100_ae5f75608d86bf88aa4f9f4aa76ab1b7.jpg', + 'duration': 2544, + 'season_number': 2, + 'series': 'Shadowhunters', + 'episode_number': 1, + 'timestamp': 1483387200, + 'ext': 'mp4' }, 'params': { 'geo_bypass_ip_block': '3.244.239.0/24', @@ -94,13 +102,22 @@ class GoIE(AdobePassIE): 'skip_download': True, }, }, { - 'url': 'https://abc.com/shows/the-rookie/episode-guide/season-02/03-the-bet', + 'url': 'https://abc.com/shows/the-rookie/episode-guide/season-04/12-the-knock', 'info_dict': { - 'id': 'VDKA13435179', - 'ext': 'mp4', - 'title': 'The Bet', - 'description': 'md5:c66de8ba2e92c6c5c113c3ade84ab404', + 'id': 'VDKA26050359', + 'title': 'The Knock', + 'description': 'md5:0c2947e3ada4c31f28296db7db14aa64', 'age_limit': 14, + 'ext': 'mp4', + 'thumbnail': 'http://cdn1.edgedatg.com/aws/v2/abc/TheRookie/video/412/daf830d06e83b11eaf5c0a299d993ae3/1556x876-Q75_daf830d06e83b11eaf5c0a299d993ae3.jpg', + 'episode': 'Episode 12', + 'season_number': 4, + 'season': 'Season 4', + 'timestamp': 1642975200, + 'episode_number': 12, + 'upload_date': '20220123', + 'series': 'The Rookie', + 'duration': 2572, }, 'params': { 'geo_bypass_ip_block': '3.244.239.0/24', @@ -111,24 +128,18 @@ class GoIE(AdobePassIE): 'url': 'https://fxnow.fxnetworks.com/shows/better-things/video/vdka12782841', 'info_dict': { 'id': 'VDKA12782841', - 'ext': 'mp4', 'title': 'First Look: Better Things - Season 2', 'description': 'md5:fa73584a95761c605d9d54904e35b407', - }, - 'params': { - 'geo_bypass_ip_block': '3.244.239.0/24', - # m3u8 download - 'skip_download': True, - }, - }, { - 'url': 'https://abc.com/shows/modern-family/episode-guide/season-01/101-pilot', - 'info_dict': { - 'id': 'VDKA22600213', 'ext': 'mp4', - 'title': 'Pilot', - 'description': 'md5:74306df917cfc199d76d061d66bebdb4', + 'age_limit': 14, + 'upload_date': '20170825', + 'duration': 161, + 'series': 'Better Things', + 'thumbnail': 'http://cdn1.edgedatg.com/aws/v2/fx/BetterThings/video/12782841/b6b05e58264121cc2c98811318e6d507/1556x876-Q75_b6b05e58264121cc2c98811318e6d507.jpg', + 'timestamp': 1503661074, }, 'params': { + 'geo_bypass_ip_block': '3.244.239.0/24', # m3u8 download 'skip_download': True, }, @@ -282,7 +293,6 @@ class GoIE(AdobePassIE): 'height': height, }) formats.append(f) - self._sort_formats(formats) for cc in video_data.get('closedcaption', {}).get('src', []): cc_url = cc.get('value') @@ -319,4 +329,5 @@ class GoIE(AdobePassIE): 'thumbnails': thumbnails, 'formats': formats, 'subtitles': subtitles, + 'timestamp': unified_timestamp(traverse_obj(video_data, ('airdates', 'airdate', 0))), } diff --git a/hypervideo_dl/extractor/godtube.py b/hypervideo_dl/extractor/godtube.py index 96e68b4..6975401 100644 --- a/hypervideo_dl/extractor/godtube.py +++ b/hypervideo_dl/extractor/godtube.py @@ -1,6 +1,3 @@ -from __future__ import unicode_literals - - from .common import InfoExtractor from ..utils import ( parse_duration, diff --git a/hypervideo_dl/extractor/gofile.py b/hypervideo_dl/extractor/gofile.py index 62d778c..ddbce2e 100644 --- a/hypervideo_dl/extractor/gofile.py +++ b/hypervideo_dl/extractor/gofile.py @@ -1,4 +1,5 @@ -# coding: utf-8 +import hashlib + from .common import InfoExtractor from ..utils import ( ExtractorError, @@ -19,22 +20,34 @@ class GofileIE(InfoExtractor): 'id': 'de571ac1-5edc-42e2-8ec2-bdac83ad4a31', 'filesize': 928116, 'ext': 'mp4', - 'title': 'nuuh' + 'title': 'nuuh', + 'release_timestamp': 1638338704, + 'release_date': '20211201', } }] - }, { # URL to test mixed file types - 'url': 'https://gofile.io/d/avt34h', + }, { + 'url': 'https://gofile.io/d/is8lKr', 'info_dict': { - 'id': 'avt34h', - }, - 'playlist_mincount': 1, - }, { # URL to test no video/audio error - 'url': 'https://gofile.io/d/aB03lZ', - 'info_dict': { - 'id': 'aB03lZ', + 'id': 'TMjXd9', + 'ext': 'mp4', }, 'playlist_count': 0, 'skip': 'No video/audio found at provided URL.', + }, { + 'url': 'https://gofile.io/d/TMjXd9', + 'info_dict': { + 'id': 'TMjXd9', + }, + 'playlist_count': 1, + }, { + 'url': 'https://gofile.io/d/gqOtRf', + 'info_dict': { + 'id': 'gqOtRf', + }, + 'playlist_mincount': 1, + 'params': { + 'videopassword': 'password', + }, }] _TOKEN = None @@ -50,12 +63,22 @@ class GofileIE(InfoExtractor): self._set_cookie('gofile.io', 'accountToken', self._TOKEN) def _entries(self, file_id): + query_params = { + 'contentId': file_id, + 'token': self._TOKEN, + 'websiteToken': 12345, + } + password = self.get_param('videopassword') + if password: + query_params['password'] = hashlib.sha256(password.encode('utf-8')).hexdigest() files = self._download_json( - f'https://api.gofile.io/getContent?contentId={file_id}&token={self._TOKEN}&websiteToken=websiteToken&cache=true', - 'Gofile', note='Getting filelist') + 'https://api.gofile.io/getContent', file_id, note='Getting filelist', query=query_params) status = files['status'] - if status != 'ok': + if status == 'error-passwordRequired': + raise ExtractorError( + 'This video is protected by a password, use the --video-password option', expected=True) + elif status != 'ok': raise ExtractorError(f'{self.IE_NAME} said: status {status}', expected=True) found_files = False @@ -65,7 +88,7 @@ class GofileIE(InfoExtractor): continue found_files = True - file_url = file.get('directLink') + file_url = file.get('link') if file_url: yield { 'id': file['id'], diff --git a/hypervideo_dl/extractor/golem.py b/hypervideo_dl/extractor/golem.py index 47a068e..c33d950 100644 --- a/hypervideo_dl/extractor/golem.py +++ b/hypervideo_dl/extractor/golem.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import ( compat_str, @@ -54,7 +51,6 @@ class GolemIE(InfoExtractor): 'filesize': self._int(e.findtext('filesize'), 'filesize'), 'ext': determine_ext(e.findtext('./filename')), }) - self._sort_formats(formats) info['formats'] = formats thumbnails = [] diff --git a/hypervideo_dl/extractor/goodgame.py b/hypervideo_dl/extractor/goodgame.py new file mode 100644 index 0000000..c17ad56 --- /dev/null +++ b/hypervideo_dl/extractor/goodgame.py @@ -0,0 +1,57 @@ +from .common import InfoExtractor +from ..utils import ( + clean_html, + int_or_none, + str_or_none, + traverse_obj, +) + + +class GoodGameIE(InfoExtractor): + IE_NAME = 'goodgame:stream' + _VALID_URL = r'https?://goodgame\.ru/channel/(?P<id>\w+)' + _TESTS = [{ + 'url': 'https://goodgame.ru/channel/Pomi/#autoplay', + 'info_dict': { + 'id': 'pomi', + 'ext': 'mp4', + 'title': r're:Reynor vs Special \(1/2,bo3\) Wardi Spring EU \- playoff \(финальный день\) \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', + 'channel_id': '1644', + 'channel': 'Pomi', + 'channel_url': 'https://goodgame.ru/channel/Pomi/', + 'description': 'md5:4a87b775ee7b2b57bdccebe285bbe171', + 'thumbnail': r're:^https?://.*\.jpg$', + 'live_status': 'is_live', + 'view_count': int, + }, + 'params': {'skip_download': 'm3u8'}, + 'skip': 'May not be online', + }] + + def _real_extract(self, url): + channel_name = self._match_id(url) + response = self._download_json(f'https://api2.goodgame.ru/v2/streams/{channel_name}', channel_name) + player_id = response['channel']['gg_player_src'] + + formats, subtitles = [], {} + if response.get('status') == 'Live': + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + f'https://hls.goodgame.ru/manifest/{player_id}_master.m3u8', + channel_name, 'mp4', live=True) + else: + self.raise_no_formats('User is offline', expected=True, video_id=channel_name) + + return { + 'id': player_id, + 'formats': formats, + 'subtitles': subtitles, + 'title': traverse_obj(response, ('channel', 'title')), + 'channel': channel_name, + 'channel_id': str_or_none(traverse_obj(response, ('channel', 'id'))), + 'channel_url': response.get('url'), + 'description': clean_html(traverse_obj(response, ('channel', 'description'))), + 'thumbnail': traverse_obj(response, ('channel', 'thumb')), + 'is_live': bool(formats), + 'view_count': int_or_none(response.get('viewers')), + 'age_limit': 18 if traverse_obj(response, ('channel', 'adult')) else None, + } diff --git a/hypervideo_dl/extractor/googledrive.py b/hypervideo_dl/extractor/googledrive.py index 7b5bf28..e027ea7 100644 --- a/hypervideo_dl/extractor/googledrive.py +++ b/hypervideo_dl/extractor/googledrive.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -79,13 +77,13 @@ class GoogleDriveIE(InfoExtractor): _caption_formats_ext = [] _captions_xml = None - @staticmethod - def _extract_url(webpage): + @classmethod + def _extract_embed_urls(cls, url, webpage): mobj = re.search( r'<iframe[^>]+src="https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9_-]{28,})', webpage) if mobj: - return 'https://drive.google.com/file/d/%s' % mobj.group('id') + yield 'https://drive.google.com/file/d/%s' % mobj.group('id') def _download_subtitles_xml(self, video_id, subtitles_id, hl): if self._captions_xml: @@ -255,8 +253,6 @@ class GoogleDriveIE(InfoExtractor): if not formats and reason: self.raise_no_formats(reason, expected=True) - self._sort_formats(formats) - hl = get_value('hl') subtitles_id = None ttsurl = get_value('ttsurl') @@ -266,7 +262,7 @@ class GoogleDriveIE(InfoExtractor): subtitles_id = ttsurl.encode('utf-8').decode( 'unicode_escape').split('=')[-1] - self._downloader.cookiejar.clear(domain='.google.com', path='/', name='NID') + self.cookiejar.clear(domain='.google.com', path='/', name='NID') return { 'id': video_id, @@ -278,3 +274,59 @@ class GoogleDriveIE(InfoExtractor): 'automatic_captions': self.extract_automatic_captions( video_id, subtitles_id, hl), } + + +class GoogleDriveFolderIE(InfoExtractor): + IE_NAME = 'GoogleDrive:Folder' + _VALID_URL = r'https?://(?:docs|drive)\.google\.com/drive/folders/(?P<id>[\w-]{28,})' + _TESTS = [{ + 'url': 'https://drive.google.com/drive/folders/1dQ4sx0-__Nvg65rxTSgQrl7VyW_FZ9QI', + 'info_dict': { + 'id': '1dQ4sx0-__Nvg65rxTSgQrl7VyW_FZ9QI', + 'title': 'Forrest' + }, + 'playlist_count': 3, + }] + _BOUNDARY = '=====vc17a3rwnndj=====' + _REQUEST = "/drive/v2beta/files?openDrive=true&reason=102&syncType=0&errorRecovery=false&q=trashed%20%3D%20false%20and%20'{folder_id}'%20in%20parents&fields=kind%2CnextPageToken%2Citems(kind%2CmodifiedDate%2CmodifiedByMeDate%2ClastViewedByMeDate%2CfileSize%2Cowners(kind%2CpermissionId%2Cid)%2ClastModifyingUser(kind%2CpermissionId%2Cid)%2ChasThumbnail%2CthumbnailVersion%2Ctitle%2Cid%2CresourceKey%2Cshared%2CsharedWithMeDate%2CuserPermission(role)%2CexplicitlyTrashed%2CmimeType%2CquotaBytesUsed%2Ccopyable%2CfileExtension%2CsharingUser(kind%2CpermissionId%2Cid)%2Cspaces%2Cversion%2CteamDriveId%2ChasAugmentedPermissions%2CcreatedDate%2CtrashingUser(kind%2CpermissionId%2Cid)%2CtrashedDate%2Cparents(id)%2CshortcutDetails(targetId%2CtargetMimeType%2CtargetLookupStatus)%2Ccapabilities(canCopy%2CcanDownload%2CcanEdit%2CcanAddChildren%2CcanDelete%2CcanRemoveChildren%2CcanShare%2CcanTrash%2CcanRename%2CcanReadTeamDrive%2CcanMoveTeamDriveItem)%2Clabels(starred%2Ctrashed%2Crestricted%2Cviewed))%2CincompleteSearch&appDataFilter=NO_APP_DATA&spaces=drive&pageToken={page_token}&maxResults=50&supportsTeamDrives=true&includeItemsFromAllDrives=true&corpora=default&orderBy=folder%2Ctitle_natural%20asc&retryCount=0&key={key} HTTP/1.1" + _DATA = f'''--{_BOUNDARY} +content-type: application/http +content-transfer-encoding: binary + +GET %s + +--{_BOUNDARY} +''' + + def _call_api(self, folder_id, key, data, **kwargs): + response = self._download_webpage( + 'https://clients6.google.com/batch/drive/v2beta', + folder_id, data=data.encode('utf-8'), + headers={ + 'Content-Type': 'text/plain;charset=UTF-8;', + 'Origin': 'https://drive.google.com', + }, query={ + '$ct': f'multipart/mixed; boundary="{self._BOUNDARY}"', + 'key': key + }, **kwargs) + return self._search_json('', response, 'api response', folder_id, **kwargs) or {} + + def _get_folder_items(self, folder_id, key): + page_token = '' + while page_token is not None: + request = self._REQUEST.format(folder_id=folder_id, page_token=page_token, key=key) + page = self._call_api(folder_id, key, self._DATA % request) + yield from page['items'] + page_token = page.get('nextPageToken') + + def _real_extract(self, url): + folder_id = self._match_id(url) + + webpage = self._download_webpage(url, folder_id) + key = self._search_regex(r'"(\w{39})"', webpage, 'key') + + folder_info = self._call_api(folder_id, key, self._DATA % f'/drive/v2beta/files/{folder_id} HTTP/1.1', fatal=False) + + return self.playlist_from_matches( + self._get_folder_items(folder_id, key), folder_id, folder_info.get('title'), + ie=GoogleDriveIE, getter=lambda item: f'https://drive.google.com/file/d/{item["id"]}') diff --git a/hypervideo_dl/extractor/googlepodcasts.py b/hypervideo_dl/extractor/googlepodcasts.py index 25631e2..8b2351b 100644 --- a/hypervideo_dl/extractor/googlepodcasts.py +++ b/hypervideo_dl/extractor/googlepodcasts.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/googlesearch.py b/hypervideo_dl/extractor/googlesearch.py index 4b8b1bc..67ca0e5 100644 --- a/hypervideo_dl/extractor/googlesearch.py +++ b/hypervideo_dl/extractor/googlesearch.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import itertools import re diff --git a/hypervideo_dl/extractor/goplay.py b/hypervideo_dl/extractor/goplay.py new file mode 100644 index 0000000..2882b49 --- /dev/null +++ b/hypervideo_dl/extractor/goplay.py @@ -0,0 +1,394 @@ +import base64 +import binascii +import datetime +import hashlib +import hmac +import json +import os + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + traverse_obj, + unescapeHTML, +) + + +class GoPlayIE(InfoExtractor): + _VALID_URL = r'https?://(www\.)?goplay\.be/video/([^/]+/[^/]+/|)(?P<display_id>[^/#]+)' + + _NETRC_MACHINE = 'goplay' + + _TESTS = [{ + 'url': 'https://www.goplay.be/video/de-container-cup/de-container-cup-s3/de-container-cup-s3-aflevering-2#autoplay', + 'info_dict': { + 'id': '9c4214b8-e55d-4e4b-a446-f015f6c6f811', + 'ext': 'mp4', + 'title': 'S3 - Aflevering 2', + 'series': 'De Container Cup', + 'season': 'Season 3', + 'season_number': 3, + 'episode': 'Episode 2', + 'episode_number': 2, + }, + 'skip': 'This video is only available for registered users' + }, { + 'url': 'https://www.goplay.be/video/a-family-for-thr-holidays-s1-aflevering-1#autoplay', + 'info_dict': { + 'id': '74e3ed07-748c-49e4-85a0-393a93337dbf', + 'ext': 'mp4', + 'title': 'A Family for the Holidays', + }, + 'skip': 'This video is only available for registered users' + }] + + _id_token = None + + def _perform_login(self, username, password): + self.report_login() + aws = AwsIdp(ie=self, pool_id='eu-west-1_dViSsKM5Y', client_id='6s1h851s8uplco5h6mqh1jac8m') + self._id_token, _ = aws.authenticate(username=username, password=password) + + def _real_initialize(self): + if not self._id_token: + raise self.raise_login_required(method='password') + + def _real_extract(self, url): + url, display_id = self._match_valid_url(url).group(0, 'display_id') + webpage = self._download_webpage(url, display_id) + video_data_json = self._html_search_regex(r'<div\s+data-hero="([^"]+)"', webpage, 'video_data') + video_data = self._parse_json(unescapeHTML(video_data_json), display_id).get('data') + + movie = video_data.get('movie') + if movie: + video_id = movie['videoUuid'] + info_dict = { + 'title': movie.get('title') + } + else: + episode = traverse_obj(video_data, ('playlists', ..., 'episodes', lambda _, v: v['pageInfo']['url'] == url), get_all=False) + video_id = episode['videoUuid'] + info_dict = { + 'title': episode.get('episodeTitle'), + 'series': traverse_obj(episode, ('program', 'title')), + 'season_number': episode.get('seasonNumber'), + 'episode_number': episode.get('episodeNumber'), + } + + api = self._download_json( + f'https://api.viervijfzes.be/content/{video_id}', + video_id, headers={'Authorization': self._id_token}) + + formats, subs = self._extract_m3u8_formats_and_subtitles( + api['video']['S'], video_id, ext='mp4', m3u8_id='HLS') + + info_dict.update({ + 'id': video_id, + 'formats': formats, + }) + + return info_dict + + +# Taken from https://github.com/add-ons/plugin.video.viervijfzes/blob/master/resources/lib/viervijfzes/auth_awsidp.py +# Released into Public domain by https://github.com/michaelarnauts + +class InvalidLoginException(ExtractorError): + """ The login credentials are invalid """ + + +class AuthenticationException(ExtractorError): + """ Something went wrong while logging in """ + + +class AwsIdp: + """ AWS Identity Provider """ + + def __init__(self, ie, pool_id, client_id): + """ + :param InfoExtrator ie: The extractor that instantiated this class. + :param str pool_id: The AWS user pool to connect to (format: <region>_<poolid>). + E.g.: eu-west-1_aLkOfYN3T + :param str client_id: The client application ID (the ID of the application connecting) + """ + + self.ie = ie + + self.pool_id = pool_id + if "_" not in self.pool_id: + raise ValueError("Invalid pool_id format. Should be <region>_<poolid>.") + + self.client_id = client_id + self.region = self.pool_id.split("_")[0] + self.url = "https://cognito-idp.%s.amazonaws.com/" % (self.region,) + + # Initialize the values + # https://github.com/aws/amazon-cognito-identity-js/blob/master/src/AuthenticationHelper.js#L22 + self.n_hex = 'FFFFFFFFFFFFFFFFC90FDAA22168C234C4C6628B80DC1CD1' + \ + '29024E088A67CC74020BBEA63B139B22514A08798E3404DD' + \ + 'EF9519B3CD3A431B302B0A6DF25F14374FE1356D6D51C245' + \ + 'E485B576625E7EC6F44C42E9A637ED6B0BFF5CB6F406B7ED' + \ + 'EE386BFB5A899FA5AE9F24117C4B1FE649286651ECE45B3D' + \ + 'C2007CB8A163BF0598DA48361C55D39A69163FA8FD24CF5F' + \ + '83655D23DCA3AD961C62F356208552BB9ED529077096966D' + \ + '670C354E4ABC9804F1746C08CA18217C32905E462E36CE3B' + \ + 'E39E772C180E86039B2783A2EC07A28FB5C55DF06F4C52C9' + \ + 'DE2BCBF6955817183995497CEA956AE515D2261898FA0510' + \ + '15728E5A8AAAC42DAD33170D04507A33A85521ABDF1CBA64' + \ + 'ECFB850458DBEF0A8AEA71575D060C7DB3970F85A6E1E4C7' + \ + 'ABF5AE8CDB0933D71E8C94E04A25619DCEE3D2261AD2EE6B' + \ + 'F12FFA06D98A0864D87602733EC86A64521F2B18177B200C' + \ + 'BBE117577A615D6C770988C0BAD946E208E24FA074E5AB31' + \ + '43DB5BFCE0FD108E4B82D120A93AD2CAFFFFFFFFFFFFFFFF' + + # https://github.com/aws/amazon-cognito-identity-js/blob/master/src/AuthenticationHelper.js#L49 + self.g_hex = '2' + self.info_bits = bytearray('Caldera Derived Key', 'utf-8') + + self.big_n = self.__hex_to_long(self.n_hex) + self.g = self.__hex_to_long(self.g_hex) + self.k = self.__hex_to_long(self.__hex_hash('00' + self.n_hex + '0' + self.g_hex)) + self.small_a_value = self.__generate_random_small_a() + self.large_a_value = self.__calculate_a() + + def authenticate(self, username, password): + """ Authenticate with a username and password. """ + # Step 1: First initiate an authentication request + auth_data_dict = self.__get_authentication_request(username) + auth_data = json.dumps(auth_data_dict).encode("utf-8") + auth_headers = { + "X-Amz-Target": "AWSCognitoIdentityProviderService.InitiateAuth", + "Accept-Encoding": "identity", + "Content-Type": "application/x-amz-json-1.1" + } + auth_response_json = self.ie._download_json( + self.url, None, data=auth_data, headers=auth_headers, + note='Authenticating username', errnote='Invalid username') + challenge_parameters = auth_response_json.get("ChallengeParameters") + + if auth_response_json.get("ChallengeName") != "PASSWORD_VERIFIER": + raise AuthenticationException(auth_response_json["message"]) + + # Step 2: Respond to the Challenge with a valid ChallengeResponse + challenge_request = self.__get_challenge_response_request(challenge_parameters, password) + challenge_data = json.dumps(challenge_request).encode("utf-8") + challenge_headers = { + "X-Amz-Target": "AWSCognitoIdentityProviderService.RespondToAuthChallenge", + "Content-Type": "application/x-amz-json-1.1" + } + auth_response_json = self.ie._download_json( + self.url, None, data=challenge_data, headers=challenge_headers, + note='Authenticating password', errnote='Invalid password') + + if 'message' in auth_response_json: + raise InvalidLoginException(auth_response_json['message']) + return ( + auth_response_json['AuthenticationResult']['IdToken'], + auth_response_json['AuthenticationResult']['RefreshToken'] + ) + + def __get_authentication_request(self, username): + """ + + :param str username: The username to use + + :return: A full Authorization request. + :rtype: dict + """ + auth_request = { + "AuthParameters": { + "USERNAME": username, + "SRP_A": self.__long_to_hex(self.large_a_value) + }, + "AuthFlow": "USER_SRP_AUTH", + "ClientId": self.client_id + } + return auth_request + + def __get_challenge_response_request(self, challenge_parameters, password): + """ Create a Challenge Response Request object. + + :param dict[str,str|imt] challenge_parameters: The parameters for the challenge. + :param str password: The password. + + :return: A valid and full request data object to use as a response for a challenge. + :rtype: dict + """ + user_id = challenge_parameters["USERNAME"] + user_id_for_srp = challenge_parameters["USER_ID_FOR_SRP"] + srp_b = challenge_parameters["SRP_B"] + salt = challenge_parameters["SALT"] + secret_block = challenge_parameters["SECRET_BLOCK"] + + timestamp = self.__get_current_timestamp() + + # Get a HKDF key for the password, SrpB and the Salt + hkdf = self.__get_hkdf_key_for_password( + user_id_for_srp, + password, + self.__hex_to_long(srp_b), + salt + ) + secret_block_bytes = base64.standard_b64decode(secret_block) + + # the message is a combo of the pool_id, provided SRP userId, the Secret and Timestamp + msg = \ + bytearray(self.pool_id.split('_')[1], 'utf-8') + \ + bytearray(user_id_for_srp, 'utf-8') + \ + bytearray(secret_block_bytes) + \ + bytearray(timestamp, 'utf-8') + hmac_obj = hmac.new(hkdf, msg, digestmod=hashlib.sha256) + signature_string = base64.standard_b64encode(hmac_obj.digest()).decode('utf-8') + challenge_request = { + "ChallengeResponses": { + "USERNAME": user_id, + "TIMESTAMP": timestamp, + "PASSWORD_CLAIM_SECRET_BLOCK": secret_block, + "PASSWORD_CLAIM_SIGNATURE": signature_string + }, + "ChallengeName": "PASSWORD_VERIFIER", + "ClientId": self.client_id + } + return challenge_request + + def __get_hkdf_key_for_password(self, username, password, server_b_value, salt): + """ Calculates the final hkdf based on computed S value, and computed U value and the key. + + :param str username: Username. + :param str password: Password. + :param int server_b_value: Server B value. + :param int salt: Generated salt. + + :return Computed HKDF value. + :rtype: object + """ + + u_value = self.__calculate_u(self.large_a_value, server_b_value) + if u_value == 0: + raise ValueError('U cannot be zero.') + username_password = '%s%s:%s' % (self.pool_id.split('_')[1], username, password) + username_password_hash = self.__hash_sha256(username_password.encode('utf-8')) + + x_value = self.__hex_to_long(self.__hex_hash(self.__pad_hex(salt) + username_password_hash)) + g_mod_pow_xn = pow(self.g, x_value, self.big_n) + int_value2 = server_b_value - self.k * g_mod_pow_xn + s_value = pow(int_value2, self.small_a_value + u_value * x_value, self.big_n) + hkdf = self.__compute_hkdf( + bytearray.fromhex(self.__pad_hex(s_value)), + bytearray.fromhex(self.__pad_hex(self.__long_to_hex(u_value))) + ) + return hkdf + + def __compute_hkdf(self, ikm, salt): + """ Standard hkdf algorithm + + :param {Buffer} ikm Input key material. + :param {Buffer} salt Salt value. + :return {Buffer} Strong key material. + """ + + prk = hmac.new(salt, ikm, hashlib.sha256).digest() + info_bits_update = self.info_bits + bytearray(chr(1), 'utf-8') + hmac_hash = hmac.new(prk, info_bits_update, hashlib.sha256).digest() + return hmac_hash[:16] + + def __calculate_u(self, big_a, big_b): + """ Calculate the client's value U which is the hash of A and B + + :param int big_a: Large A value. + :param int big_b: Server B value. + + :return Computed U value. + :rtype: int + """ + + u_hex_hash = self.__hex_hash(self.__pad_hex(big_a) + self.__pad_hex(big_b)) + return self.__hex_to_long(u_hex_hash) + + def __generate_random_small_a(self): + """ Helper function to generate a random big integer + + :return a random value. + :rtype: int + """ + random_long_int = self.__get_random(128) + return random_long_int % self.big_n + + def __calculate_a(self): + """ Calculate the client's public value A = g^a%N with the generated random number a + + :return Computed large A. + :rtype: int + """ + + big_a = pow(self.g, self.small_a_value, self.big_n) + # safety check + if (big_a % self.big_n) == 0: + raise ValueError('Safety check for A failed') + return big_a + + @staticmethod + def __long_to_hex(long_num): + return '%x' % long_num + + @staticmethod + def __hex_to_long(hex_string): + return int(hex_string, 16) + + @staticmethod + def __hex_hash(hex_string): + return AwsIdp.__hash_sha256(bytearray.fromhex(hex_string)) + + @staticmethod + def __hash_sha256(buf): + """AuthenticationHelper.hash""" + digest = hashlib.sha256(buf).hexdigest() + return (64 - len(digest)) * '0' + digest + + @staticmethod + def __pad_hex(long_int): + """ Converts a Long integer (or hex string) to hex format padded with zeroes for hashing + + :param int|str long_int: Number or string to pad. + + :return Padded hex string. + :rtype: str + """ + + if not isinstance(long_int, str): + hash_str = AwsIdp.__long_to_hex(long_int) + else: + hash_str = long_int + if len(hash_str) % 2 == 1: + hash_str = '0%s' % hash_str + elif hash_str[0] in '89ABCDEFabcdef': + hash_str = '00%s' % hash_str + return hash_str + + @staticmethod + def __get_random(nbytes): + random_hex = binascii.hexlify(os.urandom(nbytes)) + return AwsIdp.__hex_to_long(random_hex) + + @staticmethod + def __get_current_timestamp(): + """ Creates a timestamp with the correct English format. + + :return: timestamp in format 'Sun Jan 27 19:00:04 UTC 2019' + :rtype: str + """ + + # We need US only data, so we cannot just do a strftime: + # Sun Jan 27 19:00:04 UTC 2019 + months = [None, 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] + days = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'] + + time_now = datetime.datetime.utcnow() + format_string = "{} {} {} %H:%M:%S UTC %Y".format(days[time_now.weekday()], months[time_now.month], time_now.day) + time_string = datetime.datetime.utcnow().strftime(format_string) + return time_string + + def __str__(self): + return "AWS IDP Client for:\nRegion: %s\nPoolId: %s\nAppId: %s" % ( + self.region, self.pool_id.split("_")[1], self.client_id + ) diff --git a/hypervideo_dl/extractor/gopro.py b/hypervideo_dl/extractor/gopro.py index 10cc1ae..ae96537 100644 --- a/hypervideo_dl/extractor/gopro.py +++ b/hypervideo_dl/extractor/gopro.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( int_or_none, @@ -81,8 +78,6 @@ class GoProIE(InfoExtractor): 'height': int_or_none(fmt.get('height')), }) - self._sort_formats(formats) - title = str_or_none( try_get(metadata, lambda x: x['collection']['title']) or self._html_search_meta(['og:title', 'twitter:title'], webpage) diff --git a/hypervideo_dl/extractor/goshgay.py b/hypervideo_dl/extractor/goshgay.py index 377981d..9a1f32b 100644 --- a/hypervideo_dl/extractor/goshgay.py +++ b/hypervideo_dl/extractor/goshgay.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import ( compat_parse_qs, diff --git a/hypervideo_dl/extractor/gotostage.py b/hypervideo_dl/extractor/gotostage.py index 6aa9610..112293b 100644 --- a/hypervideo_dl/extractor/gotostage.py +++ b/hypervideo_dl/extractor/gotostage.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import compat_str from ..utils import ( diff --git a/hypervideo_dl/extractor/gputechconf.py b/hypervideo_dl/extractor/gputechconf.py index 73dc62c..2d13bf4 100644 --- a/hypervideo_dl/extractor/gputechconf.py +++ b/hypervideo_dl/extractor/gputechconf.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/gronkh.py b/hypervideo_dl/extractor/gronkh.py index c9f1dd2..b9370e3 100644 --- a/hypervideo_dl/extractor/gronkh.py +++ b/hypervideo_dl/extractor/gronkh.py @@ -1,20 +1,34 @@ -# coding: utf-8 -from __future__ import unicode_literals +import functools from .common import InfoExtractor -from ..utils import unified_strdate +from ..utils import ( + OnDemandPagedList, + traverse_obj, + unified_strdate, +) class GronkhIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?gronkh\.tv/(?:watch/)?stream/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?gronkh\.tv/(?:watch/)?streams?/(?P<id>\d+)' _TESTS = [{ + 'url': 'https://gronkh.tv/streams/657', + 'info_dict': { + 'id': '657', + 'ext': 'mp4', + 'title': 'H.O.R.D.E. - DAS ZWEiTE ZEiTALTER 🎲 Session 1', + 'view_count': int, + 'thumbnail': 'https://01.cdn.vod.farm/preview/9e2555d3a23bf4e5c5b7c6b3b70a9d84.jpg', + 'upload_date': '20221111' + }, + 'params': {'skip_download': True} + }, { 'url': 'https://gronkh.tv/stream/536', 'info_dict': { 'id': '536', 'ext': 'mp4', 'title': 'GTV0536, 2021-10-01 - MARTHA IS DEAD #FREiAB1830 !FF7 !horde !archiv', - 'view_count': 19491, + 'view_count': int, 'thumbnail': 'https://01.cdn.vod.farm/preview/6436746cce14e25f751260a692872b9b.jpg', 'upload_date': '20211001' }, @@ -34,7 +48,6 @@ class GronkhIE(InfoExtractor): 'url': data_json['vtt_url'], 'ext': 'vtt', }) - self._sort_formats(formats) return { 'id': id, 'title': data_json.get('title'), @@ -44,3 +57,54 @@ class GronkhIE(InfoExtractor): 'formats': formats, 'subtitles': subtitles, } + + +class GronkhFeedIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?gronkh\.tv(?:/feed)?/?(?:#|$)' + IE_NAME = 'gronkh:feed' + + _TESTS = [{ + 'url': 'https://gronkh.tv/feed', + 'info_dict': { + 'id': 'feed', + }, + 'playlist_count': 16, + }, { + 'url': 'https://gronkh.tv', + 'only_matching': True, + }] + + def _entries(self): + for type_ in ('recent', 'views'): + info = self._download_json( + f'https://api.gronkh.tv/v1/video/discovery/{type_}', 'feed', note=f'Downloading {type_} API JSON') + for item in traverse_obj(info, ('discovery', ...)) or []: + yield self.url_result(f'https://gronkh.tv/watch/stream/{item["episode"]}', GronkhIE, item.get('title')) + + def _real_extract(self, url): + return self.playlist_result(self._entries(), 'feed') + + +class GronkhVodsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?gronkh\.tv/vods/streams/?(?:#|$)' + IE_NAME = 'gronkh:vods' + + _TESTS = [{ + 'url': 'https://gronkh.tv/vods/streams', + 'info_dict': { + 'id': 'vods', + }, + 'playlist_mincount': 150, + }] + _PER_PAGE = 25 + + def _fetch_page(self, page): + items = traverse_obj(self._download_json( + 'https://api.gronkh.tv/v1/search', 'vods', query={'offset': self._PER_PAGE * page, 'first': self._PER_PAGE}, + note=f'Downloading stream video page {page + 1}'), ('results', 'videos', ...)) + for item in items or []: + yield self.url_result(f'https://gronkh.tv/watch/stream/{item["episode"]}', GronkhIE, item['episode'], item.get('title')) + + def _real_extract(self, url): + entries = OnDemandPagedList(functools.partial(self._fetch_page), self._PER_PAGE) + return self.playlist_result(entries, 'vods') diff --git a/hypervideo_dl/extractor/groupon.py b/hypervideo_dl/extractor/groupon.py index a6da909..362d3ff 100644 --- a/hypervideo_dl/extractor/groupon.py +++ b/hypervideo_dl/extractor/groupon.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/harpodeon.py b/hypervideo_dl/extractor/harpodeon.py new file mode 100644 index 0000000..0aa4733 --- /dev/null +++ b/hypervideo_dl/extractor/harpodeon.py @@ -0,0 +1,70 @@ +from .common import InfoExtractor +from ..utils import unified_strdate + + +class HarpodeonIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?harpodeon\.com/(?:video|preview)/\w+/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.harpodeon.com/video/The_Smoking_Out_of_Bella_Butts/268068288', + 'md5': '727371564a6a9ebccef2073535b5b6bd', + 'skip': 'Free video could become unavailable', + 'info_dict': { + 'id': '268068288', + 'ext': 'mp4', + 'title': 'The Smoking Out of Bella Butts', + 'description': 'md5:47e16bdb41fc8a79c83ab83af11c8b77', + 'creator': 'Vitagraph Company of America', + 'release_date': '19150101' + } + }, { + 'url': 'https://www.harpodeon.com/preview/The_Smoking_Out_of_Bella_Butts/268068288', + 'md5': '6dfea5412845f690c7331be703f884db', + 'info_dict': { + 'id': '268068288', + 'ext': 'mp4', + 'title': 'The Smoking Out of Bella Butts', + 'description': 'md5:47e16bdb41fc8a79c83ab83af11c8b77', + 'creator': 'Vitagraph Company of America', + 'release_date': '19150101' + } + }, { + 'url': 'https://www.harpodeon.com/preview/Behind_the_Screen/421838710', + 'md5': '7979df9ca04637282cb7d172ab3a9c3b', + 'info_dict': { + 'id': '421838710', + 'ext': 'mp4', + 'title': 'Behind the Screen', + 'description': 'md5:008972a3dc51fba3965ee517d2ba9155', + 'creator': 'Lone Star Corporation', + 'release_date': '19160101' + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + title, creator, release_year = self._search_regex( + r'''(?x) + <div[^>]+videoInfo[^<]*<h2[^>]*>(?P<title>[^>]+)</h2> + (?:\s*<p[^>]*>\((?P<creator>.+),\s*)?(?P<release_year>\d{4})?''', + webpage, 'title', group=('title', 'creator', 'release_year'), + fatal=False) or (None, None, None) + + hp_base = self._html_search_regex(r'hpBase\(\s*["\']([^"\']+)', webpage, 'hp_base') + + hp_inject_video, hp_resolution = self._search_regex( + r'''(?x) + hpInjectVideo\([\'\"](?P<hp_inject_video>\w+)[\'\"], + [\'\"](?P<hp_resolution>\d+)[\'\"]''', + webpage, 'hp_inject_video', group=['hp_inject_video', 'hp_resolution']) + + return { + 'id': video_id, + 'title': title, + 'url': f'{hp_base}{hp_inject_video}_{hp_resolution}.mp4', + 'http_headers': {'Referer': url}, + 'description': self._html_search_meta('description', webpage, fatal=False), + 'creator': creator, + 'release_date': unified_strdate(f'{release_year}0101') + } diff --git a/hypervideo_dl/extractor/hbo.py b/hypervideo_dl/extractor/hbo.py index 68df748..530bdb7 100644 --- a/hypervideo_dl/extractor/hbo.py +++ b/hypervideo_dl/extractor/hbo.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -115,7 +112,6 @@ class HBOBaseIE(InfoExtractor): 'width': format_info.get('width'), 'height': format_info.get('height'), }) - self._sort_formats(formats) thumbnails = [] card_sizes = xpath_element(video_data, 'titleCardSizes') diff --git a/hypervideo_dl/extractor/hearthisat.py b/hypervideo_dl/extractor/hearthisat.py index a3d6a05..d1a400d 100644 --- a/hypervideo_dl/extractor/hearthisat.py +++ b/hypervideo_dl/extractor/hearthisat.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor from ..utils import ( determine_ext, @@ -85,7 +81,6 @@ class HearThisAtIE(InfoExtractor): 'acodec': ext, 'quality': 2, # Usually better quality }) - self._sort_formats(formats) return { 'id': track_id, diff --git a/hypervideo_dl/extractor/heise.py b/hypervideo_dl/extractor/heise.py index cbe564a..27d737c 100644 --- a/hypervideo_dl/extractor/heise.py +++ b/hypervideo_dl/extractor/heise.py @@ -1,13 +1,12 @@ -# coding: utf-8 -from __future__ import unicode_literals +import urllib.parse from .common import InfoExtractor from .kaltura import KalturaIE from .youtube import YoutubeIE from ..utils import ( + NO_DEFAULT, determine_ext, int_or_none, - NO_DEFAULT, parse_iso8601, smuggle_url, xpath_text, @@ -26,6 +25,9 @@ class HeiseIE(InfoExtractor): 'timestamp': 1512734959, 'upload_date': '20171208', 'description': 'md5:c934cbfb326c669c2bcabcbe3d3fcd20', + 'thumbnail': 're:^https?://.*/thumbnail/.*', + 'duration': 2845, + 'view_count': int, }, 'params': { 'skip_download': True, @@ -37,11 +39,27 @@ class HeiseIE(InfoExtractor): 'info_dict': { 'id': '6kmWbXleKW4', 'ext': 'mp4', - 'title': 'NEU IM SEPTEMBER | Netflix', - 'description': 'md5:2131f3c7525e540d5fd841de938bd452', + 'title': 'Neu im September 2017 | Netflix', + 'description': 'md5:d6852d1f96bb80760608eed3b907437c', 'upload_date': '20170830', 'uploader': 'Netflix Deutschland, Österreich und Schweiz', 'uploader_id': 'netflixdach', + 'categories': ['Entertainment'], + 'tags': 'count:27', + 'age_limit': 0, + 'availability': 'public', + 'comment_count': int, + 'channel_id': 'UCZqgRlLcvO3Fnx_npQJygcQ', + 'thumbnail': 'https://i.ytimg.com/vi_webp/6kmWbXleKW4/maxresdefault.webp', + 'uploader_url': 'http://www.youtube.com/user/netflixdach', + 'playable_in_embed': True, + 'live_status': 'not_live', + 'channel_url': 'https://www.youtube.com/channel/UCZqgRlLcvO3Fnx_npQJygcQ', + 'view_count': int, + 'channel': 'Netflix Deutschland, Österreich und Schweiz', + 'channel_follower_count': int, + 'like_count': int, + 'duration': 67, }, 'params': { 'skip_download': True, @@ -55,11 +73,15 @@ class HeiseIE(InfoExtractor): 'description': 'md5:47e8ffb6c46d85c92c310a512d6db271', 'timestamp': 1512470717, 'upload_date': '20171205', + 'duration': 786, + 'view_count': int, + 'thumbnail': 're:^https?://.*/thumbnail/.*', }, 'params': { 'skip_download': True, }, }, { + # FIXME: Video m3u8 fails to download; issue with Kaltura extractor 'url': 'https://www.heise.de/ct/artikel/c-t-uplink-20-8-Staubsaugerroboter-Xiaomi-Vacuum-2-AR-Brille-Meta-2-und-Android-rooten-3959893.html', 'info_dict': { 'id': '1_59mk80sf', @@ -72,6 +94,18 @@ class HeiseIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + # videout + 'url': 'https://www.heise.de/ct/artikel/c-t-uplink-3-8-Anonyme-SIM-Karten-G-Sync-Monitore-Citizenfour-2440327.html', + 'info_dict': { + 'id': '2440327', + 'ext': 'mp4', + 'title': 'c\'t uplink 3.8: Anonyme SIM-Karten, G-Sync-Monitore, Citizenfour', + 'thumbnail': 'http://www.heise.de/imagine/yxM2qmol0xV3iFB7qFb70dGvXjc/gallery/', + 'description': 'md5:fa164d8c8707dff124a9626d39205f5d', + 'timestamp': 1414825200, + 'upload_date': '20141101', + } }, { 'url': 'http://www.heise.de/ct/artikel/c-t-uplink-3-3-Owncloud-Tastaturen-Peilsender-Smartphone-2403911.html', 'only_matching': True, @@ -124,26 +158,28 @@ class HeiseIE(InfoExtractor): if kaltura_id: return _make_kaltura_result('kaltura:2238431:%s' % kaltura_id) - yt_urls = YoutubeIE._extract_urls(webpage) + yt_urls = tuple(YoutubeIE._extract_embed_urls(url, webpage)) if yt_urls: return self.playlist_from_matches( yt_urls, video_id, title, ie=YoutubeIE.ie_key()) title = extract_title() - - container_id = self._search_regex( - r'<div class="videoplayerjw"[^>]+data-container="([0-9]+)"', - webpage, 'container ID') - - sequenz_id = self._search_regex( - r'<div class="videoplayerjw"[^>]+data-sequenz="([0-9]+)"', - webpage, 'sequenz ID') - - doc = self._download_xml( - 'http://www.heise.de/videout/feed', video_id, query={ + api_params = urllib.parse.parse_qs( + self._search_regex(r'/videout/feed\.json\?([^\']+)', webpage, 'feed params', default=None) or '') + if not api_params or 'container' not in api_params or 'sequenz' not in api_params: + container_id = self._search_regex( + r'<div class="videoplayerjw"[^>]+data-container="([0-9]+)"', + webpage, 'container ID') + + sequenz_id = self._search_regex( + r'<div class="videoplayerjw"[^>]+data-sequenz="([0-9]+)"', + webpage, 'sequenz ID') + api_params = { 'container': container_id, 'sequenz': sequenz_id, - }) + } + doc = self._download_xml( + 'http://www.heise.de/videout/feed', video_id, query=api_params) formats = [] for source_node in doc.findall('.//{http://rss.jwpcdn.com/}source'): @@ -158,7 +194,6 @@ class HeiseIE(InfoExtractor): 'format_id': '%s_%s' % (ext, label), 'height': height, }) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/hellporno.py b/hypervideo_dl/extractor/hellporno.py index 92d32cd..fa32b27 100644 --- a/hypervideo_dl/extractor/hellporno.py +++ b/hypervideo_dl/extractor/hellporno.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( int_or_none, @@ -41,7 +39,6 @@ class HellPornoIE(InfoExtractor): title = remove_end(self._html_extract_title(webpage), ' - Hell Porno') info = self._parse_html5_media_entries(url, webpage, display_id)[0] - self._sort_formats(info['formats']) video_id = self._search_regex( (r'chs_object\s*=\s*["\'](\d+)', diff --git a/hypervideo_dl/extractor/helsinki.py b/hypervideo_dl/extractor/helsinki.py index 575fb33..e518cae 100644 --- a/hypervideo_dl/extractor/helsinki.py +++ b/hypervideo_dl/extractor/helsinki.py @@ -1,7 +1,3 @@ -# coding: utf-8 - -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import js_to_json @@ -33,7 +29,6 @@ class HelsinkiIE(InfoExtractor): 'url': s['file'], 'ext': 'mp4', } for s in params['sources']] - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/hentaistigma.py b/hypervideo_dl/extractor/hentaistigma.py index 86a93de..ca5ffc2 100644 --- a/hypervideo_dl/extractor/hentaistigma.py +++ b/hypervideo_dl/extractor/hentaistigma.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/hgtv.py b/hypervideo_dl/extractor/hgtv.py index a4f3325..c40017d 100644 --- a/hypervideo_dl/extractor/hgtv.py +++ b/hypervideo_dl/extractor/hgtv.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/hidive.py b/hypervideo_dl/extractor/hidive.py index 46d7d62..3a53f2c 100644 --- a/hypervideo_dl/extractor/hidive.py +++ b/hypervideo_dl/extractor/hidive.py @@ -1,4 +1,3 @@ -# coding: utf-8 import re from .common import InfoExtractor @@ -39,7 +38,9 @@ class HiDiveIE(InfoExtractor): webpage = self._download_webpage(self._LOGIN_URL, None) form = self._search_regex( r'(?s)<form[^>]+action="/account/login"[^>]*>(.+?)</form>', - webpage, 'login form') + webpage, 'login form', default=None) + if not form: # logged in + return data = self._hidden_inputs(form) data.update({ 'Email': username, @@ -102,7 +103,6 @@ class HiDiveIE(InfoExtractor): f['language'] = audio f['format_note'] = f'{version}, {extra}' formats.extend(frmt) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/historicfilms.py b/hypervideo_dl/extractor/historicfilms.py index 56343e9..c428fee 100644 --- a/hypervideo_dl/extractor/historicfilms.py +++ b/hypervideo_dl/extractor/historicfilms.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import parse_duration diff --git a/hypervideo_dl/extractor/hitbox.py b/hypervideo_dl/extractor/hitbox.py index 0470d0a..f0c6898 100644 --- a/hypervideo_dl/extractor/hitbox.py +++ b/hypervideo_dl/extractor/hitbox.py @@ -1,16 +1,13 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( clean_html, - parse_iso8601, + determine_ext, float_or_none, int_or_none, - compat_str, - determine_ext, + parse_iso8601, ) @@ -121,7 +118,6 @@ class HitboxIE(InfoExtractor): 'tbr': bitrate, 'format_note': label, }) - self._sort_formats(formats) metadata = self._extract_metadata( 'https://www.smashcast.tv/api/media/video', video_id) @@ -130,7 +126,7 @@ class HitboxIE(InfoExtractor): return metadata -class HitboxLiveIE(HitboxIE): +class HitboxLiveIE(HitboxIE): # XXX: Do not subclass from concrete IE IE_NAME = 'hitbox:live' _VALID_URL = r'https?://(?:www\.)?(?:hitbox|smashcast)\.tv/(?P<id>[^/?#&]+)' _TESTS = [{ @@ -203,7 +199,6 @@ class HitboxLiveIE(HitboxIE): 'page_url': url, 'player_url': 'http://www.hitbox.tv/static/player/flowplayer/flowplayer.commercial-3.2.16.swf', }) - self._sort_formats(formats) metadata = self._extract_metadata( 'https://www.smashcast.tv/api/media/live', video_id) diff --git a/hypervideo_dl/extractor/hitrecord.py b/hypervideo_dl/extractor/hitrecord.py index fd5dc29..902af44 100644 --- a/hypervideo_dl/extractor/hitrecord.py +++ b/hypervideo_dl/extractor/hitrecord.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import compat_str from ..utils import ( diff --git a/hypervideo_dl/extractor/hketv.py b/hypervideo_dl/extractor/hketv.py index 1f3502b..1087956 100644 --- a/hypervideo_dl/extractor/hketv.py +++ b/hypervideo_dl/extractor/hketv.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import compat_str from ..utils import ( @@ -140,7 +137,6 @@ class HKETVIE(InfoExtractor): 'width': w, 'height': h, }) - self._sort_formats(formats) subtitles = {} tracks = try_get(playlist0, lambda x: x['tracks'], list) or [] diff --git a/hypervideo_dl/extractor/holodex.py b/hypervideo_dl/extractor/holodex.py new file mode 100644 index 0000000..a2b73ec --- /dev/null +++ b/hypervideo_dl/extractor/holodex.py @@ -0,0 +1,100 @@ +from .common import InfoExtractor +from .youtube import YoutubeIE +from ..utils import traverse_obj + + +class HolodexIE(InfoExtractor): + _VALID_URL = r'''(?x)https?://(?:www\.|staging\.)?holodex\.net/(?: + api/v2/playlist/(?P<playlist>\d+)| + watch/(?P<id>[\w-]{11})(?:\?(?:[^#]+&)?playlist=(?P<playlist2>\d+))? + )''' + _TESTS = [{ + 'url': 'https://holodex.net/watch/9kQ2GtvDV3s', + 'md5': 'be5ffce2f0feae8ba4c01553abc0f175', + 'info_dict': { + 'ext': 'mp4', + 'id': '9kQ2GtvDV3s', + 'title': '【おちゃめ機能】ホロライブが吹っ切れた【24人で歌ってみた】', + 'channel_id': 'UCJFZiqLMntJufDCHc6bQixg', + 'playable_in_embed': True, + 'tags': 'count:43', + 'age_limit': 0, + 'live_status': 'not_live', + 'description': 'md5:040e866c09dc4ab899b36479f4b7c7a2', + 'channel_url': 'https://www.youtube.com/channel/UCJFZiqLMntJufDCHc6bQixg', + 'upload_date': '20200406', + 'uploader_url': 'http://www.youtube.com/channel/UCJFZiqLMntJufDCHc6bQixg', + 'view_count': int, + 'channel': 'hololive ホロライブ - VTuber Group', + 'categories': ['Music'], + 'uploader': 'hololive ホロライブ - VTuber Group', + 'channel_follower_count': int, + 'uploader_id': 'UCJFZiqLMntJufDCHc6bQixg', + 'availability': 'public', + 'thumbnail': 'https://i.ytimg.com/vi_webp/9kQ2GtvDV3s/maxresdefault.webp', + 'duration': 263, + 'like_count': int, + }, + }, { + 'url': 'https://holodex.net/api/v2/playlist/239', + 'info_dict': { + 'id': '239', + 'title': 'Songs/Videos that made fall into the rabbit hole (from my google activity history)', + }, + 'playlist_count': 14, + }, { + 'url': 'https://holodex.net/watch/_m2mQyaofjI?foo=bar&playlist=69', + 'info_dict': { + 'id': '69', + 'title': '拿著金斧頭的藍髮大姊姊' + }, + 'playlist_count': 3, + }, { + 'url': 'https://holodex.net/watch/_m2mQyaofjI?playlist=69', + 'info_dict': { + 'id': '_m2mQyaofjI', + 'ext': 'mp4', + 'playable_in_embed': True, + 'like_count': int, + 'uploader': 'Ernst / エンスト', + 'duration': 11, + 'uploader_url': 'http://www.youtube.com/channel/UCqSX4PPZY0cyetqKVY_wRVA', + 'categories': ['Entertainment'], + 'title': '【星街すいせい】星街向你獻上晚安', + 'upload_date': '20210705', + 'description': 'md5:8b8ffb157bae77f2d109021a0b577d4a', + 'channel': 'Ernst / エンスト', + 'channel_id': 'UCqSX4PPZY0cyetqKVY_wRVA', + 'channel_follower_count': int, + 'view_count': int, + 'tags': [], + 'live_status': 'not_live', + 'channel_url': 'https://www.youtube.com/channel/UCqSX4PPZY0cyetqKVY_wRVA', + 'availability': 'public', + 'thumbnail': 'https://i.ytimg.com/vi_webp/_m2mQyaofjI/maxresdefault.webp', + 'age_limit': 0, + 'uploader_id': 'UCqSX4PPZY0cyetqKVY_wRVA', + 'comment_count': int, + }, + 'params': {'noplaylist': True}, + }, { + 'url': 'https://staging.holodex.net/api/v2/playlist/125', + 'only_matching': True, + }, { + 'url': 'https://staging.holodex.net/watch/rJJTJA_T_b0?playlist=25', + 'only_matching': True, + }, { + 'url': 'https://staging.holodex.net/watch/s1ifBeukThg', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id, playlist_id, pl_id2 = self._match_valid_url(url).group('id', 'playlist', 'playlist2') + playlist_id = playlist_id or pl_id2 + + if not self._yes_playlist(playlist_id, video_id): + return self.url_result(f'https://www.youtube.com/watch?v={video_id}', YoutubeIE) + + data = self._download_json(f'https://holodex.net/api/v2/playlist/{playlist_id}', playlist_id) + return self.playlist_from_matches( + traverse_obj(data, ('videos', ..., 'id')), playlist_id, data.get('name'), ie=YoutubeIE) diff --git a/hypervideo_dl/extractor/hornbunny.py b/hypervideo_dl/extractor/hornbunny.py deleted file mode 100644 index c458a95..0000000 --- a/hypervideo_dl/extractor/hornbunny.py +++ /dev/null @@ -1,49 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ( - int_or_none, - parse_duration, -) - - -class HornBunnyIE(InfoExtractor): - _VALID_URL = r'http?://(?:www\.)?hornbunny\.com/videos/(?P<title_dash>[a-z-]+)-(?P<id>\d+)\.html' - _TEST = { - 'url': 'http://hornbunny.com/videos/panty-slut-jerk-off-instruction-5227.html', - 'md5': 'e20fd862d1894b67564c96f180f43924', - 'info_dict': { - 'id': '5227', - 'ext': 'mp4', - 'title': 'panty slut jerk off instruction', - 'duration': 550, - 'age_limit': 18, - 'view_count': int, - 'thumbnail': r're:^https?://.*\.jpg$', - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - title = self._og_search_title(webpage) - info_dict = self._parse_html5_media_entries(url, webpage, video_id)[0] - - duration = parse_duration(self._search_regex( - r'<strong>Runtime:</strong>\s*([0-9:]+)</div>', - webpage, 'duration', fatal=False)) - view_count = int_or_none(self._search_regex( - r'<strong>Views:</strong>\s*(\d+)</div>', - webpage, 'view count', fatal=False)) - - info_dict.update({ - 'id': video_id, - 'title': title, - 'duration': duration, - 'view_count': view_count, - 'age_limit': 18, - }) - - return info_dict diff --git a/hypervideo_dl/extractor/hotnewhiphop.py b/hypervideo_dl/extractor/hotnewhiphop.py index 4703e18..f8570cb 100644 --- a/hypervideo_dl/extractor/hotnewhiphop.py +++ b/hypervideo_dl/extractor/hotnewhiphop.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import compat_b64decode from ..utils import ( diff --git a/hypervideo_dl/extractor/hotstar.py b/hypervideo_dl/extractor/hotstar.py index d55a79b..61eec7b 100644 --- a/hypervideo_dl/extractor/hotstar.py +++ b/hypervideo_dl/extractor/hotstar.py @@ -1,31 +1,33 @@ -# coding: utf-8 -from __future__ import unicode_literals - import hashlib import hmac +import json import re import time import uuid -import json from .common import InfoExtractor -from ..compat import ( - compat_HTTPError, - compat_str -) +from ..compat import compat_HTTPError, compat_str from ..utils import ( - determine_ext, ExtractorError, + determine_ext, int_or_none, + join_nonempty, str_or_none, - try_get, + traverse_obj, url_or_none, ) class HotStarBaseIE(InfoExtractor): + _BASE_URL = 'https://www.hotstar.com' + _API_URL = 'https://api.hotstar.com' _AKAMAI_ENCRYPTION_KEY = b'\x05\xfc\x1a\x01\xca\xc9\x4b\xc4\x12\xfc\x53\x12\x07\x75\xf9\xee' + def _call_api_v1(self, path, *args, **kwargs): + return self._download_json( + f'{self._API_URL}/o/v1/{path}', *args, **kwargs, + headers={'x-country-code': 'IN', 'x-platform-code': 'PCTV'}) + def _call_api_impl(self, path, video_id, query, st=None, cookies=None): st = int_or_none(st) or int(time.time()) exp = st + 6000 @@ -36,7 +38,7 @@ class HotStarBaseIE(InfoExtractor): token = cookies.get('userUP').value else: token = self._download_json( - 'https://api.hotstar.com/um/v3/users', + f'{self._API_URL}/um/v3/users', video_id, note='Downloading token', data=json.dumps({"device_ids": [{"id": compat_str(uuid.uuid4()), "type": "device_id"}]}).encode('utf-8'), headers={ @@ -46,58 +48,48 @@ class HotStarBaseIE(InfoExtractor): })['user_identity'] response = self._download_json( - 'https://api.hotstar.com/' + path, video_id, headers={ + f'{self._API_URL}/{path}', video_id, query=query, + headers={ 'hotstarauth': auth, 'x-hs-appversion': '6.72.2', 'x-hs-platform': 'web', 'x-hs-usertoken': token, - }, query=query) + }) if response['message'] != "Playback URL's fetched successfully": raise ExtractorError( response['message'], expected=True) return response['data'] - def _call_api(self, path, video_id, query_name='contentId'): - return self._download_json('https://api.hotstar.com/' + path, video_id=video_id, query={ - query_name: video_id, - 'tas': 10000, - }, headers={ - 'x-country-code': 'IN', - 'x-platform-code': 'PCTV', - }) - def _call_api_v2(self, path, video_id, st=None, cookies=None): return self._call_api_impl( - '%s/content/%s' % (path, video_id), video_id, st=st, cookies=cookies, query={ + f'{path}/content/{video_id}', video_id, st=st, cookies=cookies, query={ 'desired-config': 'audio_channel:stereo|container:fmp4|dynamic_range:hdr|encryption:plain|ladder:tv|package:dash|resolution:fhd|subs-tag:HotstarVIP|video_codec:h265', 'device-id': cookies.get('device_id').value if cookies.get('device_id') else compat_str(uuid.uuid4()), 'os-name': 'Windows', 'os-version': '10', }) + def _playlist_entries(self, path, item_id, root=None, **kwargs): + results = self._call_api_v1(path, item_id, **kwargs)['body']['results'] + for video in traverse_obj(results, (('assets', None), 'items', ...)): + if video.get('contentId'): + yield self.url_result( + HotStarIE._video_url(video['contentId'], root=root), HotStarIE, video['contentId']) + class HotStarIE(HotStarBaseIE): IE_NAME = 'hotstar' _VALID_URL = r'''(?x) - (?: - hotstar\:| - https?://(?:www\.)?hotstar\.com(?:/in)?/(?!in/) - ) - (?: - (?P<type>movies|sports|episode|(?P<tv>tv)) - (?: - \:| - /[^/?#]+/ - (?(tv) - (?:[^/?#]+/){2}| - (?:[^/?#]+/)* - ) - )| - [^/?#]+/ - )? - (?P<id>\d{10}) - ''' + https?://(?:www\.)?hotstar\.com(?:/in)?/(?!in/) + (?: + (?P<type>movies|sports|episode|(?P<tv>tv))/ + (?(tv)(?:[^/?#]+/){2}|[^?#]*) + )? + [^/?#]+/ + (?P<id>\d{10}) + ''' + _TESTS = [{ 'url': 'https://www.hotstar.com/can-you-not-spread-rumours/1000076273', 'info_dict': { @@ -108,38 +100,9 @@ class HotStarIE(HotStarBaseIE): 'timestamp': 1447248600, 'upload_date': '20151111', 'duration': 381, + 'episode': 'Can You Not Spread Rumours?', }, - }, { - 'url': 'hotstar:1000076273', - 'only_matching': True, - }, { - 'url': 'https://www.hotstar.com/movies/radha-gopalam/1000057157', - 'info_dict': { - 'id': '1000057157', - 'ext': 'mp4', - 'title': 'Radha Gopalam', - 'description': 'md5:be3bc342cc120bbc95b3b0960e2b0d22', - 'timestamp': 1140805800, - 'upload_date': '20060224', - 'duration': 9182, - }, - }, { - 'url': 'hotstar:movies:1000057157', - 'only_matching': True, - }, { - 'url': 'https://www.hotstar.com/in/sports/cricket/follow-the-blues-2021/recap-eng-fight-back-on-day-2/1260066104', - 'only_matching': True, - }, { - 'url': 'https://www.hotstar.com/in/sports/football/most-costly-pl-transfers-ft-grealish/1260065956', - 'only_matching': True, - }, { - # contentData - 'url': 'hotstar:sports:1260065956', - 'only_matching': True, - }, { - # contentData - 'url': 'hotstar:sports:1260066104', - 'only_matching': True, + 'params': {'skip_download': 'm3u8'}, }, { 'url': 'https://www.hotstar.com/tv/ek-bhram-sarvagun-sampanna/s-2116/janhvi-targets-suman/1000234847', 'info_dict': { @@ -158,12 +121,19 @@ class HotStarIE(HotStarBaseIE): 'season_id': 6771, 'episode': 'Janhvi Targets Suman', 'episode_number': 8, - }, + } }, { - 'url': 'hotstar:episode:1000234847', + 'url': 'https://www.hotstar.com/movies/radha-gopalam/1000057157', + 'only_matching': True, + }, { + 'url': 'https://www.hotstar.com/in/sports/cricket/follow-the-blues-2021/recap-eng-fight-back-on-day-2/1260066104', + 'only_matching': True, + }, { + 'url': 'https://www.hotstar.com/in/sports/football/most-costly-pl-transfers-ft-grealish/1260065956', 'only_matching': True, }] _GEO_BYPASS = False + _TYPE = { 'movies': 'movie', 'sports': 'match', @@ -172,41 +142,54 @@ class HotStarIE(HotStarBaseIE): None: 'content', } + _IGNORE_MAP = { + 'res': 'resolution', + 'vcodec': 'video_codec', + 'dr': 'dynamic_range', + } + + @classmethod + def _video_url(cls, video_id, video_type=None, *, slug='ignore_me', root=None): + assert None in (video_type, root) + if not root: + root = join_nonempty(cls._BASE_URL, video_type, delim='/') + return f'{root}/{slug}/{video_id}' + def _real_extract(self, url): - mobj = self._match_valid_url(url) - video_id = mobj.group('id') - video_type = mobj.group('type') - cookies = self._get_cookies(url) + video_id, video_type = self._match_valid_url(url).group('id', 'type') video_type = self._TYPE.get(video_type, video_type) - video_data = self._call_api(f'o/v1/{video_type}/detail', video_id)['body']['results']['item'] - title = video_data['title'] + cookies = self._get_cookies(url) # Cookies before any request + video_data = self._call_api_v1(f'{video_type}/detail', video_id, + query={'tas': 10000, 'contentId': video_id})['body']['results']['item'] if not self.get_param('allow_unplayable_formats') and video_data.get('drmProtected'): self.report_drm(video_id) - headers = {'Referer': 'https://www.hotstar.com/in'} - formats = [] - subs = {} + # See https://github.com/hypervideo/hypervideo/issues/396 + st = self._download_webpage_handle(f'{self._BASE_URL}/in', video_id)[1].headers.get('x-origin-date') + geo_restricted = False - _, urlh = self._download_webpage_handle('https://www.hotstar.com/in', video_id) - # Required to fix https://github.com/hypervideo/hypervideo/issues/396 - st = urlh.headers.get('x-origin-date') + formats, subs = [], {} + headers = {'Referer': f'{self._BASE_URL}/in'} + # change to v2 in the future playback_sets = self._call_api_v2('play/v1/playback', video_id, st=st, cookies=cookies)['playBackSets'] for playback_set in playback_sets: if not isinstance(playback_set, dict): continue - dr = re.search(r'dynamic_range:(?P<dr>[a-z]+)', playback_set.get('tagsCombination')).group('dr') + tags = str_or_none(playback_set.get('tagsCombination')) or '' + if any(f'{prefix}:{ignore}' in tags + for key, prefix in self._IGNORE_MAP.items() + for ignore in self._configuration_arg(key)): + continue + format_url = url_or_none(playback_set.get('playbackUrl')) if not format_url: continue - format_url = re.sub( - r'(?<=//staragvod)(\d)', r'web\1', format_url) - tags = str_or_none(playback_set.get('tagsCombination')) or '' - ingored_res, ignored_vcodec, ignored_dr = self._configuration_arg('res'), self._configuration_arg('vcodec'), self._configuration_arg('dr') - if any(f'resolution:{ig_res}' in tags for ig_res in ingored_res) or any(f'video_codec:{ig_vc}' in tags for ig_vc in ignored_vcodec) or any(f'dynamic_range:{ig_dr}' in tags for ig_dr in ignored_dr): - continue + format_url = re.sub(r'(?<=//staragvod)(\d)', r'web\1', format_url) + dr = re.search(r'dynamic_range:(?P<dr>[a-z]+)', playback_set.get('tagsCombination')).group('dr') ext = determine_ext(format_url) + current_formats, current_subs = [], {} try: if 'package:hls' in tags or ext == 'm3u8': @@ -218,8 +201,7 @@ class HotStarIE(HotStarBaseIE): current_formats, current_subs = self._extract_mpd_formats_and_subtitles( format_url, video_id, mpd_id=f'{dr}-dash', headers=headers) elif ext == 'f4m': - # produce broken files - pass + pass # XXX: produce broken files else: current_formats = [{ 'url': format_url, @@ -230,6 +212,7 @@ class HotStarIE(HotStarBaseIE): if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: geo_restricted = True continue + if tags and 'encryption:plain' not in tags: for f in current_formats: f['has_drm'] = True @@ -238,18 +221,18 @@ class HotStarIE(HotStarBaseIE): for f in current_formats: if not f.get('langauge'): f['language'] = lang + formats.extend(current_formats) subs = self._merge_subtitles(subs, current_subs) + if not formats and geo_restricted: self.raise_geo_restricted(countries=['IN'], metadata_available=True) - self._sort_formats(formats) - for f in formats: f.setdefault('http_headers', {}).update(headers) return { 'id': video_id, - 'title': title, + 'title': video_data.get('title'), 'description': video_data.get('description'), 'duration': int_or_none(video_data.get('duration')), 'timestamp': int_or_none(video_data.get('broadcastDate') or video_data.get('startDate')), @@ -261,17 +244,51 @@ class HotStarIE(HotStarBaseIE): 'season': video_data.get('seasonName'), 'season_number': int_or_none(video_data.get('seasonNo')), 'season_id': video_data.get('seasonId'), - 'episode': title, + 'episode': video_data.get('title'), 'episode_number': int_or_none(video_data.get('episodeNo')), - 'http_headers': { - 'Referer': 'https://www.hotstar.com/in', - } } +class HotStarPrefixIE(InfoExtractor): + """ The "hotstar:" prefix is no longer in use, but this is kept for backward compatibility """ + IE_DESC = False + _VALID_URL = r'hotstar:(?:(?P<type>\w+):)?(?P<id>\d+)$' + _TESTS = [{ + 'url': 'hotstar:1000076273', + 'only_matching': True, + }, { + 'url': 'hotstar:movies:1260009879', + 'info_dict': { + 'id': '1260009879', + 'ext': 'mp4', + 'title': 'Nuvvu Naaku Nachav', + 'description': 'md5:d43701b1314e6f8233ce33523c043b7d', + 'timestamp': 1567525674, + 'upload_date': '20190903', + 'duration': 10787, + 'episode': 'Nuvvu Naaku Nachav', + }, + }, { + 'url': 'hotstar:episode:1000234847', + 'only_matching': True, + }, { + # contentData + 'url': 'hotstar:sports:1260065956', + 'only_matching': True, + }, { + # contentData + 'url': 'hotstar:sports:1260066104', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id, video_type = self._match_valid_url(url).group('id', 'type') + return self.url_result(HotStarIE._video_url(video_id, video_type), HotStarIE, video_id) + + class HotStarPlaylistIE(HotStarBaseIE): IE_NAME = 'hotstar:playlist' - _VALID_URL = r'https?://(?:www\.)?hotstar\.com/tv/[^/]+/s-\w+/list/[^/]+/t-(?P<id>\w+)' + _VALID_URL = r'https?://(?:www\.)?hotstar\.com(?:/in)?/tv(?:/[^/]+){2}/list/[^/]+/t-(?P<id>\w+)' _TESTS = [{ 'url': 'https://www.hotstar.com/tv/savdhaan-india/s-26/list/popular-clips/t-3_2_26', 'info_dict': { @@ -281,25 +298,49 @@ class HotStarPlaylistIE(HotStarBaseIE): }, { 'url': 'https://www.hotstar.com/tv/savdhaan-india/s-26/list/extras/t-2480', 'only_matching': True, + }, { + 'url': 'https://www.hotstar.com/in/tv/karthika-deepam/15457/list/popular-clips/t-3_2_1272', + 'only_matching': True, }] def _real_extract(self, url): - playlist_id = self._match_id(url) + id_ = self._match_id(url) + return self.playlist_result( + self._playlist_entries('tray/find', id_, query={'tas': 10000, 'uqId': id_}), id_) - collection = self._call_api('o/v1/tray/find', playlist_id, 'uqId')['body']['results'] - entries = [ - self.url_result( - 'https://www.hotstar.com/%s' % video['contentId'], - ie=HotStarIE.ie_key(), video_id=video['contentId']) - for video in collection['assets']['items'] - if video.get('contentId')] - return self.playlist_result(entries, playlist_id) +class HotStarSeasonIE(HotStarBaseIE): + IE_NAME = 'hotstar:season' + _VALID_URL = r'(?P<url>https?://(?:www\.)?hotstar\.com(?:/in)?/tv/[^/]+/\w+)/seasons/[^/]+/ss-(?P<id>\w+)' + _TESTS = [{ + 'url': 'https://www.hotstar.com/tv/radhakrishn/1260000646/seasons/season-2/ss-8028', + 'info_dict': { + 'id': '8028', + }, + 'playlist_mincount': 35, + }, { + 'url': 'https://www.hotstar.com/in/tv/ishqbaaz/9567/seasons/season-2/ss-4357', + 'info_dict': { + 'id': '4357', + }, + 'playlist_mincount': 30, + }, { + 'url': 'https://www.hotstar.com/in/tv/bigg-boss/14714/seasons/season-4/ss-8208/', + 'info_dict': { + 'id': '8208', + }, + 'playlist_mincount': 19, + }] + + def _real_extract(self, url): + url, season_id = self._match_valid_url(url).groups() + return self.playlist_result(self._playlist_entries( + 'season/asset', season_id, url, query={'tao': 0, 'tas': 0, 'size': 10000, 'id': season_id}), season_id) class HotStarSeriesIE(HotStarBaseIE): IE_NAME = 'hotstar:series' - _VALID_URL = r'(?P<url>https?://(?:www\.)?hotstar\.com(?:/in)?/tv/[^/]+/(?P<id>\d+))' + _VALID_URL = r'(?P<url>https?://(?:www\.)?hotstar\.com(?:/in)?/tv/[^/]+/(?P<id>\d+))/?(?:[#?]|$)' _TESTS = [{ 'url': 'https://www.hotstar.com/in/tv/radhakrishn/1260000646', 'info_dict': { @@ -317,25 +358,13 @@ class HotStarSeriesIE(HotStarBaseIE): 'info_dict': { 'id': '435', }, - 'playlist_mincount': 269, + 'playlist_mincount': 267, }] def _real_extract(self, url): url, series_id = self._match_valid_url(url).groups() - headers = { - 'x-country-code': 'IN', - 'x-platform-code': 'PCTV', - } - detail_json = self._download_json('https://api.hotstar.com/o/v1/show/detail?contentId=' + series_id, - video_id=series_id, headers=headers) - id = compat_str(try_get(detail_json, lambda x: x['body']['results']['item']['id'], int)) - item_json = self._download_json('https://api.hotstar.com/o/v1/tray/g/1/items?etid=0&tao=0&tas=10000&eid=' + id, - video_id=series_id, headers=headers) - entries = [ - self.url_result( - '%s/ignoreme/%d' % (url, video['contentId']), - ie=HotStarIE.ie_key(), video_id=video['contentId']) - for video in item_json['body']['results']['items'] - if video.get('contentId')] - - return self.playlist_result(entries, series_id) + id_ = self._call_api_v1( + 'show/detail', series_id, query={'contentId': series_id})['body']['results']['item']['id'] + + return self.playlist_result(self._playlist_entries( + 'tray/g/1/items', series_id, url, query={'tao': 0, 'tas': 10000, 'etid': 0, 'eid': id_}), series_id) diff --git a/hypervideo_dl/extractor/howcast.py b/hypervideo_dl/extractor/howcast.py index 7e36b85..59cf80f 100644 --- a/hypervideo_dl/extractor/howcast.py +++ b/hypervideo_dl/extractor/howcast.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import parse_iso8601 diff --git a/hypervideo_dl/extractor/howstuffworks.py b/hypervideo_dl/extractor/howstuffworks.py index cf90ab3..238fc0b 100644 --- a/hypervideo_dl/extractor/howstuffworks.py +++ b/hypervideo_dl/extractor/howstuffworks.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( find_xpath_attr, @@ -77,8 +75,6 @@ class HowStuffWorksIE(InfoExtractor): 'vbr': vbr, }) - self._sort_formats(formats) - return { 'id': '%s' % video_id, 'display_id': display_id, diff --git a/hypervideo_dl/extractor/hrfensehen.py b/hypervideo_dl/extractor/hrfensehen.py index e39ded2..35e9f67 100644 --- a/hypervideo_dl/extractor/hrfensehen.py +++ b/hypervideo_dl/extractor/hrfensehen.py @@ -1,17 +1,19 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json import re -from ..utils import int_or_none, unified_timestamp, unescapeHTML from .common import InfoExtractor +from ..utils import ( + int_or_none, + traverse_obj, + try_call, + unescapeHTML, + unified_timestamp, +) class HRFernsehenIE(InfoExtractor): IE_NAME = 'hrfernsehen' _VALID_URL = r'^https?://www\.(?:hr-fernsehen|hessenschau)\.de/.*,video-(?P<id>[0-9]{6})\.html' - _TESTS = [{ 'url': 'https://www.hessenschau.de/tv-sendung/hessenschau-vom-26082020,video-130546.html', 'md5': '5c4e0ba94677c516a2f65a84110fc536', @@ -24,10 +26,11 @@ class HRFernsehenIE(InfoExtractor): 'subtitles': {'de': [{ 'url': 'https://hr-a.akamaihd.net/video/as/hessenschau/2020_08/hrLogo_200826200407_L385592_512x288-25p-500kbit.vtt' }]}, - 'timestamp': 1598470200, + 'timestamp': 1598400000, 'upload_date': '20200826', - 'thumbnail': 'https://www.hessenschau.de/tv-sendung/hs_ganz-1554~_t-1598465545029_v-16to9__medium.jpg', - 'title': 'hessenschau vom 26.08.2020' + 'thumbnail': 'https://www.hessenschau.de/tv-sendung/hs_ganz-1554~_t-1598465545029_v-16to9.jpg', + 'title': 'hessenschau vom 26.08.2020', + 'duration': 1654 } }, { 'url': 'https://www.hr-fernsehen.de/sendungen-a-z/mex/sendungen/fair-und-gut---was-hinter-aldis-eigenem-guetesiegel-steckt,video-130544.html', @@ -36,25 +39,18 @@ class HRFernsehenIE(InfoExtractor): _GEO_COUNTRIES = ['DE'] - def extract_airdate(self, loader_data): - airdate_str = loader_data.get('mediaMetadata', {}).get('agf', {}).get('airdate') - - if airdate_str is None: - return None - - return unified_timestamp(airdate_str) - def extract_formats(self, loader_data): stream_formats = [] - for stream_obj in loader_data["videoResolutionLevels"]: + data = loader_data['mediaCollection']['streams'][0]['media'] + for inner in data[1:]: stream_format = { - 'format_id': str(stream_obj['verticalResolution']) + "p", - 'height': stream_obj['verticalResolution'], - 'url': stream_obj['url'], + 'format_id': try_call(lambda: f'{inner["maxHResolutionPx"]}p'), + 'height': inner.get('maxHResolutionPx'), + 'url': inner['url'], } quality_information = re.search(r'([0-9]{3,4})x([0-9]{3,4})-([0-9]{2})p-([0-9]{3,4})kbit', - stream_obj['url']) + inner['url']) if quality_information: stream_format['width'] = int_or_none(quality_information.group(1)) stream_format['height'] = int_or_none(quality_information.group(2)) @@ -62,8 +58,6 @@ class HRFernsehenIE(InfoExtractor): stream_format['tbr'] = int_or_none(quality_information.group(4)) stream_formats.append(stream_format) - - self._sort_formats(stream_formats) return stream_formats def _real_extract(self, url): @@ -75,22 +69,22 @@ class HRFernsehenIE(InfoExtractor): description = self._html_search_meta( ['description'], webpage) - loader_str = unescapeHTML(self._search_regex(r"data-new-hr-mediaplayer-loader='([^']*)'", webpage, "ardloader")) + loader_str = unescapeHTML(self._search_regex(r"data-(?:new-)?hr-mediaplayer-loader='([^']*)'", webpage, 'ardloader')) loader_data = json.loads(loader_str) + subtitle = traverse_obj(loader_data, ('mediaCollection', 'subTitles', 0, 'sources', 0, 'url')) + info = { 'id': video_id, 'title': title, 'description': description, 'formats': self.extract_formats(loader_data), - 'timestamp': self.extract_airdate(loader_data) + 'subtitles': {'de': [{'url': subtitle}]}, + 'timestamp': unified_timestamp(self._search_regex( + r'<time\sdatetime="(\d{4}\W\d{1,2}\W\d{1,2})', webpage, 'datetime', fatal=False)), + 'duration': int_or_none(traverse_obj( + loader_data, ('playerConfig', 'pluginData', 'trackingAti@all', 'richMedia', 'duration'))), + 'thumbnail': self._search_regex(r'thumbnailUrl\W*([^"]+)', webpage, 'thumbnail', default=None), } - if "subtitle" in loader_data: - info["subtitles"] = {"de": [{"url": loader_data["subtitle"]}]} - - thumbnails = list(set([t for t in loader_data.get("previewImageUrl", {}).values()])) - if len(thumbnails) > 0: - info["thumbnails"] = [{"url": t} for t in thumbnails] - return info diff --git a/hypervideo_dl/extractor/hrti.py b/hypervideo_dl/extractor/hrti.py index 36d6007..cfec80d 100644 --- a/hypervideo_dl/extractor/hrti.py +++ b/hypervideo_dl/extractor/hrti.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json from .common import InfoExtractor @@ -147,7 +144,6 @@ class HRTiIE(HRTiBaseIE): formats = self._extract_m3u8_formats( m3u8_url, display_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') - self._sort_formats(formats) description = clean_html(title_info.get('summary_long')) age_limit = parse_age_limit(video.get('parental_control', {}).get('rating')) diff --git a/hypervideo_dl/extractor/hse.py b/hypervideo_dl/extractor/hse.py index 9144ff8..3cb21d2 100644 --- a/hypervideo_dl/extractor/hse.py +++ b/hypervideo_dl/extractor/hse.py @@ -1,4 +1,3 @@ -# coding: utf-8 from .common import InfoExtractor from ..utils import ( ExtractorError, @@ -26,7 +25,6 @@ class HSEShowBaseInfoExtractor(InfoExtractor): fmts, subs = self._extract_m3u8_formats_and_subtitles(src['url'], video_id, ext='mp4') formats.extend(fmts) subtitles = self._merge_subtitles(subtitles, subs) - self._sort_formats(formats) return formats, subtitles diff --git a/hypervideo_dl/extractor/huajiao.py b/hypervideo_dl/extractor/huajiao.py index 4ca275d..c498fa3 100644 --- a/hypervideo_dl/extractor/huajiao.py +++ b/hypervideo_dl/extractor/huajiao.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( parse_duration, diff --git a/hypervideo_dl/extractor/huffpost.py b/hypervideo_dl/extractor/huffpost.py index 54385ba..69fdc34 100644 --- a/hypervideo_dl/extractor/huffpost.py +++ b/hypervideo_dl/extractor/huffpost.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -19,6 +17,7 @@ class HuffPostIE(InfoExtractor): HPLEmbedPlayer/\?segmentId= ) (?P<id>[0-9a-f]+)''' + _EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed\.live\.huffingtonpost\.com/.+?)\1'] _TEST = { 'url': 'http://live.huffingtonpost.com/r/segment/legalese-it/52dd3e4b02a7602131000677', @@ -80,8 +79,6 @@ class HuffPostIE(InfoExtractor): 'vcodec': 'none' if key.startswith('audio/') else None, }) - self._sort_formats(formats) - return { 'id': video_id, 'title': video_title, diff --git a/hypervideo_dl/extractor/hungama.py b/hypervideo_dl/extractor/hungama.py index 821b16e..2e99396 100644 --- a/hypervideo_dl/extractor/hungama.py +++ b/hypervideo_dl/extractor/hungama.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -23,15 +20,17 @@ class HungamaIE(InfoExtractor): ''' _TESTS = [{ 'url': 'http://www.hungama.com/video/krishna-chants/39349649/', - 'md5': 'a845a6d1ebd08d80c1035126d49bd6a0', + 'md5': '687c5f1e9f832f3b59f44ed0eb1f120a', 'info_dict': { - 'id': '2931166', + 'id': '39349649', 'ext': 'mp4', - 'title': 'Lucky Ali - Kitni Haseen Zindagi', - 'track': 'Kitni Haseen Zindagi', - 'artist': 'Lucky Ali', - 'album': 'Aks', - 'release_year': 2000, + 'title': 'Krishna Chants', + 'description': 'Watch Krishna Chants video now. You can also watch other latest videos only at Hungama', + 'upload_date': '20180829', + 'duration': 264, + 'timestamp': 1535500800, + 'view_count': int, + 'thumbnail': 'https://images.hungama.com/c/1/0dc/2ca/39349649/39349649_700x394.jpg', } }, { 'url': 'https://www.hungama.com/movie/kahaani-2/44129919/', @@ -43,12 +42,7 @@ class HungamaIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - info = self._search_json_ld(webpage, video_id) - - m3u8_url = self._download_json( + video_json = self._download_json( 'https://www.hungama.com/index.php', video_id, data=urlencode_postdata({'content_id': video_id}), headers={ 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', @@ -56,18 +50,24 @@ class HungamaIE(InfoExtractor): }, query={ 'c': 'common', 'm': 'get_video_mdn_url', - })['stream_url'] + }) - formats = self._extract_m3u8_formats( - m3u8_url, video_id, ext='mp4', entry_protocol='m3u8_native', - m3u8_id='hls') - self._sort_formats(formats) + formats = self._extract_m3u8_formats(video_json['stream_url'], video_id, ext='mp4', m3u8_id='hls') - info.update({ + json_ld = self._search_json_ld( + self._download_webpage(url, video_id, fatal=False) or '', video_id, fatal=False) + + return { + **json_ld, 'id': video_id, 'formats': formats, - }) - return info + 'subtitles': { + 'en': [{ + 'url': video_json['sub_title'], + 'ext': 'vtt', + }] + } if video_json.get('sub_title') else None, + } class HungamaSongIE(InfoExtractor): diff --git a/hypervideo_dl/extractor/huya.py b/hypervideo_dl/extractor/huya.py index 4e96f22..b6e9eec 100644 --- a/hypervideo_dl/extractor/huya.py +++ b/hypervideo_dl/extractor/huya.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import hashlib import random @@ -9,7 +6,6 @@ from ..compat import compat_urlparse, compat_b64decode from ..utils import ( ExtractorError, int_or_none, - js_to_json, str_or_none, try_get, unescapeHTML, @@ -58,11 +54,7 @@ class HuyaLiveIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id=video_id) - json_stream = self._search_regex(r'"stream":\s+"([a-zA-Z0-9+=/]+)"', webpage, 'stream', default=None) - if not json_stream: - raise ExtractorError('Video is offline', expected=True) - stream_data = self._parse_json(compat_b64decode(json_stream).decode(), video_id=video_id, - transform_source=js_to_json) + stream_data = self._search_json(r'stream:\s', webpage, 'stream', video_id=video_id, default=None) room_info = try_get(stream_data, lambda x: x['data'][0]['gameLiveInfo']) if not room_info: raise ExtractorError('Can not extract the room info', expected=True) @@ -70,6 +62,8 @@ class HuyaLiveIE(InfoExtractor): screen_type = room_info.get('screenType') live_source_type = room_info.get('liveSourceType') stream_info_list = stream_data['data'][0]['gameStreamInfoList'] + if not stream_info_list: + raise ExtractorError('Video is offline', expected=True) formats = [] for stream_info in stream_info_list: stream_url = stream_info.get('sFlvUrl') @@ -99,8 +93,6 @@ class HuyaLiveIE(InfoExtractor): **self._RESOLUTION.get(si.get('sDisplayName'), {}), }) - self._sort_formats(formats) - return { 'id': video_id, 'title': title, diff --git a/hypervideo_dl/extractor/hypem.py b/hypervideo_dl/extractor/hypem.py index 9ca28d6..54db7b3 100644 --- a/hypervideo_dl/extractor/hypem.py +++ b/hypervideo_dl/extractor/hypem.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import int_or_none diff --git a/hypervideo_dl/extractor/hytale.py b/hypervideo_dl/extractor/hytale.py new file mode 100644 index 0000000..0f4dcc3 --- /dev/null +++ b/hypervideo_dl/extractor/hytale.py @@ -0,0 +1,58 @@ +import re + +from .common import InfoExtractor +from ..utils import traverse_obj + + +class HytaleIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?hytale\.com/news/\d+/\d+/(?P<id>[a-z0-9-]+)' + _TESTS = [{ + 'url': 'https://hytale.com/news/2021/07/summer-2021-development-update', + 'info_dict': { + 'id': 'summer-2021-development-update', + 'title': 'Summer 2021 Development Update', + }, + 'playlist_count': 4, + 'playlist': [{ + 'md5': '0854ebe347d233ee19b86ab7b2ead610', + 'info_dict': { + 'id': 'ed51a2609d21bad6e14145c37c334999', + 'ext': 'mp4', + 'title': 'Avatar Personalization', + 'thumbnail': r're:https://videodelivery\.net/\w+/thumbnails/thumbnail\.jpg', + } + }] + }, { + 'url': 'https://www.hytale.com/news/2019/11/hytale-graphics-update', + 'info_dict': { + 'id': 'hytale-graphics-update', + 'title': 'Hytale graphics update', + }, + 'playlist_count': 2, + }] + + def _real_initialize(self): + media_webpage = self._download_webpage( + 'https://hytale.com/media', None, note='Downloading list of media', fatal=False) or '' + + clips_json = traverse_obj( + self._search_json( + r'window\.__INITIAL_COMPONENTS_STATE__\s*=\s*\[', + media_webpage, 'clips json', None), + ('media', 'clips')) or [] + + self._titles = {clip.get('src'): clip.get('caption') for clip in clips_json} + + def _real_extract(self, url): + playlist_id = self._match_id(url) + webpage = self._download_webpage(url, playlist_id) + entries = [ + self.url_result( + f'https://cloudflarestream.com/{video_hash}/manifest/video.mpd?parentOrigin=https%3A%2F%2Fhytale.com', + title=self._titles.get(video_hash), url_transparent=True) + for video_hash in re.findall( + r'<stream\s+class\s*=\s*"ql-video\s+cf-stream"\s+src\s*=\s*"([a-f0-9]{32})"', + webpage) + ] + + return self.playlist_result(entries, playlist_id, self._og_search_title(webpage)) diff --git a/hypervideo_dl/extractor/icareus.py b/hypervideo_dl/extractor/icareus.py new file mode 100644 index 0000000..d081cf4 --- /dev/null +++ b/hypervideo_dl/extractor/icareus.py @@ -0,0 +1,179 @@ +import re + +from .common import InfoExtractor +from ..utils import ( + clean_html, + determine_ext, + get_element_by_class, + int_or_none, + merge_dicts, + parse_bitrate, + parse_resolution, + remove_end, + str_or_none, + url_or_none, + urlencode_postdata, +) + + +class IcareusIE(InfoExtractor): + _DOMAINS = '|'.join(map(re.escape, ( + 'asahitv.fi', + 'helsinkikanava.fi', + 'hyvinvointitv.fi', + 'inez.fi', + 'permanto.fi', + 'suite.icareus.com', + 'videos.minifiddlers.org', + ))) + _VALID_URL = rf'(?P<base_url>https?://(?:www\.)?(?:{_DOMAINS}))/[^?#]+/player/[^?#]+\?(?:[^#]+&)?(?:assetId|eventId)=(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.helsinkikanava.fi/fi_FI/web/helsinkikanava/player/vod?assetId=68021894', + 'md5': 'ca0b62ffc814a5411dfa6349cf5adb8a', + 'info_dict': { + 'id': '68021894', + 'ext': 'mp4', + 'title': 'Perheiden parhaaksi', + 'description': 'md5:295785ea408e5ac00708766465cc1325', + 'thumbnail': 'https://www.helsinkikanava.fi/image/image_gallery?img_id=68022501', + 'upload_date': '20200924', + 'timestamp': 1600938300, + }, + }, { # Recorded livestream + 'url': 'https://www.helsinkikanava.fi/fi/web/helsinkikanava/player/event/view?eventId=76241489', + 'md5': '014327e69dfa7b949fcc861f6d162d6d', + 'info_dict': { + 'id': '76258304', + 'ext': 'mp4', + 'title': 'Helsingin kaupungin ja HUSin tiedotustilaisuus koronaepidemiatilanteesta 24.11.2020', + 'description': 'md5:3129d041c6fbbcdc7fe68d9a938fef1c', + 'thumbnail': 'https://icareus-suite.secure2.footprint.net/image/image_gallery?img_id=76288630', + 'upload_date': '20201124', + 'timestamp': 1606206600, + }, + }, { # Non-m3u8 stream + 'url': 'https://suite.icareus.com/fi/web/westend-indians/player/vod?assetId=47567389', + 'md5': '72fc04ee971bbedc44405cdf16c990b6', + 'info_dict': { + 'id': '47567389', + 'ext': 'mp4', + 'title': 'Omatoiminen harjoittelu - Laukominen', + 'description': '', + 'thumbnail': 'https://suite.icareus.com/image/image_gallery?img_id=47568162', + 'upload_date': '20200319', + 'timestamp': 1584658080, + }, + }, { + 'url': 'https://asahitv.fi/fi/web/asahi/player/vod?assetId=89415818', + 'only_matching': True + }, { + 'url': 'https://hyvinvointitv.fi/fi/web/hyvinvointitv/player/vod?assetId=89149730', + 'only_matching': True + }, { + 'url': 'https://inez.fi/fi/web/inez-media/player/vod?assetId=71328822', + 'only_matching': True + }, { + 'url': 'https://www.permanto.fi/fi/web/alfatv/player/vod?assetId=135497515', + 'only_matching': True + }, { + 'url': 'https://videos.minifiddlers.org/web/international-minifiddlers/player/vod?assetId=1982759', + 'only_matching': True + }] + + def _real_extract(self, url): + base_url, temp_id = self._match_valid_url(url).groups() + webpage = self._download_webpage(url, temp_id) + + video_id = self._search_regex(r"_icareus\['itemId'\]\s*=\s*'(\d+)'", webpage, 'video_id') + organization_id = self._search_regex(r"_icareus\['organizationId'\]\s*=\s*'(\d+)'", webpage, 'organization_id') + + assets = self._download_json( + self._search_regex(r'var\s+publishingServiceURL\s*=\s*"(http[^"]+)";', webpage, 'api_base'), + video_id, data=urlencode_postdata({ + 'version': '03', + 'action': 'getAssetPlaybackUrls', + 'organizationId': organization_id, + 'assetId': video_id, + 'token': self._search_regex(r"_icareus\['token'\]\s*=\s*'([a-f0-9]+)'", webpage, 'icareus_token'), + })) + + subtitles = { + remove_end(sdesc.split(' ')[0], ':'): [{'url': url_or_none(surl)}] + for _, sdesc, surl in assets.get('subtitles') or [] + } + + formats = [{ + 'format': item.get('name'), + 'format_id': 'audio', + 'vcodec': 'none', + 'url': url_or_none(item['url']), + 'tbr': int_or_none(self._search_regex( + r'\((\d+)\s*k\)', item.get('name') or '', 'audio bitrate', default=None)), + } for item in assets.get('audio_urls') or [] if url_or_none(item.get('url'))] + + for item in assets.get('urls') or []: + video_url = url_or_none(item.get('url')) + if video_url is None: + continue + ext = determine_ext(video_url) + if ext == 'm3u8': + fmts, subs = self._extract_m3u8_formats_and_subtitles( + video_url, video_id, 'mp4', m3u8_id='hls', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + else: + fmt = item.get('name') + formats.append({ + 'url': video_url, + 'format': fmt, + 'tbr': parse_bitrate(fmt), + 'format_id': str_or_none(item.get('id')), + **parse_resolution(fmt), + }) + + info, token, live_title = self._search_json_ld(webpage, video_id, default={}), None, None + if not info: + token = self._search_regex( + r'data\s*:\s*{action:"getAsset".*?token:\'([a-f0-9]+)\'}', webpage, 'token', default=None) + if not token: + live_title = get_element_by_class('unpublished-info-item future-event-title', webpage) + + if token: + metadata = self._download_json( + f'{base_url}/icareus-suite-api-portlet/publishing', + video_id, fatal=False, data=urlencode_postdata({ + 'version': '03', + 'action': 'getAsset', + 'organizationId': organization_id, + 'assetId': video_id, + 'languageId': 'en_US', + 'userId': '0', + 'token': token, + })) or {} + info = { + 'title': metadata.get('name'), + 'description': metadata.get('description'), + 'timestamp': int_or_none(metadata.get('date'), scale=1000), + 'duration': int_or_none(metadata.get('duration')), + 'thumbnail': url_or_none(metadata.get('thumbnailMedium')), + } + elif live_title: # Recorded livestream + info = { + 'title': live_title, + 'description': get_element_by_class('unpublished-info-item future-event-description', webpage), + 'timestamp': int_or_none(self._search_regex( + r'var startEvent\s*=\s*(\d+);', webpage, 'uploadDate', fatal=False), scale=1000), + } + + thumbnails = info.get('thumbnails') or [{ + 'url': url_or_none(info.get('thumbnail') or assets.get('thumbnail')) + }] + + return merge_dicts({ + 'id': video_id, + 'title': None, + 'formats': formats, + 'subtitles': subtitles, + 'description': clean_html(info.get('description')), + 'thumbnails': thumbnails if thumbnails[0]['url'] else None, + }, info) diff --git a/hypervideo_dl/extractor/ichinanalive.py b/hypervideo_dl/extractor/ichinanalive.py index cb39f82..9d55ddc 100644 --- a/hypervideo_dl/extractor/ichinanalive.py +++ b/hypervideo_dl/extractor/ichinanalive.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ExtractorError, str_or_none, traverse_obj, unified_strdate from ..compat import compat_str @@ -76,8 +73,6 @@ class IchinanaLiveIE(InfoExtractor): 'acodec': 'aac', }) - self._sort_formats(formats) - return { 'id': video_id, 'title': uploader or video_id, @@ -150,8 +145,6 @@ class IchinanaLiveClipIE(InfoExtractor): 'http_headers': {'Referer': url}, }) - self._sort_formats(formats) - return { 'id': video_id, 'title': uploader or video_id, diff --git a/hypervideo_dl/extractor/ign.py b/hypervideo_dl/extractor/ign.py index c826eb3..d4797d3 100644 --- a/hypervideo_dl/extractor/ign.py +++ b/hypervideo_dl/extractor/ign.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -104,8 +102,6 @@ class IGNIE(IGNBaseIE): 'url': mezzanine_url, }) - self._sort_formats(formats) - thumbnails = [] for thumbnail in (video.get('thumbnails') or []): thumbnail_url = thumbnail.get('url') diff --git a/hypervideo_dl/extractor/iheart.py b/hypervideo_dl/extractor/iheart.py index b54c05e..2c6a5b6 100644 --- a/hypervideo_dl/extractor/iheart.py +++ b/hypervideo_dl/extractor/iheart.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( clean_html, diff --git a/hypervideo_dl/extractor/iltalehti.py b/hypervideo_dl/extractor/iltalehti.py new file mode 100644 index 0000000..0e7e82c --- /dev/null +++ b/hypervideo_dl/extractor/iltalehti.py @@ -0,0 +1,51 @@ +from .common import InfoExtractor +from ..utils import js_to_json, traverse_obj + + +class IltalehtiIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?iltalehti\.fi/[^/?#]+/a/(?P<id>[^/?#])' + _TESTS = [ + # jwplatform embed main_media + { + 'url': 'https://www.iltalehti.fi/ulkomaat/a/9fbd067f-94e4-46cd-8748-9d958eb4dae2', + 'md5': 'af12d42c539f1f49f0b62d231fe72dcd', + 'info_dict': { + 'id': 'gYjjaf1L', + 'ext': 'mp4', + 'title': 'Sensuroimaton Päivärinta, jakso 227: Vieraana Suomen Venäjän ex-suurlähettiläs René Nyberg ja Kenraalimajuri evp Pekka Toveri', + 'description': '', + 'upload_date': '20220928', + 'timestamp': 1664360878, + 'duration': 2089, + 'thumbnail': r're:^https?://.*\.jpg', + }, + }, + # jwplatform embed body + { + 'url': 'https://www.iltalehti.fi/politiikka/a/1ce49d85-1670-428b-8db8-d2479b9950a4', + 'md5': '9e50334b8f8330ce8828b567a82a3c65', + 'info_dict': { + 'id': '18R6zkLi', + 'ext': 'mp4', + 'title': 'Pekka Toverin arvio: Näin Nord Stream -kaasuputken räjäyttäminen on saatettu toteuttaa', + 'description': 'md5:3d1302c9e17e7ffd564143ff58f8de35', + 'upload_date': '20220929', + 'timestamp': 1664435867, + 'duration': 165.0, + 'thumbnail': r're:^https?://.*\.jpg', + }, + }, + ] + + def _real_extract(self, url): + article_id = self._match_id(url) + webpage = self._download_webpage(url, article_id) + info = self._search_json( + r'<script>\s*window.App\s*=', webpage, 'json', article_id, + transform_source=js_to_json) + props = traverse_obj(info, ( + 'state', 'articles', ..., 'items', (('main_media', 'properties'), ('body', ..., 'properties')))) + video_ids = traverse_obj(props, (lambda _, v: v['provider'] == 'jwplayer', 'id')) + return self.playlist_from_matches( + video_ids, article_id, ie='JWPlatform', getter=lambda id: f'jwplatform:{id}', + title=traverse_obj(info, ('state', 'articles', ..., 'items', 'canonical_title'), get_all=False)) diff --git a/hypervideo_dl/extractor/imdb.py b/hypervideo_dl/extractor/imdb.py index 96cee2e..557a3b7 100644 --- a/hypervideo_dl/extractor/imdb.py +++ b/hypervideo_dl/extractor/imdb.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import base64 import json import re @@ -102,7 +100,6 @@ class ImdbIE(InfoExtractor): 'ext': ext, 'quality': quality(format_id), }) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/imggaming.py b/hypervideo_dl/extractor/imggaming.py index ce7b21a..8e220fd 100644 --- a/hypervideo_dl/extractor/imggaming.py +++ b/hypervideo_dl/extractor/imggaming.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json from .common import InfoExtractor @@ -106,7 +103,6 @@ class ImgGamingBaseIE(InfoExtractor): formats.extend(self._extract_mpd_formats( media_url, media_id, mpd_id='dash', fatal=False, headers=self._MANIFEST_HEADERS)) - self._sort_formats(formats) subtitles = {} for subtitle in video_data.get('subtitles', []): diff --git a/hypervideo_dl/extractor/imgur.py b/hypervideo_dl/extractor/imgur.py index c917cf1..061c4cc 100644 --- a/hypervideo_dl/extractor/imgur.py +++ b/hypervideo_dl/extractor/imgur.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -86,8 +84,6 @@ class ImgurIE(InfoExtractor): }, }) - self._sort_formats(formats) - return { 'id': video_id, 'formats': formats, @@ -140,7 +136,7 @@ class ImgurGalleryIE(InfoExtractor): return self.url_result('http://imgur.com/%s' % gallery_id, ImgurIE.ie_key(), gallery_id) -class ImgurAlbumIE(ImgurGalleryIE): +class ImgurAlbumIE(ImgurGalleryIE): # XXX: Do not subclass from concrete IE IE_NAME = 'imgur:album' _VALID_URL = r'https?://(?:i\.)?imgur\.com/a/(?P<id>[a-zA-Z0-9]+)' diff --git a/hypervideo_dl/extractor/ina.py b/hypervideo_dl/extractor/ina.py index b3b2683..857013d 100644 --- a/hypervideo_dl/extractor/ina.py +++ b/hypervideo_dl/extractor/ina.py @@ -1,26 +1,19 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor -from ..utils import ( - determine_ext, - int_or_none, - strip_or_none, - xpath_attr, - xpath_text, -) +from ..utils import unified_strdate class InaIE(InfoExtractor): - _VALID_URL = r'https?://(?:(?:www|m)\.)?ina\.fr/(?:video|audio)/(?P<id>[A-Z0-9_]+)' + _VALID_URL = r'https?://(?:(?:www|m)\.)?ina\.fr/(?:[^?#]+/)(?P<id>[\w-]+)' _TESTS = [{ - 'url': 'http://www.ina.fr/video/I12055569/francois-hollande-je-crois-que-c-est-clair-video.html', - 'md5': 'a667021bf2b41f8dc6049479d9bb38a3', + 'url': 'https://www.ina.fr/video/I12055569/francois-hollande-je-crois-que-c-est-clair-video.html', + 'md5': 'c5a09e5cb5604ed10709f06e7a377dda', 'info_dict': { 'id': 'I12055569', 'ext': 'mp4', 'title': 'François Hollande "Je crois que c\'est clair"', - 'description': 'md5:3f09eb072a06cb286b8f7e4f77109663', + 'description': 'md5:19f61e2b4844ed4bb2e3df9ab9f527ff', + 'upload_date': '20070712', + 'thumbnail': 'https://cdn-hub.ina.fr/notice/690x517/3c4/I12055569.jpeg', } }, { 'url': 'https://www.ina.fr/video/S806544_001/don-d-organes-des-avancees-mais-d-importants-besoins-video.html', @@ -34,53 +27,58 @@ class InaIE(InfoExtractor): }, { 'url': 'http://m.ina.fr/video/I12055569', 'only_matching': True, + }, { + 'url': 'https://www.ina.fr/ina-eclaire-actu/video/cpb8205116303/les-jeux-electroniques', + 'md5': '4b8284a9a3a184fdc7e744225b8251e7', + 'info_dict': { + 'id': 'CPB8205116303', + 'ext': 'mp4', + 'title': 'Les jeux électroniques', + 'description': 'md5:e09f7683dad1cc60b74950490127d233', + 'upload_date': '19821204', + 'duration': 657, + 'thumbnail': 'https://cdn-hub.ina.fr/notice/690x517/203/CPB8205116303.jpeg', + }, + }, { + 'url': 'https://www.ina.fr/ina-eclaire-actu/arletty-carriere-conseils-actrice-marcel-carne', + 'md5': '743d6f069a00e19dda0da166a54eeccb', + 'info_dict': { + 'id': 'I22203233', + 'ext': 'mp4', + 'title': 'Arletty sur le métier d\'actrice', + 'description': 'md5:3d89b5e419d8514c934f146045ccdbad', + 'upload_date': '19581128', + 'thumbnail': 'https://cdn-hub.ina.fr/notice/690x517/082/I22203233.jpeg', + }, + }, { + 'url': 'https://www.ina.fr/ina-eclaire-actu/chasse-croise-sncf-gare-d-austerlitz-vacances-d-ete', + 'md5': 'a96fb85e9ba3b5c5b2eeb0c5daa55f2f', + 'info_dict': { + 'id': 'CAF91038285', + 'ext': 'mp4', + 'title': 'Les grands départs : les trains', + 'description': 'md5:1630ee819d8d4da97df53459e99f72bb', + 'upload_date': '19740801', + 'thumbnail': 'https://cdn-hub.ina.fr/notice/690x517/2cf/CAF91038285.jpeg', + }, }] def _real_extract(self, url): - video_id = self._match_id(url) - info_doc = self._download_xml( - 'http://player.ina.fr/notices/%s.mrss' % video_id, video_id) - item = info_doc.find('channel/item') - title = xpath_text(item, 'title', fatal=True) - media_ns_xpath = lambda x: self._xpath_ns(x, 'http://search.yahoo.com/mrss/') - content = item.find(media_ns_xpath('content')) + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) - get_furl = lambda x: xpath_attr(content, media_ns_xpath(x), 'url') - formats = [] - for q, w, h in (('bq', 400, 300), ('mq', 512, 384), ('hq', 768, 576)): - q_url = get_furl(q) - if not q_url: - continue - formats.append({ - 'format_id': q, - 'url': q_url, - 'width': w, - 'height': h, - }) - if not formats: - furl = get_furl('player') or content.attrib['url'] - ext = determine_ext(furl) - formats = [{ - 'url': furl, - 'vcodec': 'none' if ext == 'mp3' else None, - 'ext': ext, - }] + api_url = self._html_search_regex(r'asset-details-url\s*=\s*["\'](?P<api_url>[^"\']+)', webpage, 'api_url') + asset_id = self._search_regex(r'assets/([^?/]+)', api_url, 'asset_id') - thumbnails = [] - for thumbnail in content.findall(media_ns_xpath('thumbnail')): - thumbnail_url = thumbnail.get('url') - if not thumbnail_url: - continue - thumbnails.append({ - 'url': thumbnail_url, - 'height': int_or_none(thumbnail.get('height')), - 'width': int_or_none(thumbnail.get('width')), - }) + api_response = self._download_json(api_url.replace(asset_id, f'{asset_id}.json'), asset_id) return { - 'id': video_id, - 'formats': formats, - 'title': title, - 'description': strip_or_none(xpath_text(item, 'description')), - 'thumbnails': thumbnails, + 'id': asset_id, + 'url': api_response['resourceUrl'], + 'ext': {'video': 'mp4', 'audio': 'mp3'}.get(api_response.get('type')), + 'title': api_response.get('title'), + 'description': api_response.get('description'), + 'upload_date': unified_strdate(api_response.get('dateOfBroadcast')), + 'duration': api_response.get('duration'), + 'thumbnail': api_response.get('resourceThumbnail'), } diff --git a/hypervideo_dl/extractor/inc.py b/hypervideo_dl/extractor/inc.py index d5b258a..9b3fe9a 100644 --- a/hypervideo_dl/extractor/inc.py +++ b/hypervideo_dl/extractor/inc.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from .kaltura import KalturaIE diff --git a/hypervideo_dl/extractor/indavideo.py b/hypervideo_dl/extractor/indavideo.py index 4c16243..4fa97d8 100644 --- a/hypervideo_dl/extractor/indavideo.py +++ b/hypervideo_dl/extractor/indavideo.py @@ -1,8 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - from .common import InfoExtractor from ..compat import compat_str from ..utils import ( @@ -15,6 +10,14 @@ from ..utils import ( class IndavideoEmbedIE(InfoExtractor): _VALID_URL = r'https?://(?:(?:embed\.)?indavideo\.hu/player/video/|assets\.indavideo\.hu/swf/player\.swf\?.*\b(?:v(?:ID|id))=)(?P<id>[\da-f]+)' + # Some example URLs covered by generic extractor: + # http://indavideo.hu/video/Vicces_cica_1 + # http://index.indavideo.hu/video/2015_0728_beregszasz + # http://auto.indavideo.hu/video/Sajat_utanfutoban_a_kis_tacsko + # http://erotika.indavideo.hu/video/Amator_tini_punci + # http://film.indavideo.hu/video/f_hrom_nagymamm_volt + # http://palyazat.indavideo.hu/video/Embertelen_dal_Dodgem_egyuttes + _EMBED_REGEX = [r'<iframe[^>]+\bsrc=["\'](?P<url>(?:https?:)?//embed\.indavideo\.hu/player/video/[\da-f]+)'] _TESTS = [{ 'url': 'http://indavideo.hu/player/video/1bdc3c6d80/', 'md5': 'c8a507a1c7410685f83a06eaeeaafeab', @@ -40,20 +43,6 @@ class IndavideoEmbedIE(InfoExtractor): 'only_matching': True, }] - # Some example URLs covered by generic extractor: - # http://indavideo.hu/video/Vicces_cica_1 - # http://index.indavideo.hu/video/2015_0728_beregszasz - # http://auto.indavideo.hu/video/Sajat_utanfutoban_a_kis_tacsko - # http://erotika.indavideo.hu/video/Amator_tini_punci - # http://film.indavideo.hu/video/f_hrom_nagymamm_volt - # http://palyazat.indavideo.hu/video/Embertelen_dal_Dodgem_egyuttes - - @staticmethod - def _extract_urls(webpage): - return re.findall( - r'<iframe[^>]+\bsrc=["\'](?P<url>(?:https?:)?//embed\.indavideo\.hu/player/video/[\da-f]+)', - webpage) - def _real_extract(self, url): video_id = self._match_id(url) @@ -100,7 +89,6 @@ class IndavideoEmbedIE(InfoExtractor): 'url': video_url, 'height': height, }) - self._sort_formats(formats) timestamp = video.get('date') if timestamp: diff --git a/hypervideo_dl/extractor/infoq.py b/hypervideo_dl/extractor/infoq.py index 347cc51..192bcfe 100644 --- a/hypervideo_dl/extractor/infoq.py +++ b/hypervideo_dl/extractor/infoq.py @@ -1,15 +1,13 @@ -# coding: utf-8 - -from __future__ import unicode_literals - from ..compat import ( compat_b64decode, compat_urllib_parse_unquote, compat_urlparse, ) from ..utils import ( + ExtractorError, determine_ext, update_url_query, + traverse_obj, ) from .bokecc import BokeCCBaseIE @@ -38,6 +36,7 @@ class InfoQIE(BokeCCBaseIE): 'ext': 'flv', 'description': 'md5:308d981fb28fa42f49f9568322c683ff', }, + 'skip': 'Sorry, the page you visited does not exist', }, { 'url': 'https://www.infoq.com/presentations/Simple-Made-Easy', 'md5': '0e34642d4d9ef44bf86f66f6399672db', @@ -90,8 +89,10 @@ class InfoQIE(BokeCCBaseIE): }] def _extract_http_audio(self, webpage, video_id): - fields = self._form_hidden_inputs('mp3Form', webpage) - http_audio_url = fields.get('filename') + try: + http_audio_url = traverse_obj(self._form_hidden_inputs('mp3Form', webpage), 'filename') + except ExtractorError: + http_audio_url = None if not http_audio_url: return [] @@ -127,8 +128,6 @@ class InfoQIE(BokeCCBaseIE): + self._extract_http_video(webpage) + self._extract_http_audio(webpage, video_id)) - self._sort_formats(formats) - return { 'id': video_id, 'title': video_title, diff --git a/hypervideo_dl/extractor/instagram.py b/hypervideo_dl/extractor/instagram.py index 970f2c8..0233513 100644 --- a/hypervideo_dl/extractor/instagram.py +++ b/hypervideo_dl/extractor/instagram.py @@ -1,19 +1,17 @@ -# coding: utf-8 - -import itertools import hashlib +import itertools import json import re import time +import urllib.error from .common import InfoExtractor -from ..compat import ( - compat_HTTPError, -) from ..utils import ( ExtractorError, - format_field, + decode_base_n, + encode_base_n, float_or_none, + format_field, get_element_by_attribute, int_or_none, lowercase_escape, @@ -24,42 +22,59 @@ from ..utils import ( urlencode_postdata, ) +_ENCODING_CHARS = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_' + + +def _pk_to_id(id): + """Source: https://stackoverflow.com/questions/24437823/getting-instagram-post-url-from-media-id""" + return encode_base_n(int(id.split('_')[0]), table=_ENCODING_CHARS) + + +def _id_to_pk(shortcode): + """Covert a shortcode to a numeric value""" + return decode_base_n(shortcode[:11], table=_ENCODING_CHARS) + class InstagramBaseIE(InfoExtractor): _NETRC_MACHINE = 'instagram' _IS_LOGGED_IN = False + _API_BASE_URL = 'https://i.instagram.com/api/v1' + _LOGIN_URL = 'https://www.instagram.com/accounts/login' + _API_HEADERS = { + 'X-IG-App-ID': '936619743392459', + 'X-ASBD-ID': '198387', + 'X-IG-WWW-Claim': '0', + 'Origin': 'https://www.instagram.com', + 'Accept': '*/*', + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36', + } + def _perform_login(self, username, password): if self._IS_LOGGED_IN: return login_webpage = self._download_webpage( - 'https://www.instagram.com/accounts/login/', None, - note='Downloading login webpage', errnote='Failed to download login webpage') + self._LOGIN_URL, None, note='Downloading login webpage', errnote='Failed to download login webpage') - shared_data = self._parse_json( - self._search_regex( - r'window\._sharedData\s*=\s*({.+?});', - login_webpage, 'shared data', default='{}'), - None) - - login = self._download_json('https://www.instagram.com/accounts/login/ajax/', None, note='Logging in', headers={ - 'Accept': '*/*', - 'X-IG-App-ID': '936619743392459', - 'X-ASBD-ID': '198387', - 'X-IG-WWW-Claim': '0', - 'X-Requested-With': 'XMLHttpRequest', - 'X-CSRFToken': shared_data['config']['csrf_token'], - 'X-Instagram-AJAX': shared_data['rollout_hash'], - 'Referer': 'https://www.instagram.com/', - }, data=urlencode_postdata({ - 'enc_password': f'#PWD_INSTAGRAM_BROWSER:0:{int(time.time())}:{password}', - 'username': username, - 'queryParams': '{}', - 'optIntoOneTap': 'false', - 'stopDeletionNonce': '', - 'trustedDeviceRecords': '{}', - })) + shared_data = self._parse_json(self._search_regex( + r'window\._sharedData\s*=\s*({.+?});', login_webpage, 'shared data', default='{}'), None) + + login = self._download_json( + f'{self._LOGIN_URL}/ajax/', None, note='Logging in', headers={ + **self._API_HEADERS, + 'X-Requested-With': 'XMLHttpRequest', + 'X-CSRFToken': shared_data['config']['csrf_token'], + 'X-Instagram-AJAX': shared_data['rollout_hash'], + 'Referer': 'https://www.instagram.com/', + }, data=urlencode_postdata({ + 'enc_password': f'#PWD_INSTAGRAM_BROWSER:0:{int(time.time())}:{password}', + 'username': username, + 'queryParams': '{}', + 'optIntoOneTap': 'false', + 'stopDeletionNonce': '', + 'trustedDeviceRecords': '{}', + })) if not login.get('authenticated'): if login.get('message'): @@ -124,7 +139,7 @@ class InstagramBaseIE(InfoExtractor): } def _extract_product_media(self, product_media): - media_id = product_media.get('code') or product_media.get('id') + media_id = product_media.get('code') or _pk_to_id(product_media.get('pk')) vcodec = product_media.get('video_codec') dash_manifest_raw = product_media.get('video_dash_manifest') videos_list = product_media.get('video_versions') @@ -140,7 +155,6 @@ class InstagramBaseIE(InfoExtractor): } for format in videos_list or []] if dash_manifest_raw: formats.extend(self._parse_mpd_formats(self._parse_xml(dash_manifest_raw, media_id), mpd_id='dash')) - self._sort_formats(formats) thumbnails = [{ 'url': thumbnail.get('url'), @@ -160,7 +174,7 @@ class InstagramBaseIE(InfoExtractor): user_info = product_info.get('user') or {} info_dict = { - 'id': product_info.get('code') or product_info.get('id'), + 'id': _pk_to_id(traverse_obj(product_info, 'pk', 'id', expected_type=str_or_none)[:19]), 'title': product_info.get('title') or f'Video by {user_info.get("username")}', 'description': traverse_obj(product_info, ('caption', 'text'), expected_type=str_or_none), 'timestamp': int_or_none(product_info.get('taken_at')), @@ -170,6 +184,7 @@ class InstagramBaseIE(InfoExtractor): 'view_count': int_or_none(product_info.get('view_count')), 'like_count': int_or_none(product_info.get('like_count')), 'comment_count': int_or_none(product_info.get('comment_count')), + '__post_extractor': self.extract_comments(_pk_to_id(product_info.get('pk'))), 'http_headers': { 'Referer': 'https://www.instagram.com/', } @@ -191,6 +206,23 @@ class InstagramBaseIE(InfoExtractor): **self._extract_product_media(product_info) } + def _get_comments(self, video_id): + comments_info = self._download_json( + f'{self._API_BASE_URL}/media/{_id_to_pk(video_id)}/comments/?can_support_threading=true&permalink_enabled=false', video_id, + fatal=False, errnote='Comments extraction failed', note='Downloading comments info', headers=self._API_HEADERS) or {} + + comment_data = traverse_obj(comments_info, ('edge_media_to_parent_comment', 'edges'), 'comments') + for comment_dict in comment_data or []: + yield { + 'author': traverse_obj(comment_dict, ('node', 'owner', 'username'), ('user', 'username')), + 'author_id': traverse_obj(comment_dict, ('node', 'owner', 'id'), ('user', 'pk')), + 'author_thumbnail': traverse_obj(comment_dict, ('node', 'owner', 'profile_pic_url'), ('user', 'profile_pic_url'), expected_type=url_or_none), + 'id': traverse_obj(comment_dict, ('node', 'id'), 'pk'), + 'text': traverse_obj(comment_dict, ('node', 'text'), 'text'), + 'like_count': traverse_obj(comment_dict, ('node', 'edge_liked_by', 'count'), 'comment_like_count', expected_type=int_or_none), + 'timestamp': traverse_obj(comment_dict, ('node', 'created_at'), 'created_at', expected_type=int_or_none), + } + class InstagramIOSIE(InfoExtractor): IE_DESC = 'IOS instagram:// URL' @@ -216,27 +248,14 @@ class InstagramIOSIE(InfoExtractor): 'add_ie': ['Instagram'] }] - def _get_id(self, id): - """Source: https://stackoverflow.com/questions/24437823/getting-instagram-post-url-from-media-id""" - chrs = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_' - media_id = int(id.split('_')[0]) - shortened_id = '' - while media_id > 0: - r = media_id % 64 - media_id = (media_id - r) // 64 - shortened_id = chrs[r] + shortened_id - return shortened_id - def _real_extract(self, url): - return { - '_type': 'url_transparent', - 'url': f'http://instagram.com/tv/{self._get_id(self._match_id(url))}/', - 'ie_key': 'Instagram', - } + video_id = _pk_to_id(self._match_id(url)) + return self.url_result(f'http://instagram.com/tv/{video_id}', InstagramIE, video_id) class InstagramIE(InstagramBaseIE): _VALID_URL = r'(?P<url>https?://(?:www\.)?instagram\.com(?:/[^/]+)?/(?:p|tv|reel)/(?P<id>[^/?#&]+))' + _EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?instagram\.com/p/[^/]+/embed.*?)\1'] _TESTS = [{ 'url': 'https://instagram.com/p/aye83DjauH/?foo=bar#abc', 'md5': '0d2da106a9d2631273e192b372806516', @@ -246,7 +265,7 @@ class InstagramIE(InstagramBaseIE): 'title': 'Video by naomipq', 'description': 'md5:1f17f0ab29bd6fe2bfad705f58de3cb8', 'thumbnail': r're:^https?://.*\.jpg', - 'duration': 0, + 'duration': 8.747, 'timestamp': 1371748545, 'upload_date': '20130620', 'uploader_id': '2815873', @@ -256,27 +275,34 @@ class InstagramIE(InstagramBaseIE): 'comment_count': int, 'comments': list, }, + 'expected_warnings': [ + 'General metadata extraction failed', + 'Main webpage is locked behind the login page', + ], }, { - # missing description - 'url': 'https://www.instagram.com/p/BA-pQFBG8HZ/?taken-by=britneyspears', + # reel + 'url': 'https://www.instagram.com/reel/Chunk8-jurw/', + 'md5': 'f6d8277f74515fa3ff9f5791426e42b1', 'info_dict': { - 'id': 'BA-pQFBG8HZ', + 'id': 'Chunk8-jurw', 'ext': 'mp4', - 'title': 'Video by britneyspears', + 'title': 'Video by instagram', + 'description': 'md5:c9cde483606ed6f80fbe9283a6a2b290', 'thumbnail': r're:^https?://.*\.jpg', - 'duration': 0, - 'timestamp': 1453760977, - 'upload_date': '20160125', - 'uploader_id': '12246775', - 'uploader': 'Britney Spears', - 'channel': 'britneyspears', + 'duration': 5.016, + 'timestamp': 1661529231, + 'upload_date': '20220826', + 'uploader_id': '25025320', + 'uploader': 'Instagram', + 'channel': 'instagram', 'like_count': int, 'comment_count': int, 'comments': list, }, - 'params': { - 'skip_download': True, - }, + 'expected_warnings': [ + 'General metadata extraction failed', + 'Main webpage is locked behind the login page', + ], }, { # multi video post 'url': 'https://www.instagram.com/p/BQ0eAlwhDrw/', @@ -285,18 +311,24 @@ class InstagramIE(InstagramBaseIE): 'id': 'BQ0dSaohpPW', 'ext': 'mp4', 'title': 'Video 1', + 'thumbnail': r're:^https?://.*\.jpg', + 'view_count': int, }, }, { 'info_dict': { 'id': 'BQ0dTpOhuHT', 'ext': 'mp4', 'title': 'Video 2', + 'thumbnail': r're:^https?://.*\.jpg', + 'view_count': int, }, }, { 'info_dict': { 'id': 'BQ0dT7RBFeF', 'ext': 'mp4', 'title': 'Video 3', + 'thumbnail': r're:^https?://.*\.jpg', + 'view_count': int, }, }], 'info_dict': { @@ -304,6 +336,10 @@ class InstagramIE(InstagramBaseIE): 'title': 'Post by instagram', 'description': 'md5:0f9203fc6a2ce4d228da5754bcf54957', }, + 'expected_warnings': [ + 'General metadata extraction failed', + 'Main webpage is locked behind the login page', + ], }, { # IGTV 'url': 'https://www.instagram.com/tv/BkfuX9UB-eK/', @@ -322,7 +358,11 @@ class InstagramIE(InstagramBaseIE): 'comment_count': int, 'comments': list, 'description': 'Meet Cass Hirst (@cass.fb), a fingerboarding pro who can perform tiny ollies and kickflips while blindfolded.', - } + }, + 'expected_warnings': [ + 'General metadata extraction failed', + 'Main webpage is locked behind the login page', + ], }, { 'url': 'https://instagram.com/p/-Cmh1cukG2/', 'only_matching': True, @@ -340,59 +380,88 @@ class InstagramIE(InstagramBaseIE): 'only_matching': True, }] - @staticmethod - def _extract_embed_url(webpage): - mobj = re.search( - r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?instagram\.com/p/[^/]+/embed.*?)\1', - webpage) - if mobj: - return mobj.group('url') - - blockquote_el = get_element_by_attribute( - 'class', 'instagram-media', webpage) - if blockquote_el is None: - return + @classmethod + def _extract_embed_urls(cls, url, webpage): + res = tuple(super()._extract_embed_urls(url, webpage)) + if res: + return res - mobj = re.search( - r'<a[^>]+href=([\'"])(?P<link>[^\'"]+)\1', blockquote_el) + mobj = re.search(r'<a[^>]+href=([\'"])(?P<link>[^\'"]+)\1', + get_element_by_attribute('class', 'instagram-media', webpage) or '') if mobj: - return mobj.group('link') + return [mobj.group('link')] def _real_extract(self, url): video_id, url = self._match_valid_url(url).group('id', 'url') - webpage, urlh = self._download_webpage_handle(url, video_id) - if 'www.instagram.com/accounts/login' in urlh.geturl(): - self.report_warning('Main webpage is locked behind the login page. ' - 'Retrying with embed webpage (Note that some metadata might be missing)') - webpage = self._download_webpage( - 'https://www.instagram.com/p/%s/embed/' % video_id, video_id, note='Downloading embed webpage') - - shared_data = self._parse_json( - self._search_regex( - r'window\._sharedData\s*=\s*({.+?});', - webpage, 'shared data', default='{}'), - video_id, fatal=False) - media = traverse_obj( - shared_data, - ('entry_data', 'PostPage', 0, 'graphql', 'shortcode_media'), - ('entry_data', 'PostPage', 0, 'media'), - expected_type=dict) - - # _sharedData.entry_data.PostPage is empty when authenticated (see - # https://github.com/ytdl-org/youtube-dl/pull/22880) - if not media: - additional_data = self._parse_json( - self._search_regex( - r'window\.__additionalDataLoaded\s*\(\s*[^,]+,\s*({.+?})\s*\);', - webpage, 'additional data', default='{}'), - video_id, fatal=False) - product_item = traverse_obj(additional_data, ('items', 0), expected_type=dict) - if product_item: - return self._extract_product(product_item) - media = traverse_obj(additional_data, ('graphql', 'shortcode_media'), 'shortcode_media', expected_type=dict) or {} - - if not media and 'www.instagram.com/accounts/login' in urlh.geturl(): - self.raise_login_required('You need to log in to access this content') + media, webpage = {}, '' + + if self._get_cookies(url).get('sessionid'): + info = traverse_obj(self._download_json( + f'{self._API_BASE_URL}/media/{_id_to_pk(video_id)}/info/', video_id, + fatal=False, errnote='Video info extraction failed', + note='Downloading video info', headers=self._API_HEADERS), ('items', 0)) + if info: + media.update(info) + return self._extract_product(media) + + api_check = self._download_json( + f'{self._API_BASE_URL}/web/get_ruling_for_content/?content_type=MEDIA&target_id={_id_to_pk(video_id)}', + video_id, headers=self._API_HEADERS, fatal=False, note='Setting up session', errnote=False) or {} + csrf_token = self._get_cookies('https://www.instagram.com').get('csrftoken') + + if not csrf_token: + self.report_warning('No csrf token set by Instagram API', video_id) + else: + csrf_token = csrf_token.value if api_check.get('status') == 'ok' else None + if not csrf_token: + self.report_warning('Instagram API is not granting access', video_id) + + variables = { + 'shortcode': video_id, + 'child_comment_count': 3, + 'fetch_comment_count': 40, + 'parent_comment_count': 24, + 'has_threaded_comments': True, + } + general_info = self._download_json( + 'https://www.instagram.com/graphql/query/', video_id, fatal=False, errnote=False, + headers={ + **self._API_HEADERS, + 'X-CSRFToken': csrf_token or '', + 'X-Requested-With': 'XMLHttpRequest', + 'Referer': url, + }, query={ + 'query_hash': '9f8827793ef34641b2fb195d4d41151c', + 'variables': json.dumps(variables, separators=(',', ':')), + }) + media.update(traverse_obj(general_info, ('data', 'shortcode_media')) or {}) + + if not general_info: + self.report_warning('General metadata extraction failed (some metadata might be missing).', video_id) + webpage, urlh = self._download_webpage_handle(url, video_id) + shared_data = self._search_json( + r'window\._sharedData\s*=', webpage, 'shared data', video_id, fatal=False) or {} + + if shared_data and self._LOGIN_URL not in urlh.geturl(): + media.update(traverse_obj( + shared_data, ('entry_data', 'PostPage', 0, 'graphql', 'shortcode_media'), + ('entry_data', 'PostPage', 0, 'media'), expected_type=dict) or {}) + else: + self.report_warning('Main webpage is locked behind the login page. Retrying with embed webpage (some metadata might be missing).') + webpage = self._download_webpage( + f'{url}/embed/', video_id, note='Downloading embed webpage', fatal=False) + additional_data = self._search_json( + r'window\.__additionalDataLoaded\s*\(\s*[^,]+,', webpage, 'additional data', video_id, fatal=False) + if not additional_data and not media: + self.raise_login_required('Requested content is not available, rate-limit reached or login required') + + product_item = traverse_obj(additional_data, ('items', 0), expected_type=dict) + if product_item: + media.update(product_item) + return self._extract_product(media) + + media.update(traverse_obj( + additional_data, ('graphql', 'shortcode_media'), 'shortcode_media', expected_type=dict) or {}) username = traverse_obj(media, ('owner', 'username')) or self._search_regex( r'"owner"\s*:\s*{\s*"username"\s*:\s*"(.+?)"', webpage, 'username', fatal=False) @@ -412,7 +481,7 @@ class InstagramIE(InstagramBaseIE): if nodes: return self.playlist_result( self._extract_nodes(nodes, True), video_id, - format_field(username, template='Post by %s'), description) + format_field(username, None, 'Post by %s'), description) video_url = self._og_search_video_url(webpage, secure=False) @@ -424,7 +493,6 @@ class InstagramIE(InstagramBaseIE): dash = traverse_obj(media, ('dash_info', 'video_dash_manifest')) if dash: formats.extend(self._parse_mpd_formats(self._parse_xml(dash, video_id), mpd_id='dash')) - self._sort_formats(formats) comment_data = traverse_obj(media, ('edge_media_to_parent_comment', 'edges')) comments = [{ @@ -521,7 +589,7 @@ class InstagramPlaylistBaseIE(InstagramBaseIE): except ExtractorError as e: # if it's an error caused by a bad query, and there are # more GIS templates to try, ignore it and keep trying - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + if isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 403: if gis_tmpl != gis_tmpls[-1]: continue raise @@ -631,41 +699,32 @@ class InstagramStoryIE(InstagramBaseIE): def _real_extract(self, url): username, story_id = self._match_valid_url(url).groups() - - story_info_url = f'{username}/{story_id}/?__a=1' if username == 'highlights' else f'{username}/?__a=1' - story_info = self._download_json(f'https://www.instagram.com/stories/{story_info_url}', story_id, headers={ - 'X-IG-App-ID': 936619743392459, - 'X-ASBD-ID': 198387, - 'X-IG-WWW-Claim': 0, - 'X-Requested-With': 'XMLHttpRequest', - 'Referer': url, - }) - user_id = story_info['user']['id'] - highlight_title = traverse_obj(story_info, ('highlight', 'title')) + story_info = self._download_webpage(url, story_id) + user_info = self._search_json(r'"user":', story_info, 'user info', story_id, fatal=False) + if not user_info: + self.raise_login_required('This content is unreachable') + user_id = user_info.get('id') story_info_url = user_id if username != 'highlights' else f'highlight:{story_id}' - videos = self._download_json(f'https://i.instagram.com/api/v1/feed/reels_media/?reel_ids={story_info_url}', story_id, headers={ - 'X-IG-App-ID': 936619743392459, - 'X-ASBD-ID': 198387, - 'X-IG-WWW-Claim': 0, - })['reels'] - - full_name = traverse_obj(videos, ('user', 'full_name')) - - user_info = {} - if not (username and username != 'highlights' and full_name): - user_info = self._download_json( - f'https://i.instagram.com/api/v1/users/{user_id}/info/', story_id, headers={ - 'User-Agent': 'Mozilla/5.0 (Linux; Android 11; SM-A505F Build/RP1A.200720.012; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/96.0.4664.45 Mobile Safari/537.36 Instagram 214.1.0.29.120 Android (30/11; 450dpi; 1080x2122; samsung; SM-A505F; a50; exynos9610; en_US; 333717274)', - }, note='Downloading user info') + videos = traverse_obj(self._download_json( + f'{self._API_BASE_URL}/feed/reels_media/?reel_ids={story_info_url}', + story_id, errnote=False, fatal=False, headers=self._API_HEADERS), 'reels') + if not videos: + self.raise_login_required('You need to log in to access this content') - username = traverse_obj(user_info, ('user', 'username')) or username - full_name = traverse_obj(user_info, ('user', 'full_name')) or full_name + full_name = traverse_obj(videos, (f'highlight:{story_id}', 'user', 'full_name'), (str(user_id), 'user', 'full_name')) + story_title = traverse_obj(videos, (f'highlight:{story_id}', 'title')) + if not story_title: + story_title = f'Story by {username}' highlights = traverse_obj(videos, (f'highlight:{story_id}', 'items'), (str(user_id), 'items')) - return self.playlist_result([{ - **self._extract_product(highlight), - 'title': f'Story by {username}', - 'uploader': full_name, - 'uploader_id': user_id, - } for highlight in highlights], playlist_id=story_id, playlist_title=highlight_title) + info_data = [] + for highlight in highlights: + highlight_data = self._extract_product(highlight) + if highlight_data.get('formats'): + info_data.append({ + **highlight_data, + 'uploader': full_name, + 'uploader_id': user_id, + }) + return self.playlist_result(info_data, playlist_id=story_id, playlist_title=story_title) diff --git a/hypervideo_dl/extractor/internazionale.py b/hypervideo_dl/extractor/internazionale.py index 45e2af6..1b1cb57 100644 --- a/hypervideo_dl/extractor/internazionale.py +++ b/hypervideo_dl/extractor/internazionale.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import unified_timestamp @@ -63,7 +60,6 @@ class InternazionaleIE(InfoExtractor): entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) formats.extend(self._extract_mpd_formats( video_base + 'mpd', display_id, mpd_id='dash', fatal=False)) - self._sort_formats(formats) timestamp = unified_timestamp(self._html_search_meta( 'article:published_time', webpage, 'timestamp')) diff --git a/hypervideo_dl/extractor/internetvideoarchive.py b/hypervideo_dl/extractor/internetvideoarchive.py index 880918c..9d2574c 100644 --- a/hypervideo_dl/extractor/internetvideoarchive.py +++ b/hypervideo_dl/extractor/internetvideoarchive.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import json import re @@ -50,7 +48,6 @@ class InternetVideoArchiveIE(InfoExtractor): replace_url('.mpd'), video_id, mpd_id='dash', fatal=False)) formats.extend(self._extract_ism_formats( replace_url('Manifest'), video_id, ism_id='mss', fatal=False)) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/iprima.py b/hypervideo_dl/extractor/iprima.py index 1a20384..1818205 100644 --- a/hypervideo_dl/extractor/iprima.py +++ b/hypervideo_dl/extractor/iprima.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re import time @@ -151,9 +148,8 @@ class IPrimaIE(InfoExtractor): elif manifest_type == 'DASH' or ext == 'mpd': formats += self._extract_mpd_formats( manifest_url, video_id, mpd_id='dash', fatal=False) - self._sort_formats(formats) - final_result = self._search_json_ld(webpage, video_id) or {} + final_result = self._search_json_ld(webpage, video_id, default={}) final_result.update({ 'id': video_id, 'title': title, @@ -251,8 +247,6 @@ class IPrimaCNNIE(InfoExtractor): if not formats and '>GEO_IP_NOT_ALLOWED<' in playerpage: self.raise_geo_restricted(countries=['CZ'], metadata_available=True) - self._sort_formats(formats) - return { 'id': video_id, 'title': title, diff --git a/hypervideo_dl/extractor/iqiyi.py b/hypervideo_dl/extractor/iqiyi.py index d07b39d..c41f6db 100644 --- a/hypervideo_dl/extractor/iqiyi.py +++ b/hypervideo_dl/extractor/iqiyi.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import hashlib import itertools import re @@ -218,7 +215,6 @@ class IqiyiIE(InfoExtractor): self._sleep(5, video_id) - self._sort_formats(formats) title = (get_element_by_id('widget-videotitle', webpage) or clean_html(get_element_by_attribute('class', 'mod-play-tit', webpage)) or self._html_search_regex(r'<span[^>]+data-videochanged-title="word"[^>]*>([^<]+)</span>', webpage, 'title')) @@ -274,6 +270,7 @@ class IqIE(InfoExtractor): '1': 'zh_CN', '2': 'zh_TW', '3': 'en', + '4': 'kor', '18': 'th', '21': 'my', '23': 'vi', @@ -354,7 +351,7 @@ class IqIE(InfoExtractor): ''' def _extract_vms_player_js(self, webpage, video_id): - player_js_cache = self._downloader.cache.load('iq', 'player_js') + player_js_cache = self.cache.load('iq', 'player_js') if player_js_cache: return player_js_cache webpack_js_url = self._proto_relative_url(self._search_regex( @@ -367,7 +364,7 @@ class IqIE(InfoExtractor): f'https://stc.iqiyipic.com/_next/static/chunks/{webpack_map1.get(module_index, module_index)}.{webpack_map2[module_index]}.js', video_id, note=f'Downloading #{module_index} module JS', errnote='Unable to download module JS', fatal=False) or '' if 'vms request' in module_js: - self._downloader.cache.store('iq', 'player_js', module_js) + self.cache.store('iq', 'player_js', module_js) return module_js raise ExtractorError('Unable to extract player JS') @@ -420,8 +417,9 @@ class IqIE(InfoExtractor): ut_list = ['0'] # bid 0 as an initial format checker - dash_paths = self._parse_json(PhantomJSwrapper(self).get( - url, html='<!DOCTYPE html>', video_id=video_id, note2='Executing signature code', jscode=self._DASH_JS % { + dash_paths = self._parse_json(PhantomJSwrapper(self, timeout=120_000).get( + url, note2='Executing signature code (this may take a couple minutes)', + html='<!DOCTYPE html>', video_id=video_id, jscode=self._DASH_JS % { 'tvid': video_info['tvId'], 'vid': video_info['vid'], 'src': traverse_obj(next_props, ('initialProps', 'pageProps', 'ptid'), @@ -443,7 +441,7 @@ class IqIE(InfoExtractor): preview_time = traverse_obj( initial_format_data, ('boss_ts', (None, 'data'), ('previewTime', 'rtime')), expected_type=float_or_none, get_all=False) if traverse_obj(initial_format_data, ('boss_ts', 'data', 'prv'), expected_type=int_or_none): - self.report_warning('This preview video is limited%s' % format_field(preview_time, template=' to %s seconds')) + self.report_warning('This preview video is limited%s' % format_field(preview_time, None, ' to %s seconds')) # TODO: Extract audio-only formats for bid in set(traverse_obj(initial_format_data, ('program', 'video', ..., 'bid'), expected_type=str_or_none, default=[])): @@ -498,8 +496,6 @@ class IqIE(InfoExtractor): }) formats.extend(extracted_formats) - self._sort_formats(formats) - for sub_format in traverse_obj(initial_format_data, ('program', 'stl', ...), expected_type=dict, default=[]): lang = self._LID_TAGS.get(str_or_none(sub_format.get('lid')), sub_format.get('_name')) subtitles.setdefault(lang, []).extend([{ diff --git a/hypervideo_dl/extractor/ir90tv.py b/hypervideo_dl/extractor/ir90tv.py deleted file mode 100644 index d5a3f6f..0000000 --- a/hypervideo_dl/extractor/ir90tv.py +++ /dev/null @@ -1,42 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import remove_start - - -class Ir90TvIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?90tv\.ir/video/(?P<id>[0-9]+)/.*' - _TESTS = [{ - 'url': 'http://90tv.ir/video/95719/%D8%B4%D8%A7%DB%8C%D8%B9%D8%A7%D8%AA-%D9%86%D9%82%D9%84-%D9%88-%D8%A7%D9%86%D8%AA%D9%82%D8%A7%D9%84%D8%A7%D8%AA-%D9%85%D9%87%D9%85-%D9%81%D9%88%D8%AA%D8%A8%D8%A7%D9%84-%D8%A7%D8%B1%D9%88%D9%BE%D8%A7-940218', - 'md5': '411dbd94891381960cb9e13daa47a869', - 'info_dict': { - 'id': '95719', - 'ext': 'mp4', - 'title': 'شایعات نقل و انتقالات مهم فوتبال اروپا 94/02/18', - 'thumbnail': r're:^https?://.*\.jpg$', - } - }, { - 'url': 'http://www.90tv.ir/video/95719/%D8%B4%D8%A7%DB%8C%D8%B9%D8%A7%D8%AA-%D9%86%D9%82%D9%84-%D9%88-%D8%A7%D9%86%D8%AA%D9%82%D8%A7%D9%84%D8%A7%D8%AA-%D9%85%D9%87%D9%85-%D9%81%D9%88%D8%AA%D8%A8%D8%A7%D9%84-%D8%A7%D8%B1%D9%88%D9%BE%D8%A7-940218', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - title = remove_start(self._html_search_regex( - r'<title>([^<]+)', webpage, 'title'), '90tv.ir :: ') - - video_url = self._search_regex( - r']+src="([^"]+)"', webpage, 'video url') - - thumbnail = self._search_regex(r'poster="([^"]+)"', webpage, 'thumbnail url', fatal=False) - - return { - 'url': video_url, - 'id': video_id, - 'title': title, - 'video_url': video_url, - 'thumbnail': thumbnail, - } diff --git a/hypervideo_dl/extractor/islamchannel.py b/hypervideo_dl/extractor/islamchannel.py new file mode 100644 index 0000000..253a846 --- /dev/null +++ b/hypervideo_dl/extractor/islamchannel.py @@ -0,0 +1,81 @@ +import re + +from .common import InfoExtractor +from ..utils import traverse_obj, urljoin + + +class IslamChannelIE(InfoExtractor): + _VALID_URL = r'https?://watch\.islamchannel\.tv/watch/(?P\d+)' + _TESTS = [{ + 'url': 'https://watch.islamchannel.tv/watch/38604310', + 'info_dict': { + 'id': '38604310', + 'title': 'Omar - Young Omar', + 'description': 'md5:5cc7ddecef064ea7afe52eb5e0e33b55', + 'thumbnail': r're:https?://.+', + 'ext': 'mp4', + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + thumbnail = self._search_regex( + r'data-poster="([^"]+)"', webpage, 'data poster', fatal=False) or \ + self._html_search_meta(('og:image', 'twitter:image'), webpage) + + headers = { + 'Token': self._search_regex(r'data-token="([^"]+)"', webpage, 'data token'), + 'Token-Expiry': self._search_regex(r'data-expiry="([^"]+)"', webpage, 'data expiry'), + 'Uvid': video_id, + } + show_stream = self._download_json( + f'https://v2-streams-elb.simplestreamcdn.com/api/show/stream/{video_id}', video_id, + query={ + 'key': self._search_regex(r'data-key="([^"]+)"', webpage, 'data key'), + 'platform': 'chrome', + }, headers=headers) + # TODO: show_stream['stream'] and show_stream['drm'] may contain something interesting + streams = self._download_json( + traverse_obj(show_stream, ('response', 'tokenization', 'url')), video_id, + headers=headers) + formats, subs = self._extract_m3u8_formats_and_subtitles(traverse_obj(streams, ('Streams', 'Adaptive')), video_id, 'mp4') + + return { + 'id': video_id, + 'title': self._html_search_meta(('og:title', 'twitter:title'), webpage), + 'description': self._html_search_meta(('og:description', 'twitter:description', 'description'), webpage), + 'formats': formats, + 'subtitles': subs, + 'thumbnails': [{ + 'id': 'unscaled', + 'url': thumbnail.split('?')[0], + 'ext': 'jpg', + 'preference': 2, + }, { + 'id': 'orig', + 'url': thumbnail, + 'ext': 'jpg', + 'preference': 1, + }] if thumbnail else None, + } + + +class IslamChannelSeriesIE(InfoExtractor): + _VALID_URL = r'https?://watch\.islamchannel\.tv/series/(?P[a-f\d-]+)' + _TESTS = [{ + 'url': 'https://watch.islamchannel.tv/series/a6cccef3-3ef1-11eb-bc19-06b69c2357cd', + 'info_dict': { + 'id': 'a6cccef3-3ef1-11eb-bc19-06b69c2357cd', + }, + 'playlist_mincount': 31, + }] + + def _real_extract(self, url): + pl_id = self._match_id(url) + webpage = self._download_webpage(url, pl_id) + + return self.playlist_from_matches( + re.finditer(r']+?data-video-type="show">', webpage), + pl_id, getter=lambda x: urljoin(url, x.group(1)), ie=IslamChannelIE) diff --git a/hypervideo_dl/extractor/israelnationalnews.py b/hypervideo_dl/extractor/israelnationalnews.py new file mode 100644 index 0000000..35040f5 --- /dev/null +++ b/hypervideo_dl/extractor/israelnationalnews.py @@ -0,0 +1,50 @@ +from .common import InfoExtractor +from ..utils import ExtractorError, traverse_obj + + +class IsraelNationalNewsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?israelnationalnews\.com/news/(?P\d+)' + _TESTS = [{ + 'url': 'https://www.israelnationalnews.com/news/354520', + 'info_dict': { + 'id': '354520' + }, + 'playlist': [{ + 'info_dict': { + 'id': 'jA84wQhVvg8', + 'title': 'Even CNN Host Is Shocked by How Bad Biden\'s Approval Ratings Have Gotten | DM CLIPS | Rubin Report', + 'ext': 'mp4', + 'description': 'md5:b7325a3d00c7596337dc3ae37e32d35c', + 'channel': 'The Rubin Report', + 'channel_follower_count': int, + 'comment_count': int, + 'categories': ['News & Politics'], + 'like_count': int, + 'uploader_url': 'http://www.youtube.com/user/RubinReport', + 'uploader_id': 'RubinReport', + 'availability': 'public', + 'view_count': int, + 'duration': 240, + 'thumbnail': 'https://i.ytimg.com/vi_webp/jA84wQhVvg8/maxresdefault.webp', + 'live_status': 'not_live', + 'playable_in_embed': True, + 'age_limit': 0, + 'tags': 'count:29', + 'channel_id': 'UCJdKr0Bgd_5saZYqLCa9mng', + 'channel_url': 'https://www.youtube.com/channel/UCJdKr0Bgd_5saZYqLCa9mng', + 'upload_date': '20220606', + 'uploader': 'The Rubin Report', + } + }] + }] + + def _real_extract(self, url): + news_article_id = self._match_id(url) + article_json = self._download_json( + f'https://www.israelnationalnews.com/Generic/NewAPI/Item?type=0&Item={news_article_id}', news_article_id) + + urls = traverse_obj(article_json, ('Content2', ..., 'content', ..., 'attrs', 'src')) + if not urls: + raise ExtractorError('This article does not have any videos', expected=True) + + return self.playlist_from_matches(urls, news_article_id, ie='Youtube') diff --git a/hypervideo_dl/extractor/itprotv.py b/hypervideo_dl/extractor/itprotv.py index 64cb4e6..4ac1260 100644 --- a/hypervideo_dl/extractor/itprotv.py +++ b/hypervideo_dl/extractor/itprotv.py @@ -1,5 +1,3 @@ -# coding: utf-8 - import re from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/itv.py b/hypervideo_dl/extractor/itv.py index 66705a2..0681050 100644 --- a/hypervideo_dl/extractor/itv.py +++ b/hypervideo_dl/extractor/itv.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json from .common import InfoExtractor @@ -175,7 +172,6 @@ class ITVIE(InfoExtractor): formats.append({ 'url': href, }) - self._sort_formats(formats) info = self._search_json_ld(webpage, video_id, default={}) if not info: json_ld = self._parse_json(self._search_regex( diff --git a/hypervideo_dl/extractor/ivi.py b/hypervideo_dl/extractor/ivi.py index 098ab66..27a222a 100644 --- a/hypervideo_dl/extractor/ivi.py +++ b/hypervideo_dl/extractor/ivi.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json import re @@ -16,6 +13,7 @@ class IviIE(InfoExtractor): IE_DESC = 'ivi.ru' IE_NAME = 'ivi' _VALID_URL = r'https?://(?:www\.)?ivi\.(?:ru|tv)/(?:watch/(?:[^/]+/)?|video/player\?.*?videoId=)(?P\d+)' + _EMBED_REGEX = [r']+?src=(["\'])(?Phttps?://(?:www\.)?ivi\.ru/video/player.+?)\1'] _GEO_BYPASS = False _GEO_COUNTRIES = ['RU'] _LIGHT_KEY = b'\xf1\x02\x32\xb7\xbc\x5c\x7a\xe8\xf7\x96\xc1\x33\x2b\x27\xa1\x8c' @@ -168,7 +166,6 @@ class IviIE(InfoExtractor): 'quality': quality(content_format), 'filesize': int_or_none(f.get('size_in_bytes')), }) - self._sort_formats(formats) compilation = result.get('compilation') episode = title if compilation else None diff --git a/hypervideo_dl/extractor/ivideon.py b/hypervideo_dl/extractor/ivideon.py index 44b2208..7d1e554 100644 --- a/hypervideo_dl/extractor/ivideon.py +++ b/hypervideo_dl/extractor/ivideon.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor from ..compat import ( compat_urllib_parse_urlencode, @@ -71,7 +67,6 @@ class IvideonIE(InfoExtractor): 'ext': 'flv', 'quality': quality(format_id), } for format_id in self._QUALITIES] - self._sort_formats(formats) return { 'id': server_id, diff --git a/hypervideo_dl/extractor/iwara.py b/hypervideo_dl/extractor/iwara.py index c0e01e3..ec3e59c 100644 --- a/hypervideo_dl/extractor/iwara.py +++ b/hypervideo_dl/extractor/iwara.py @@ -1,21 +1,29 @@ -# coding: utf-8 -from __future__ import unicode_literals +import itertools import re +import urllib.parse from .common import InfoExtractor -from ..compat import compat_urllib_parse_urlparse from ..utils import ( int_or_none, mimetype2ext, remove_end, - url_or_none, - unified_strdate, strip_or_none, + unified_strdate, + url_or_none, + urljoin, ) -class IwaraIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.|ecchi\.)?iwara\.tv/videos/(?P[a-zA-Z0-9]+)' +class IwaraBaseIE(InfoExtractor): + _BASE_REGEX = r'(?Phttps?://(?:www\.|ecchi\.)?iwara\.tv)' + + def _extract_playlist(self, base_url, webpage): + for path in re.findall(r'class="title">\s*[a-zA-Z0-9]+)' _TESTS = [{ 'url': 'http://iwara.tv/videos/amVwUl1EHpAD9RD', # md5 is unstable @@ -60,7 +68,7 @@ class IwaraIE(InfoExtractor): webpage, urlh = self._download_webpage_handle(url, video_id) - hostname = compat_urllib_parse_urlparse(urlh.geturl()).hostname + hostname = urllib.parse.urlparse(urlh.geturl()).hostname # ecchi is 'sexy' in Japanese age_limit = 18 if hostname.split('.')[0] == 'ecchi' else 0 @@ -108,8 +116,6 @@ class IwaraIE(InfoExtractor): 'quality': 1 if format_id == 'Source' else 0, }) - self._sort_formats(formats) - return { 'id': video_id, 'title': title, @@ -120,3 +126,114 @@ class IwaraIE(InfoExtractor): 'upload_date': upload_date, 'description': description, } + + +class IwaraPlaylistIE(IwaraBaseIE): + _VALID_URL = fr'{IwaraBaseIE._BASE_REGEX}/playlist/(?P[^/?#&]+)' + IE_NAME = 'iwara:playlist' + + _TESTS = [{ + 'url': 'https://ecchi.iwara.tv/playlist/best-enf', + 'info_dict': { + 'title': 'Best enf', + 'uploader': 'Jared98112', + 'id': 'best-enf', + }, + 'playlist_mincount': 1097, + }, { + # urlencoded + 'url': 'https://ecchi.iwara.tv/playlist/%E3%83%97%E3%83%AC%E3%82%A4%E3%83%AA%E3%82%B9%E3%83%88-2', + 'info_dict': { + 'id': 'プレイリスト-2', + 'title': 'プレイリスト', + 'uploader': 'mainyu', + }, + 'playlist_mincount': 91, + }] + + def _real_extract(self, url): + playlist_id, base_url = self._match_valid_url(url).group('id', 'base_url') + playlist_id = urllib.parse.unquote(playlist_id) + webpage = self._download_webpage(url, playlist_id) + + return { + '_type': 'playlist', + 'id': playlist_id, + 'title': self._html_search_regex(r'class="title"[^>]*>([^<]+)', webpage, 'title', fatal=False), + 'uploader': self._html_search_regex(r'

    ([^<]+)', webpage, 'uploader', fatal=False), + 'entries': self._extract_playlist(base_url, webpage), + } + + +class IwaraUserIE(IwaraBaseIE): + _VALID_URL = fr'{IwaraBaseIE._BASE_REGEX}/users/(?P[^/?#&]+)' + IE_NAME = 'iwara:user' + + _TESTS = [{ + 'note': 'number of all videos page is just 1 page. less than 40 videos', + 'url': 'https://ecchi.iwara.tv/users/infinityyukarip', + 'info_dict': { + 'title': 'Uploaded videos from Infinity_YukariP', + 'id': 'infinityyukarip', + 'uploader': 'Infinity_YukariP', + 'uploader_id': 'infinityyukarip', + }, + 'playlist_mincount': 39, + }, { + 'note': 'no even all videos page. probably less than 10 videos', + 'url': 'https://ecchi.iwara.tv/users/mmd-quintet', + 'info_dict': { + 'title': 'Uploaded videos from mmd quintet', + 'id': 'mmd-quintet', + 'uploader': 'mmd quintet', + 'uploader_id': 'mmd-quintet', + }, + 'playlist_mincount': 6, + }, { + 'note': 'has paging. more than 40 videos', + 'url': 'https://ecchi.iwara.tv/users/theblackbirdcalls', + 'info_dict': { + 'title': 'Uploaded videos from TheBlackbirdCalls', + 'id': 'theblackbirdcalls', + 'uploader': 'TheBlackbirdCalls', + 'uploader_id': 'theblackbirdcalls', + }, + 'playlist_mincount': 420, + }, { + 'note': 'foreign chars in URL. there must be foreign characters in URL', + 'url': 'https://ecchi.iwara.tv/users/ぶた丼', + 'info_dict': { + 'title': 'Uploaded videos from ぶた丼', + 'id': 'ぶた丼', + 'uploader': 'ぶた丼', + 'uploader_id': 'ぶた丼', + }, + 'playlist_mincount': 170, + }] + + def _entries(self, playlist_id, base_url): + webpage = self._download_webpage( + f'{base_url}/users/{playlist_id}', playlist_id) + videos_url = self._search_regex(r'', webpage, 'all videos url', default=None) + if not videos_url: + yield from self._extract_playlist(base_url, webpage) + return + + videos_url = urljoin(base_url, videos_url) + + for n in itertools.count(1): + page = self._download_webpage( + videos_url, playlist_id, note=f'Downloading playlist page {n}', + query={'page': str(n - 1)} if n > 1 else {}) + yield from self._extract_playlist( + base_url, page) + + if f'page={n}' not in page: + break + + def _real_extract(self, url): + playlist_id, base_url = self._match_valid_url(url).group('id', 'base_url') + playlist_id = urllib.parse.unquote(playlist_id) + + return self.playlist_result( + self._entries(playlist_id, base_url), playlist_id) diff --git a/hypervideo_dl/extractor/ixigua.py b/hypervideo_dl/extractor/ixigua.py new file mode 100644 index 0000000..1f086d2 --- /dev/null +++ b/hypervideo_dl/extractor/ixigua.py @@ -0,0 +1,83 @@ +import base64 + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + get_element_by_id, + int_or_none, + js_to_json, + str_or_none, + traverse_obj, +) + + +class IxiguaIE(InfoExtractor): + _VALID_URL = r'https?://(?:\w+\.)?ixigua\.com/(?:video/)?(?P\d+).+' + _TESTS = [{ + 'url': 'https://www.ixigua.com/6996881461559165471', + 'info_dict': { + 'id': '6996881461559165471', + 'ext': 'mp4', + 'title': '盲目涉水风险大,亲身示范高水位行车注意事项', + 'description': 'md5:8c82f46186299add4a1c455430740229', + 'tags': ['video_car'], + 'like_count': int, + 'dislike_count': int, + 'view_count': int, + 'uploader': '懂车帝原创', + 'uploader_id': '6480145787', + 'thumbnail': r're:^https?://.+\.(avif|webp)', + 'timestamp': 1629088414, + 'duration': 1030, + } + }] + + def _get_json_data(self, webpage, video_id): + js_data = get_element_by_id('SSR_HYDRATED_DATA', webpage) + if not js_data: + if self._cookies_passed: + raise ExtractorError('Failed to get SSR_HYDRATED_DATA') + raise ExtractorError('Cookies (not necessarily logged in) are needed', expected=True) + + return self._parse_json( + js_data.replace('window._SSR_HYDRATED_DATA=', ''), video_id, transform_source=js_to_json) + + def _media_selector(self, json_data): + for path, override in ( + (('video_list', ), {}), + (('dynamic_video', 'dynamic_video_list'), {'acodec': 'none'}), + (('dynamic_video', 'dynamic_audio_list'), {'vcodec': 'none', 'ext': 'm4a'}), + ): + for media in traverse_obj(json_data, (..., *path, lambda _, v: v['main_url'])): + yield { + 'url': base64.b64decode(media['main_url']).decode(), + 'width': int_or_none(media.get('vwidth')), + 'height': int_or_none(media.get('vheight')), + 'fps': int_or_none(media.get('fps')), + 'vcodec': media.get('codec_type'), + 'format_id': str_or_none(media.get('quality_type')), + 'filesize': int_or_none(media.get('size')), + 'ext': 'mp4', + **override, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + json_data = self._get_json_data(webpage, video_id)['anyVideo']['gidInformation']['packerData']['video'] + + formats = list(self._media_selector(json_data.get('videoResource'))) + return { + 'id': video_id, + 'title': json_data.get('title'), + 'description': json_data.get('video_abstract'), + 'formats': formats, + 'like_count': json_data.get('video_like_count'), + 'duration': int_or_none(json_data.get('duration')), + 'tags': [json_data.get('tag')], + 'uploader_id': traverse_obj(json_data, ('user_info', 'user_id')), + 'uploader': traverse_obj(json_data, ('user_info', 'name')), + 'view_count': json_data.get('video_watch_count'), + 'dislike_count': json_data.get('video_unlike_count'), + 'timestamp': int_or_none(json_data.get('video_publish_time')), + } diff --git a/hypervideo_dl/extractor/izlesene.py b/hypervideo_dl/extractor/izlesene.py index f8fca6c..5cdf870 100644 --- a/hypervideo_dl/extractor/izlesene.py +++ b/hypervideo_dl/extractor/izlesene.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import ( compat_str, @@ -81,7 +78,6 @@ class IzleseneIE(InfoExtractor): 'ext': ext, 'height': height, }) - self._sort_formats(formats) description = self._og_search_description(webpage, default=None) thumbnail = video.get('posterURL') or self._proto_relative_url( diff --git a/hypervideo_dl/extractor/jable.py b/hypervideo_dl/extractor/jable.py new file mode 100644 index 0000000..84c3225 --- /dev/null +++ b/hypervideo_dl/extractor/jable.py @@ -0,0 +1,103 @@ +import re + +from .common import InfoExtractor +from ..utils import ( + InAdvancePagedList, + int_or_none, + orderedSet, + unified_strdate, +) + + +class JableIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?jable.tv/videos/(?P[\w-]+)' + _TESTS = [{ + 'url': 'https://jable.tv/videos/pppd-812/', + 'md5': 'f1537283a9bc073c31ff86ca35d9b2a6', + 'info_dict': { + 'id': 'pppd-812', + 'ext': 'mp4', + 'title': 'PPPD-812 只要表現好巨乳女教師吉根柚莉愛就獎勵學生們在白虎穴內射出精液', + 'description': 'md5:5b6d4199a854f62c5e56e26ccad19967', + 'thumbnail': r're:^https?://.*\.jpg$', + 'age_limit': 18, + 'like_count': int, + 'view_count': int, + }, + }, { + 'url': 'https://jable.tv/videos/apak-220/', + 'md5': '71f9239d69ced58ab74a816908847cc1', + 'info_dict': { + 'id': 'apak-220', + 'ext': 'mp4', + 'title': 'md5:5c3861b7cf80112a6e2b70bccf170824', + 'description': '', + 'thumbnail': r're:^https?://.*\.jpg$', + 'age_limit': 18, + 'like_count': int, + 'view_count': int, + 'upload_date': '20220319', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + formats = self._extract_m3u8_formats( + self._search_regex(r'var\s+hlsUrl\s*=\s*\'([^\']+)', webpage, 'hls_url'), video_id, 'mp4', m3u8_id='hls') + + return { + 'id': video_id, + 'title': self._og_search_title(webpage), + 'description': self._og_search_description(webpage, default=''), + 'thumbnail': self._og_search_thumbnail(webpage, default=None), + 'formats': formats, + 'age_limit': 18, + 'upload_date': unified_strdate(self._search_regex( + r'class="inactive-color">\D+\s+(\d{4}-\d+-\d+)', webpage, 'upload_date', default=None)), + 'view_count': int_or_none(self._search_regex( + r'#icon-eye">\n*([\d ]+)', + webpage, 'view_count', default='').replace(' ', '')), + 'like_count': int_or_none(self._search_regex( + r'#icon-heart">(\d+)', webpage, 'link_count', default=None)), + } + + +class JablePlaylistIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?jable.tv/(?:categories|models|tags)/(?P[\w-]+)' + _TESTS = [{ + 'url': 'https://jable.tv/models/kaede-karen/', + 'info_dict': { + 'id': 'kaede-karen', + 'title': '楓カレン', + }, + 'playlist_count': 34, + }, { + 'url': 'https://jable.tv/categories/roleplay/', + 'only_matching': True, + }, { + 'url': 'https://jable.tv/tags/girl/', + 'only_matching': True, + }] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + webpage = self._download_webpage(url, playlist_id) + + def page_func(page_num): + return [ + self.url_result(player_url, JableIE) + for player_url in orderedSet(re.findall( + r'href="(https://jable.tv/videos/[\w-]+/?)"', + self._download_webpage(url, playlist_id, query={ + 'mode': 'async', + 'from': page_num + 1, + 'function': 'get_block', + 'block_id': 'list_videos_common_videos_list', + }, note=f'Downloading page {page_num + 1}')))] + + return self.playlist_result( + InAdvancePagedList(page_func, int_or_none(self._search_regex( + r'from:(\d+)">[^<]+\s*»', webpage, 'last page number', default=1)), 24), + playlist_id, self._search_regex( + r'

    ([^<]+)', webpage, 'playlist title', default=None)) diff --git a/hypervideo_dl/extractor/jamendo.py b/hypervideo_dl/extractor/jamendo.py index 755d970..a2bbba3 100644 --- a/hypervideo_dl/extractor/jamendo.py +++ b/hypervideo_dl/extractor/jamendo.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import hashlib import random @@ -31,10 +28,11 @@ class JamendoIE(InfoExtractor): 'ext': 'flac', # 'title': 'Maya Filipič - Stories from Emona I', 'title': 'Stories from Emona I', - # 'artist': 'Maya Filipič', + 'artist': 'Maya Filipič', + 'album': 'Between two worlds', 'track': 'Stories from Emona I', 'duration': 210, - 'thumbnail': r're:^https?://.*\.jpg', + 'thumbnail': 'https://usercontent.jamendo.com?type=album&id=29279&width=300&trackid=196219', 'timestamp': 1217438117, 'upload_date': '20080730', 'license': 'by-nc-nd', @@ -48,11 +46,11 @@ class JamendoIE(InfoExtractor): 'only_matching': True, }] - def _call_api(self, resource, resource_id): + def _call_api(self, resource, resource_id, fatal=True): path = '/api/%ss' % resource rand = compat_str(random.random()) return self._download_json( - 'https://www.jamendo.com' + path, resource_id, query={ + 'https://www.jamendo.com' + path, resource_id, fatal=fatal, query={ 'id[]': resource_id, }, headers={ 'X-Jam-Call': '$%s*%s~' % (hashlib.sha1((path + rand).encode()).hexdigest(), rand) @@ -74,6 +72,8 @@ class JamendoIE(InfoExtractor): # if artist_name: # title = '%s - %s' % (artist_name, title) # album = get_model('album') + artist = self._call_api("artist", track.get('artistId'), fatal=False) + album = self._call_api("album", track.get('albumId'), fatal=False) formats = [{ 'url': 'https://%s.jamendo.com/?trackid=%s&format=%s&from=app-97dab294' @@ -87,7 +87,6 @@ class JamendoIE(InfoExtractor): ('ogg1', 'ogg', 'ogg'), ('flac', 'flac', 'flac'), ))] - self._sort_formats(formats) urls = [] thumbnails = [] @@ -121,9 +120,9 @@ class JamendoIE(InfoExtractor): 'title': title, 'description': track.get('description'), 'duration': int_or_none(track.get('duration')), - # 'artist': artist_name, + 'artist': artist.get('name'), 'track': track_name, - # 'album': album.get('name'), + 'album': album.get('name'), 'formats': formats, 'license': '-'.join(license) if license else None, 'timestamp': int_or_none(track.get('dateCreated')), @@ -134,7 +133,7 @@ class JamendoIE(InfoExtractor): } -class JamendoAlbumIE(JamendoIE): +class JamendoAlbumIE(JamendoIE): # XXX: Do not subclass from concrete IE _VALID_URL = r'https?://(?:www\.)?jamendo\.com/album/(?P[0-9]+)' _TESTS = [{ 'url': 'https://www.jamendo.com/album/121486/duck-on-cover', @@ -148,22 +147,38 @@ class JamendoAlbumIE(JamendoIE): 'info_dict': { 'id': '1032333', 'ext': 'flac', - 'title': 'Shearer - Warmachine', + 'title': 'Warmachine', 'artist': 'Shearer', 'track': 'Warmachine', 'timestamp': 1368089771, 'upload_date': '20130509', + 'view_count': int, + 'thumbnail': 'https://usercontent.jamendo.com?type=album&id=121486&width=300&trackid=1032333', + 'duration': 190, + 'license': 'by', + 'album': 'Duck On Cover', + 'average_rating': 4, + 'tags': ['rock', 'drums', 'bass', 'world', 'punk', 'neutral'], + 'like_count': int, } }, { 'md5': '1f358d7b2f98edfe90fd55dac0799d50', 'info_dict': { 'id': '1032330', 'ext': 'flac', - 'title': 'Shearer - Without Your Ghost', + 'title': 'Without Your Ghost', 'artist': 'Shearer', 'track': 'Without Your Ghost', 'timestamp': 1368089771, 'upload_date': '20130509', + 'duration': 192, + 'tags': ['rock', 'drums', 'bass', 'world', 'punk'], + 'album': 'Duck On Cover', + 'thumbnail': 'https://usercontent.jamendo.com?type=album&id=121486&width=300&trackid=1032330', + 'view_count': int, + 'average_rating': 4, + 'license': 'by', + 'like_count': int, } }], 'params': { diff --git a/hypervideo_dl/extractor/japandiet.py b/hypervideo_dl/extractor/japandiet.py new file mode 100644 index 0000000..6c65056 --- /dev/null +++ b/hypervideo_dl/extractor/japandiet.py @@ -0,0 +1,274 @@ +import re + +from ..utils import ( + ExtractorError, + clean_html, + int_or_none, + join_nonempty, + parse_qs, + smuggle_url, + traverse_obj, + try_call, + unsmuggle_url +) +from .common import InfoExtractor + + +def _parse_japanese_date(text): + if not text: + return None + ERA_TABLE = { + '明治': 1868, + '大正': 1912, + '昭和': 1926, + '平成': 1989, + '令和': 2019, + } + ERA_RE = '|'.join(map(re.escape, ERA_TABLE.keys())) + mobj = re.search(rf'({ERA_RE})?(\d+)年(\d+)月(\d+)日', re.sub(r'[\s\u3000]+', '', text)) + if not mobj: + return None + era, year, month, day = mobj.groups() + year, month, day = map(int, (year, month, day)) + if era: + # example input: 令和5年3月34日 + # even though each era have their end, don't check here + year += ERA_TABLE[era] + return '%04d%02d%02d' % (year, month, day) + + +def _parse_japanese_duration(text): + mobj = re.search(r'(?:(\d+)日間?)?(?:(\d+)時間?)?(?:(\d+)分)?(?:(\d+)秒)?', re.sub(r'[\s\u3000]+', '', text or '')) + if not mobj: + return + days, hours, mins, secs = [int_or_none(x, default=0) for x in mobj.groups()] + return secs + mins * 60 + hours * 60 * 60 + days * 24 * 60 * 60 + + +class ShugiinItvBaseIE(InfoExtractor): + _INDEX_ROOMS = None + + @classmethod + def _find_rooms(cls, webpage): + return [{ + '_type': 'url', + 'id': x.group(1), + 'title': clean_html(x.group(2)).strip(), + 'url': smuggle_url(f'https://www.shugiintv.go.jp/jp/index.php?room_id={x.group(1)}', {'g': x.groups()}), + 'ie_key': ShugiinItvLiveIE.ie_key(), + } for x in re.finditer(r'(?s)(.+?)', webpage)] + + def _fetch_rooms(self): + if not self._INDEX_ROOMS: + webpage = self._download_webpage( + 'https://www.shugiintv.go.jp/jp/index.php', None, + encoding='euc-jp', note='Downloading proceedings info') + ShugiinItvBaseIE._INDEX_ROOMS = self._find_rooms(webpage) + return self._INDEX_ROOMS + + +class ShugiinItvLiveIE(ShugiinItvBaseIE): + _VALID_URL = r'https?://(?:www\.)?shugiintv\.go\.jp/(?:jp|en)(?:/index\.php)?$' + IE_DESC = '衆議院インターネット審議中継' + + _TESTS = [{ + 'url': 'https://www.shugiintv.go.jp/jp/index.php', + 'info_dict': { + '_type': 'playlist', + 'title': 'All proceedings for today', + }, + # expect at least one proceedings is running + 'playlist_mincount': 1, + }] + + @classmethod + def suitable(cls, url): + return super().suitable(url) and not any(x.suitable(url) for x in (ShugiinItvLiveRoomIE, ShugiinItvVodIE)) + + def _real_extract(self, url): + self.to_screen( + 'Downloading all running proceedings. To specify one proceeding, use direct link from the website') + return self.playlist_result(self._fetch_rooms(), playlist_title='All proceedings for today') + + +class ShugiinItvLiveRoomIE(ShugiinItvBaseIE): + _VALID_URL = r'https?://(?:www\.)?shugiintv\.go\.jp/(?:jp|en)/index\.php\?room_id=(?Proom\d+)' + IE_DESC = '衆議院インターネット審議中継 (中継)' + + _TESTS = [{ + 'url': 'https://www.shugiintv.go.jp/jp/index.php?room_id=room01', + 'info_dict': { + 'id': 'room01', + 'title': '内閣委員会', + }, + 'skip': 'this runs for a time and not every day', + }, { + 'url': 'https://www.shugiintv.go.jp/jp/index.php?room_id=room11', + 'info_dict': { + 'id': 'room11', + 'title': '外務委員会', + }, + 'skip': 'this runs for a time and not every day', + }] + + def _real_extract(self, url): + url, smug = unsmuggle_url(url, default={}) + if smug.get('g'): + room_id, title = smug['g'] + else: + room_id = self._match_id(url) + title = traverse_obj(self._fetch_rooms(), (lambda k, v: v['id'] == room_id, 'title'), get_all=False) + + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + f'https://hlslive.shugiintv.go.jp/{room_id}/amlst:{room_id}/playlist.m3u8', + room_id, ext='mp4') + + return { + 'id': room_id, + 'title': title, + 'formats': formats, + 'subtitles': subtitles, + 'is_live': True, + } + + +class ShugiinItvVodIE(ShugiinItvBaseIE): + _VALID_URL = r'https?://(?:www\.)?shugiintv\.go\.jp/(?:jp|en)/index\.php\?ex=VL(?:\&[^=]+=[^&]*)*\&deli_id=(?P\d+)' + IE_DESC = '衆議院インターネット審議中継 (ビデオライブラリ)' + _TESTS = [{ + 'url': 'https://www.shugiintv.go.jp/jp/index.php?ex=VL&media_type=&deli_id=53846', + 'info_dict': { + 'id': '53846', + 'title': 'ウクライナ大統領国会演説(オンライン)', + 'release_date': '20220323', + 'chapters': 'count:4', + } + }, { + 'url': 'https://www.shugiintv.go.jp/en/index.php?ex=VL&media_type=&deli_id=53846', + 'only_matching': True + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage( + f'https://www.shugiintv.go.jp/jp/index.php?ex=VL&media_type=&deli_id={video_id}', video_id, + encoding='euc-jp') + + m3u8_url = self._search_regex( + r'id="vtag_src_base_vod"\s*value="(http.+?\.m3u8)"', webpage, 'm3u8 url') + m3u8_url = re.sub(r'^http://', 'https://', m3u8_url) + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + m3u8_url, video_id, ext='mp4') + + title = self._html_search_regex( + (r'(.+)\s*\(\d+分\)', + r'(.+?)\s*\s*(.+?)', + webpage, 'title', fatal=False)) + + chapters = [] + for chp in re.finditer(r'(?i)(?!', webpage): + chapters.append({ + 'title': clean_html(chp.group(2)).strip(), + 'start_time': try_call(lambda: float(parse_qs(chp.group(1))['time'][0].strip())), + }) + # NOTE: there are blanks at the first and the end of the videos, + # so getting/providing the video duration is not possible + # also, the exact end_time for the last chapter is unknown (we can get at most minutes of granularity) + last_tr = re.findall(r'(?s)(.+?)', webpage)[-1] + if last_tr and chapters: + last_td = re.findall(r'', last_tr)[-1] + if last_td: + chapters[-1]['end_time'] = chapters[-1]['start_time'] + _parse_japanese_duration(clean_html(last_td)) + + return { + 'id': video_id, + 'title': title, + 'release_date': release_date, + 'chapters': chapters, + 'formats': formats, + 'subtitles': subtitles, + } + + +class SangiinInstructionIE(InfoExtractor): + _VALID_URL = r'^https?://www\.webtv\.sangiin\.go\.jp/webtv/index\.php' + IE_DESC = False # this shouldn't be listed as a supported site + + def _real_extract(self, url): + raise ExtractorError('Copy the link from the botton below the video description or player, and use the link to download. If there are no button in the frame, get the URL of the frame showing the video.', expected=True) + + +class SangiinIE(InfoExtractor): + _VALID_URL = r'https?://www\.webtv\.sangiin\.go\.jp/webtv/detail\.php\?sid=(?P\d+)' + IE_DESC = '参議院インターネット審議中継 (archive)' + + _TESTS = [{ + 'url': 'https://www.webtv.sangiin.go.jp/webtv/detail.php?sid=7052', + 'info_dict': { + 'id': '7052', + 'title': '2022年10月7日 本会議', + 'description': 'md5:0a5fed523f95c88105a0b0bf1dd71489', + 'upload_date': '20221007', + 'ext': 'mp4', + }, + }, { + 'url': 'https://www.webtv.sangiin.go.jp/webtv/detail.php?sid=7037', + 'info_dict': { + 'id': '7037', + 'title': '2022年10月3日 開会式', + 'upload_date': '20221003', + 'ext': 'mp4', + }, + }, { + 'url': 'https://www.webtv.sangiin.go.jp/webtv/detail.php?sid=7076', + 'info_dict': { + 'id': '7076', + 'title': '2022年10月27日 法務委員会', + 'upload_date': '20221027', + 'ext': 'mp4', + 'is_live': True, + }, + 'skip': 'this live is turned into archive after it ends', + }, ] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + date = self._html_search_regex( + r']*>\s*開会日\s*\s*]*>\s*(.+?)\s*', webpage, + 'date', fatal=False) + upload_date = _parse_japanese_date(date) + + title = self._html_search_regex( + r']*>\s*会議名\s*\s*]*>\s*(.+?)\s*', webpage, + 'date', fatal=False) + + # some videos don't have the elements, so assume it's missing + description = self._html_search_regex( + r'会議の経過\s*

    \s*]*>(.+?)
    ', webpage, + 'description', default=None) + + # this row appears only when it's livestream + is_live = bool(self._html_search_regex( + r']*>\s*公報掲載時刻\s*\s*]*>\s*(.+?)\s*', webpage, + 'is_live', default=None)) + + m3u8_url = self._search_regex( + r'var\s+videopath\s*=\s*(["\'])([^"\']+)\1', webpage, + 'm3u8 url', group=2) + + formats, subs = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, 'mp4') + + return { + 'id': video_id, + 'title': join_nonempty(date, title, delim=' '), + 'description': description, + 'upload_date': upload_date, + 'formats': formats, + 'subtitles': subs, + 'is_live': is_live, + } diff --git a/hypervideo_dl/extractor/jeuxvideo.py b/hypervideo_dl/extractor/jeuxvideo.py index 77c0f52..56ea15c 100644 --- a/hypervideo_dl/extractor/jeuxvideo.py +++ b/hypervideo_dl/extractor/jeuxvideo.py @@ -1,8 +1,3 @@ -# coding: utf-8 - -from __future__ import unicode_literals - - from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/jixie.py b/hypervideo_dl/extractor/jixie.py new file mode 100644 index 0000000..4830e61 --- /dev/null +++ b/hypervideo_dl/extractor/jixie.py @@ -0,0 +1,47 @@ +from .common import InfoExtractor +from ..utils import clean_html, float_or_none, traverse_obj, try_call + + +class JixieBaseIE(InfoExtractor): + """ + API Reference: + https://jixie.atlassian.net/servicedesk/customer/portal/2/article/1339654214?src=-1456335525, + https://scripts.jixie.media/jxvideo.3.1.min.js + """ + + def _extract_data_from_jixie_id(self, display_id, video_id, webpage): + json_data = self._download_json( + 'https://apidam.jixie.io/api/public/stream', display_id, + query={'metadata': 'full', 'video_id': video_id})['data'] + + formats, subtitles = [], {} + for stream in json_data['streams']: + if stream.get('type') == 'HLS': + fmt, sub = self._extract_m3u8_formats_and_subtitles(stream.get('url'), display_id, ext='mp4') + if json_data.get('drm'): + for f in fmt: + f['has_drm'] = True + formats.extend(fmt) + self._merge_subtitles(sub, target=subtitles) + else: + formats.append({ + 'url': stream.get('url'), + 'width': stream.get('width'), + 'height': stream.get('height'), + 'ext': 'mp4', + }) + + return { + 'id': video_id, + 'display_id': display_id, + 'formats': formats, + 'subtitles': subtitles, + 'title': json_data.get('title') or self._html_search_meta(['og:title', 'twitter:title'], webpage), + 'description': (clean_html(traverse_obj(json_data, ('metadata', 'description'))) + or self._html_search_meta(['description', 'og:description', 'twitter:description'], webpage)), + 'thumbnails': traverse_obj(json_data, ('metadata', 'thumbnails')), + 'duration': float_or_none(traverse_obj(json_data, ('metadata', 'duration'))), + 'tags': try_call(lambda: (json_data['metadata']['keywords'] or None).split(',')), + 'categories': try_call(lambda: (json_data['metadata']['categories'] or None).split(',')), + 'uploader_id': json_data.get('owner_id'), + } diff --git a/hypervideo_dl/extractor/joj.py b/hypervideo_dl/extractor/joj.py index 7350f53..9b62284 100644 --- a/hypervideo_dl/extractor/joj.py +++ b/hypervideo_dl/extractor/joj.py @@ -1,8 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - from .common import InfoExtractor from ..compat import compat_str from ..utils import ( @@ -21,6 +16,7 @@ class JojIE(InfoExtractor): ) (?P[^/?#^]+) ''' + _EMBED_REGEX = [r']+\bsrc=(["\'])(?P(?:https?:)?//media\.joj\.sk/embed/(?:(?!\1).)+)\1'] _TESTS = [{ 'url': 'https://media.joj.sk/embed/a388ec4c-6019-4a4a-9312-b1bee194e932', 'info_dict': { @@ -41,14 +37,6 @@ class JojIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): - return [ - mobj.group('url') - for mobj in re.finditer( - r']+\bsrc=(["\'])(?P(?:https?:)?//media\.joj\.sk/embed/(?:(?!\1).)+)\1', - webpage)] - def _real_extract(self, url): video_id = self._match_id(url) @@ -73,7 +61,7 @@ class JojIE(InfoExtractor): r'(\d+)[pP]\.', format_url, 'height', default=None) formats.append({ 'url': format_url, - 'format_id': format_field(height, template='%sp'), + 'format_id': format_field(height, None, '%sp'), 'height': int(height), }) if not formats: @@ -93,7 +81,6 @@ class JojIE(InfoExtractor): r'(\d+)[pP]', format_id or path, 'height', default=None)), }) - self._sort_formats(formats) thumbnail = self._og_search_thumbnail(webpage) diff --git a/hypervideo_dl/extractor/jove.py b/hypervideo_dl/extractor/jove.py index 4b7dfc5..245fe73 100644 --- a/hypervideo_dl/extractor/jove.py +++ b/hypervideo_dl/extractor/jove.py @@ -1,6 +1,3 @@ -from __future__ import unicode_literals - - from .common import InfoExtractor from ..utils import ( ExtractorError, diff --git a/hypervideo_dl/extractor/jwplatform.py b/hypervideo_dl/extractor/jwplatform.py index 5aa508b..c949689 100644 --- a/hypervideo_dl/extractor/jwplatform.py +++ b/hypervideo_dl/extractor/jwplatform.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -8,7 +5,7 @@ from ..utils import unsmuggle_url class JWPlatformIE(InfoExtractor): - _VALID_URL = r'(?:https?://(?:content\.jwplatform|cdn\.jwplayer)\.com/(?:(?:feed|player|thumb|preview)s|jw6|v2/media)/|jwplatform:)(?P[a-zA-Z0-9]{8})' + _VALID_URL = r'(?:https?://(?:content\.jwplatform|cdn\.jwplayer)\.com/(?:(?:feed|player|thumb|preview|manifest)s|jw6|v2/media)/|jwplatform:)(?P[a-zA-Z0-9]{8})' _TESTS = [{ 'url': 'http://content.jwplatform.com/players/nPripu9l-ALJ3XQCI.js', 'md5': 'fa8899fa601eb7c83a64e9d568bdf325', @@ -25,21 +22,48 @@ class JWPlatformIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_url(webpage): - urls = JWPlatformIE._extract_urls(webpage) - return urls[0] if urls else None + _WEBPAGE_TESTS = [{ + # JWPlatform iframe + 'url': 'https://www.covermagazine.co.uk/feature/2465255/business-protection-involved', + 'info_dict': { + 'id': 'AG26UQXM', + 'ext': 'mp4', + 'upload_date': '20160719', + 'timestamp': 1468923808, + 'title': '2016_05_18 Cover L&G Business Protection V1 FINAL.mp4', + 'thumbnail': 'https://cdn.jwplayer.com/v2/media/AG26UQXM/poster.jpg?width=720', + 'description': '', + 'duration': 294.0, + }, + }, { + # Player url not surrounded by quotes + 'url': 'https://www.deutsche-kinemathek.de/en/online/streaming/darling-berlin', + 'info_dict': { + 'id': 'R10NQdhY', + 'title': 'Playgirl', + 'ext': 'mp4', + 'upload_date': '20220624', + 'thumbnail': 'https://cdn.jwplayer.com/v2/media/R10NQdhY/poster.jpg?width=720', + 'timestamp': 1656064800, + 'description': 'BRD 1966, Will Tremper', + 'duration': 5146.0, + }, + 'params': {'allowed_extractors': ['generic', 'jwplatform']}, + }] - @staticmethod - def _extract_urls(webpage): + @classmethod + def _extract_embed_urls(cls, url, webpage): for tag, key in ((r'(?:script|iframe)', 'src'), ('input', 'value')): # is used by hyland.com # if we find