From eaeeef9c1d1bedb76fea953c332ef84d53bffe2c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jes=C3=BAs?= Date: Fri, 2 Dec 2022 05:21:10 +0800 Subject: update from upstream --- hypervideo_dl/YoutubeDL.py | 1613 ++++++----- hypervideo_dl/__init__.py | 233 +- hypervideo_dl/__main__.py | 6 +- hypervideo_dl/aes.py | 101 +- hypervideo_dl/cache.py | 47 +- hypervideo_dl/compat.py | 330 --- hypervideo_dl/compat/__init__.py | 78 + hypervideo_dl/compat/_deprecated.py | 16 + hypervideo_dl/compat/_legacy.py | 97 + hypervideo_dl/compat/compat_utils.py | 70 + hypervideo_dl/compat/functools.py | 26 + hypervideo_dl/compat/imghdr.py | 16 + hypervideo_dl/compat/shutil.py | 30 + hypervideo_dl/cookies.py | 514 ++-- hypervideo_dl/dependencies.py | 97 + hypervideo_dl/downloader/__init__.py | 31 +- hypervideo_dl/downloader/common.py | 333 ++- hypervideo_dl/downloader/dash.py | 23 +- hypervideo_dl/downloader/external.py | 219 +- hypervideo_dl/downloader/f4m.py | 54 +- hypervideo_dl/downloader/fc2.py | 11 +- hypervideo_dl/downloader/fragment.py | 191 +- hypervideo_dl/downloader/hls.py | 66 +- hypervideo_dl/downloader/http.py | 98 +- hypervideo_dl/downloader/ism.py | 64 +- hypervideo_dl/downloader/mhtml.py | 30 +- hypervideo_dl/downloader/niconico.py | 13 +- hypervideo_dl/downloader/rtmp.py | 12 +- hypervideo_dl/downloader/rtsp.py | 9 +- hypervideo_dl/downloader/websocket.py | 19 +- hypervideo_dl/downloader/youtube_live_chat.py | 53 +- hypervideo_dl/extractor/__init__.py | 52 +- hypervideo_dl/extractor/_extractors.py | 2354 +++++++++++++++ hypervideo_dl/extractor/abc.py | 5 - hypervideo_dl/extractor/abcnews.py | 4 - hypervideo_dl/extractor/abcotvs.py | 6 - hypervideo_dl/extractor/abematv.py | 254 +- hypervideo_dl/extractor/academicearth.py | 2 - hypervideo_dl/extractor/acast.py | 4 - hypervideo_dl/extractor/acfun.py | 199 ++ hypervideo_dl/extractor/adn.py | 50 +- hypervideo_dl/extractor/adobeconnect.py | 3 - hypervideo_dl/extractor/adobepass.py | 98 +- hypervideo_dl/extractor/adobetv.py | 5 +- hypervideo_dl/extractor/adultswim.py | 4 - hypervideo_dl/extractor/aenetworks.py | 17 +- hypervideo_dl/extractor/aeonco.py | 40 + hypervideo_dl/extractor/afreecatv.py | 71 +- hypervideo_dl/extractor/agora.py | 251 ++ hypervideo_dl/extractor/airmozilla.py | 3 - hypervideo_dl/extractor/aliexpress.py | 3 - hypervideo_dl/extractor/aljazeera.py | 3 - hypervideo_dl/extractor/allocine.py | 5 - hypervideo_dl/extractor/alphaporno.py | 2 - hypervideo_dl/extractor/alsace20tv.py | 4 - hypervideo_dl/extractor/alura.py | 7 +- hypervideo_dl/extractor/amara.py | 3 - hypervideo_dl/extractor/amazon.py | 31 +- hypervideo_dl/extractor/amazonminitv.py | 290 ++ hypervideo_dl/extractor/amcnetworks.py | 6 +- hypervideo_dl/extractor/americastestkitchen.py | 57 +- hypervideo_dl/extractor/amp.py | 7 +- hypervideo_dl/extractor/angel.py | 56 + hypervideo_dl/extractor/animelab.py | 278 -- hypervideo_dl/extractor/animeondemand.py | 284 -- hypervideo_dl/extractor/ant1newsgr.py | 19 +- hypervideo_dl/extractor/anvato.py | 233 +- .../extractor/anvato_token_generator/__init__.py | 7 - .../extractor/anvato_token_generator/common.py | 6 - .../extractor/anvato_token_generator/nfl.py | 30 - hypervideo_dl/extractor/aol.py | 6 +- hypervideo_dl/extractor/apa.py | 15 +- hypervideo_dl/extractor/aparat.py | 5 +- hypervideo_dl/extractor/appleconnect.py | 3 - hypervideo_dl/extractor/applepodcasts.py | 3 - hypervideo_dl/extractor/appletrailers.py | 5 - hypervideo_dl/extractor/archiveorg.py | 437 ++- hypervideo_dl/extractor/arcpublishing.py | 8 +- hypervideo_dl/extractor/ard.py | 7 - hypervideo_dl/extractor/arkena.py | 17 +- hypervideo_dl/extractor/arnes.py | 6 +- hypervideo_dl/extractor/arte.py | 373 +-- hypervideo_dl/extractor/asiancrush.py | 3 - hypervideo_dl/extractor/atresplayer.py | 5 - hypervideo_dl/extractor/atscaleconf.py | 34 + hypervideo_dl/extractor/atttechchannel.py | 2 - hypervideo_dl/extractor/atvat.py | 4 - hypervideo_dl/extractor/audimedia.py | 4 - hypervideo_dl/extractor/audioboom.py | 76 +- hypervideo_dl/extractor/audiodraft.py | 93 + hypervideo_dl/extractor/audiomack.py | 3 - hypervideo_dl/extractor/audius.py | 11 +- hypervideo_dl/extractor/awaan.py | 5 +- hypervideo_dl/extractor/aws.py | 5 +- hypervideo_dl/extractor/azmedien.py | 3 - hypervideo_dl/extractor/baidu.py | 4 - hypervideo_dl/extractor/banbye.py | 5 - hypervideo_dl/extractor/bandaichannel.py | 7 +- hypervideo_dl/extractor/bandcamp.py | 23 +- hypervideo_dl/extractor/bannedvideo.py | 3 - hypervideo_dl/extractor/bbc.py | 56 +- hypervideo_dl/extractor/beatport.py | 4 - hypervideo_dl/extractor/beeg.py | 4 - hypervideo_dl/extractor/behindkink.py | 4 - hypervideo_dl/extractor/bellmedia.py | 14 +- hypervideo_dl/extractor/berufetv.py | 70 + hypervideo_dl/extractor/bet.py | 2 - hypervideo_dl/extractor/bfi.py | 3 - hypervideo_dl/extractor/bfmtv.py | 5 +- hypervideo_dl/extractor/bibeltv.py | 3 - hypervideo_dl/extractor/bigflix.py | 5 - hypervideo_dl/extractor/bigo.py | 15 +- hypervideo_dl/extractor/bild.py | 3 - hypervideo_dl/extractor/bilibili.py | 1007 ++++--- hypervideo_dl/extractor/biobiochiletv.py | 3 - hypervideo_dl/extractor/biqle.py | 4 - hypervideo_dl/extractor/bitchute.py | 279 +- hypervideo_dl/extractor/bitwave.py | 3 - hypervideo_dl/extractor/blackboardcollaborate.py | 4 - hypervideo_dl/extractor/bleacherreport.py | 3 - hypervideo_dl/extractor/blinkx.py | 86 - hypervideo_dl/extractor/blogger.py | 11 +- hypervideo_dl/extractor/bloomberg.py | 14 +- hypervideo_dl/extractor/bokecc.py | 6 - hypervideo_dl/extractor/bongacams.py | 21 +- hypervideo_dl/extractor/booyah.py | 86 + hypervideo_dl/extractor/bostonglobe.py | 3 - hypervideo_dl/extractor/box.py | 5 - hypervideo_dl/extractor/bpb.py | 6 - hypervideo_dl/extractor/br.py | 5 - hypervideo_dl/extractor/bravotv.py | 3 - hypervideo_dl/extractor/breakcom.py | 4 - hypervideo_dl/extractor/breitbart.py | 6 +- hypervideo_dl/extractor/brightcove.py | 530 +++- hypervideo_dl/extractor/bundesliga.py | 34 + hypervideo_dl/extractor/businessinsider.py | 3 - hypervideo_dl/extractor/buzzfeed.py | 5 +- hypervideo_dl/extractor/byutv.py | 4 - hypervideo_dl/extractor/c56.py | 5 - hypervideo_dl/extractor/cableav.py | 2 - hypervideo_dl/extractor/callin.py | 6 +- hypervideo_dl/extractor/caltrans.py | 4 - hypervideo_dl/extractor/cam4.py | 4 - hypervideo_dl/extractor/camdemy.py | 3 - hypervideo_dl/extractor/cammodels.py | 4 - hypervideo_dl/extractor/camsoda.py | 57 + hypervideo_dl/extractor/camtasia.py | 71 + hypervideo_dl/extractor/camtube.py | 71 - hypervideo_dl/extractor/camwithher.py | 2 - hypervideo_dl/extractor/canalalpha.py | 4 - hypervideo_dl/extractor/canalc2.py | 5 - hypervideo_dl/extractor/canalplus.py | 5 - hypervideo_dl/extractor/canvas.py | 2 - hypervideo_dl/extractor/carambatv.py | 4 - hypervideo_dl/extractor/cartoonnetwork.py | 3 - hypervideo_dl/extractor/cbc.py | 9 +- hypervideo_dl/extractor/cbs.py | 5 +- hypervideo_dl/extractor/cbsinteractive.py | 6 +- hypervideo_dl/extractor/cbslocal.py | 7 +- hypervideo_dl/extractor/cbsnews.py | 8 +- hypervideo_dl/extractor/cbssports.py | 4 - hypervideo_dl/extractor/ccc.py | 5 +- hypervideo_dl/extractor/ccma.py | 4 - hypervideo_dl/extractor/cctv.py | 5 - hypervideo_dl/extractor/cda.py | 97 +- hypervideo_dl/extractor/cellebrite.py | 63 + hypervideo_dl/extractor/ceskatelevize.py | 77 +- hypervideo_dl/extractor/cgtn.py | 3 - hypervideo_dl/extractor/channel9.py | 10 +- hypervideo_dl/extractor/charlierose.py | 4 - hypervideo_dl/extractor/chaturbate.py | 3 - hypervideo_dl/extractor/chilloutzone.py | 2 - hypervideo_dl/extractor/chingari.py | 18 +- hypervideo_dl/extractor/chirbit.py | 3 - hypervideo_dl/extractor/cinchcast.py | 6 +- hypervideo_dl/extractor/cinemax.py | 4 - hypervideo_dl/extractor/cinetecamilano.py | 61 + hypervideo_dl/extractor/ciscolive.py | 3 - hypervideo_dl/extractor/ciscowebex.py | 4 - hypervideo_dl/extractor/cjsw.py | 4 - hypervideo_dl/extractor/cliphunter.py | 3 - hypervideo_dl/extractor/clippit.py | 4 - hypervideo_dl/extractor/cliprs.py | 3 - hypervideo_dl/extractor/clipsyndicate.py | 2 - hypervideo_dl/extractor/closertotruth.py | 3 - hypervideo_dl/extractor/cloudflarestream.py | 16 +- hypervideo_dl/extractor/cloudy.py | 3 - hypervideo_dl/extractor/clubic.py | 4 - hypervideo_dl/extractor/clyp.py | 3 - hypervideo_dl/extractor/cmt.py | 4 +- hypervideo_dl/extractor/cnbc.py | 4 - hypervideo_dl/extractor/cnn.py | 60 +- hypervideo_dl/extractor/comedycentral.py | 2 - hypervideo_dl/extractor/common.py | 1459 +++++----- hypervideo_dl/extractor/commonmistakes.py | 12 +- hypervideo_dl/extractor/commonprotocols.py | 8 +- hypervideo_dl/extractor/condenast.py | 9 +- hypervideo_dl/extractor/contv.py | 5 - hypervideo_dl/extractor/corus.py | 7 +- hypervideo_dl/extractor/coub.py | 5 - hypervideo_dl/extractor/cozytv.py | 3 - hypervideo_dl/extractor/cpac.py | 12 - hypervideo_dl/extractor/cracked.py | 2 - hypervideo_dl/extractor/crackle.py | 4 - hypervideo_dl/extractor/craftsy.py | 3 - hypervideo_dl/extractor/crooksandliars.py | 5 +- hypervideo_dl/extractor/crowdbunker.py | 4 - hypervideo_dl/extractor/crunchyroll.py | 921 +----- hypervideo_dl/extractor/cspan.py | 8 +- hypervideo_dl/extractor/ctsnews.py | 3 - hypervideo_dl/extractor/ctv.py | 3 - hypervideo_dl/extractor/ctvnews.py | 3 - hypervideo_dl/extractor/cultureunplugged.py | 2 - hypervideo_dl/extractor/curiositystream.py | 19 +- hypervideo_dl/extractor/cwtv.py | 4 +- hypervideo_dl/extractor/cybrary.py | 4 +- hypervideo_dl/extractor/daftsex.py | 5 - hypervideo_dl/extractor/dailymail.py | 13 +- hypervideo_dl/extractor/dailymotion.py | 36 +- hypervideo_dl/extractor/dailywire.py | 113 + hypervideo_dl/extractor/damtomo.py | 4 - hypervideo_dl/extractor/daum.py | 6 +- hypervideo_dl/extractor/daystar.py | 1 - hypervideo_dl/extractor/dbtv.py | 12 +- hypervideo_dl/extractor/dctp.py | 3 - hypervideo_dl/extractor/deezer.py | 4 - hypervideo_dl/extractor/defense.py | 2 - hypervideo_dl/extractor/democracynow.py | 5 - hypervideo_dl/extractor/detik.py | 159 ++ hypervideo_dl/extractor/deuxm.py | 76 + hypervideo_dl/extractor/dfb.py | 4 - hypervideo_dl/extractor/dhm.py | 2 - hypervideo_dl/extractor/digg.py | 2 - hypervideo_dl/extractor/digitalconcerthall.py | 6 +- hypervideo_dl/extractor/digiteka.py | 16 +- hypervideo_dl/extractor/discovery.py | 2 - hypervideo_dl/extractor/discoverygo.py | 3 - hypervideo_dl/extractor/discoverynetworks.py | 42 - hypervideo_dl/extractor/discoveryplusindia.py | 98 - hypervideo_dl/extractor/discoveryvr.py | 59 - hypervideo_dl/extractor/disney.py | 4 - hypervideo_dl/extractor/dispeak.py | 3 - hypervideo_dl/extractor/dlive.py | 4 - hypervideo_dl/extractor/doodstream.py | 76 - hypervideo_dl/extractor/dotsub.py | 2 - hypervideo_dl/extractor/douyutv.py | 3 - hypervideo_dl/extractor/dplay.py | 83 +- hypervideo_dl/extractor/drbonanza.py | 4 - hypervideo_dl/extractor/dreisat.py | 4 +- hypervideo_dl/extractor/drooble.py | 3 - hypervideo_dl/extractor/dropbox.py | 8 +- hypervideo_dl/extractor/dropout.py | 34 +- hypervideo_dl/extractor/drtuber.py | 10 +- hypervideo_dl/extractor/drtv.py | 51 +- hypervideo_dl/extractor/dtube.py | 3 - hypervideo_dl/extractor/duboku.py | 53 +- hypervideo_dl/extractor/dumpert.py | 4 - hypervideo_dl/extractor/dvtv.py | 4 - hypervideo_dl/extractor/dw.py | 4 - hypervideo_dl/extractor/eagleplatform.py | 39 +- hypervideo_dl/extractor/ebaumsworld.py | 2 - hypervideo_dl/extractor/echomsk.py | 3 - hypervideo_dl/extractor/egghead.py | 4 - hypervideo_dl/extractor/ehow.py | 2 - hypervideo_dl/extractor/eighttracks.py | 3 - hypervideo_dl/extractor/einthusan.py | 5 - hypervideo_dl/extractor/eitb.py | 5 - hypervideo_dl/extractor/ellentube.py | 4 - hypervideo_dl/extractor/elonet.py | 4 - hypervideo_dl/extractor/elpais.py | 3 - hypervideo_dl/extractor/embedly.py | 14 +- hypervideo_dl/extractor/engadget.py | 2 - hypervideo_dl/extractor/epicon.py | 4 - hypervideo_dl/extractor/epoch.py | 55 + hypervideo_dl/extractor/eporner.py | 5 - hypervideo_dl/extractor/eroprofile.py | 2 - hypervideo_dl/extractor/ertgr.py | 26 +- hypervideo_dl/extractor/escapist.py | 3 - hypervideo_dl/extractor/espn.py | 165 +- hypervideo_dl/extractor/esri.py | 4 - hypervideo_dl/extractor/europa.py | 4 - hypervideo_dl/extractor/europeantour.py | 3 - hypervideo_dl/extractor/eurosport.py | 97 + hypervideo_dl/extractor/euscreen.py | 4 - hypervideo_dl/extractor/everyonesmixtape.py | 76 - hypervideo_dl/extractor/expotv.py | 3 - hypervideo_dl/extractor/expressen.py | 21 +- hypervideo_dl/extractor/extractors.py | 2162 +------------- hypervideo_dl/extractor/extremetube.py | 4 +- hypervideo_dl/extractor/eyedotv.py | 3 - hypervideo_dl/extractor/facebook.py | 79 +- hypervideo_dl/extractor/fancode.py | 5 +- hypervideo_dl/extractor/faz.py | 4 - hypervideo_dl/extractor/fc2.py | 26 +- hypervideo_dl/extractor/fczenit.py | 5 - hypervideo_dl/extractor/fifa.py | 94 + hypervideo_dl/extractor/filmmodu.py | 5 - hypervideo_dl/extractor/filmon.py | 5 - hypervideo_dl/extractor/filmweb.py | 3 - hypervideo_dl/extractor/firsttv.py | 4 - hypervideo_dl/extractor/fivemin.py | 54 - hypervideo_dl/extractor/fivetv.py | 6 +- hypervideo_dl/extractor/flickr.py | 5 +- hypervideo_dl/extractor/folketinget.py | 4 - hypervideo_dl/extractor/footyroom.py | 3 - hypervideo_dl/extractor/formula1.py | 3 - hypervideo_dl/extractor/fourtube.py | 3 - hypervideo_dl/extractor/fourzerostudio.py | 106 + hypervideo_dl/extractor/fox.py | 10 +- hypervideo_dl/extractor/fox9.py | 3 - hypervideo_dl/extractor/foxgay.py | 6 +- hypervideo_dl/extractor/foxnews.py | 43 +- hypervideo_dl/extractor/foxsports.py | 2 - hypervideo_dl/extractor/fptplay.py | 41 +- hypervideo_dl/extractor/franceculture.py | 128 - hypervideo_dl/extractor/franceinter.py | 3 - hypervideo_dl/extractor/francetv.py | 10 +- hypervideo_dl/extractor/freesound.py | 3 - hypervideo_dl/extractor/freespeech.py | 2 - hypervideo_dl/extractor/freetv.py | 139 + hypervideo_dl/extractor/freshlive.py | 83 - hypervideo_dl/extractor/frontendmasters.py | 4 - hypervideo_dl/extractor/fujitv.py | 12 +- hypervideo_dl/extractor/funimation.py | 14 +- hypervideo_dl/extractor/funk.py | 4 - hypervideo_dl/extractor/fusion.py | 3 - hypervideo_dl/extractor/fuyintv.py | 30 + hypervideo_dl/extractor/fxnetworks.py | 77 - hypervideo_dl/extractor/gab.py | 6 - hypervideo_dl/extractor/gaia.py | 5 - hypervideo_dl/extractor/gameinformer.py | 3 - hypervideo_dl/extractor/gamejolt.py | 1 - hypervideo_dl/extractor/gamespot.py | 4 - hypervideo_dl/extractor/gamestar.py | 4 - hypervideo_dl/extractor/gaskrank.py | 4 - hypervideo_dl/extractor/gazeta.py | 4 - hypervideo_dl/extractor/gdcvault.py | 2 - hypervideo_dl/extractor/gedidigital.py | 36 +- hypervideo_dl/extractor/generic.py | 1922 ++----------- hypervideo_dl/extractor/genericembeds.py | 114 + hypervideo_dl/extractor/genius.py | 127 + hypervideo_dl/extractor/gettr.py | 7 - hypervideo_dl/extractor/gfycat.py | 17 +- hypervideo_dl/extractor/giantbomb.py | 4 - hypervideo_dl/extractor/giga.py | 13 +- hypervideo_dl/extractor/gigya.py | 2 - hypervideo_dl/extractor/glide.py | 5 +- hypervideo_dl/extractor/globo.py | 24 +- hypervideo_dl/extractor/glomex.py | 16 +- hypervideo_dl/extractor/go.py | 59 +- hypervideo_dl/extractor/godtube.py | 3 - hypervideo_dl/extractor/gofile.py | 53 +- hypervideo_dl/extractor/golem.py | 4 - hypervideo_dl/extractor/goodgame.py | 57 + hypervideo_dl/extractor/googledrive.py | 68 +- hypervideo_dl/extractor/googlepodcasts.py | 3 - hypervideo_dl/extractor/googlesearch.py | 2 - hypervideo_dl/extractor/goplay.py | 394 +++ hypervideo_dl/extractor/gopro.py | 5 - hypervideo_dl/extractor/goshgay.py | 3 - hypervideo_dl/extractor/gotostage.py | 3 - hypervideo_dl/extractor/gputechconf.py | 3 - hypervideo_dl/extractor/gronkh.py | 76 +- hypervideo_dl/extractor/groupon.py | 2 - hypervideo_dl/extractor/harpodeon.py | 70 + hypervideo_dl/extractor/hbo.py | 4 - hypervideo_dl/extractor/hearthisat.py | 5 - hypervideo_dl/extractor/heise.py | 73 +- hypervideo_dl/extractor/hellporno.py | 3 - hypervideo_dl/extractor/helsinki.py | 5 - hypervideo_dl/extractor/hentaistigma.py | 2 - hypervideo_dl/extractor/hgtv.py | 3 - hypervideo_dl/extractor/hidive.py | 6 +- hypervideo_dl/extractor/historicfilms.py | 2 - hypervideo_dl/extractor/hitbox.py | 13 +- hypervideo_dl/extractor/hitrecord.py | 2 - hypervideo_dl/extractor/hketv.py | 4 - hypervideo_dl/extractor/holodex.py | 100 + hypervideo_dl/extractor/hornbunny.py | 49 - hypervideo_dl/extractor/hotnewhiphop.py | 2 - hypervideo_dl/extractor/hotstar.py | 291 +- hypervideo_dl/extractor/howcast.py | 2 - hypervideo_dl/extractor/howstuffworks.py | 4 - hypervideo_dl/extractor/hrfensehen.py | 58 +- hypervideo_dl/extractor/hrti.py | 4 - hypervideo_dl/extractor/hse.py | 2 - hypervideo_dl/extractor/huajiao.py | 3 - hypervideo_dl/extractor/huffpost.py | 5 +- hypervideo_dl/extractor/hungama.py | 48 +- hypervideo_dl/extractor/huya.py | 14 +- hypervideo_dl/extractor/hypem.py | 2 - hypervideo_dl/extractor/hytale.py | 58 + hypervideo_dl/extractor/icareus.py | 179 ++ hypervideo_dl/extractor/ichinanalive.py | 7 - hypervideo_dl/extractor/ign.py | 4 - hypervideo_dl/extractor/iheart.py | 3 - hypervideo_dl/extractor/iltalehti.py | 51 + hypervideo_dl/extractor/imdb.py | 3 - hypervideo_dl/extractor/imggaming.py | 4 - hypervideo_dl/extractor/imgur.py | 6 +- hypervideo_dl/extractor/ina.py | 110 +- hypervideo_dl/extractor/inc.py | 2 - hypervideo_dl/extractor/indavideo.py | 28 +- hypervideo_dl/extractor/infoq.py | 15 +- hypervideo_dl/extractor/instagram.py | 359 ++- hypervideo_dl/extractor/internazionale.py | 4 - hypervideo_dl/extractor/internetvideoarchive.py | 3 - hypervideo_dl/extractor/iprima.py | 8 +- hypervideo_dl/extractor/iqiyi.py | 18 +- hypervideo_dl/extractor/ir90tv.py | 42 - hypervideo_dl/extractor/islamchannel.py | 81 + hypervideo_dl/extractor/israelnationalnews.py | 50 + hypervideo_dl/extractor/itprotv.py | 2 - hypervideo_dl/extractor/itv.py | 4 - hypervideo_dl/extractor/ivi.py | 5 +- hypervideo_dl/extractor/ivideon.py | 5 - hypervideo_dl/extractor/iwara.py | 137 +- hypervideo_dl/extractor/ixigua.py | 83 + hypervideo_dl/extractor/izlesene.py | 4 - hypervideo_dl/extractor/jable.py | 103 + hypervideo_dl/extractor/jamendo.py | 41 +- hypervideo_dl/extractor/japandiet.py | 274 ++ hypervideo_dl/extractor/jeuxvideo.py | 5 - hypervideo_dl/extractor/jixie.py | 47 + hypervideo_dl/extractor/joj.py | 17 +- hypervideo_dl/extractor/jove.py | 3 - hypervideo_dl/extractor/jwplatform.py | 46 +- hypervideo_dl/extractor/kakao.py | 6 +- hypervideo_dl/extractor/kaltura.py | 265 +- hypervideo_dl/extractor/kanal2.py | 66 + hypervideo_dl/extractor/kanalplay.py | 96 - hypervideo_dl/extractor/kankan.py | 48 - hypervideo_dl/extractor/karaoketv.py | 3 - hypervideo_dl/extractor/karrierevideos.py | 3 - hypervideo_dl/extractor/keezmovies.py | 11 +- hypervideo_dl/extractor/kelbyone.py | 4 - hypervideo_dl/extractor/ketnet.py | 2 - hypervideo_dl/extractor/khanacademy.py | 19 +- hypervideo_dl/extractor/kicker.py | 55 + hypervideo_dl/extractor/kickstarter.py | 3 - hypervideo_dl/extractor/kinja.py | 17 +- hypervideo_dl/extractor/kinopoisk.py | 4 - hypervideo_dl/extractor/kompas.py | 26 + hypervideo_dl/extractor/konserthusetplay.py | 5 - hypervideo_dl/extractor/koo.py | 3 - hypervideo_dl/extractor/krasview.py | 3 - hypervideo_dl/extractor/kth.py | 28 + hypervideo_dl/extractor/ku6.py | 2 - hypervideo_dl/extractor/kusi.py | 10 +- hypervideo_dl/extractor/kuwo.py | 6 - hypervideo_dl/extractor/la7.py | 8 +- hypervideo_dl/extractor/laola1tv.py | 6 +- hypervideo_dl/extractor/lastfm.py | 5 +- hypervideo_dl/extractor/lbry.py | 86 +- hypervideo_dl/extractor/lci.py | 32 +- hypervideo_dl/extractor/lcp.py | 5 +- hypervideo_dl/extractor/lecture2go.py | 5 - hypervideo_dl/extractor/lecturio.py | 4 - hypervideo_dl/extractor/leeco.py | 6 +- hypervideo_dl/extractor/lego.py | 4 - hypervideo_dl/extractor/lemonde.py | 2 - hypervideo_dl/extractor/lenta.py | 3 - hypervideo_dl/extractor/libraryofcongress.py | 5 - hypervideo_dl/extractor/libsyn.py | 5 +- hypervideo_dl/extractor/lifenews.py | 5 - hypervideo_dl/extractor/likee.py | 192 ++ hypervideo_dl/extractor/limelight.py | 9 +- hypervideo_dl/extractor/line.py | 7 +- hypervideo_dl/extractor/linkedin.py | 13 +- hypervideo_dl/extractor/linuxacademy.py | 3 - hypervideo_dl/extractor/liputan6.py | 64 + hypervideo_dl/extractor/listennotes.py | 86 + hypervideo_dl/extractor/litv.py | 3 - hypervideo_dl/extractor/livejournal.py | 3 - hypervideo_dl/extractor/liveleak.py | 191 -- hypervideo_dl/extractor/livestream.py | 7 +- hypervideo_dl/extractor/livestreamfails.py | 37 + hypervideo_dl/extractor/lnkgo.py | 8 +- hypervideo_dl/extractor/localnews8.py | 4 - hypervideo_dl/extractor/lovehomeporn.py | 3 - hypervideo_dl/extractor/lrt.py | 58 +- hypervideo_dl/extractor/lynda.py | 4 - hypervideo_dl/extractor/m6.py | 3 - hypervideo_dl/extractor/magentamusik360.py | 3 - hypervideo_dl/extractor/mailru.py | 4 - hypervideo_dl/extractor/mainstreaming.py | 11 +- hypervideo_dl/extractor/malltv.py | 37 +- hypervideo_dl/extractor/mangomolo.py | 31 +- hypervideo_dl/extractor/manoto.py | 5 - hypervideo_dl/extractor/manyvids.py | 122 +- hypervideo_dl/extractor/maoritv.py | 3 - hypervideo_dl/extractor/markiza.py | 3 - hypervideo_dl/extractor/massengeschmacktv.py | 4 - hypervideo_dl/extractor/masters.py | 38 + hypervideo_dl/extractor/matchtv.py | 4 - hypervideo_dl/extractor/mdr.py | 5 - hypervideo_dl/extractor/medaltv.py | 77 +- hypervideo_dl/extractor/mediaite.py | 3 - hypervideo_dl/extractor/mediaklikk.py | 4 - hypervideo_dl/extractor/medialaan.py | 7 +- hypervideo_dl/extractor/mediaset.py | 43 +- hypervideo_dl/extractor/mediasite.py | 19 +- hypervideo_dl/extractor/mediaworksnz.py | 103 + hypervideo_dl/extractor/medici.py | 3 - hypervideo_dl/extractor/megaphone.py | 11 +- hypervideo_dl/extractor/megatvcom.py | 11 +- hypervideo_dl/extractor/meipai.py | 7 +- hypervideo_dl/extractor/melonvod.py | 4 - hypervideo_dl/extractor/meta.py | 3 - hypervideo_dl/extractor/metacafe.py | 16 +- hypervideo_dl/extractor/metacritic.py | 3 - hypervideo_dl/extractor/mgoon.py | 5 - hypervideo_dl/extractor/mgtv.py | 11 +- hypervideo_dl/extractor/miaopai.py | 3 - hypervideo_dl/extractor/microsoftembed.py | 65 + hypervideo_dl/extractor/microsoftstream.py | 4 - hypervideo_dl/extractor/microsoftvirtualacademy.py | 12 +- hypervideo_dl/extractor/mildom.py | 11 +- hypervideo_dl/extractor/minds.py | 8 +- hypervideo_dl/extractor/ministrygrid.py | 2 - hypervideo_dl/extractor/minoto.py | 5 - hypervideo_dl/extractor/miomio.py | 3 - hypervideo_dl/extractor/mirrativ.py | 3 - hypervideo_dl/extractor/mirrorcouk.py | 98 + hypervideo_dl/extractor/mit.py | 2 - hypervideo_dl/extractor/mitele.py | 5 +- hypervideo_dl/extractor/mixch.py | 2 - hypervideo_dl/extractor/mixcloud.py | 11 +- hypervideo_dl/extractor/mlb.py | 120 +- hypervideo_dl/extractor/mlssoccer.py | 3 - hypervideo_dl/extractor/mnet.py | 4 - hypervideo_dl/extractor/mocha.py | 64 + hypervideo_dl/extractor/moevideo.py | 4 - hypervideo_dl/extractor/mofosex.py | 13 +- hypervideo_dl/extractor/mojvideo.py | 4 - hypervideo_dl/extractor/morningstar.py | 4 - hypervideo_dl/extractor/motherless.py | 31 +- hypervideo_dl/extractor/motorsport.py | 12 +- hypervideo_dl/extractor/movieclips.py | 3 - hypervideo_dl/extractor/moviepilot.py | 112 + hypervideo_dl/extractor/moview.py | 43 + hypervideo_dl/extractor/moviezine.py | 6 - hypervideo_dl/extractor/movingimage.py | 2 - hypervideo_dl/extractor/msn.py | 4 - hypervideo_dl/extractor/mtv.py | 26 +- hypervideo_dl/extractor/muenchentv.py | 4 - hypervideo_dl/extractor/murrtube.py | 5 +- hypervideo_dl/extractor/musescore.py | 3 - hypervideo_dl/extractor/musicdex.py | 5 +- hypervideo_dl/extractor/mwave.py | 3 - hypervideo_dl/extractor/mxplayer.py | 151 +- hypervideo_dl/extractor/mychannels.py | 4 - hypervideo_dl/extractor/myspace.py | 5 - hypervideo_dl/extractor/myspass.py | 3 - hypervideo_dl/extractor/myvi.py | 13 +- hypervideo_dl/extractor/myvideoge.py | 3 - hypervideo_dl/extractor/myvidster.py | 2 - hypervideo_dl/extractor/n1.py | 5 - hypervideo_dl/extractor/nate.py | 4 - hypervideo_dl/extractor/nationalgeographic.py | 4 +- hypervideo_dl/extractor/naver.py | 172 +- hypervideo_dl/extractor/nba.py | 4 - hypervideo_dl/extractor/nbc.py | 191 +- hypervideo_dl/extractor/ndr.py | 252 +- hypervideo_dl/extractor/ndtv.py | 15 +- hypervideo_dl/extractor/nebula.py | 111 +- hypervideo_dl/extractor/nerdcubed.py | 3 - hypervideo_dl/extractor/neteasemusic.py | 176 +- hypervideo_dl/extractor/netverse.py | 176 ++ hypervideo_dl/extractor/netzkino.py | 5 - hypervideo_dl/extractor/newgrounds.py | 4 - hypervideo_dl/extractor/newspicks.py | 53 + hypervideo_dl/extractor/newstube.py | 4 - hypervideo_dl/extractor/newsy.py | 4 - hypervideo_dl/extractor/nextmedia.py | 7 +- hypervideo_dl/extractor/nexx.py | 25 +- hypervideo_dl/extractor/nfb.py | 4 - hypervideo_dl/extractor/nfhsnetwork.py | 7 +- hypervideo_dl/extractor/nfl.py | 15 +- hypervideo_dl/extractor/nhk.py | 27 +- hypervideo_dl/extractor/nhl.py | 4 - hypervideo_dl/extractor/nick.py | 6 +- hypervideo_dl/extractor/niconico.py | 56 +- hypervideo_dl/extractor/ninecninemedia.py | 4 - hypervideo_dl/extractor/ninegag.py | 48 +- hypervideo_dl/extractor/ninenow.py | 3 - hypervideo_dl/extractor/nintendo.py | 3 - hypervideo_dl/extractor/nitter.py | 3 - hypervideo_dl/extractor/njpwworld.py | 5 - hypervideo_dl/extractor/nobelprize.py | 4 - hypervideo_dl/extractor/noco.py | 228 -- hypervideo_dl/extractor/nonktube.py | 2 - hypervideo_dl/extractor/noodlemagazine.py | 5 - hypervideo_dl/extractor/noovo.py | 3 - hypervideo_dl/extractor/normalboots.py | 3 - hypervideo_dl/extractor/nosnl.py | 95 + hypervideo_dl/extractor/nosvideo.py | 3 - hypervideo_dl/extractor/nova.py | 5 - hypervideo_dl/extractor/novaplay.py | 54 +- hypervideo_dl/extractor/nowness.py | 3 - hypervideo_dl/extractor/noz.py | 11 +- hypervideo_dl/extractor/npo.py | 10 +- hypervideo_dl/extractor/npr.py | 23 +- hypervideo_dl/extractor/nrk.py | 20 +- hypervideo_dl/extractor/nrl.py | 3 - hypervideo_dl/extractor/ntvcojp.py | 3 - hypervideo_dl/extractor/ntvde.py | 4 - hypervideo_dl/extractor/ntvru.py | 4 - hypervideo_dl/extractor/nuevo.py | 3 - hypervideo_dl/extractor/nuvid.py | 3 - hypervideo_dl/extractor/nytimes.py | 5 +- hypervideo_dl/extractor/nzherald.py | 49 +- hypervideo_dl/extractor/nzz.py | 3 - hypervideo_dl/extractor/odatv.py | 3 - hypervideo_dl/extractor/odnoklassniki.py | 107 +- hypervideo_dl/extractor/oftv.py | 54 + hypervideo_dl/extractor/oktoberfesttv.py | 3 - hypervideo_dl/extractor/olympics.py | 6 +- hypervideo_dl/extractor/on24.py | 4 - hypervideo_dl/extractor/once.py | 5 +- hypervideo_dl/extractor/ondemandkorea.py | 25 +- hypervideo_dl/extractor/onefootball.py | 4 - hypervideo_dl/extractor/onenewsnz.py | 111 + hypervideo_dl/extractor/onet.py | 4 - hypervideo_dl/extractor/onionstudios.py | 13 +- hypervideo_dl/extractor/ooyala.py | 27 +- hypervideo_dl/extractor/opencast.py | 5 - hypervideo_dl/extractor/openload.py | 110 +- hypervideo_dl/extractor/openrec.py | 10 +- hypervideo_dl/extractor/ora.py | 4 - hypervideo_dl/extractor/orf.py | 287 +- hypervideo_dl/extractor/outsidetv.py | 3 - hypervideo_dl/extractor/packtpub.py | 2 - hypervideo_dl/extractor/palcomp3.py | 4 - hypervideo_dl/extractor/pandoratv.py | 5 - hypervideo_dl/extractor/panopto.py | 9 +- hypervideo_dl/extractor/paramountplus.py | 69 +- hypervideo_dl/extractor/parler.py | 111 + hypervideo_dl/extractor/parliamentliveuk.py | 80 - hypervideo_dl/extractor/parlview.py | 4 - hypervideo_dl/extractor/patreon.py | 368 ++- hypervideo_dl/extractor/pbs.py | 4 - hypervideo_dl/extractor/pearvideo.py | 13 +- hypervideo_dl/extractor/peekvids.py | 6 +- hypervideo_dl/extractor/peertube.py | 24 +- hypervideo_dl/extractor/peertv.py | 5 - hypervideo_dl/extractor/peloton.py | 16 +- hypervideo_dl/extractor/people.py | 3 - hypervideo_dl/extractor/performgroup.py | 5 - hypervideo_dl/extractor/periscope.py | 14 +- hypervideo_dl/extractor/philharmoniedeparis.py | 43 +- hypervideo_dl/extractor/phoenix.py | 3 - hypervideo_dl/extractor/photobucket.py | 2 - hypervideo_dl/extractor/piapro.py | 17 +- hypervideo_dl/extractor/picarto.py | 5 - hypervideo_dl/extractor/piksel.py | 15 +- hypervideo_dl/extractor/pinkbike.py | 4 - hypervideo_dl/extractor/pinterest.py | 4 - hypervideo_dl/extractor/pixivsketch.py | 4 - hypervideo_dl/extractor/pladform.py | 15 +- hypervideo_dl/extractor/planetmarathi.py | 4 - hypervideo_dl/extractor/platzi.py | 4 - hypervideo_dl/extractor/playfm.py | 4 - hypervideo_dl/extractor/playplustv.py | 4 - hypervideo_dl/extractor/plays.py | 4 - hypervideo_dl/extractor/playstuff.py | 2 - hypervideo_dl/extractor/playsuisse.py | 147 + hypervideo_dl/extractor/playtvak.py | 4 - hypervideo_dl/extractor/playvid.py | 16 +- hypervideo_dl/extractor/playwire.py | 6 +- hypervideo_dl/extractor/pluralsight.py | 4 - hypervideo_dl/extractor/plutotv.py | 4 - hypervideo_dl/extractor/podbayfm.py | 75 + hypervideo_dl/extractor/podchaser.py | 97 + hypervideo_dl/extractor/podomatic.py | 2 - hypervideo_dl/extractor/pokemon.py | 44 - hypervideo_dl/extractor/pokergo.py | 3 - hypervideo_dl/extractor/polsatgo.py | 4 - hypervideo_dl/extractor/polskieradio.py | 5 - hypervideo_dl/extractor/popcorntimes.py | 11 +- hypervideo_dl/extractor/popcorntv.py | 3 - hypervideo_dl/extractor/porn91.py | 3 - hypervideo_dl/extractor/porncom.py | 4 - hypervideo_dl/extractor/pornez.py | 2 - hypervideo_dl/extractor/pornflip.py | 4 - hypervideo_dl/extractor/pornhd.py | 4 - hypervideo_dl/extractor/pornhub.py | 47 +- hypervideo_dl/extractor/pornotube.py | 2 - hypervideo_dl/extractor/pornovoisines.py | 5 - hypervideo_dl/extractor/pornoxo.py | 3 - hypervideo_dl/extractor/prankcast.py | 66 + hypervideo_dl/extractor/premiershiprugby.py | 39 + hypervideo_dl/extractor/presstv.py | 4 - hypervideo_dl/extractor/projectveritas.py | 4 - hypervideo_dl/extractor/prosiebensat1.py | 4 - hypervideo_dl/extractor/prx.py | 3 - hypervideo_dl/extractor/puhutv.py | 4 - hypervideo_dl/extractor/puls4.py | 10 +- hypervideo_dl/extractor/pyvideo.py | 2 - hypervideo_dl/extractor/qingting.py | 47 + hypervideo_dl/extractor/qqmusic.py | 4 - hypervideo_dl/extractor/r7.py | 4 - hypervideo_dl/extractor/radiko.py | 76 +- hypervideo_dl/extractor/radiobremen.py | 4 - hypervideo_dl/extractor/radiocanada.py | 5 - hypervideo_dl/extractor/radiode.py | 3 - hypervideo_dl/extractor/radiofrance.py | 53 +- hypervideo_dl/extractor/radiojavan.py | 3 - hypervideo_dl/extractor/radiokapital.py | 2 - hypervideo_dl/extractor/radiozet.py | 1 - hypervideo_dl/extractor/radlive.py | 7 +- hypervideo_dl/extractor/rai.py | 247 +- hypervideo_dl/extractor/raywenderlich.py | 2 - hypervideo_dl/extractor/rbmaradio.py | 3 - hypervideo_dl/extractor/rcs.py | 49 +- hypervideo_dl/extractor/rcti.py | 5 - hypervideo_dl/extractor/rds.py | 3 - hypervideo_dl/extractor/redbee.py | 379 +++ hypervideo_dl/extractor/redbulltv.py | 7 +- hypervideo_dl/extractor/reddit.py | 89 +- hypervideo_dl/extractor/redgifs.py | 40 +- hypervideo_dl/extractor/redtube.py | 12 +- hypervideo_dl/extractor/regiotv.py | 3 - hypervideo_dl/extractor/rentv.py | 4 - hypervideo_dl/extractor/restudy.py | 4 - hypervideo_dl/extractor/reuters.py | 4 - hypervideo_dl/extractor/reverbnation.py | 2 - hypervideo_dl/extractor/rice.py | 4 - hypervideo_dl/extractor/rmcdecouverte.py | 4 - hypervideo_dl/extractor/ro220.py | 43 - hypervideo_dl/extractor/rockstargames.py | 5 - hypervideo_dl/extractor/rokfin.py | 173 +- hypervideo_dl/extractor/roosterteeth.py | 2 - hypervideo_dl/extractor/rottentomatoes.py | 2 - hypervideo_dl/extractor/roxwel.py | 52 - hypervideo_dl/extractor/rozhlas.py | 3 - hypervideo_dl/extractor/rtbf.py | 159 -- hypervideo_dl/extractor/rte.py | 5 - hypervideo_dl/extractor/rtl2.py | 6 - hypervideo_dl/extractor/rtlnl.py | 156 +- hypervideo_dl/extractor/rtnews.py | 3 - hypervideo_dl/extractor/rtp.py | 3 - hypervideo_dl/extractor/rtrfm.py | 2 - hypervideo_dl/extractor/rts.py | 6 +- hypervideo_dl/extractor/rtve.py | 28 +- hypervideo_dl/extractor/rtvnh.py | 4 - hypervideo_dl/extractor/rtvs.py | 4 - hypervideo_dl/extractor/rtvslo.py | 150 + hypervideo_dl/extractor/ruhd.py | 3 - hypervideo_dl/extractor/rule34video.py | 4 - hypervideo_dl/extractor/rumble.py | 213 +- hypervideo_dl/extractor/rutube.py | 13 +- hypervideo_dl/extractor/rutv.py | 33 +- hypervideo_dl/extractor/ruutu.py | 50 +- hypervideo_dl/extractor/ruv.py | 3 - hypervideo_dl/extractor/safari.py | 3 - hypervideo_dl/extractor/saitosan.py | 4 - hypervideo_dl/extractor/samplefocus.py | 3 - hypervideo_dl/extractor/sapo.py | 5 - hypervideo_dl/extractor/savefrom.py | 3 - hypervideo_dl/extractor/sbs.py | 16 +- hypervideo_dl/extractor/screen9.py | 62 + hypervideo_dl/extractor/screencast.py | 14 +- hypervideo_dl/extractor/screencastify.py | 52 + hypervideo_dl/extractor/screencastomatic.py | 27 +- hypervideo_dl/extractor/scrippsnetworks.py | 3 - hypervideo_dl/extractor/scrolller.py | 102 + hypervideo_dl/extractor/scte.py | 2 - hypervideo_dl/extractor/seeker.py | 3 - hypervideo_dl/extractor/senategov.py | 15 +- hypervideo_dl/extractor/senateisvp.py | 153 - hypervideo_dl/extractor/sendtonews.py | 13 +- hypervideo_dl/extractor/servus.py | 4 - hypervideo_dl/extractor/sevenplus.py | 7 +- hypervideo_dl/extractor/sexu.py | 3 - hypervideo_dl/extractor/seznamzpravy.py | 16 +- hypervideo_dl/extractor/shahid.py | 4 - hypervideo_dl/extractor/shared.py | 13 +- hypervideo_dl/extractor/sharevideos.py | 6 + hypervideo_dl/extractor/shemaroome.py | 4 - hypervideo_dl/extractor/showroomlive.py | 4 - hypervideo_dl/extractor/simplecast.py | 19 +- hypervideo_dl/extractor/sina.py | 5 - hypervideo_dl/extractor/sixplay.py | 5 - hypervideo_dl/extractor/skeb.py | 3 - hypervideo_dl/extractor/sky.py | 3 - hypervideo_dl/extractor/skyit.py | 99 +- hypervideo_dl/extractor/skylinewebcams.py | 3 - hypervideo_dl/extractor/skynewsarabia.py | 3 - hypervideo_dl/extractor/skynewsau.py | 3 - hypervideo_dl/extractor/slideshare.py | 2 - hypervideo_dl/extractor/slideslive.py | 5 +- hypervideo_dl/extractor/slutload.py | 2 - hypervideo_dl/extractor/smotrim.py | 65 + hypervideo_dl/extractor/snotr.py | 4 - hypervideo_dl/extractor/sohu.py | 4 - hypervideo_dl/extractor/sonyliv.py | 34 +- hypervideo_dl/extractor/soundcloud.py | 370 ++- hypervideo_dl/extractor/soundgasm.py | 3 - hypervideo_dl/extractor/southpark.py | 54 +- hypervideo_dl/extractor/sovietscloset.py | 18 +- hypervideo_dl/extractor/spankbang.py | 4 - hypervideo_dl/extractor/spankwire.py | 10 +- hypervideo_dl/extractor/spiegel.py | 3 - hypervideo_dl/extractor/spiegeltv.py | 17 - hypervideo_dl/extractor/spike.py | 2 - hypervideo_dl/extractor/sport5.py | 5 - hypervideo_dl/extractor/sportbox.py | 13 +- hypervideo_dl/extractor/sportdeutschland.py | 3 - hypervideo_dl/extractor/spotify.py | 55 +- hypervideo_dl/extractor/spreaker.py | 3 - hypervideo_dl/extractor/springboardplatform.py | 14 +- hypervideo_dl/extractor/sprout.py | 3 - hypervideo_dl/extractor/srgssr.py | 5 - hypervideo_dl/extractor/srmediathek.py | 3 - hypervideo_dl/extractor/stanfordoc.py | 2 - hypervideo_dl/extractor/startrek.py | 75 + hypervideo_dl/extractor/startv.py | 3 - hypervideo_dl/extractor/steam.py | 49 +- hypervideo_dl/extractor/stitcher.py | 2 - hypervideo_dl/extractor/storyfire.py | 5 +- hypervideo_dl/extractor/streamable.py | 15 +- hypervideo_dl/extractor/streamanity.py | 4 - hypervideo_dl/extractor/streamcloud.py | 3 - hypervideo_dl/extractor/streamcz.py | 6 +- hypervideo_dl/extractor/streamff.py | 1 - hypervideo_dl/extractor/streetvoice.py | 3 - hypervideo_dl/extractor/stretchinternet.py | 2 - hypervideo_dl/extractor/stripchat.py | 52 +- hypervideo_dl/extractor/stv.py | 6 +- hypervideo_dl/extractor/substack.py | 100 + hypervideo_dl/extractor/sunporno.py | 3 - hypervideo_dl/extractor/sverigesradio.py | 4 - hypervideo_dl/extractor/svt.py | 12 +- hypervideo_dl/extractor/swearnet.py | 73 + hypervideo_dl/extractor/swrmediathek.py | 4 - hypervideo_dl/extractor/syfy.py | 2 - hypervideo_dl/extractor/syvdk.py | 33 + hypervideo_dl/extractor/sztvhu.py | 3 - hypervideo_dl/extractor/tagesschau.py | 5 - hypervideo_dl/extractor/tass.py | 4 - hypervideo_dl/extractor/tastytrade.py | 43 - hypervideo_dl/extractor/tbs.py | 3 - hypervideo_dl/extractor/tdslifeway.py | 2 - hypervideo_dl/extractor/teachable.py | 16 +- hypervideo_dl/extractor/teachertube.py | 5 - hypervideo_dl/extractor/teachingchannel.py | 2 - hypervideo_dl/extractor/teamcoco.py | 4 - hypervideo_dl/extractor/teamtreehouse.py | 3 - hypervideo_dl/extractor/techtalks.py | 2 - hypervideo_dl/extractor/ted.py | 8 +- hypervideo_dl/extractor/tele13.py | 4 - hypervideo_dl/extractor/tele5.py | 5 +- hypervideo_dl/extractor/telebruxelles.py | 4 - hypervideo_dl/extractor/telecinco.py | 4 - hypervideo_dl/extractor/telegraaf.py | 9 +- hypervideo_dl/extractor/telegram.py | 141 +- hypervideo_dl/extractor/telemb.py | 4 - hypervideo_dl/extractor/telemundo.py | 4 - hypervideo_dl/extractor/telequebec.py | 3 - hypervideo_dl/extractor/teletask.py | 2 - hypervideo_dl/extractor/telewebion.py | 3 - hypervideo_dl/extractor/tempo.py | 53 + hypervideo_dl/extractor/tencent.py | 452 +++ hypervideo_dl/extractor/tennistv.py | 186 +- hypervideo_dl/extractor/tenplay.py | 4 - hypervideo_dl/extractor/testurl.py | 47 +- hypervideo_dl/extractor/tf1.py | 3 - hypervideo_dl/extractor/tfo.py | 3 - hypervideo_dl/extractor/theholetv.py | 35 + hypervideo_dl/extractor/theintercept.py | 3 - hypervideo_dl/extractor/theplatform.py | 30 +- hypervideo_dl/extractor/thescene.py | 44 - hypervideo_dl/extractor/thestar.py | 3 - hypervideo_dl/extractor/thesun.py | 2 - hypervideo_dl/extractor/theta.py | 5 - hypervideo_dl/extractor/theweatherchannel.py | 6 +- hypervideo_dl/extractor/thisamericanlife.py | 2 - hypervideo_dl/extractor/thisav.py | 4 - hypervideo_dl/extractor/thisoldhouse.py | 3 - hypervideo_dl/extractor/threeqsdn.py | 26 +- hypervideo_dl/extractor/threespeak.py | 4 - hypervideo_dl/extractor/tiktok.py | 283 +- hypervideo_dl/extractor/tinypic.py | 2 - hypervideo_dl/extractor/tmz.py | 62 +- hypervideo_dl/extractor/tnaflix.py | 205 +- hypervideo_dl/extractor/toggle.py | 4 - hypervideo_dl/extractor/toggo.py | 11 +- hypervideo_dl/extractor/tokentube.py | 5 - hypervideo_dl/extractor/tonline.py | 3 - hypervideo_dl/extractor/toongoggles.py | 4 - hypervideo_dl/extractor/toutv.py | 5 +- hypervideo_dl/extractor/toypics.py | 3 - hypervideo_dl/extractor/traileraddict.py | 2 - hypervideo_dl/extractor/triller.py | 294 ++ hypervideo_dl/extractor/trilulilu.py | 3 - hypervideo_dl/extractor/trovo.py | 308 +- hypervideo_dl/extractor/trueid.py | 3 - hypervideo_dl/extractor/trunews.py | 2 - hypervideo_dl/extractor/truth.py | 69 + hypervideo_dl/extractor/trutv.py | 4 - hypervideo_dl/extractor/tube8.py | 11 +- hypervideo_dl/extractor/tubetugraz.py | 233 ++ hypervideo_dl/extractor/tubitv.py | 57 +- hypervideo_dl/extractor/tudou.py | 49 - hypervideo_dl/extractor/tumblr.py | 5 - hypervideo_dl/extractor/tunein.py | 11 +- hypervideo_dl/extractor/tunepk.py | 3 - hypervideo_dl/extractor/turbo.py | 4 - hypervideo_dl/extractor/turner.py | 7 +- hypervideo_dl/extractor/tv2.py | 37 +- hypervideo_dl/extractor/tv24ua.py | 78 + hypervideo_dl/extractor/tv2dk.py | 4 - hypervideo_dl/extractor/tv2hu.py | 3 - hypervideo_dl/extractor/tv4.py | 5 - hypervideo_dl/extractor/tv5mondeplus.py | 4 - hypervideo_dl/extractor/tv5unis.py | 4 - hypervideo_dl/extractor/tva.py | 3 - hypervideo_dl/extractor/tvanouvelles.py | 3 - hypervideo_dl/extractor/tvc.py | 14 +- hypervideo_dl/extractor/tver.py | 130 +- hypervideo_dl/extractor/tvigle.py | 6 +- hypervideo_dl/extractor/tviplayer.py | 78 + hypervideo_dl/extractor/tvland.py | 3 - hypervideo_dl/extractor/tvn24.py | 4 - hypervideo_dl/extractor/tvnet.py | 4 - hypervideo_dl/extractor/tvnoe.py | 3 - hypervideo_dl/extractor/tvnow.py | 7 +- hypervideo_dl/extractor/tvopengr.py | 14 +- hypervideo_dl/extractor/tvp.py | 236 +- hypervideo_dl/extractor/tvplay.py | 7 - hypervideo_dl/extractor/tvplayer.py | 4 - hypervideo_dl/extractor/tweakers.py | 3 - hypervideo_dl/extractor/twentyfourvideo.py | 4 - hypervideo_dl/extractor/twentymin.py | 13 +- hypervideo_dl/extractor/twentythreevideo.py | 3 - hypervideo_dl/extractor/twitcasting.py | 33 +- hypervideo_dl/extractor/twitch.py | 155 +- hypervideo_dl/extractor/twitter.py | 788 ++++- hypervideo_dl/extractor/udemy.py | 23 +- hypervideo_dl/extractor/udn.py | 6 +- hypervideo_dl/extractor/ufctv.py | 3 - hypervideo_dl/extractor/ukcolumn.py | 2 - hypervideo_dl/extractor/uktvplay.py | 8 +- hypervideo_dl/extractor/umg.py | 4 - hypervideo_dl/extractor/unistra.py | 3 - hypervideo_dl/extractor/unity.py | 2 - hypervideo_dl/extractor/unscripted.py | 53 + hypervideo_dl/extractor/unsupported.py | 143 + hypervideo_dl/extractor/uol.py | 4 - hypervideo_dl/extractor/uplynk.py | 7 +- hypervideo_dl/extractor/urort.py | 13 +- hypervideo_dl/extractor/urplay.py | 4 - hypervideo_dl/extractor/usanetwork.py | 5 +- hypervideo_dl/extractor/usatoday.py | 3 - hypervideo_dl/extractor/ustream.py | 12 +- hypervideo_dl/extractor/ustudio.py | 5 - hypervideo_dl/extractor/utreon.py | 4 - hypervideo_dl/extractor/varzesh3.py | 3 - hypervideo_dl/extractor/vbox7.py | 14 +- hypervideo_dl/extractor/veehd.py | 2 - hypervideo_dl/extractor/veo.py | 5 - hypervideo_dl/extractor/veoh.py | 67 +- hypervideo_dl/extractor/vesti.py | 3 - hypervideo_dl/extractor/vevo.py | 123 +- hypervideo_dl/extractor/vgtv.py | 12 +- hypervideo_dl/extractor/vh1.py | 3 - hypervideo_dl/extractor/vice.py | 17 +- hypervideo_dl/extractor/vidbit.py | 2 - hypervideo_dl/extractor/viddler.py | 6 +- hypervideo_dl/extractor/videa.py | 21 +- hypervideo_dl/extractor/videocampus_sachsen.py | 237 +- hypervideo_dl/extractor/videodetective.py | 2 - hypervideo_dl/extractor/videofyme.py | 2 - hypervideo_dl/extractor/videomore.py | 25 +- hypervideo_dl/extractor/videopress.py | 13 +- hypervideo_dl/extractor/vidio.py | 42 +- hypervideo_dl/extractor/vidlii.py | 6 +- hypervideo_dl/extractor/vidme.py | 295 -- hypervideo_dl/extractor/vidzi.py | 68 - hypervideo_dl/extractor/vier.py | 264 -- hypervideo_dl/extractor/viewlift.py | 13 +- hypervideo_dl/extractor/viidea.py | 3 - hypervideo_dl/extractor/viki.py | 3 - hypervideo_dl/extractor/vimeo.py | 252 +- hypervideo_dl/extractor/vimm.py | 3 - hypervideo_dl/extractor/vimple.py | 3 - hypervideo_dl/extractor/vine.py | 8 +- hypervideo_dl/extractor/viqeo.py | 15 +- hypervideo_dl/extractor/viu.py | 47 +- hypervideo_dl/extractor/vk.py | 140 +- hypervideo_dl/extractor/vlive.py | 21 +- hypervideo_dl/extractor/vodlocker.py | 3 - hypervideo_dl/extractor/vodpl.py | 3 - hypervideo_dl/extractor/vodplatform.py | 5 +- hypervideo_dl/extractor/voicerepublic.py | 3 - hypervideo_dl/extractor/voicy.py | 8 +- hypervideo_dl/extractor/voot.py | 9 +- hypervideo_dl/extractor/voxmedia.py | 6 +- hypervideo_dl/extractor/vrak.py | 3 - hypervideo_dl/extractor/vrt.py | 4 - hypervideo_dl/extractor/vrv.py | 19 +- hypervideo_dl/extractor/vshare.py | 22 +- hypervideo_dl/extractor/vtm.py | 3 - hypervideo_dl/extractor/vube.py | 170 -- hypervideo_dl/extractor/vuclip.py | 2 - hypervideo_dl/extractor/vupload.py | 3 - hypervideo_dl/extractor/vvvvid.py | 21 +- hypervideo_dl/extractor/vyborymos.py | 3 - hypervideo_dl/extractor/vzaar.py | 14 +- hypervideo_dl/extractor/wakanim.py | 3 - hypervideo_dl/extractor/walla.py | 4 - hypervideo_dl/extractor/wasdtv.py | 10 +- hypervideo_dl/extractor/washingtonpost.py | 10 +- hypervideo_dl/extractor/wat.py | 7 +- hypervideo_dl/extractor/watchbox.py | 5 - hypervideo_dl/extractor/watchindianporn.py | 3 - hypervideo_dl/extractor/wdr.py | 7 +- hypervideo_dl/extractor/webcaster.py | 20 +- hypervideo_dl/extractor/webofstories.py | 5 - hypervideo_dl/extractor/weibo.py | 5 - hypervideo_dl/extractor/weiqitv.py | 3 - hypervideo_dl/extractor/whowatch.py | 4 - hypervideo_dl/extractor/wikimedia.py | 55 + hypervideo_dl/extractor/willow.py | 2 - hypervideo_dl/extractor/wimtv.py | 19 +- hypervideo_dl/extractor/wistia.py | 250 +- hypervideo_dl/extractor/wordpress.py | 154 + hypervideo_dl/extractor/worldstarhiphop.py | 2 - hypervideo_dl/extractor/wppilot.py | 10 +- hypervideo_dl/extractor/wsj.py | 7 +- hypervideo_dl/extractor/wwe.py | 2 - hypervideo_dl/extractor/xbef.py | 2 - hypervideo_dl/extractor/xboxclips.py | 3 - hypervideo_dl/extractor/xfileshare.py | 31 +- hypervideo_dl/extractor/xhamster.py | 27 +- hypervideo_dl/extractor/xiami.py | 3 - hypervideo_dl/extractor/ximalaya.py | 161 +- hypervideo_dl/extractor/xinpianchang.py | 5 - hypervideo_dl/extractor/xminus.py | 3 - hypervideo_dl/extractor/xnxx.py | 4 - hypervideo_dl/extractor/xstream.py | 4 - hypervideo_dl/extractor/xtube.py | 3 - hypervideo_dl/extractor/xuite.py | 4 - hypervideo_dl/extractor/xvideos.py | 4 - hypervideo_dl/extractor/xxxymovies.py | 3 - hypervideo_dl/extractor/yahoo.py | 143 +- hypervideo_dl/extractor/yandexdisk.py | 4 - hypervideo_dl/extractor/yandexmusic.py | 6 +- hypervideo_dl/extractor/yandexvideo.py | 182 +- hypervideo_dl/extractor/yapfiles.py | 14 +- hypervideo_dl/extractor/yesjapan.py | 3 - hypervideo_dl/extractor/yinyuetai.py | 4 - hypervideo_dl/extractor/yle_areena.py | 71 + hypervideo_dl/extractor/ynet.py | 10 +- hypervideo_dl/extractor/youjizz.py | 3 - hypervideo_dl/extractor/youku.py | 4 - hypervideo_dl/extractor/younow.py | 5 +- hypervideo_dl/extractor/youporn.py | 39 +- hypervideo_dl/extractor/yourporn.py | 2 - hypervideo_dl/extractor/yourupload.py | 3 - hypervideo_dl/extractor/youtube.py | 2997 +++++++++++++------- hypervideo_dl/extractor/zapiks.py | 5 +- hypervideo_dl/extractor/zaq1.py | 101 - hypervideo_dl/extractor/zattoo.py | 708 ++++- hypervideo_dl/extractor/zdf.py | 150 +- hypervideo_dl/extractor/zee5.py | 49 +- hypervideo_dl/extractor/zeenews.py | 57 + hypervideo_dl/extractor/zhihu.py | 6 +- hypervideo_dl/extractor/zingmp3.py | 392 ++- hypervideo_dl/extractor/zoom.py | 6 - hypervideo_dl/extractor/zype.py | 13 +- hypervideo_dl/jsinterp.py | 785 +++-- hypervideo_dl/minicurses.py | 7 +- hypervideo_dl/options.py | 565 ++-- hypervideo_dl/postprocessor/__init__.py | 15 +- hypervideo_dl/postprocessor/common.py | 64 +- hypervideo_dl/postprocessor/embedthumbnail.py | 79 +- hypervideo_dl/postprocessor/exec.py | 8 +- hypervideo_dl/postprocessor/execafterdownload.py | 31 - hypervideo_dl/postprocessor/ffmpeg.py | 521 ++-- hypervideo_dl/postprocessor/metadatafromtitle.py | 48 - hypervideo_dl/postprocessor/metadataparser.py | 28 +- hypervideo_dl/postprocessor/modify_chapters.py | 37 +- .../postprocessor/movefilesafterdownload.py | 7 +- hypervideo_dl/postprocessor/sponskrub.py | 25 +- hypervideo_dl/postprocessor/sponsorblock.py | 29 +- hypervideo_dl/postprocessor/xattrpp.py | 85 +- hypervideo_dl/socks.py | 70 +- hypervideo_dl/utils.py | 2873 ++++++++++++------- hypervideo_dl/version.py | 6 +- hypervideo_dl/webvtt.py | 61 +- 1092 files changed, 32411 insertions(+), 23399 deletions(-) delete mode 100644 hypervideo_dl/compat.py create mode 100644 hypervideo_dl/compat/__init__.py create mode 100644 hypervideo_dl/compat/_deprecated.py create mode 100644 hypervideo_dl/compat/_legacy.py create mode 100644 hypervideo_dl/compat/compat_utils.py create mode 100644 hypervideo_dl/compat/functools.py create mode 100644 hypervideo_dl/compat/imghdr.py create mode 100644 hypervideo_dl/compat/shutil.py create mode 100644 hypervideo_dl/dependencies.py create mode 100644 hypervideo_dl/extractor/_extractors.py create mode 100644 hypervideo_dl/extractor/acfun.py create mode 100644 hypervideo_dl/extractor/aeonco.py create mode 100644 hypervideo_dl/extractor/agora.py create mode 100644 hypervideo_dl/extractor/amazonminitv.py create mode 100644 hypervideo_dl/extractor/angel.py delete mode 100644 hypervideo_dl/extractor/animelab.py delete mode 100644 hypervideo_dl/extractor/animeondemand.py delete mode 100644 hypervideo_dl/extractor/anvato_token_generator/__init__.py delete mode 100644 hypervideo_dl/extractor/anvato_token_generator/common.py delete mode 100644 hypervideo_dl/extractor/anvato_token_generator/nfl.py create mode 100644 hypervideo_dl/extractor/atscaleconf.py create mode 100644 hypervideo_dl/extractor/audiodraft.py create mode 100644 hypervideo_dl/extractor/berufetv.py delete mode 100644 hypervideo_dl/extractor/blinkx.py create mode 100644 hypervideo_dl/extractor/booyah.py create mode 100644 hypervideo_dl/extractor/bundesliga.py create mode 100644 hypervideo_dl/extractor/camsoda.py create mode 100644 hypervideo_dl/extractor/camtasia.py delete mode 100644 hypervideo_dl/extractor/camtube.py create mode 100644 hypervideo_dl/extractor/cellebrite.py create mode 100644 hypervideo_dl/extractor/cinetecamilano.py create mode 100644 hypervideo_dl/extractor/dailywire.py create mode 100644 hypervideo_dl/extractor/detik.py create mode 100644 hypervideo_dl/extractor/deuxm.py delete mode 100644 hypervideo_dl/extractor/discoverynetworks.py delete mode 100644 hypervideo_dl/extractor/discoveryplusindia.py delete mode 100644 hypervideo_dl/extractor/discoveryvr.py delete mode 100644 hypervideo_dl/extractor/doodstream.py create mode 100644 hypervideo_dl/extractor/epoch.py create mode 100644 hypervideo_dl/extractor/eurosport.py delete mode 100644 hypervideo_dl/extractor/everyonesmixtape.py create mode 100644 hypervideo_dl/extractor/fifa.py delete mode 100644 hypervideo_dl/extractor/fivemin.py create mode 100644 hypervideo_dl/extractor/fourzerostudio.py delete mode 100644 hypervideo_dl/extractor/franceculture.py create mode 100644 hypervideo_dl/extractor/freetv.py delete mode 100644 hypervideo_dl/extractor/freshlive.py create mode 100644 hypervideo_dl/extractor/fuyintv.py delete mode 100644 hypervideo_dl/extractor/fxnetworks.py create mode 100644 hypervideo_dl/extractor/genericembeds.py create mode 100644 hypervideo_dl/extractor/genius.py create mode 100644 hypervideo_dl/extractor/goodgame.py create mode 100644 hypervideo_dl/extractor/goplay.py create mode 100644 hypervideo_dl/extractor/harpodeon.py create mode 100644 hypervideo_dl/extractor/holodex.py delete mode 100644 hypervideo_dl/extractor/hornbunny.py create mode 100644 hypervideo_dl/extractor/hytale.py create mode 100644 hypervideo_dl/extractor/icareus.py create mode 100644 hypervideo_dl/extractor/iltalehti.py delete mode 100644 hypervideo_dl/extractor/ir90tv.py create mode 100644 hypervideo_dl/extractor/islamchannel.py create mode 100644 hypervideo_dl/extractor/israelnationalnews.py create mode 100644 hypervideo_dl/extractor/ixigua.py create mode 100644 hypervideo_dl/extractor/jable.py create mode 100644 hypervideo_dl/extractor/japandiet.py create mode 100644 hypervideo_dl/extractor/jixie.py create mode 100644 hypervideo_dl/extractor/kanal2.py delete mode 100644 hypervideo_dl/extractor/kanalplay.py delete mode 100644 hypervideo_dl/extractor/kankan.py create mode 100644 hypervideo_dl/extractor/kicker.py create mode 100644 hypervideo_dl/extractor/kompas.py create mode 100644 hypervideo_dl/extractor/kth.py create mode 100644 hypervideo_dl/extractor/likee.py create mode 100644 hypervideo_dl/extractor/liputan6.py create mode 100644 hypervideo_dl/extractor/listennotes.py delete mode 100644 hypervideo_dl/extractor/liveleak.py create mode 100644 hypervideo_dl/extractor/livestreamfails.py create mode 100644 hypervideo_dl/extractor/masters.py create mode 100644 hypervideo_dl/extractor/mediaworksnz.py create mode 100644 hypervideo_dl/extractor/microsoftembed.py create mode 100644 hypervideo_dl/extractor/mirrorcouk.py create mode 100644 hypervideo_dl/extractor/mocha.py create mode 100644 hypervideo_dl/extractor/moviepilot.py create mode 100644 hypervideo_dl/extractor/moview.py create mode 100644 hypervideo_dl/extractor/netverse.py create mode 100644 hypervideo_dl/extractor/newspicks.py delete mode 100644 hypervideo_dl/extractor/noco.py create mode 100644 hypervideo_dl/extractor/nosnl.py create mode 100644 hypervideo_dl/extractor/oftv.py create mode 100644 hypervideo_dl/extractor/onenewsnz.py create mode 100644 hypervideo_dl/extractor/parler.py delete mode 100644 hypervideo_dl/extractor/parliamentliveuk.py create mode 100644 hypervideo_dl/extractor/playsuisse.py create mode 100644 hypervideo_dl/extractor/podbayfm.py create mode 100644 hypervideo_dl/extractor/podchaser.py create mode 100644 hypervideo_dl/extractor/prankcast.py create mode 100644 hypervideo_dl/extractor/premiershiprugby.py create mode 100644 hypervideo_dl/extractor/qingting.py create mode 100644 hypervideo_dl/extractor/redbee.py delete mode 100644 hypervideo_dl/extractor/ro220.py delete mode 100644 hypervideo_dl/extractor/roxwel.py delete mode 100644 hypervideo_dl/extractor/rtbf.py create mode 100644 hypervideo_dl/extractor/rtvslo.py create mode 100644 hypervideo_dl/extractor/screen9.py create mode 100644 hypervideo_dl/extractor/screencastify.py create mode 100644 hypervideo_dl/extractor/scrolller.py delete mode 100644 hypervideo_dl/extractor/senateisvp.py create mode 100644 hypervideo_dl/extractor/sharevideos.py create mode 100644 hypervideo_dl/extractor/smotrim.py delete mode 100644 hypervideo_dl/extractor/spiegeltv.py create mode 100644 hypervideo_dl/extractor/startrek.py create mode 100644 hypervideo_dl/extractor/substack.py create mode 100644 hypervideo_dl/extractor/swearnet.py create mode 100644 hypervideo_dl/extractor/syvdk.py delete mode 100644 hypervideo_dl/extractor/tastytrade.py create mode 100644 hypervideo_dl/extractor/tempo.py create mode 100644 hypervideo_dl/extractor/tencent.py create mode 100644 hypervideo_dl/extractor/theholetv.py delete mode 100644 hypervideo_dl/extractor/thescene.py create mode 100644 hypervideo_dl/extractor/triller.py create mode 100644 hypervideo_dl/extractor/truth.py create mode 100644 hypervideo_dl/extractor/tubetugraz.py delete mode 100644 hypervideo_dl/extractor/tudou.py create mode 100644 hypervideo_dl/extractor/tv24ua.py create mode 100644 hypervideo_dl/extractor/tviplayer.py create mode 100644 hypervideo_dl/extractor/unscripted.py create mode 100644 hypervideo_dl/extractor/unsupported.py delete mode 100644 hypervideo_dl/extractor/vidme.py delete mode 100644 hypervideo_dl/extractor/vidzi.py delete mode 100644 hypervideo_dl/extractor/vier.py delete mode 100644 hypervideo_dl/extractor/vube.py create mode 100644 hypervideo_dl/extractor/wikimedia.py create mode 100644 hypervideo_dl/extractor/wordpress.py create mode 100644 hypervideo_dl/extractor/yle_areena.py delete mode 100644 hypervideo_dl/extractor/zaq1.py create mode 100644 hypervideo_dl/extractor/zeenews.py delete mode 100644 hypervideo_dl/postprocessor/execafterdownload.py delete mode 100644 hypervideo_dl/postprocessor/metadatafromtitle.py (limited to 'hypervideo_dl') diff --git a/hypervideo_dl/YoutubeDL.py b/hypervideo_dl/YoutubeDL.py index 276f42d..012c3b8 100644 --- a/hypervideo_dl/YoutubeDL.py +++ b/hypervideo_dl/YoutubeDL.py @@ -1,8 +1,3 @@ -#!/usr/bin/env python3 -# coding: utf-8 - -from __future__ import absolute_import, unicode_literals - import collections import contextlib import datetime @@ -15,7 +10,7 @@ import json import locale import operator import os -import platform +import random import re import shutil import subprocess @@ -24,151 +19,141 @@ import tempfile import time import tokenize import traceback -import random import unicodedata - -from enum import Enum +import urllib.request from string import ascii_letters -from .compat import ( - compat_basestring, - compat_brotli, - compat_get_terminal_size, - compat_kwargs, - compat_numeric_types, - compat_os_name, - compat_pycrypto_AES, - compat_shlex_quote, - compat_str, - compat_tokenize_tokenize, - compat_urllib_error, - compat_urllib_request, - compat_urllib_request_DataHandler, - windows_enable_vt_mode, -) +from .cache import Cache +from .compat import compat_os_name, compat_shlex_quote from .cookies import load_cookies +from .downloader import FFmpegFD, get_suitable_downloader, shorten_protocol_name +from .downloader.rtmp import rtmpdump_version +from .extractor import gen_extractor_classes, get_info_extractor +from .extractor.common import UnsupportedURLIE +from .extractor.openload import PhantomJSwrapper +from .minicurses import format_text +from .postprocessor import _PLUGIN_CLASSES as plugin_postprocessors +from .postprocessor import ( + EmbedThumbnailPP, + FFmpegFixupDuplicateMoovPP, + FFmpegFixupDurationPP, + FFmpegFixupM3u8PP, + FFmpegFixupM4aPP, + FFmpegFixupStretchedPP, + FFmpegFixupTimestampPP, + FFmpegMergerPP, + FFmpegPostProcessor, + FFmpegVideoConvertorPP, + MoveFilesAfterDownloadPP, + get_postprocessor, +) from .utils import ( + DEFAULT_OUTTMPL, + IDENTITY, + LINK_TEMPLATES, + MEDIA_EXTENSIONS, + NO_DEFAULT, + NUMBER_RE, + OUTTMPL_TYPES, + POSTPROCESS_WHEN, + STR_FORMAT_RE_TMPL, + STR_FORMAT_TYPES, + ContentTooShortError, + DateRange, + DownloadCancelled, + DownloadError, + EntryNotInPlaylist, + ExistingVideoReached, + ExtractorError, + FormatSorter, + GeoRestrictedError, + HEADRequest, + ISO3166Utils, + LazyList, + MaxDownloadsReached, + Namespace, + PagedList, + PerRequestProxyHandler, + PlaylistEntries, + Popen, + PostProcessingError, + ReExtractInfo, + RejectedVideoReached, + SameFileError, + UnavailableVideoError, + UserNotLive, + YoutubeDLCookieProcessor, + YoutubeDLHandler, + YoutubeDLRedirectHandler, age_restricted, args_to_str, - ContentTooShortError, + bug_reports_message, date_from_str, - DateRange, - DEFAULT_OUTTMPL, + deprecation_warning, determine_ext, determine_protocol, - DownloadCancelled, - DownloadError, encode_compat_str, encodeFilename, - EntryNotInPlaylist, error_to_compat_str, - ExistingVideoReached, + escapeHTML, expand_path, - ExtractorError, filter_dict, float_or_none, format_bytes, - format_field, format_decimal_suffix, + format_field, formatSeconds, - GeoRestrictedError, + get_compatible_ext, get_domain, - has_certifi, - HEADRequest, - InAdvancePagedList, int_or_none, iri_to_uri, - ISO3166Utils, + is_path_like, join_nonempty, - LazyList, - LINK_TEMPLATES, locked_file, + make_archive_id, make_dir, make_HTTPS_handler, - MaxDownloadsReached, merge_headers, network_exceptions, - NO_DEFAULT, number_of_digits, orderedSet, - OUTTMPL_TYPES, - PagedList, + orderedSet_from_options, parse_filesize, - PerRequestProxyHandler, - platform_name, - Popen, - POSTPROCESS_WHEN, - PostProcessingError, preferredencoding, prepend_extension, - ReExtractInfo, register_socks_protocols, - RejectedVideoReached, remove_terminal_sequences, render_table, replace_extension, - SameFileError, sanitize_filename, sanitize_path, sanitize_url, sanitized_Request, std_headers, - STR_FORMAT_RE_TMPL, - STR_FORMAT_TYPES, str_or_none, strftime_or_none, subtitles_filename, supports_terminal_sequences, + system_identifier, timetuple_from_msec, to_high_limit_path, traverse_obj, + try_call, try_get, - UnavailableVideoError, url_basename, variadic, version_tuple, + windows_enable_vt_mode, write_json_file, write_string, - YoutubeDLCookieProcessor, - YoutubeDLHandler, - YoutubeDLRedirectHandler, -) -from .cache import Cache -from .minicurses import format_text -from .extractor import ( - gen_extractor_classes, - get_info_extractor, - _LAZY_LOADER, - _PLUGIN_CLASSES as plugin_extractors -) -from .extractor.openload import PhantomJSwrapper -from .downloader import ( - FFmpegFD, - get_suitable_downloader, - shorten_protocol_name -) -from .downloader.rtmp import rtmpdump_version -from .postprocessor import ( - get_postprocessor, - EmbedThumbnailPP, - FFmpegFixupDuplicateMoovPP, - FFmpegFixupDurationPP, - FFmpegFixupM3u8PP, - FFmpegFixupM4aPP, - FFmpegFixupStretchedPP, - FFmpegFixupTimestampPP, - FFmpegMergerPP, - FFmpegPostProcessor, - MoveFilesAfterDownloadPP, - _PLUGIN_CLASSES as plugin_postprocessors ) -from .version import __version__ +from .version import RELEASE_GIT_HEAD, VARIANT, __version__ if compat_os_name == 'nt': import ctypes -class YoutubeDL(object): +class YoutubeDL: """YoutubeDL class. YoutubeDL objects are the ones responsible of downloading the @@ -211,13 +196,6 @@ class YoutubeDL(object): For compatibility, a single list is also accepted print_to_file: A dict with keys WHEN (same as forceprint) mapped to a list of tuples with (template, filename) - forceurl: Force printing final URL. (Deprecated) - forcetitle: Force printing title. (Deprecated) - forceid: Force printing ID. (Deprecated) - forcethumbnail: Force printing thumbnail URL. (Deprecated) - forcedescription: Force printing description. (Deprecated) - forcefilename: Force printing final filename. (Deprecated) - forceduration: Force printing duration. (Deprecated) forcejson: Force printing info_dict as JSON. dump_single_json: Force printing the info_dict of the whole playlist (or video) as a single JSON line. @@ -261,22 +239,20 @@ class YoutubeDL(object): Default is 'only_download' for CLI, but False for API skip_playlist_after_errors: Number of allowed failures until the rest of the playlist is skipped - force_generic_extractor: Force downloader to use the generic extractor + allowed_extractors: List of regexes to match against extractor names that are allowed overwrites: Overwrite all video and metadata files if True, overwrite only non-video files if None and don't overwrite any file if False For compatibility with youtube-dl, "nooverwrites" may also be used instead - playliststart: Playlist item to start at. - playlistend: Playlist item to end at. playlist_items: Specific indices of playlist to download. - playlistreverse: Download playlist items in reverse order. playlistrandom: Download playlist items in random order. + lazy_playlist: Process playlist entries as they are received. matchtitle: Download only matching titles. rejecttitle: Reject downloads for matching titles. logger: Log messages to a logging.Logger instance. - logtostderr: Log messages to stderr instead of stdout. - consoletitle: Display progress in console window's titlebar. + logtostderr: Print everything to stderr instead of stdout. + consoletitle: Display progress in console window's titlebar. writedescription: Write the video description to a .description file writeinfojson: Write the video description to a .info.json file clean_infojson: Remove private fields from the infojson @@ -294,15 +270,12 @@ class YoutubeDL(object): writedesktoplink: Write a Linux internet shortcut file (.desktop) writesubtitles: Write the video subtitles to a file writeautomaticsub: Write the automatically generated subtitles to a file - allsubtitles: Deprecated - Use subtitleslangs = ['all'] - Downloads all the subtitles of the video - (requires writesubtitles or writeautomaticsub) listsubtitles: Lists all available subtitles for the video subtitlesformat: The format code for subtitles subtitleslangs: List of languages of the subtitles to download (can be regex). The list may contain "all" to refer to all the available subtitles. The language can be prefixed with a "-" to - exclude it from the requested languages. Eg: ['all', '-live_chat'] + exclude it from the requested languages, e.g. ['all', '-live_chat'] keepvideo: Keep the video file after post-processing daterange: A DateRange object, download only if the upload_date is in the range. skip_download: Skip the actual download of the video file @@ -320,24 +293,28 @@ class YoutubeDL(object): downloaded. Videos without view count information are always downloaded. None for no limit. - download_archive: File name of a file where all downloads are recorded. - Videos already present in the file are not downloaded - again. + download_archive: A set, or the name of a file where all downloads are recorded. + Videos already present in the file are not downloaded again. break_on_existing: Stop the download process after attempting to download a file that is in the archive. break_on_reject: Stop the download process when encountering a video that has been filtered out. break_per_url: Whether break_on_reject and break_on_existing should act on each input URL as opposed to for the entire queue - cookiefile: File name where cookies should be read from and dumped to + cookiefile: File name or text stream from where cookies should be read and dumped to cookiesfrombrowser: A tuple containing the name of the browser, the profile - name/pathfrom where cookies are loaded, and the name of the - keyring. Eg: ('chrome', ) or ('vivaldi', 'default', 'BASICTEXT') + name/path from where cookies are loaded, the name of the keyring, + and the container name, e.g. ('chrome', ) or + ('vivaldi', 'default', 'BASICTEXT') or ('firefox', 'default', None, 'Meta') legacyserverconnect: Explicitly allow HTTPS connection to servers that do not support RFC 5746 secure renegotiation nocheckcertificate: Do not verify SSL certificates + client_certificate: Path to client certificate file in PEM format. May include the private key + client_certificate_key: Path to private key file for client certificate + client_certificate_password: Password for client certificate private key, if encrypted. + If not provided and the key is encrypted, hypervideo will ask interactively prefer_insecure: Use HTTP instead of HTTPS to retrieve information. - At the moment, this is only supported by YouTube. + (Only supported by some extractors) http_headers: A dictionary of custom headers to be used for all requests proxy: URL of the proxy server to use geo_verification_proxy: URL of the proxy to use for IP address verification @@ -346,13 +323,17 @@ class YoutubeDL(object): bidi_workaround: Work around buggy terminals without bidirectional text support, using fridibi debug_printtraffic:Print out sent and received HTTP traffic - include_ads: Download ads as well (deprecated) default_search: Prepend this string if an input url is not valid. 'auto' for elaborate guessing encoding: Use this encoding instead of the system-specified. - extract_flat: Do not resolve URLs, return the immediate result. - Pass in 'in_playlist' to only show this behavior for - playlist items. + extract_flat: Whether to resolve and process url_results further + * False: Always process (default) + * True: Never process + * 'in_playlist': Do not process inside playlist/multi_video + * 'discard': Always process, but don't return the result + from inside playlist/multi_video + * 'discard_in_playlist': Same as "discard", but only for + playlists (not multi_video) wait_for_video: If given, wait for scheduled streams to become available. The value should be a tuple containing the range (min_secs, max_secs) to wait between retries @@ -362,10 +343,6 @@ class YoutubeDL(object): * when: When to run the postprocessor. Allowed values are the entries of utils.POSTPROCESS_WHEN Assumed to be 'post_process' if not given - post_hooks: Deprecated - Register a custom postprocessor instead - A list of functions that get called as the final step - for each video file, after all postprocessors have been - called. The filename will be passed as the only argument. progress_hooks: A list of functions that get called on download progress, with a dictionary with the entries * status: One of "downloading", "error", or "finished". @@ -400,7 +377,7 @@ class YoutubeDL(object): Progress hooks are guaranteed to be called at least twice (with status "started" and "finished") if the processing is successful. - merge_output_format: Extension to use when merging formats. + merge_output_format: "/" separated list of extensions to use when merging formats. final_ext: Expected final extension; used to detect when the file was already downloaded and converted fixup: Automatically correct known faults of the file. @@ -410,8 +387,6 @@ class YoutubeDL(object): - "detect_or_warn": check whether we can do anything about it, warn otherwise (default) source_address: Client-side IP address to bind to. - call_home: Boolean, true iff we are allowed to contact the - hypervideo servers for debugging. (BROKEN) sleep_interval_requests: Number of seconds to sleep between requests during extraction sleep_interval: Number of seconds to sleep before each download when @@ -427,10 +402,14 @@ class YoutubeDL(object): sleep_interval_subtitles: Number of seconds to sleep before each subtitle download listformats: Print an overview of available video formats and exit. list_thumbnails: Print a table of all thumbnails and exit. - match_filter: A function that gets called with the info_dict of - every video. - If it returns a message, the video is ignored. - If it returns None, the video is downloaded. + match_filter: A function that gets called for every video with the signature + (info_dict, *, incomplete: bool) -> Optional[str] + For backward compatibility with youtube-dl, the signature + (info_dict) -> Optional[str] is also allowed. + - If it returns a message, the video is ignored. + - If it returns None, the video is downloaded. + - If it returns utils.NO_DEFAULT, the user is interactively + asked whether to download the video. match_filter_func in utils.py is one example for this. no_color: Do not emit color codes in output. geo_bypass: Bypass geographic restriction via faking X-Forwarded-For @@ -442,17 +421,10 @@ class YoutubeDL(object): geo_bypass_ip_block: IP range in CIDR notation that will be used similarly to geo_bypass_country - - The following options determine which downloader is picked: external_downloader: A dictionary of protocol keys and the executable of the external downloader to use for it. The allowed protocols are default|http|ftp|m3u8|dash|rtsp|rtmp|mms. Set the value to 'native' to use the native downloader - hls_prefer_native: Deprecated - Use external_downloader = {'m3u8': 'native'} - or {'m3u8': 'ffmpeg'} instead. - Use the native HLS downloader instead of ffmpeg/avconv - if True, otherwise use ffmpeg/avconv if False, otherwise - use downloader suggested by extractor if None. compat_opts: Compatibility options. See "Differences in default behavior". The following options do not work when used through the API: filename, abort-on-error, multistreams, no-live-chat, format-sort @@ -462,17 +434,29 @@ class YoutubeDL(object): Allowed keys are 'download', 'postprocess', 'download-title' (console title) and 'postprocess-title'. The template is mapped on a dictionary with keys 'progress' and 'info' + retry_sleep_functions: Dictionary of functions that takes the number of attempts + as argument and returns the time to sleep in seconds. + Allowed keys are 'http', 'fragment', 'file_access' + download_ranges: A callback function that gets called for every video with + the signature (info_dict, ydl) -> Iterable[Section]. + Only the returned sections will be downloaded. + Each Section is a dict with the following keys: + * start_time: Start time of the section in seconds + * end_time: End time of the section in seconds + * title: Section title (Optional) + * index: Section number (Optional) + force_keyframes_at_cuts: Re-encode the video when downloading ranges to get precise cuts + noprogress: Do not print the progress bar + live_from_start: Whether to download livestreams videos from the start The following parameters are not used by YoutubeDL itself, they are used by the downloader (see hypervideo_dl/downloader/common.py): nopart, updatetime, buffersize, ratelimit, throttledratelimit, min_filesize, max_filesize, test, noresizebuffer, retries, file_access_retries, fragment_retries, - continuedl, noprogress, xattr_set_filesize, hls_use_mpegts, http_chunk_size, + continuedl, xattr_set_filesize, hls_use_mpegts, http_chunk_size, external_downloader_args, concurrent_fragment_downloads. The following options are used by the post processors: - prefer_ffmpeg: If False, use avconv instead of ffmpeg if both are available, - otherwise prefer ffmpeg. (avconv support is deprecated) ffmpeg_location: Location of the ffmpeg/avconv binary; either the path to the binary or its containing directory. postprocessor_args: A dictionary of postprocessor/executable keys (in lower case) @@ -490,44 +474,89 @@ class YoutubeDL(object): discontinuities such as ad breaks (default: False) extractor_args: A dictionary of arguments to be passed to the extractors. See "EXTRACTOR ARGUMENTS" for details. - Eg: {'youtube': {'skip': ['dash', 'hls']}} + E.g. {'youtube': {'skip': ['dash', 'hls']}} mark_watched: Mark videos watched (even with --simulate). Only for YouTube - youtube_include_dash_manifest: Deprecated - Use extractor_args instead. + + The following options are deprecated and may be removed in the future: + + force_generic_extractor: Force downloader to use the generic extractor + - Use allowed_extractors = ['generic', 'default'] + playliststart: - Use playlist_items + Playlist item to start at. + playlistend: - Use playlist_items + Playlist item to end at. + playlistreverse: - Use playlist_items + Download playlist items in reverse order. + forceurl: - Use forceprint + Force printing final URL. + forcetitle: - Use forceprint + Force printing title. + forceid: - Use forceprint + Force printing ID. + forcethumbnail: - Use forceprint + Force printing thumbnail URL. + forcedescription: - Use forceprint + Force printing description. + forcefilename: - Use forceprint + Force printing final filename. + forceduration: - Use forceprint + Force printing duration. + allsubtitles: - Use subtitleslangs = ['all'] + Downloads all the subtitles of the video + (requires writesubtitles or writeautomaticsub) + include_ads: - Doesn't work + Download ads as well + call_home: - Not implemented + Boolean, true iff we are allowed to contact the + hypervideo servers for debugging. + post_hooks: - Register a custom postprocessor + A list of functions that get called as the final step + for each video file, after all postprocessors have been + called. The filename will be passed as the only argument. + hls_prefer_native: - Use external_downloader = {'m3u8': 'native'} or {'m3u8': 'ffmpeg'}. + Use the native HLS downloader instead of ffmpeg/avconv + if True, otherwise use ffmpeg/avconv if False, otherwise + use downloader suggested by extractor if None. + prefer_ffmpeg: - avconv support is deprecated + If False, use avconv instead of ffmpeg if both are available, + otherwise prefer ffmpeg. + youtube_include_dash_manifest: - Use extractor_args If True (default), DASH manifests and related data will be downloaded and processed by extractor. You can reduce network I/O by disabling it if you don't care about DASH. (only for youtube) - youtube_include_hls_manifest: Deprecated - Use extractor_args instead. + youtube_include_hls_manifest: - Use extractor_args If True (default), HLS manifests and related data will be downloaded and processed by extractor. You can reduce network I/O by disabling it if you don't care about HLS. (only for youtube) """ - _NUMERIC_FIELDS = set(( - 'width', 'height', 'tbr', 'abr', 'asr', 'vbr', 'fps', 'filesize', 'filesize_approx', + _NUMERIC_FIELDS = { + 'width', 'height', 'asr', 'audio_channels', 'fps', + 'tbr', 'abr', 'vbr', 'filesize', 'filesize_approx', 'timestamp', 'release_timestamp', 'duration', 'view_count', 'like_count', 'dislike_count', 'repost_count', 'average_rating', 'comment_count', 'age_limit', 'start_time', 'end_time', 'chapter_number', 'season_number', 'episode_number', 'track_number', 'disc_number', 'release_year', - )) + } _format_fields = { # NB: Keep in sync with the docstring of extractor/common.py 'url', 'manifest_url', 'manifest_stream_number', 'ext', 'format', 'format_id', 'format_note', - 'width', 'height', 'resolution', 'dynamic_range', 'tbr', 'abr', 'acodec', 'asr', - 'vbr', 'fps', 'vcodec', 'container', 'filesize', 'filesize_approx', + 'width', 'height', 'aspect_ratio', 'resolution', 'dynamic_range', 'tbr', 'abr', 'acodec', 'asr', 'audio_channels', + 'vbr', 'fps', 'vcodec', 'container', 'filesize', 'filesize_approx', 'rows', 'columns', 'player_url', 'protocol', 'fragment_base_url', 'fragments', 'is_from_start', 'preference', 'language', 'language_preference', 'quality', 'source_preference', 'http_headers', 'stretched_ratio', 'no_resume', 'has_drm', 'downloader_options', 'page_url', 'app', 'play_path', 'tc_url', 'flash_version', 'rtmp_live', 'rtmp_conn', 'rtmp_protocol', 'rtmp_real_time' } _format_selection_exts = { - 'audio': {'m4a', 'mp3', 'ogg', 'aac'}, - 'video': {'mp4', 'flv', 'webm', '3gp'}, - 'storyboards': {'mhtml'}, + 'audio': set(MEDIA_EXTENSIONS.common_audio), + 'video': set(MEDIA_EXTENSIONS.common_video + ('3gp', )), + 'storyboards': set(MEDIA_EXTENSIONS.storyboards), } def __init__(self, params=None, auto_init=True): @@ -554,21 +583,30 @@ class YoutubeDL(object): self.cache = Cache(self) windows_enable_vt_mode() - self._out_files = { - 'error': sys.stderr, - 'print': sys.stderr if self.params.get('logtostderr') else sys.stdout, - 'console': None if compat_os_name == 'nt' else next( + stdout = sys.stderr if self.params.get('logtostderr') else sys.stdout + self._out_files = Namespace( + out=stdout, + error=sys.stderr, + screen=sys.stderr if self.params.get('quiet') else stdout, + console=None if compat_os_name == 'nt' else next( filter(supports_terminal_sequences, (sys.stderr, sys.stdout)), None) - } - self._out_files['screen'] = sys.stderr if self.params.get('quiet') else self._out_files['print'] - self._allow_colors = { - type_: not self.params.get('no_color') and supports_terminal_sequences(self._out_files[type_]) - for type_ in ('screen', 'error') - } - - if sys.version_info < (3, 6): - self.report_warning( - 'Python version %d.%d is not supported! Please update to Python 3.6 or above' % sys.version_info[:2]) + ) + self._allow_colors = Namespace(**{ + type_: not self.params.get('no_color') and supports_terminal_sequences(stream) + for type_, stream in self._out_files.items_ if type_ != 'console' + }) + + # The code is left like this to be reused for future deprecations + MIN_SUPPORTED, MIN_RECOMMENDED = (3, 7), (3, 7) + current_version = sys.version_info[:2] + if current_version < MIN_RECOMMENDED: + msg = ('Support for Python version %d.%d has been deprecated. ' + 'See https://github.com/hypervideo/hypervideo/issues/3764 for more details.' + '\n You will no longer receive updates on this version') + if current_version < MIN_SUPPORTED: + msg = 'Python version %d.%d is no longer supported' + self.deprecation_warning( + f'{msg}! Please update to Python %d.%d or above' % (*current_version, *MIN_RECOMMENDED)) if self.params.get('allow_unplayable_formats'): self.report_warning( @@ -577,9 +615,33 @@ class YoutubeDL(object): ' If you experience any issues while using this option, ' f'{self._format_err("DO NOT", self.Styles.ERROR)} open a bug report') + if self.params.get('bidi_workaround', False): + try: + import pty + master, slave = pty.openpty() + width = shutil.get_terminal_size().columns + width_args = [] if width is None else ['-w', str(width)] + sp_kwargs = {'stdin': subprocess.PIPE, 'stdout': slave, 'stderr': self._out_files.error} + try: + self._output_process = Popen(['bidiv'] + width_args, **sp_kwargs) + except OSError: + self._output_process = Popen(['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs) + self._output_channel = os.fdopen(master, 'rb') + except OSError as ose: + if ose.errno == errno.ENOENT: + self.report_warning( + 'Could not find fribidi executable, ignoring --bidi-workaround. ' + 'Make sure that fribidi is an executable file in one of the directories in your $PATH.') + else: + raise + + self.params['compat_opts'] = set(self.params.get('compat_opts', ())) + if auto_init and auto_init != 'no_verbose_header': + self.print_debug_header() + def check_deprecated(param, option, suggestion): if self.params.get(param) is not None: - self.report_warning('%s is deprecated. Use %s instead' % (option, suggestion)) + self.report_warning(f'{option} is deprecated. Use {suggestion} instead') return True return False @@ -594,9 +656,9 @@ class YoutubeDL(object): for msg in self.params.get('_warnings', []): self.report_warning(msg) for msg in self.params.get('_deprecation_warnings', []): - self.deprecation_warning(msg) + self.deprecated_feature(msg) - if 'list-formats' in self.params.get('compat_opts', []): + if 'list-formats' in self.params['compat_opts']: self.params['listformats_table'] = False if 'overwrites' not in self.params and self.params.get('nooverwrites') is not None: @@ -609,6 +671,13 @@ class YoutubeDL(object): else: self.params['nooverwrites'] = not self.params['overwrites'] + if self.params.get('simulate') is None and any(( + self.params.get('list_thumbnails'), + self.params.get('listformats'), + self.params.get('listsubtitles'), + )): + self.params['simulate'] = 'list_only' + self.params.setdefault('forceprint', {}) self.params.setdefault('print_to_file', {}) @@ -616,31 +685,8 @@ class YoutubeDL(object): if not isinstance(params['forceprint'], dict): self.params['forceprint'] = {'video': params['forceprint']} - if self.params.get('bidi_workaround', False): - try: - import pty - master, slave = pty.openpty() - width = compat_get_terminal_size().columns - if width is None: - width_args = [] - else: - width_args = ['-w', str(width)] - sp_kwargs = dict( - stdin=subprocess.PIPE, - stdout=slave, - stderr=self._out_files['error']) - try: - self._output_process = Popen(['bidiv'] + width_args, **sp_kwargs) - except OSError: - self._output_process = Popen(['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs) - self._output_channel = os.fdopen(master, 'rb') - except OSError as ose: - if ose.errno == errno.ENOENT: - self.report_warning( - 'Could not find fribidi executable, ignoring --bidi-workaround. ' - 'Make sure that fribidi is an executable file in one of the directories in your $PATH.') - else: - raise + if auto_init: + self.add_default_info_extractors() if (sys.platform != 'win32' and sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968'] @@ -652,7 +698,7 @@ class YoutubeDL(object): 'Set the LC_ALL environment variable to fix this.') self.params['restrictfilenames'] = True - self.outtmpl_dict = self.parse_outtmpl() + self._parse_outtmpl() # Creating format selector here allows us to catch syntax errors before the extraction self.format_selector = ( @@ -663,13 +709,6 @@ class YoutubeDL(object): # Set http_headers defaults according to std_headers self.params['http_headers'] = merge_headers(std_headers, self.params.get('http_headers', {})) - self._setup_opener() - - if auto_init: - if auto_init != 'no_verbose_header': - self.print_debug_header() - self.add_default_info_extractors() - hooks = { 'post_hooks': self.add_post_hook, 'progress_hooks': self.add_progress_hook, @@ -683,28 +722,31 @@ class YoutubeDL(object): pp_def = dict(pp_def_raw) when = pp_def.pop('when', 'post_process') self.add_post_processor( - get_postprocessor(pp_def.pop('key'))(self, **compat_kwargs(pp_def)), + get_postprocessor(pp_def.pop('key'))(self, **pp_def), when=when) + self._setup_opener() register_socks_protocols() def preload_download_archive(fn): """Preload the archive, if any is specified""" + archive = set() if fn is None: - return False + return archive + elif not is_path_like(fn): + return fn + self.write_debug(f'Loading archive file {fn!r}') try: with locked_file(fn, 'r', encoding='utf-8') as archive_file: for line in archive_file: - self.archive.add(line.strip()) - except IOError as ioe: + archive.add(line.strip()) + except OSError as ioe: if ioe.errno != errno.ENOENT: raise - return False - return True + return archive - self.archive = set() - preload_download_archive(self.params.get('download_archive')) + self.archive = preload_download_archive(self.params.get('download_archive')) def warn_if_short_id(self, argv): # short YouTube ID starting with dash? @@ -730,13 +772,6 @@ class YoutubeDL(object): self._ies_instances[ie_key] = ie ie.set_downloader(self) - def _get_info_extractor_class(self, ie_key): - ie = self._ies.get(ie_key) - if ie is None: - ie = get_info_extractor(ie_key) - self.add_info_extractor(ie) - return ie - def get_info_extractor(self, ie_key): """ Get an instance of an IE with name ie_key, it will try to get one from @@ -753,11 +788,23 @@ class YoutubeDL(object): """ Add the InfoExtractors returned by gen_extractors to the end of the list """ - for ie in gen_extractor_classes(): - self.add_info_extractor(ie) + all_ies = {ie.IE_NAME.lower(): ie for ie in gen_extractor_classes()} + all_ies['end'] = UnsupportedURLIE() + try: + ie_names = orderedSet_from_options( + self.params.get('allowed_extractors', ['default']), { + 'all': list(all_ies), + 'default': [name for name, ie in all_ies.items() if ie._ENABLED], + }, use_regex=True) + except re.error as e: + raise ValueError(f'Wrong regex for allowed_extractors: {e.pattern}') + for name in ie_names: + self.add_info_extractor(all_ies[name]) + self.write_debug(f'Loaded {len(ie_names)} extractors') def add_post_processor(self, pp, when='post_process'): """Add a PostProcessor object to the end of the chain.""" + assert when in POSTPROCESS_WHEN, f'Invalid when={when}' self._pps[when].append(pp) pp.set_downloader(self) @@ -781,11 +828,11 @@ class YoutubeDL(object): return message assert hasattr(self, '_output_process') - assert isinstance(message, compat_str) + assert isinstance(message, str) line_count = message.count('\n') + 1 - self._output_process.stdin.write((message + '\n').encode('utf-8')) + self._output_process.stdin.write((message + '\n').encode()) self._output_process.stdin.flush() - res = ''.join(self._output_channel.readline().decode('utf-8') + res = ''.join(self._output_channel.readline().decode() for _ in range(line_count)) return res[:-len('\n')] @@ -799,12 +846,14 @@ class YoutubeDL(object): def to_stdout(self, message, skip_eol=False, quiet=None): """Print message to stdout""" if quiet is not None: - self.deprecation_warning('"YoutubeDL.to_stdout" no longer accepts the argument quiet. Use "YoutubeDL.to_screen" instead') - self._write_string( - '%s%s' % (self._bidi_workaround(message), ('' if skip_eol else '\n')), - self._out_files['print']) - - def to_screen(self, message, skip_eol=False, quiet=None): + self.deprecation_warning('"YoutubeDL.to_stdout" no longer accepts the argument quiet. ' + 'Use "YoutubeDL.to_screen" instead') + if skip_eol is not False: + self.deprecation_warning('"YoutubeDL.to_stdout" no longer accepts the argument skip_eol. ' + 'Use "YoutubeDL.to_screen" instead') + self._write_string(f'{self._bidi_workaround(message)}\n', self._out_files.out) + + def to_screen(self, message, skip_eol=False, quiet=None, only_once=False): """Print message to screen if not in quiet mode""" if self.params.get('logger'): self.params['logger'].debug(message) @@ -813,20 +862,20 @@ class YoutubeDL(object): return self._write_string( '%s%s' % (self._bidi_workaround(message), ('' if skip_eol else '\n')), - self._out_files['screen']) + self._out_files.screen, only_once=only_once) def to_stderr(self, message, only_once=False): """Print message to stderr""" - assert isinstance(message, compat_str) + assert isinstance(message, str) if self.params.get('logger'): self.params['logger'].error(message) else: - self._write_string('%s\n' % self._bidi_workaround(message), self._out_files['error'], only_once=only_once) + self._write_string(f'{self._bidi_workaround(message)}\n', self._out_files.error, only_once=only_once) def _send_console_code(self, code): - if compat_os_name == 'nt' or not self._out_files['console']: + if compat_os_name == 'nt' or not self._out_files.console: return - self._write_string(code, self._out_files['console']) + self._write_string(code, self._out_files.console) def to_console_title(self, message): if not self.params.get('consoletitle', False): @@ -894,16 +943,19 @@ class YoutubeDL(object): raise DownloadError(message, exc_info) self._download_retcode = 1 - class Styles(Enum): - HEADERS = 'yellow' - EMPHASIS = 'light blue' - ID = 'green' - DELIM = 'blue' - ERROR = 'red' - WARNING = 'yellow' - SUPPRESS = 'light black' + Styles = Namespace( + HEADERS='yellow', + EMPHASIS='light blue', + FILENAME='green', + ID='green', + DELIM='blue', + ERROR='red', + WARNING='yellow', + SUPPRESS='light black', + ) def _format_text(self, handle, allow_colors, text, f, fallback=None, *, test_encoding=False): + text = str(text) if test_encoding: original_text = text # handle.encoding can be None. See https://github.com/hypervideo/hypervideo/issues/2711 @@ -911,17 +963,16 @@ class YoutubeDL(object): text = text.encode(encoding, 'ignore').decode(encoding) if fallback is not None and text != original_text: text = fallback - if isinstance(f, self.Styles): - f = f.value return format_text(text, f) if allow_colors else text if fallback is None else fallback + def _format_out(self, *args, **kwargs): + return self._format_text(self._out_files.out, self._allow_colors.out, *args, **kwargs) + def _format_screen(self, *args, **kwargs): - return self._format_text( - self._out_files['screen'], self._allow_colors['screen'], *args, **kwargs) + return self._format_text(self._out_files.screen, self._allow_colors.screen, *args, **kwargs) def _format_err(self, *args, **kwargs): - return self._format_text( - self._out_files['error'], self._allow_colors['error'], *args, **kwargs) + return self._format_text(self._out_files.error, self._allow_colors.error, *args, **kwargs) def report_warning(self, message, only_once=False): ''' @@ -935,11 +986,14 @@ class YoutubeDL(object): return self.to_stderr(f'{self._format_err("WARNING:", self.Styles.WARNING)} {message}', only_once) - def deprecation_warning(self, message): + def deprecation_warning(self, message, *, stacklevel=0): + deprecation_warning( + message, stacklevel=stacklevel + 1, printer=self.report_error, is_error=False) + + def deprecated_feature(self, message): if self.params.get('logger') is not None: - self.params['logger'].warning(f'DeprecationWarning: {message}') - else: - self.to_stderr(f'{self._format_err("DeprecationWarning:", self.Styles.ERROR)} {message}', True) + self.params['logger'].warning(f'Deprecated Feature: {message}') + self.to_stderr(f'{self._format_err("Deprecated Feature:", self.Styles.ERROR)} {message}', True) def report_error(self, message, *args, **kwargs): ''' @@ -952,7 +1006,7 @@ class YoutubeDL(object): '''Log debug message or Print message to stderr''' if not self.params.get('verbose', False): return - message = '[debug] %s' % message + message = f'[debug] {message}' if self.params.get('logger'): self.params['logger'].debug(message) else: @@ -973,7 +1027,7 @@ class YoutubeDL(object): self.to_screen('Deleting existing file') def raise_no_formats(self, info, forced=False, *, msg=None): - has_drm = info.get('__has_drm') + has_drm = info.get('_has_drm') ignored, expected = self.params.get('ignore_no_formats_error'), bool(msg) msg = msg or has_drm and 'This video is DRM protected' or 'No video formats found!' if forced or not ignored: @@ -983,37 +1037,27 @@ class YoutubeDL(object): self.report_warning(msg) def parse_outtmpl(self): - outtmpl_dict = self.params.get('outtmpl', {}) - if not isinstance(outtmpl_dict, dict): - outtmpl_dict = {'default': outtmpl_dict} - # Remove spaces in the default template - if self.params.get('restrictfilenames'): + self.deprecation_warning('"YoutubeDL.parse_outtmpl" is deprecated and may be removed in a future version') + self._parse_outtmpl() + return self.params['outtmpl'] + + def _parse_outtmpl(self): + sanitize = IDENTITY + if self.params.get('restrictfilenames'): # Remove spaces in the default template sanitize = lambda x: x.replace(' - ', ' ').replace(' ', '-') - else: - sanitize = lambda x: x - outtmpl_dict.update({ - k: sanitize(v) for k, v in DEFAULT_OUTTMPL.items() - if outtmpl_dict.get(k) is None}) - for key, val in outtmpl_dict.items(): - if isinstance(val, bytes): - self.report_warning( - 'Parameter outtmpl is bytes, but should be a unicode string. ' - 'Put from __future__ import unicode_literals at the top of your code file or consider switching to Python 3.x.') - return outtmpl_dict + + outtmpl = self.params.setdefault('outtmpl', {}) + if not isinstance(outtmpl, dict): + self.params['outtmpl'] = outtmpl = {'default': outtmpl} + outtmpl.update({k: sanitize(v) for k, v in DEFAULT_OUTTMPL.items() if outtmpl.get(k) is None}) def get_output_path(self, dir_type='', filename=None): paths = self.params.get('paths', {}) - assert isinstance(paths, dict) + assert isinstance(paths, dict), '"paths" parameter must be a dictionary' path = os.path.join( expand_path(paths.get('home', '').strip()), expand_path(paths.get(dir_type, '').strip()) if dir_type else '', filename or '') - - # Temporary fix for #4787 - # 'Treat' all problem characters by passing filename through preferredencoding - # to workaround encoding issues with subprocess on python2 @ Windows - if sys.version_info < (3, 0) and sys.platform == 'win32': - path = encodeFilename(path, True).decode(preferredencoding()) return sanitize_path(path, force=self.params.get('windowsfilenames')) @staticmethod @@ -1023,11 +1067,11 @@ class YoutubeDL(object): # '%%' intact for template dict substitution step. Working around # with boundary-alike separator hack. sep = ''.join([random.choice(ascii_letters) for _ in range(32)]) - outtmpl = outtmpl.replace('%%', '%{0}%'.format(sep)).replace('$$', '${0}$'.format(sep)) + outtmpl = outtmpl.replace('%%', f'%{sep}%').replace('$$', f'${sep}$') # outtmpl should be expand_path'ed before template dict substitution # because meta fields may contain env variables we don't want to - # be expanded. For example, for outtmpl "%(title)s.%(ext)s" and + # be expanded. E.g. for outtmpl "%(title)s.%(ext)s" and # title "Hello $PATH", we don't want `$PATH` to be expanded. return expand_path(outtmpl).replace(sep, '') @@ -1043,7 +1087,7 @@ class YoutubeDL(object): def validate_outtmpl(cls, outtmpl): ''' @return None or Exception object ''' outtmpl = re.sub( - STR_FORMAT_RE_TMPL.format('[^)]*', '[ljqBUDS]'), + STR_FORMAT_RE_TMPL.format('[^)]*', '[ljhqBUDS]'), lambda mobj: f'{mobj.group(0)[:-1]}s', cls._outtmpl_expandpath(outtmpl)) try: @@ -1056,6 +1100,7 @@ class YoutubeDL(object): def _copy_infodict(info_dict): info_dict = dict(info_dict) info_dict.pop('__postprocessors', None) + info_dict.pop('__pending_error', None) return info_dict def prepare_outtmpl(self, outtmpl, info_dict, sanitize=False): @@ -1071,7 +1116,7 @@ class YoutubeDL(object): formatSeconds(info_dict['duration'], '-' if sanitize else ':') if info_dict.get('duration', None) is not None else None) - info_dict['autonumber'] = self.params.get('autonumber_start', 1) - 1 + self._num_downloads + info_dict['autonumber'] = int(self.params.get('autonumber_start', 1) - 1 + self._num_downloads) info_dict['video_autonumber'] = self._num_videos if info_dict.get('resolution') is None: info_dict['resolution'] = self.format_resolution(info_dict, default=None) @@ -1079,38 +1124,51 @@ class YoutubeDL(object): # For fields playlist_index, playlist_autonumber and autonumber convert all occurrences # of %(field)s to %(field)0Nd for backward compatibility field_size_compat_map = { - 'playlist_index': number_of_digits(info_dict.get('_last_playlist_index') or 0), + 'playlist_index': number_of_digits(info_dict.get('__last_playlist_index') or 0), 'playlist_autonumber': number_of_digits(info_dict.get('n_entries') or 0), 'autonumber': self.params.get('autonumber_size') or 5, } TMPL_DICT = {} - EXTERNAL_FORMAT_RE = re.compile(STR_FORMAT_RE_TMPL.format('[^)]*', f'[{STR_FORMAT_TYPES}ljqBUDS]')) + EXTERNAL_FORMAT_RE = re.compile(STR_FORMAT_RE_TMPL.format('[^)]*', f'[{STR_FORMAT_TYPES}ljhqBUDS]')) MATH_FUNCTIONS = { '+': float.__add__, '-': float.__sub__, } # Field is of the form key1.key2... - # where keys (except first) can be string, int or slice - FIELD_RE = r'\w*(?:\.(?:\w+|{num}|{num}?(?::{num}?){{1,2}}))*'.format(num=r'(?:-?\d+)') - MATH_FIELD_RE = r'''(?:{field}|{num})'''.format(field=FIELD_RE, num=r'-?\d+(?:.\d+)?') + # where keys (except first) can be string, int, slice or "{field, ...}" + FIELD_INNER_RE = r'(?:\w+|%(num)s|%(num)s?(?::%(num)s?){1,2})' % {'num': r'(?:-?\d+)'} + FIELD_RE = r'\w*(?:\.(?:%(inner)s|{%(field)s(?:,%(field)s)*}))*' % { + 'inner': FIELD_INNER_RE, + 'field': rf'\w*(?:\.{FIELD_INNER_RE})*' + } + MATH_FIELD_RE = rf'(?:{FIELD_RE}|-?{NUMBER_RE})' MATH_OPERATORS_RE = r'(?:%s)' % '|'.join(map(re.escape, MATH_FUNCTIONS.keys())) - INTERNAL_FORMAT_RE = re.compile(r'''(?x) + INTERNAL_FORMAT_RE = re.compile(rf'''(?x) (?P-)? - (?P{field}) - (?P(?:{math_op}{math_field})*) + (?P{FIELD_RE}) + (?P(?:{MATH_OPERATORS_RE}{MATH_FIELD_RE})*) (?:>(?P.+?))? (?P (?P(?.*?))? (?:\|(?P.*?))? - )$'''.format(field=FIELD_RE, math_op=MATH_OPERATORS_RE, math_field=MATH_FIELD_RE)) + )$''') + + def _traverse_infodict(fields): + fields = [f for x in re.split(r'\.({.+?})\.?', fields) + for f in ([x] if x.startswith('{') else x.split('.'))] + for i in (0, -1): + if fields and not fields[i]: + fields.pop(i) - def _traverse_infodict(k): - k = k.split('.') - if k[0] == '': - k.pop(0) - return traverse_obj(info_dict, k, is_user_input=True, traverse_string=True) + for i, f in enumerate(fields): + if not f.startswith('{'): + continue + assert f.endswith('}'), f'No closing brace for {f} in {fields}' + fields[i] = {k: k.split('.') for k in f[1:-1].split(',')} + + return traverse_obj(info_dict, fields, is_user_input=True, traverse_string=True) def get_value(mdict): # Object traversal @@ -1146,6 +1204,9 @@ class YoutubeDL(object): if mdict['strf_format']: value = strftime_or_none(value, mdict['strf_format'].replace('\\,', ',')) + # XXX: Workaround for https://github.com/hypervideo/hypervideo/issues/4485 + if sanitize and value == '': + value = None return value na = self.params.get('outtmpl_na_placeholder', 'NA') @@ -1153,7 +1214,7 @@ class YoutubeDL(object): def filename_sanitizer(key, value, restricted=self.params.get('restrictfilenames')): return sanitize_filename(str(value), restricted=restricted, is_id=( bool(re.search(r'(^|[_.])id(\.|$)', key)) - if 'filename-sanitization' in self.params.get('compat_opts', []) + if 'filename-sanitization' in self.params['compat_opts'] else NO_DEFAULT)) sanitizer = sanitize if callable(sanitize) else filename_sanitizer @@ -1183,7 +1244,7 @@ class YoutubeDL(object): fmt = outer_mobj.group('format') if fmt == 's' and value is not None and key in field_size_compat_map.keys(): - fmt = '0{:d}d'.format(field_size_compat_map[key]) + fmt = f'0{field_size_compat_map[key]:d}d' value = default if value is None else value if replacement is None else replacement @@ -1193,12 +1254,16 @@ class YoutubeDL(object): delim = '\n' if '#' in flags else ', ' value, fmt = delim.join(map(str, variadic(value, allowed_types=(str, bytes)))), str_fmt elif fmt[-1] == 'j': # json - value, fmt = json.dumps(value, default=_dumpjson_default, indent=4 if '#' in flags else None), str_fmt + value, fmt = json.dumps( + value, default=_dumpjson_default, + indent=4 if '#' in flags else None, ensure_ascii='+' not in flags), str_fmt + elif fmt[-1] == 'h': # html + value, fmt = escapeHTML(str(value)), str_fmt elif fmt[-1] == 'q': # quoted value = map(str, variadic(value) if '#' in flags else [value]) value, fmt = ' '.join(map(compat_shlex_quote, value)), str_fmt elif fmt[-1] == 'B': # bytes - value = f'%{str_fmt}'.encode('utf-8') % str(value).encode('utf-8') + value = f'%{str_fmt}'.encode() % str(value).encode() value, fmt = value.decode('utf-8', 'ignore'), 's' elif fmt[-1] == 'U': # unicode normalized value, fmt = unicodedata.normalize( @@ -1242,7 +1307,7 @@ class YoutubeDL(object): def _prepare_filename(self, info_dict, *, outtmpl=None, tmpl_type=None): assert None in (outtmpl, tmpl_type), 'outtmpl and tmpl_type are mutually exclusive' if outtmpl is None: - outtmpl = self.outtmpl_dict.get(tmpl_type or 'default', self.outtmpl_dict['default']) + outtmpl = self.params['outtmpl'].get(tmpl_type or 'default', self.params['outtmpl']['default']) try: outtmpl = self._outtmpl_expandpath(outtmpl) filename = self.evaluate_outtmpl(outtmpl, info_dict, True) @@ -1291,11 +1356,19 @@ class YoutubeDL(object): return self.get_output_path(dir_type, filename) def _match_entry(self, info_dict, incomplete=False, silent=False): - """ Returns None if the file should be downloaded """ + """Returns None if the file should be downloaded""" + _type = info_dict.get('_type', 'video') + assert incomplete or _type == 'video', 'Only video result can be considered complete' - video_title = info_dict.get('title', info_dict.get('id', 'video')) + video_title = info_dict.get('title', info_dict.get('id', 'entry')) def check_filter(): + if _type in ('playlist', 'multi_video'): + return + elif _type in ('url', 'url_transparent') and not try_call( + lambda: self.get_info_extractor(info_dict['ie_key']).is_single_video(info_dict['url'])): + return + if 'title' in info_dict: # This can happen when we're just evaluating the playlist title = info_dict['title'] @@ -1307,11 +1380,12 @@ class YoutubeDL(object): if rejecttitle: if re.search(rejecttitle, title, re.IGNORECASE): return '"' + title + '" title matched reject pattern "' + rejecttitle + '"' + date = info_dict.get('upload_date') if date is not None: dateRange = self.params.get('daterange', DateRange()) if date not in dateRange: - return '%s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange) + return f'{date_from_str(date).isoformat()} upload date is not in range {dateRange}' view_count = info_dict.get('view_count') if view_count is not None: min_views = self.params.get('min_views') @@ -1330,7 +1404,16 @@ class YoutubeDL(object): except TypeError: # For backward compatibility ret = None if incomplete else match_filter(info_dict) - if ret is not None: + if ret is NO_DEFAULT: + while True: + filename = self._format_screen(self.prepare_filename(info_dict), self.Styles.FILENAME) + reply = input(self._format_screen( + f'Download "{filename}"? (Y/n): ', self.Styles.EMPHASIS)).lower().strip() + if reply in {'y', ''}: + return None + elif reply == 'n': + return f'Skipping {video_title}' + elif ret is not None: return ret return None @@ -1356,18 +1439,19 @@ class YoutubeDL(object): def extract_info(self, url, download=True, ie_key=None, extra_info=None, process=True, force_generic_extractor=False): """ - Return a list with a dictionary for each video extracted. + Extract and return the information dictionary of the URL Arguments: - url -- URL to extract + @param url URL to extract Keyword arguments: - download -- whether to download videos during extraction - ie_key -- extractor key hint - extra_info -- dictionary containing the extra values to add to each result - process -- whether to resolve all unresolved references (URLs, playlist items), - must be True for download to work. - force_generic_extractor -- force using the generic extractor + @param download Whether to download videos + @param process Whether to resolve all unresolved references (URLs, playlist items). + Must be True for download to work + @param ie_key Use only the extractor with this key + + @param extra_info Dictionary containing the extra values to add to the info (For internal use only) + @force_generic_extractor Force using the generic extractor (Deprecated; use ie_key='Generic') """ if extra_info is None: @@ -1377,11 +1461,11 @@ class YoutubeDL(object): ie_key = 'Generic' if ie_key: - ies = {ie_key: self._get_info_extractor_class(ie_key)} + ies = {ie_key: self._ies[ie_key]} if ie_key in self._ies else {} else: ies = self._ies - for ie_key, ie in ies.items(): + for key, ie in ies.items(): if not ie.suitable(url): continue @@ -1390,16 +1474,18 @@ class YoutubeDL(object): 'and will probably not work.') temp_id = ie.get_temp_id(url) - if temp_id is not None and self.in_download_archive({'id': temp_id, 'ie_key': ie_key}): - self.to_screen(f'[{ie_key}] {temp_id}: has already been recorded in the archive') + if temp_id is not None and self.in_download_archive({'id': temp_id, 'ie_key': key}): + self.to_screen(f'[{key}] {temp_id}: has already been recorded in the archive') if self.params.get('break_on_existing', False): raise ExistingVideoReached() break - return self.__extract_info(url, self.get_info_extractor(ie_key), download, extra_info, process) + return self.__extract_info(url, self.get_info_extractor(key), download, extra_info, process) else: - self.report_error('no suitable InfoExtractor for URL %s' % url) + extractors_restricted = self.params.get('allowed_extractors') not in (None, ['default']) + self.report_error(f'No suitable extractor{format_field(ie_key, None, " (%s)")} found for URL {url}', + tb=False if extractors_restricted else None) - def __handle_extraction_exceptions(func): + def _handle_extraction_exceptions(func): @functools.wraps(func) def wrapper(self, *args, **kwargs): while True: @@ -1431,7 +1517,7 @@ class YoutubeDL(object): break return wrapper - def _wait_for_video(self, ie_result): + def _wait_for_video(self, ie_result={}): if (not self.params.get('wait_for_video') or ie_result.get('_type', 'video') != 'video' or ie_result.get('formats') or ie_result.get('url')): @@ -1442,7 +1528,12 @@ class YoutubeDL(object): def progress(msg): nonlocal last_msg - self.to_screen(msg + ' ' * (len(last_msg) - len(msg)) + '\r', skip_eol=True) + full_msg = f'{msg}\n' + if not self.params.get('noprogress'): + full_msg = msg + ' ' * (len(last_msg) - len(msg)) + '\r' + elif last_msg: + return + self.to_screen(full_msg, skip_eol=True) last_msg = msg min_wait, max_wait = self.params.get('wait_for_video') @@ -1450,7 +1541,7 @@ class YoutubeDL(object): if diff is None and ie_result.get('live_status') == 'is_upcoming': diff = round(random.uniform(min_wait, max_wait) if (max_wait and min_wait) else (max_wait or min_wait), 0) self.report_warning('Release time of video is not known') - elif (diff or 0) <= 0: + elif ie_result and (diff or 0) <= 0: self.report_warning('Video should already be available according to extracted info') diff = min(max(diff or 0, min_wait or 0), max_wait or float('inf')) self.to_screen(f'[wait] Waiting for {format_dur(diff)} - Press Ctrl+C to try now') @@ -1472,10 +1563,18 @@ class YoutubeDL(object): self.to_screen('') raise - @__handle_extraction_exceptions + @_handle_extraction_exceptions def __extract_info(self, url, ie, download, extra_info, process): - ie_result = ie.extract(url) + try: + ie_result = ie.extract(url) + except UserNotLive as e: + if process: + if self.params.get('wait_for_video'): + self.report_warning(e) + self._wait_for_video() + raise if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here) + self.report_warning(f'Extractor {ie.IE_NAME} returned nothing{bug_reports_message()}') return if isinstance(ie_result, list): # Backwards compatibility: old IE result format @@ -1523,7 +1622,8 @@ class YoutubeDL(object): result_type = ie_result.get('_type', 'video') if result_type in ('url', 'url_transparent'): - ie_result['url'] = sanitize_url(ie_result['url']) + ie_result['url'] = sanitize_url( + ie_result['url'], scheme='http' if self.params.get('prefer_insecure') else 'https') if ie_result.get('original_url'): extra_info.setdefault('original_url', ie_result['original_url']) @@ -1537,7 +1637,9 @@ class YoutubeDL(object): self.add_default_extra_info(info_copy, ie, ie_result['url']) self.add_extra_info(info_copy, extra_info) info_copy, _ = self.pre_process(info_copy) + self._fill_common_fields(info_copy, False) self.__forced_printings(info_copy, self.prepare_filename(info_copy), incomplete=True) + self._raise_pending_errors(info_copy) if self.params.get('force_write_download_archive', False): self.record_download_archive(info_copy) return ie_result @@ -1545,10 +1647,11 @@ class YoutubeDL(object): if result_type == 'video': self.add_extra_info(ie_result, extra_info) ie_result = self.process_video_result(ie_result, download=download) + self._raise_pending_errors(ie_result) additional_urls = (ie_result or {}).get('additional_urls') if additional_urls: # TODO: Improve MetadataParserPP to allow setting a list - if isinstance(additional_urls, compat_str): + if isinstance(additional_urls, str): additional_urls = [additional_urls] self.to_screen( '[info] %s: %d additional URL(s) requested' % (ie_result['id'], len(additional_urls))) @@ -1579,9 +1682,13 @@ class YoutubeDL(object): if not info: return info + exempted_fields = {'_type', 'url', 'ie_key'} + if not ie_result.get('section_end') and ie_result.get('section_start') is None: + # For video clips, the id etc of the clip extractor should be used + exempted_fields |= {'id', 'extractor', 'extractor_key'} + new_result = info.copy() - new_result.update(filter_dict(ie_result, lambda k, v: ( - v is not None and k not in {'_type', 'url', 'id', 'extractor', 'extractor_key', 'ie_key'}))) + new_result.update(filter_dict(ie_result, lambda k, v: v is not None and k not in exempted_fields)) # Extracted info may not be a video result (i.e. # info.get('_type', 'video') != video) but rather an url or @@ -1597,8 +1704,8 @@ class YoutubeDL(object): elif result_type in ('playlist', 'multi_video'): # Protect from infinite recursion due to recursively nested playlists # (see https://github.com/ytdl-org/youtube-dl/issues/27833) - webpage_url = ie_result['webpage_url'] - if webpage_url in self._playlist_urls: + webpage_url = ie_result.get('webpage_url') # Playlists maynot have webpage_url + if webpage_url and webpage_url in self._playlist_urls: self.to_screen( '[download] Skipping already downloaded playlist: %s' % ie_result.get('title') or ie_result.get('id')) @@ -1640,124 +1747,65 @@ class YoutubeDL(object): return make_dir(path, self.report_error) @staticmethod - def _playlist_infodict(ie_result, **kwargs): - return { - **ie_result, + def _playlist_infodict(ie_result, strict=False, **kwargs): + info = { + 'playlist_count': ie_result.get('playlist_count'), 'playlist': ie_result.get('title') or ie_result.get('id'), 'playlist_id': ie_result.get('id'), 'playlist_title': ie_result.get('title'), 'playlist_uploader': ie_result.get('uploader'), 'playlist_uploader_id': ie_result.get('uploader_id'), - 'playlist_index': 0, **kwargs, } + if strict: + return info + if ie_result.get('webpage_url'): + info.update({ + 'webpage_url': ie_result['webpage_url'], + 'webpage_url_basename': url_basename(ie_result['webpage_url']), + 'webpage_url_domain': get_domain(ie_result['webpage_url']), + }) + return { + **info, + 'playlist_index': 0, + '__last_playlist_index': max(ie_result['requested_entries'] or (0, 0)), + 'extractor': ie_result['extractor'], + 'extractor_key': ie_result['extractor_key'], + } def __process_playlist(self, ie_result, download): - # We process each entry in the playlist - playlist = ie_result.get('title') or ie_result.get('id') - self.to_screen('[download] Downloading playlist: %s' % playlist) - - if 'entries' not in ie_result: - raise EntryNotInPlaylist('There are no entries') - - MissingEntry = object() - incomplete_entries = bool(ie_result.get('requested_entries')) - if incomplete_entries: - def fill_missing_entries(entries, indices): - ret = [MissingEntry] * max(indices) - for i, entry in zip(indices, entries): - ret[i - 1] = entry - return ret - ie_result['entries'] = fill_missing_entries(ie_result['entries'], ie_result['requested_entries']) - - playlist_results = [] - - playliststart = self.params.get('playliststart', 1) - playlistend = self.params.get('playlistend') - # For backwards compatibility, interpret -1 as whole list - if playlistend == -1: - playlistend = None - - playlistitems_str = self.params.get('playlist_items') - playlistitems = None - if playlistitems_str is not None: - def iter_playlistitems(format): - for string_segment in format.split(','): - if '-' in string_segment: - start, end = string_segment.split('-') - for item in range(int(start), int(end) + 1): - yield int(item) - else: - yield int(string_segment) - playlistitems = orderedSet(iter_playlistitems(playlistitems_str)) + """Process each entry in the playlist""" + assert ie_result['_type'] in ('playlist', 'multi_video') - ie_entries = ie_result['entries'] - if isinstance(ie_entries, list): - playlist_count = len(ie_entries) - msg = f'Collected {playlist_count} videos; downloading %d of them' - ie_result['playlist_count'] = ie_result.get('playlist_count') or playlist_count - - def get_entry(i): - return ie_entries[i - 1] - else: - msg = 'Downloading %d videos' - if not isinstance(ie_entries, (PagedList, LazyList)): - ie_entries = LazyList(ie_entries) - elif isinstance(ie_entries, InAdvancePagedList): - if ie_entries._pagesize == 1: - playlist_count = ie_entries._pagecount - - def get_entry(i): - return YoutubeDL.__handle_extraction_exceptions( - lambda self, i: ie_entries[i - 1] - )(self, i) - - entries, broken = [], False - items = playlistitems if playlistitems is not None else itertools.count(playliststart) - for i in items: - if i == 0: - continue - if playlistitems is None and playlistend is not None and playlistend < i: - break - entry = None - try: - entry = get_entry(i) - if entry is MissingEntry: - raise EntryNotInPlaylist() - except (IndexError, EntryNotInPlaylist): - if incomplete_entries: - raise EntryNotInPlaylist(f'Entry {i} cannot be found') - elif not playlistitems: - break - entries.append(entry) - try: - if entry is not None: - self._match_entry(entry, incomplete=True, silent=True) - except (ExistingVideoReached, RejectedVideoReached): - broken = True - break - ie_result['entries'] = entries + common_info = self._playlist_infodict(ie_result, strict=True) + title = common_info.get('playlist') or '' + if self._match_entry(common_info, incomplete=True) is not None: + return + self.to_screen(f'[download] Downloading {ie_result["_type"]}: {title}') - # Save playlist_index before re-ordering - entries = [ - ((playlistitems[i - 1] if playlistitems else i + playliststart - 1), entry) - for i, entry in enumerate(entries, 1) - if entry is not None] - n_entries = len(entries) + all_entries = PlaylistEntries(self, ie_result) + entries = orderedSet(all_entries.get_requested_items(), lazy=True) - if not (ie_result.get('playlist_count') or broken or playlistitems or playlistend): - ie_result['playlist_count'] = n_entries + lazy = self.params.get('lazy_playlist') + if lazy: + resolved_entries, n_entries = [], 'N/A' + ie_result['requested_entries'], ie_result['entries'] = None, None + else: + entries = resolved_entries = list(entries) + n_entries = len(resolved_entries) + ie_result['requested_entries'], ie_result['entries'] = tuple(zip(*resolved_entries)) or ([], []) + if not ie_result.get('playlist_count'): + # Better to do this after potentially exhausting entries + ie_result['playlist_count'] = all_entries.get_full_count() - if not playlistitems and (playliststart != 1 or playlistend): - playlistitems = list(range(playliststart, playliststart + n_entries)) - ie_result['requested_entries'] = playlistitems + extra = self._playlist_infodict(ie_result, n_entries=int_or_none(n_entries)) + ie_copy = collections.ChainMap(ie_result, extra) _infojson_written = False write_playlist_files = self.params.get('allow_playlist_files', True) if write_playlist_files and self.params.get('list_thumbnails'): self.list_thumbnails(ie_result) if write_playlist_files and not self.params.get('simulate'): - ie_copy = self._playlist_infodict(ie_result, n_entries=n_entries) _infojson_written = self._write_info_json( 'playlist', ie_result, self.prepare_filename(ie_copy, 'pl_infojson')) if _infojson_written is None: @@ -1766,57 +1814,72 @@ class YoutubeDL(object): self.prepare_filename(ie_copy, 'pl_description')) is None: return # TODO: This should be passed to ThumbnailsConvertor if necessary - self._write_thumbnails('playlist', ie_copy, self.prepare_filename(ie_copy, 'pl_thumbnail')) - - if self.params.get('playlistreverse', False): - entries = entries[::-1] - if self.params.get('playlistrandom', False): + self._write_thumbnails('playlist', ie_result, self.prepare_filename(ie_copy, 'pl_thumbnail')) + + if lazy: + if self.params.get('playlistreverse') or self.params.get('playlistrandom'): + self.report_warning('playlistreverse and playlistrandom are not supported with lazy_playlist', only_once=True) + elif self.params.get('playlistreverse'): + entries.reverse() + elif self.params.get('playlistrandom'): random.shuffle(entries) - x_forwarded_for = ie_result.get('__x_forwarded_for_ip') + self.to_screen(f'[{ie_result["extractor"]}] Playlist {title}: Downloading {n_entries} items' + f'{format_field(ie_result, "playlist_count", " of %s")}') + + keep_resolved_entries = self.params.get('extract_flat') != 'discard' + if self.params.get('extract_flat') == 'discard_in_playlist': + keep_resolved_entries = ie_result['_type'] != 'playlist' + if keep_resolved_entries: + self.write_debug('The information of all playlist entries will be held in memory') - self.to_screen('[%s] playlist %s: %s' % (ie_result['extractor'], playlist, msg % n_entries)) failures = 0 max_failures = self.params.get('skip_playlist_after_errors') or float('inf') - for i, entry_tuple in enumerate(entries, 1): - playlist_index, entry = entry_tuple - if 'playlist-index' in self.params.get('compat_opts', []): - playlist_index = playlistitems[i - 1] if playlistitems else i + playliststart - 1 - self.to_screen('[download] Downloading video %s of %s' % (i, n_entries)) - # This __x_forwarded_for_ip thing is a bit ugly but requires - # minimal changes - if x_forwarded_for: - entry['__x_forwarded_for_ip'] = x_forwarded_for - extra = { - 'n_entries': n_entries, - '_last_playlist_index': max(playlistitems) if playlistitems else (playlistend or n_entries), - 'playlist_count': ie_result.get('playlist_count'), + for i, (playlist_index, entry) in enumerate(entries): + if lazy: + resolved_entries.append((playlist_index, entry)) + if not entry: + continue + + entry['__x_forwarded_for_ip'] = ie_result.get('__x_forwarded_for_ip') + if not lazy and 'playlist-index' in self.params.get('compat_opts', []): + playlist_index = ie_result['requested_entries'][i] + + entry_copy = collections.ChainMap(entry, { + **common_info, + 'n_entries': int_or_none(n_entries), 'playlist_index': playlist_index, - 'playlist_autonumber': i, - 'playlist': playlist, - 'playlist_id': ie_result.get('id'), - 'playlist_title': ie_result.get('title'), - 'playlist_uploader': ie_result.get('uploader'), - 'playlist_uploader_id': ie_result.get('uploader_id'), - 'extractor': ie_result['extractor'], - 'webpage_url': ie_result['webpage_url'], - 'webpage_url_basename': url_basename(ie_result['webpage_url']), - 'webpage_url_domain': get_domain(ie_result['webpage_url']), - 'extractor_key': ie_result['extractor_key'], - } + 'playlist_autonumber': i + 1, + }) - if self._match_entry(entry, incomplete=True) is not None: + if self._match_entry(entry_copy, incomplete=True) is not None: + # For compatabilty with youtube-dl. See https://github.com/hypervideo/hypervideo/issues/4369 + resolved_entries[i] = (playlist_index, NO_DEFAULT) continue + self.to_screen('[download] Downloading item %s of %s' % ( + self._format_screen(i + 1, self.Styles.ID), self._format_screen(n_entries, self.Styles.EMPHASIS))) + + extra.update({ + 'playlist_index': playlist_index, + 'playlist_autonumber': i + 1, + }) entry_result = self.__process_iterable_entry(entry, download, extra) if not entry_result: failures += 1 if failures >= max_failures: self.report_error( - 'Skipping the remaining entries in playlist "%s" since %d items failed extraction' % (playlist, failures)) + f'Skipping the remaining entries in playlist "{title}" since {failures} items failed extraction') break - playlist_results.append(entry_result) - ie_result['entries'] = playlist_results + if keep_resolved_entries: + resolved_entries[i] = (playlist_index, entry_result) + + # Update with processed data + ie_result['entries'] = [e for _, e in resolved_entries if e is not NO_DEFAULT] + ie_result['requested_entries'] = [i for i, e in resolved_entries if e is not NO_DEFAULT] + if ie_result['requested_entries'] == try_call(lambda: list(range(1, ie_result['playlist_count'] + 1))): + # Do not set for full playlist + ie_result.pop('requested_entries') # Write the updated info to json if _infojson_written is True and self._write_info_json( @@ -1825,10 +1888,10 @@ class YoutubeDL(object): return ie_result = self.run_all_pps('playlist', ie_result) - self.to_screen(f'[download] Finished downloading playlist: {playlist}') + self.to_screen(f'[download] Finished downloading playlist: {title}') return ie_result - @__handle_extraction_exceptions + @_handle_extraction_exceptions def __process_iterable_entry(self, entry, download, extra_info): return self.process_ie_result( entry, download=download, extra_info=extra_info) @@ -1910,7 +1973,7 @@ class YoutubeDL(object): temp_file.close() try: success, _ = self.dl(temp_file.name, f, test=True) - except (DownloadError, IOError, OSError, ValueError) + network_exceptions: + except (DownloadError, OSError, ValueError) + network_exceptions: success = False finally: if os.path.exists(temp_file.name): @@ -1934,12 +1997,12 @@ class YoutubeDL(object): and download and ( not can_merge() - or info_dict.get('is_live', False) - or self.outtmpl_dict['default'] == '-')) + or info_dict.get('is_live') and not self.params.get('live_from_start') + or self.params['outtmpl']['default'] == '-')) compat = ( prefer_best or self.params.get('allow_multiple_audio_streams', False) - or 'format-spec' in self.params.get('compat_opts', [])) + or 'format-spec' in self.params['compat_opts']) return ( 'best/bestvideo+bestaudio' if prefer_best @@ -1950,7 +2013,7 @@ class YoutubeDL(object): def syntax_error(note, start): message = ( 'Invalid format specification: ' - '{0}\n\t{1}\n\t{2}^'.format(note, format_spec, ' ' * start[1])) + '{}\n\t{}\n\t{}^'.format(note, format_spec, ' ' * start[1])) return SyntaxError(message) PICKFIRST = 'PICKFIRST' @@ -1973,8 +2036,8 @@ class YoutubeDL(object): filter_parts.append(string) def _remove_unused_ops(tokens): - # Remove operators that we don't use and join them with the surrounding strings - # for example: 'mp4' '-' 'baseline' '-' '16x9' is converted to 'mp4-baseline-16x9' + # Remove operators that we don't use and join them with the surrounding strings. + # E.g. 'mp4' '-' 'baseline' '-' '16x9' is converted to 'mp4-baseline-16x9' ALLOWED_OPS = ('/', '+', ',', '(', ')') last_string, last_start, last_end, last_line = None, None, None, None for type, string, start, end, line in tokens: @@ -2054,7 +2117,7 @@ class YoutubeDL(object): raise syntax_error('Expected a selector', start) current_selector = FormatSelector(MERGE, (selector_1, selector_2), []) else: - raise syntax_error('Operator not recognized: "{0}"'.format(string), start) + raise syntax_error(f'Operator not recognized: "{string}"', start) elif type == tokenize.ENDMARKER: break if current_selector: @@ -2090,14 +2153,13 @@ class YoutubeDL(object): the_only_video = video_fmts[0] if len(video_fmts) == 1 else None the_only_audio = audio_fmts[0] if len(audio_fmts) == 1 else None - output_ext = self.params.get('merge_output_format') - if not output_ext: - if the_only_video: - output_ext = the_only_video['ext'] - elif the_only_audio and not video_fmts: - output_ext = the_only_audio['ext'] - else: - output_ext = 'mkv' + output_ext = get_compatible_ext( + vcodecs=[f.get('vcodec') for f in video_fmts], + acodecs=[f.get('acodec') for f in audio_fmts], + vexts=[f['ext'] for f in video_fmts], + aexts=[f['ext'] for f in audio_fmts], + preferences=(try_call(lambda: self.params['merge_output_format'].split('/')) + or self.params.get('prefer_free_formats') and ('webm', 'mkv'))) filtered = lambda *keys: filter(None, (traverse_obj(fmt, *keys) for fmt in formats_info)) @@ -2123,6 +2185,7 @@ class YoutubeDL(object): 'vcodec': the_only_video.get('vcodec'), 'vbr': the_only_video.get('vbr'), 'stretched_ratio': the_only_video.get('stretched_ratio'), + 'aspect_ratio': the_only_video.get('aspect_ratio'), }) if the_only_audio: @@ -2130,6 +2193,7 @@ class YoutubeDL(object): 'acodec': the_only_audio.get('acodec'), 'abr': the_only_audio.get('abr'), 'asr': the_only_audio.get('asr'), + 'audio_channels': the_only_audio.get('audio_channels') }) return new_dict @@ -2178,7 +2242,8 @@ class YoutubeDL(object): yield from _check_formats(ctx['formats'][::-1]) elif format_spec == 'mergeall': def selector_function(ctx): - formats = list(_check_formats(ctx['formats'])) + formats = list(_check_formats( + f for f in ctx['formats'] if f.get('vcodec') != 'none' or f.get('acodec') != 'none')) if not formats: return merged_format = formats[-1] @@ -2235,7 +2300,7 @@ class YoutubeDL(object): matches = LazyList(_check_formats(matches[::-1 if format_reverse else 1])) try: yield matches[format_idx - 1] - except IndexError: + except LazyList.IndexError: return filters = [self._build_format_filter(f) for f in selector.filters] @@ -2247,13 +2312,13 @@ class YoutubeDL(object): return selector_function(ctx_copy) return final_selector - stream = io.BytesIO(format_spec.encode('utf-8')) + stream = io.BytesIO(format_spec.encode()) try: - tokens = list(_remove_unused_ops(compat_tokenize_tokenize(stream.readline))) + tokens = list(_remove_unused_ops(tokenize.tokenize(stream.readline))) except tokenize.TokenError: raise syntax_error('Missing closing/opening brackets or parenthesis', (0, len(format_spec))) - class TokenIterator(object): + class TokenIterator: def __init__(self, tokens): self.tokens = tokens self.counter = 0 @@ -2279,7 +2344,7 @@ class YoutubeDL(object): def _calc_headers(self, info_dict): res = merge_headers(self.params['http_headers'], info_dict.get('http_headers') or {}) - cookies = self._calc_cookies(info_dict) + cookies = self._calc_cookies(info_dict['url']) if cookies: res['Cookie'] = cookies @@ -2290,8 +2355,8 @@ class YoutubeDL(object): return res - def _calc_cookies(self, info_dict): - pr = sanitized_Request(info_dict['url']) + def _calc_cookies(self, url): + pr = sanitized_Request(url) self.cookiejar.add_cookie_header(pr) return pr.get_header('Cookie') @@ -2335,17 +2400,20 @@ class YoutubeDL(object): else: info_dict['thumbnails'] = thumbnails - def _fill_common_fields(self, info_dict, is_video=True): + def _fill_common_fields(self, info_dict, final=True): # TODO: move sanitization here - if is_video: - # playlists are allowed to lack "title" - info_dict['fulltitle'] = info_dict.get('title') - if 'title' not in info_dict: + if final: + title = info_dict.get('title', NO_DEFAULT) + if title is NO_DEFAULT: raise ExtractorError('Missing "title" field in extractor result', video_id=info_dict['id'], ie=info_dict['extractor']) - elif not info_dict.get('title'): - self.report_warning('Extractor failed to obtain "title". Creating a generic title instead') - info_dict['title'] = f'{info_dict["extractor"]} video #{info_dict["id"]}' + info_dict['fulltitle'] = title + if not title: + if title == '': + self.write_debug('Extractor gave empty title. Creating a generic title') + else: + self.report_warning('Extractor failed to obtain "title". Creating a generic title instead') + info_dict['title'] = f'{info_dict["extractor"].replace(":", "-")} video #{info_dict["id"]}' if info_dict.get('duration') is not None: info_dict['duration_string'] = formatSeconds(info_dict['duration']) @@ -2358,11 +2426,9 @@ class YoutubeDL(object): if info_dict.get(date_key) is None and info_dict.get(ts_key) is not None: # Working around out-of-range timestamp values (e.g. negative ones on Windows, # see http://bugs.python.org/issue1646728) - try: + with contextlib.suppress(ValueError, OverflowError, OSError): upload_date = datetime.datetime.utcfromtimestamp(info_dict[ts_key]) info_dict[date_key] = upload_date.strftime('%Y%m%d') - except (ValueError, OverflowError, OSError): - pass live_keys = ('is_live', 'was_live') live_status = info_dict.get('live_status') @@ -2380,13 +2446,32 @@ class YoutubeDL(object): for key in live_keys: if info_dict.get(key) is None: info_dict[key] = (live_status == key) + if live_status == 'post_live': + info_dict['was_live'] = True # Auto generate title fields corresponding to the *_number fields when missing # in order to always have clean titles. This is very common for TV series. for field in ('chapter', 'season', 'episode'): - if info_dict.get('%s_number' % field) is not None and not info_dict.get(field): + if final and info_dict.get('%s_number' % field) is not None and not info_dict.get(field): info_dict[field] = '%s %d' % (field.capitalize(), info_dict['%s_number' % field]) + def _raise_pending_errors(self, info): + err = info.pop('__pending_error', None) + if err: + self.report_error(err, tb=False) + + def sort_formats(self, info_dict): + formats = self._get_formats(info_dict) + if not formats: + return + # Backward compatibility with InfoExtractor._sort_formats + field_preference = formats[0].pop('__sort_fields', None) + if field_preference: + info_dict['_format_sort_fields'] = field_preference + + formats.sort(key=FormatSorter( + self, info_dict.get('_format_sort_fields', [])).calculate_preference) + def process_video_result(self, info_dict, download=True): assert info_dict.get('_type', 'video') == 'video' self._num_videos += 1 @@ -2403,24 +2488,40 @@ class YoutubeDL(object): def sanitize_string_field(info, string_field): field = info.get(string_field) - if field is None or isinstance(field, compat_str): + if field is None or isinstance(field, str): return report_force_conversion(string_field, 'a string', 'string') - info[string_field] = compat_str(field) + info[string_field] = str(field) def sanitize_numeric_fields(info): for numeric_field in self._NUMERIC_FIELDS: field = info.get(numeric_field) - if field is None or isinstance(field, compat_numeric_types): + if field is None or isinstance(field, (int, float)): continue report_force_conversion(numeric_field, 'numeric', 'int') info[numeric_field] = int_or_none(field) sanitize_string_field(info_dict, 'id') sanitize_numeric_fields(info_dict) + if info_dict.get('section_end') and info_dict.get('section_start') is not None: + info_dict['duration'] = round(info_dict['section_end'] - info_dict['section_start'], 3) if (info_dict.get('duration') or 0) <= 0 and info_dict.pop('duration', None): self.report_warning('"duration" field is negative, there is an error in extractor') + chapters = info_dict.get('chapters') or [] + if chapters and chapters[0].get('start_time'): + chapters.insert(0, {'start_time': 0}) + + dummy_chapter = {'end_time': 0, 'start_time': info_dict.get('duration')} + for idx, (prev, current, next_) in enumerate(zip( + (dummy_chapter, *chapters), chapters, (*chapters[1:], dummy_chapter)), 1): + if current.get('start_time') is None: + current['start_time'] = prev.get('end_time') + if not current.get('end_time'): + current['end_time'] = next_.get('start_time') + if not current.get('title'): + current['title'] = f'' + if 'playlist' not in info_dict: # It isn't part of a playlist info_dict['playlist'] = None @@ -2456,20 +2557,18 @@ class YoutubeDL(object): info_dict['requested_subtitles'] = self.process_subtitles( info_dict['id'], subtitles, automatic_captions) - if info_dict.get('formats') is None: - # There's only one format available - formats = [info_dict] - else: - formats = info_dict['formats'] + self.sort_formats(info_dict) + formats = self._get_formats(info_dict) - info_dict['__has_drm'] = any(f.get('has_drm') for f in formats) + # or None ensures --clean-infojson removes it + info_dict['_has_drm'] = any(f.get('has_drm') for f in formats) or None if not self.params.get('allow_unplayable_formats'): formats = [f for f in formats if not f.get('has_drm')] - if info_dict['__has_drm'] and all( - f.get('acodec') == f.get('vcodec') == 'none' for f in formats): - self.report_warning( - 'This video is DRM protected and only images are available for download. ' - 'Use --list-formats to see them') + + if formats and all(f.get('acodec') == f.get('vcodec') == 'none' for f in formats): + self.report_warning( + f'{"This video is DRM protected and " if info_dict["_has_drm"] else ""}' + 'only images are available for download. Use --list-formats to see them'.capitalize()) get_from_start = not info_dict.get('is_live') or bool(self.params.get('live_from_start')) if not get_from_start: @@ -2481,9 +2580,6 @@ class YoutubeDL(object): '--live-from-start is passed, but there are no formats that can be downloaded from the start. ' 'If you want to download from the current time, use --no-live-from-start')) - if not formats: - self.raise_no_formats(info_dict) - def is_wellformed(f): url = f.get('url') if not url: @@ -2496,7 +2592,10 @@ class YoutubeDL(object): return True # Filter out malformed formats for better extraction robustness - formats = list(filter(is_wellformed, formats)) + formats = list(filter(is_wellformed, formats or [])) + + if not formats: + self.raise_no_formats(info_dict) formats_dict = {} @@ -2506,7 +2605,7 @@ class YoutubeDL(object): sanitize_numeric_fields(format) format['url'] = sanitize_url(format['url']) if not format.get('format_id'): - format['format_id'] = compat_str(i) + format['format_id'] = str(i) else: # Sanitize format_id from characters used in format selector expression format['format_id'] = re.sub(r'[\s,/+\[\]()]', '_', format['format_id']) @@ -2542,9 +2641,11 @@ class YoutubeDL(object): format['resolution'] = self.format_resolution(format, default=None) if format.get('dynamic_range') is None and format.get('vcodec') != 'none': format['dynamic_range'] = 'SDR' + if format.get('aspect_ratio') is None: + format['aspect_ratio'] = try_call(lambda: round(format['width'] / format['height'], 2)) if (info_dict.get('duration') and format.get('tbr') and not format.get('filesize') and not format.get('filesize_approx')): - format['filesize_approx'] = info_dict['duration'] * format['tbr'] * (1024 / 8) + format['filesize_approx'] = int(info_dict['duration'] * format['tbr'] * (1024 / 8)) # Add HTTP headers, so that external programs can use them from the # json output @@ -2574,10 +2675,9 @@ class YoutubeDL(object): info_dict, _ = self.pre_process(info_dict, 'after_filter') # The pre-processors may have modified the formats - formats = info_dict.get('formats', [info_dict]) + formats = self._get_formats(info_dict) - list_only = self.params.get('simulate') is None and ( - self.params.get('list_thumbnails') or self.params.get('listformats') or self.params.get('listsubtitles')) + list_only = self.params.get('simulate') == 'list_only' interactive_format_selection = not list_only and self.format_selector == '-' if self.params.get('list_thumbnails'): self.list_thumbnails(info_dict) @@ -2591,7 +2691,7 @@ class YoutubeDL(object): if list_only: # Without this printing, -F --print-json will not work self.__forced_printings(info_dict, self.prepare_filename(info_dict), incomplete=True) - return + return info_dict format_selector = self.format_selector if format_selector is None: @@ -2632,20 +2732,39 @@ class YoutubeDL(object): # Process what we can, even without any available formats. formats_to_download = [{}] - best_format = formats_to_download[-1] + requested_ranges = tuple(self.params.get('download_ranges', lambda *_: [{}])(info_dict, self)) + best_format, downloaded_formats = formats_to_download[-1], [] if download: - if best_format: - self.to_screen( - f'[info] {info_dict["id"]}: Downloading {len(formats_to_download)} format(s): ' - + ', '.join([f['format_id'] for f in formats_to_download])) + if best_format and requested_ranges: + def to_screen(*msg): + self.to_screen(f'[info] {info_dict["id"]}: {" ".join(", ".join(variadic(m)) for m in msg)}') + + to_screen(f'Downloading {len(formats_to_download)} format(s):', + (f['format_id'] for f in formats_to_download)) + if requested_ranges != ({}, ): + to_screen(f'Downloading {len(requested_ranges)} time ranges:', + (f'{c["start_time"]:.1f}-{c["end_time"]:.1f}' for c in requested_ranges)) max_downloads_reached = False - for i, fmt in enumerate(formats_to_download): - formats_to_download[i] = new_info = self._copy_infodict(info_dict) + + for fmt, chapter in itertools.product(formats_to_download, requested_ranges): + new_info = self._copy_infodict(info_dict) new_info.update(fmt) + offset, duration = info_dict.get('section_start') or 0, info_dict.get('duration') or float('inf') + end_time = offset + min(chapter.get('end_time', duration), duration) + if chapter or offset: + new_info.update({ + 'section_start': offset + chapter.get('start_time', 0), + # duration may not be accurate. So allow deviations <1sec + 'section_end': end_time if end_time <= offset + duration + 1 else None, + 'section_title': chapter.get('title'), + 'section_number': chapter.get('index'), + }) + downloaded_formats.append(new_info) try: self.process_info(new_info) except MaxDownloadsReached: max_downloads_reached = True + self._raise_pending_errors(new_info) # Remove copied info for key, val in tuple(new_info.items()): if info_dict.get(key) == val: @@ -2653,12 +2772,12 @@ class YoutubeDL(object): if max_downloads_reached: break - write_archive = set(f.get('__write_download_archive', False) for f in formats_to_download) + write_archive = {f.get('__write_download_archive', False) for f in downloaded_formats} assert write_archive.issubset({True, False, 'ignore'}) if True in write_archive and False not in write_archive: self.record_download_archive(info_dict) - info_dict['requested_downloads'] = formats_to_download + info_dict['requested_downloads'] = downloaded_formats info_dict = self.run_all_pps('after_video', info_dict) if max_downloads_reached: raise MaxDownloadsReached() @@ -2669,50 +2788,35 @@ class YoutubeDL(object): def process_subtitles(self, video_id, normal_subtitles, automatic_captions): """Select the requested subtitles and their format""" - available_subs = {} + available_subs, normal_sub_langs = {}, [] if normal_subtitles and self.params.get('writesubtitles'): available_subs.update(normal_subtitles) + normal_sub_langs = tuple(normal_subtitles.keys()) if automatic_captions and self.params.get('writeautomaticsub'): for lang, cap_info in automatic_captions.items(): if lang not in available_subs: available_subs[lang] = cap_info - if (not self.params.get('writesubtitles') and not - self.params.get('writeautomaticsub') or not - available_subs): + if not available_subs or ( + not self.params.get('writesubtitles') + and not self.params.get('writeautomaticsub')): return None - all_sub_langs = available_subs.keys() + all_sub_langs = tuple(available_subs.keys()) if self.params.get('allsubtitles', False): requested_langs = all_sub_langs elif self.params.get('subtitleslangs', False): - # A list is used so that the order of languages will be the same as - # given in subtitleslangs. See https://github.com/hypervideo/hypervideo/issues/1041 - requested_langs = [] - for lang_re in self.params.get('subtitleslangs'): - discard = lang_re[0] == '-' - if discard: - lang_re = lang_re[1:] - if lang_re == 'all': - if discard: - requested_langs = [] - else: - requested_langs.extend(all_sub_langs) - continue - current_langs = filter(re.compile(lang_re + '$').match, all_sub_langs) - if discard: - for lang in current_langs: - while lang in requested_langs: - requested_langs.remove(lang) - else: - requested_langs.extend(current_langs) - requested_langs = orderedSet(requested_langs) - elif 'en' in available_subs: - requested_langs = ['en'] + try: + requested_langs = orderedSet_from_options( + self.params.get('subtitleslangs'), {'all': all_sub_langs}, use_regex=True) + except re.error as e: + raise ValueError(f'Wrong regex for subtitlelangs: {e.pattern}') + elif normal_sub_langs: + requested_langs = ['en'] if 'en' in normal_sub_langs else normal_sub_langs[:1] else: - requested_langs = [list(all_sub_langs)[0]] + requested_langs = ['en'] if 'en' in all_sub_langs else all_sub_langs[:1] if requested_langs: - self.write_debug('Downloading subtitles: %s' % ', '.join(requested_langs)) + self.to_screen(f'[info] {video_id}: Downloading subtitles: {", ".join(requested_langs)}') formats_query = self.params.get('subtitlesformat', 'best') formats_preference = formats_query.split('/') if formats_query else [] @@ -2720,7 +2824,7 @@ class YoutubeDL(object): for lang in requested_langs: formats = available_subs.get(lang) if formats is None: - self.report_warning('%s subtitles not available for %s' % (lang, video_id)) + self.report_warning(f'{lang} subtitles not available for {video_id}') continue for ext in formats_preference: if ext == 'best': @@ -2748,12 +2852,16 @@ class YoutubeDL(object): info_copy['automatic_captions_table'] = self.render_subtitles_table(info_dict.get('id'), info_dict.get('automatic_captions')) def format_tmpl(tmpl): - mobj = re.match(r'\w+(=?)$', tmpl) - if mobj and mobj.group(1): - return f'{tmpl[:-1]} = %({tmpl[:-1]})r' - elif mobj: - return f'%({tmpl})s' - return tmpl + mobj = re.fullmatch(r'([\w.:,]|-\d|(?P{([\w.:,]|-\d)+}))+=?', tmpl) + if not mobj: + return tmpl + + fmt = '%({})s' + if tmpl.startswith('{'): + tmpl = f'.{tmpl}' + if tmpl.endswith('='): + tmpl, fmt = tmpl[:-1], '{0} = %({0})#j' + return '\n'.join(map(fmt.format, [tmpl] if mobj.group('dict') else tmpl.split(','))) for tmpl in self.params['forceprint'].get(key, []): self.to_stdout(self.evaluate_outtmpl(format_tmpl(tmpl), info_copy)) @@ -2763,7 +2871,7 @@ class YoutubeDL(object): tmpl = format_tmpl(tmpl) self.to_screen(f'[info] Writing {tmpl!r} to: {filename}') if self._ensure_dir_exists(filename): - with io.open(filename, 'a', encoding='utf-8') as f: + with open(filename, 'a', encoding='utf-8') as f: f.write(self.evaluate_outtmpl(tmpl, info_copy) + '\n') def __forced_printings(self, info_dict, filename, incomplete): @@ -2833,7 +2941,7 @@ class YoutubeDL(object): urls = '", "'.join( (f['url'].split(',')[0] + ',' if f['url'].startswith('data:') else f['url']) for f in info.get('requested_formats', []) or [info]) - self.write_debug('Invoking downloader on "%s"' % urls) + self.write_debug(f'Invoking {fd.FD_NAME} downloader on "{urls}"') # Note: Ideally info should be a deep-copied so that hooks cannot modify it. # But it may contain objects that are not deep-copyable @@ -2861,8 +2969,6 @@ class YoutubeDL(object): if 'format' not in info_dict and 'ext' in info_dict: info_dict['format'] = info_dict['ext'] - # This is mostly just for backward compatibility of process_info - # As a side-effect, this allows for format-specific filters if self._match_entry(info_dict) is not None: info_dict['__write_download_archive'] = 'ignore' return @@ -2879,8 +2985,13 @@ class YoutubeDL(object): # Forced printings self.__forced_printings(info_dict, full_filename, incomplete=('format' not in info_dict)) + def check_max_downloads(): + if self._num_downloads >= float(self.params.get('max_downloads') or 'inf'): + raise MaxDownloadsReached() + if self.params.get('simulate'): info_dict['__write_download_archive'] = self.params.get('force_write_download_archive') + check_max_downloads() return if full_filename is None: @@ -2928,11 +3039,11 @@ class YoutubeDL(object): else: try: self.to_screen('[info] Writing video annotations to: ' + annofn) - with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile: + with open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile: annofile.write(info_dict['annotations']) except (KeyError, TypeError): self.report_warning('There are no annotations to write.') - except (OSError, IOError): + except OSError: self.report_error('Cannot write annotations file: ' + annofn) return @@ -2951,13 +3062,13 @@ class YoutubeDL(object): return True try: self.to_screen(f'[info] Writing internet shortcut (.{link_type}) to: {linkfn}') - with io.open(encodeFilename(to_high_limit_path(linkfn)), 'w', encoding='utf-8', - newline='\r\n' if link_type == 'url' else '\n') as linkfile: + with open(encodeFilename(to_high_limit_path(linkfn)), 'w', encoding='utf-8', + newline='\r\n' if link_type == 'url' else '\n') as linkfile: template_vars = {'url': url} if link_type == 'desktop': template_vars['filename'] = linkfn[:-(len(link_type) + 1)] linkfile.write(LINK_TEMPLATES[link_type] % template_vars) - except (OSError, IOError): + except OSError: self.report_error(f'Cannot write internet shortcut {linkfn}') return False return True @@ -2984,12 +3095,8 @@ class YoutubeDL(object): info_dict.clear() info_dict.update(new_info) - try: - new_info, files_to_move = self.pre_process(info_dict, 'before_dl', files_to_move) - replace_info_dict(new_info) - except PostProcessingError as err: - self.report_error('Preprocessing: %s' % str(err)) - return + new_info, files_to_move = self.pre_process(info_dict, 'before_dl', files_to_move) + replace_info_dict(new_info) if self.params.get('skip_download'): info_dict['filepath'] = temp_filename @@ -3011,40 +3118,25 @@ class YoutubeDL(object): info_dict['ext'] = os.path.splitext(file)[1][1:] return file - success = True - if info_dict.get('requested_formats') is not None: - - def compatible_formats(formats): - # TODO: some formats actually allow this (mkv, webm, ogg, mp4), but not all of them. - video_formats = [format for format in formats if format.get('vcodec') != 'none'] - audio_formats = [format for format in formats if format.get('acodec') != 'none'] - if len(video_formats) > 2 or len(audio_formats) > 2: - return False - - # Check extension - exts = set(format.get('ext') for format in formats) - COMPATIBLE_EXTS = ( - set(('mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma')), - set(('webm',)), - ) - for ext_sets in COMPATIBLE_EXTS: - if ext_sets.issuperset(exts): - return True - # TODO: Check acodec/vcodec - return False + fd, success = None, True + if info_dict.get('protocol') or info_dict.get('url'): + fd = get_suitable_downloader(info_dict, self.params, to_stdout=temp_filename == '-') + if fd is not FFmpegFD and ( + info_dict.get('section_start') or info_dict.get('section_end')): + msg = ('This format cannot be partially downloaded' if FFmpegFD.available() + else 'You have requested downloading the video partially, but ffmpeg is not installed') + self.report_error(f'{msg}. Aborting') + return + if info_dict.get('requested_formats') is not None: requested_formats = info_dict['requested_formats'] old_ext = info_dict['ext'] if self.params.get('merge_output_format') is None: - if not compatible_formats(requested_formats): - info_dict['ext'] = 'mkv' - self.report_warning( - 'Requested formats are incompatible for merge and will be merged into mkv') if (info_dict['ext'] == 'webm' and info_dict.get('thumbnails') # check with type instead of pp_key, __name__, or isinstance # since we dont want any custom PPs to trigger this - and any(type(pp) == EmbedThumbnailPP for pp in self._pps['post_process'])): + and any(type(pp) == EmbedThumbnailPP for pp in self._pps['post_process'])): # noqa: E721 info_dict['ext'] = 'mkv' self.report_warning( 'webm doesn\'t support embedding a thumbnail, mkv will be used') @@ -3058,7 +3150,7 @@ class YoutubeDL(object): os.path.splitext(filename)[0] if filename_real_ext in (old_ext, new_ext) else filename) - return '%s.%s' % (filename_wo_ext, ext) + return f'{filename_wo_ext}.{ext}' # Ensure filename always has a correct extension for successful merge full_filename = correct_ext(full_filename) @@ -3066,10 +3158,8 @@ class YoutubeDL(object): dl_filename = existing_video_file(full_filename, temp_filename) info_dict['__real_download'] = False - downloaded = [] merger = FFmpegMergerPP(self) - - fd = get_suitable_downloader(info_dict, self.params, to_stdout=temp_filename == '-') + downloaded = [] if dl_filename is not None: self.report_file_already_downloaded(dl_filename) elif fd: @@ -3143,12 +3233,13 @@ class YoutubeDL(object): except network_exceptions as err: self.report_error('unable to download video data: %s' % error_to_compat_str(err)) return - except (OSError, IOError) as err: + except OSError as err: raise UnavailableVideoError(err) except (ContentTooShortError, ) as err: - self.report_error('content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded)) + self.report_error(f'content too short (expected {err.expected} bytes and served {err.downloaded})') return + self._raise_pending_errors(info_dict) if success and full_filename != '-': def fixup(): @@ -3159,16 +3250,16 @@ class YoutubeDL(object): if fixup_policy in ('ignore', 'never'): return elif fixup_policy == 'warn': - do_fixup = False + do_fixup = 'warn' elif fixup_policy != 'force': assert fixup_policy in ('detect_or_warn', None) if not info_dict.get('__real_download'): do_fixup = False def ffmpeg_fixup(cndn, msg, cls): - if not cndn: + if not (do_fixup and cndn): return - if not do_fixup: + elif do_fixup == 'warn': self.report_warning(f'{vid}: {msg}') return pp = cls(self) @@ -3178,30 +3269,32 @@ class YoutubeDL(object): self.report_warning(f'{vid}: {msg}. Install ffmpeg to fix this automatically') stretched_ratio = info_dict.get('stretched_ratio') - ffmpeg_fixup( - stretched_ratio not in (1, None), - f'Non-uniform pixel ratio {stretched_ratio}', - FFmpegFixupStretchedPP) - - ffmpeg_fixup( - (info_dict.get('requested_formats') is None - and info_dict.get('container') == 'm4a_dash' - and info_dict.get('ext') == 'm4a'), - 'writing DASH m4a. Only some players support this container', - FFmpegFixupM4aPP) + ffmpeg_fixup(stretched_ratio not in (1, None), + f'Non-uniform pixel ratio {stretched_ratio}', + FFmpegFixupStretchedPP) downloader = get_suitable_downloader(info_dict, self.params) if 'protocol' in info_dict else None - downloader = downloader.__name__ if downloader else None + downloader = downloader.FD_NAME if downloader else None - if info_dict.get('requested_formats') is None: # Not necessary if doing merger - ffmpeg_fixup(downloader == 'HlsFD', + ext = info_dict.get('ext') + postprocessed_by_ffmpeg = info_dict.get('requested_formats') or any(( + isinstance(pp, FFmpegVideoConvertorPP) + and resolve_recode_mapping(ext, pp.mapping)[0] not in (ext, None) + ) for pp in self._pps['post_process']) + + if not postprocessed_by_ffmpeg: + ffmpeg_fixup(ext == 'm4a' and info_dict.get('container') == 'm4a_dash', + 'writing DASH m4a. Only some players support this container', + FFmpegFixupM4aPP) + ffmpeg_fixup(downloader == 'hlsnative' and not self.params.get('hls_use_mpegts') + or info_dict.get('is_live') and self.params.get('hls_use_mpegts') is None, 'Possible MPEG-TS in MP4 container or malformed AAC timestamps', FFmpegFixupM3u8PP) ffmpeg_fixup(info_dict.get('is_live') and downloader == 'DashSegmentsFD', 'Possible duplicate MOOV atoms', FFmpegFixupDuplicateMoovPP) - ffmpeg_fixup(downloader == 'WebSocketFragmentFD', 'Malformed timestamps detected', FFmpegFixupTimestampPP) - ffmpeg_fixup(downloader == 'WebSocketFragmentFD', 'Malformed duration detected', FFmpegFixupDurationPP) + ffmpeg_fixup(downloader == 'web_socket_fragment', 'Malformed timestamps detected', FFmpegFixupTimestampPP) + ffmpeg_fixup(downloader == 'web_socket_fragment', 'Malformed duration detected', FFmpegFixupDurationPP) fixup() try: @@ -3217,15 +3310,10 @@ class YoutubeDL(object): return info_dict['__write_download_archive'] = True + assert info_dict is original_infodict # Make sure the info_dict was modified in-place if self.params.get('force_write_download_archive'): info_dict['__write_download_archive'] = True - - # Make sure the info_dict was modified in-place - assert info_dict is original_infodict - - max_downloads = self.params.get('max_downloads') - if max_downloads is not None and self._num_downloads >= int(max_downloads): - raise MaxDownloadsReached() + check_max_downloads() def __download_wrapper(self, func): @functools.wraps(func) @@ -3234,13 +3322,11 @@ class YoutubeDL(object): res = func(*args, **kwargs) except UnavailableVideoError as e: self.report_error(e) - except MaxDownloadsReached as e: - self.to_screen(f'[info] {e}') - raise except DownloadCancelled as e: self.to_screen(f'[info] {e}') if not self.params.get('break_per_url'): raise + self._num_downloads = 0 else: if self.params.get('dump_single_json', False): self.post_extract(res) @@ -3250,7 +3336,7 @@ class YoutubeDL(object): def download(self, url_list): """Download a given list of URLs.""" url_list = variadic(url_list) # Passing a single URL is a common mistake - outtmpl = self.outtmpl_dict['default'] + outtmpl = self.params['outtmpl']['default'] if (len(url_list) > 1 and outtmpl != '-' and '%' not in outtmpl @@ -3289,11 +3375,17 @@ class YoutubeDL(object): return info_dict info_dict.setdefault('epoch', int(time.time())) info_dict.setdefault('_type', 'video') + info_dict.setdefault('_version', { + 'version': __version__, + 'current_git_head': current_git_head(), + 'release_git_head': RELEASE_GIT_HEAD, + 'repository': REPOSITORY, + }) if remove_private_keys: - reject = lambda k, v: v is None or (k.startswith('_') and k != '_type') or k in { + reject = lambda k, v: v is None or k.startswith('__') or k in { 'requested_downloads', 'requested_formats', 'requested_subtitles', 'requested_entries', - 'entries', 'filepath', 'infojson_filename', 'original_url', 'playlist_autonumber', + 'entries', 'filepath', '_filename', 'infojson_filename', 'original_url', 'playlist_autonumber', } else: reject = lambda k, v: False @@ -3315,6 +3407,17 @@ class YoutubeDL(object): ''' Alias of sanitize_info for backward compatibility ''' return YoutubeDL.sanitize_info(info_dict, actually_filter) + def _delete_downloaded_files(self, *files_to_delete, info={}, msg=None): + for filename in set(filter(None, files_to_delete)): + if msg: + self.to_screen(msg % filename) + try: + os.remove(filename) + except OSError: + self.report_warning(f'Unable to delete file {filename}') + if filename in info.get('__files_to_move', []): # NB: Delete even if None + del info['__files_to_move'][filename] + @staticmethod def post_extract(info_dict): def actual_post_extract(info_dict): @@ -3347,14 +3450,8 @@ class YoutubeDL(object): for f in files_to_delete: infodict['__files_to_move'].setdefault(f, '') else: - for old_filename in set(files_to_delete): - self.to_screen('Deleting original file %s (pass -k to keep)' % old_filename) - try: - os.remove(encodeFilename(old_filename)) - except (IOError, OSError): - self.report_warning('Unable to remove downloaded original file') - if old_filename in infodict['__files_to_move']: - del infodict['__files_to_move'][old_filename] + self._delete_downloaded_files( + *files_to_delete, info=infodict, msg='Deleting original file %s (pass -k to keep)') return infodict def run_all_pps(self, key, info, *, additional_pps=None): @@ -3366,7 +3463,12 @@ class YoutubeDL(object): def pre_process(self, ie_info, key='pre_process', files_to_move=None): info = dict(ie_info) info['__files_to_move'] = files_to_move or {} - info = self.run_all_pps(key, info) + try: + info = self.run_all_pps(key, info) + except PostProcessingError as err: + msg = f'Preprocessing: {err}' + info.setdefault('__pending_error', msg) + self.report_error(msg, is_error=False) return info, info.pop('__files_to_move', None) def post_process(self, filename, info, files_to_move=None): @@ -3396,18 +3498,15 @@ class YoutubeDL(object): break else: return - return '%s %s' % (extractor.lower(), video_id) + return make_archive_id(extractor, video_id) def in_download_archive(self, info_dict): - fn = self.params.get('download_archive') - if fn is None: + if not self.archive: return False - vid_id = self._make_archive_id(info_dict) - if not vid_id: - return False # Incomplete video information - - return vid_id in self.archive + vid_ids = [self._make_archive_id(info_dict)] + vid_ids.extend(info_dict.get('_old_archive_ids') or []) + return any(id_ in self.archive for id_ in vid_ids) def record_download_archive(self, info_dict): fn = self.params.get('download_archive') @@ -3415,9 +3514,11 @@ class YoutubeDL(object): return vid_id = self._make_archive_id(info_dict) assert vid_id + self.write_debug(f'Adding to archive: {vid_id}') - with locked_file(fn, 'a', encoding='utf-8') as archive_file: - archive_file.write(vid_id + '\n') + if is_path_like(fn): + with locked_file(fn, 'a', encoding='utf-8') as archive_file: + archive_file.write(vid_id + '\n') self.archive.add(vid_id) @staticmethod @@ -3436,7 +3537,7 @@ class YoutubeDL(object): def _list_format_headers(self, *headers): if self.params.get('listformats_table', True) is not False: - return [self._format_screen(header, self.Styles.HEADERS) for header in headers] + return [self._format_out(header, self.Styles.HEADERS) for header in headers] return headers def _format_note(self, fdict): @@ -3499,11 +3600,17 @@ class YoutubeDL(object): res += '~' + format_bytes(fdict['filesize_approx']) return res - def render_formats_table(self, info_dict): - if not info_dict.get('formats') and not info_dict.get('url'): - return None + def _get_formats(self, info_dict): + if info_dict.get('formats') is None: + if info_dict.get('url') and info_dict.get('_type', 'video') == 'video': + return [info_dict] + return [] + return info_dict['formats'] - formats = info_dict.get('formats', [info_dict]) + def render_formats_table(self, info_dict): + formats = self._get_formats(info_dict) + if not formats: + return if not self.params.get('listformats_table', True) is not False: table = [ [ @@ -3511,33 +3618,45 @@ class YoutubeDL(object): format_field(f, 'ext'), self.format_resolution(f), self._format_note(f) - ] for f in formats if f.get('preference') is None or f['preference'] >= -1000] + ] for f in formats if (f.get('preference') or 0) >= -1000] return render_table(['format code', 'extension', 'resolution', 'note'], table, extra_gap=1) - delim = self._format_screen('\u2502', self.Styles.DELIM, '|', test_encoding=True) + def simplified_codec(f, field): + assert field in ('acodec', 'vcodec') + codec = f.get(field, 'unknown') + if not codec: + return 'unknown' + elif codec != 'none': + return '.'.join(codec.split('.')[:4]) + + if field == 'vcodec' and f.get('acodec') == 'none': + return 'images' + elif field == 'acodec' and f.get('vcodec') == 'none': + return '' + return self._format_out('audio only' if field == 'vcodec' else 'video only', + self.Styles.SUPPRESS) + + delim = self._format_out('\u2502', self.Styles.DELIM, '|', test_encoding=True) table = [ [ - self._format_screen(format_field(f, 'format_id'), self.Styles.ID), + self._format_out(format_field(f, 'format_id'), self.Styles.ID), format_field(f, 'ext'), format_field(f, func=self.format_resolution, ignore=('audio only', 'images')), - format_field(f, 'fps', '\t%d'), + format_field(f, 'fps', '\t%d', func=round), format_field(f, 'dynamic_range', '%s', ignore=(None, 'SDR')).replace('HDR', ''), + format_field(f, 'audio_channels', '\t%s'), delim, format_field(f, 'filesize', ' \t%s', func=format_bytes) + format_field(f, 'filesize_approx', '~\t%s', func=format_bytes), - format_field(f, 'tbr', '\t%dk'), + format_field(f, 'tbr', '\t%dk', func=round), shorten_protocol_name(f.get('protocol', '')), delim, - format_field(f, 'vcodec', default='unknown').replace( - 'none', 'images' if f.get('acodec') == 'none' - else self._format_screen('audio only', self.Styles.SUPPRESS)), - format_field(f, 'vbr', '\t%dk'), - format_field(f, 'acodec', default='unknown').replace( - 'none', '' if f.get('vcodec') == 'none' - else self._format_screen('video only', self.Styles.SUPPRESS)), - format_field(f, 'abr', '\t%dk'), - format_field(f, 'asr', '\t%dHz'), + simplified_codec(f, 'vcodec'), + format_field(f, 'vbr', '\t%dk', func=round), + simplified_codec(f, 'acodec'), + format_field(f, 'abr', '\t%dk', func=round), + format_field(f, 'asr', '\t%s', func=format_decimal_suffix), join_nonempty( - self._format_screen('UNSUPPORTED', 'light red') if f.get('ext') in ('f4f', 'f4m') else None, + self._format_out('UNSUPPORTED', 'light red') if f.get('ext') in ('f4f', 'f4m') else None, format_field(f, 'language', '[%s]'), join_nonempty(format_field(f, 'format_note'), format_field(f, 'container', ignore=(None, f.get('ext'))), @@ -3545,12 +3664,12 @@ class YoutubeDL(object): delim=' '), ] for f in formats if f.get('preference') is None or f['preference'] >= -1000] header_line = self._list_format_headers( - 'ID', 'EXT', 'RESOLUTION', '\tFPS', 'HDR', delim, '\tFILESIZE', '\tTBR', 'PROTO', + 'ID', 'EXT', 'RESOLUTION', '\tFPS', 'HDR', 'CH', delim, '\tFILESIZE', '\tTBR', 'PROTO', delim, 'VCODEC', '\tVBR', 'ACODEC', '\tABR', '\tASR', 'MORE INFO') return render_table( header_line, table, hide_empty=True, - delim=self._format_screen('\u2500', self.Styles.DELIM, '-', test_encoding=True)) + delim=self._format_out('\u2500', self.Styles.DELIM, '-', test_encoding=True)) def render_thumbnails_table(self, info_dict): thumbnails = list(info_dict.get('thumbnails') or []) @@ -3558,7 +3677,7 @@ class YoutubeDL(object): return None return render_table( self._list_format_headers('ID', 'Width', 'Height', 'URL'), - [[t.get('id'), t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails]) + [[t.get('id'), t.get('width') or 'unknown', t.get('height') or 'unknown', t['url']] for t in thumbnails]) def render_subtitles_table(self, video_id, subtitles): def _row(lang, formats): @@ -3593,7 +3712,7 @@ class YoutubeDL(object): def urlopen(self, req): """ Start an HTTP download """ - if isinstance(req, compat_basestring): + if isinstance(req, str): req = sanitized_Request(req) return self._opener.open(req, timeout=self._socket_timeout) @@ -3601,18 +3720,27 @@ class YoutubeDL(object): if not self.params.get('verbose'): return + from . import _IN_CLI # Must be delayed import + + # These imports can be slow. So import them only as needed + from .extractor.extractors import _LAZY_LOADER + from .extractor.extractors import _PLUGIN_CLASSES as plugin_extractors + def get_encoding(stream): ret = str(getattr(stream, 'encoding', 'missing (%s)' % type(stream).__name__)) if not supports_terminal_sequences(stream): - from .compat import WINDOWS_VT_MODE + from .utils import WINDOWS_VT_MODE # Must be imported locally ret += ' (No VT)' if WINDOWS_VT_MODE is False else ' (No ANSI)' return ret - encoding_str = 'Encodings: locale %s, fs %s, out %s, err %s, pref %s' % ( + encoding_str = 'Encodings: locale %s, fs %s, pref %s, %s' % ( locale.getpreferredencoding(), sys.getfilesystemencoding(), - get_encoding(self._out_files['screen']), get_encoding(self._out_files['error']), - self.get_encoding()) + self.get_encoding(), + ', '.join( + f'{key} {get_encoding(stream)}' for key, stream in self._out_files.items_ + if stream is not None and key != 'console') + ) logger = self.params.get('logger') if logger: @@ -3623,11 +3751,19 @@ class YoutubeDL(object): write_debug = lambda msg: self._write_string(f'[debug] {msg}\n') source = detect_variant() + if VARIANT not in (None, 'pip'): + source += '*' write_debug(join_nonempty( - 'hypervideo version', __version__, + f'{"hypervideo" if REPOSITORY == "hypervideo/hypervideo" else REPOSITORY} version', + __version__, f'[{RELEASE_GIT_HEAD}]' if RELEASE_GIT_HEAD else '', '' if source == 'unknown' else f'({source})', + '' if _IN_CLI else 'API', delim=' ')) + + if not _IN_CLI: + write_debug(f'params: {self.params}') + if not _LAZY_LOADER: if os.environ.get('YTDLP_NO_LAZY_EXTRACTORS'): write_debug('Lazy loading extractors is forcibly disabled') @@ -3637,41 +3773,17 @@ class YoutubeDL(object): write_debug('Plugins: %s' % [ '%s%s' % (klass.__name__, '' if klass.__name__ == name else f' as {name}') for name, klass in itertools.chain(plugin_extractors.items(), plugin_postprocessors.items())]) - if self.params.get('compat_opts'): - write_debug('Compatibility options: %s' % ', '.join(self.params.get('compat_opts'))) + if self.params['compat_opts']: + write_debug('Compatibility options: %s' % ', '.join(self.params['compat_opts'])) - if source == 'source': - try: - sp = Popen( - ['git', 'rev-parse', '--short', 'HEAD'], - stdout=subprocess.PIPE, stderr=subprocess.PIPE, - cwd=os.path.dirname(os.path.abspath(__file__))) - out, err = sp.communicate_or_kill() - out = out.decode().strip() - if re.match('[0-9a-f]+', out): - write_debug('Git HEAD: %s' % out) - except Exception: - try: - sys.exc_clear() - except Exception: - pass - - def python_implementation(): - impl_name = platform.python_implementation() - if impl_name == 'PyPy' and hasattr(sys, 'pypy_version_info'): - return impl_name + ' version %d.%d.%d' % sys.pypy_version_info[:3] - return impl_name - - write_debug('Python version %s (%s %s) - %s' % ( - platform.python_version(), - python_implementation(), - platform.architecture()[0], - platform_name())) + if current_git_head(): + write_debug(f'Git HEAD: {current_git_head()}') + write_debug(system_identifier()) exe_versions, ffmpeg_features = FFmpegPostProcessor.get_versions_and_features(self) ffmpeg_features = {key for key, val in ffmpeg_features.items() if val} if ffmpeg_features: - exe_versions['ffmpeg'] += ' (%s)' % ','.join(ffmpeg_features) + exe_versions['ffmpeg'] += ' (%s)' % ','.join(sorted(ffmpeg_features)) exe_versions['rtmpdump'] = rtmpdump_version() exe_versions['phantomjs'] = PhantomJSwrapper._version() @@ -3680,21 +3792,14 @@ class YoutubeDL(object): ) or 'none' write_debug('exe versions: %s' % exe_str) - from .downloader.websocket import has_websockets - from .postprocessor.embedthumbnail import has_mutagen - from .cookies import SQLITE_AVAILABLE, SECRETSTORAGE_AVAILABLE - - lib_str = join_nonempty( - compat_brotli and compat_brotli.__name__, - has_certifi and 'certifi', - compat_pycrypto_AES and compat_pycrypto_AES.__name__.split('.')[0], - SECRETSTORAGE_AVAILABLE and 'secretstorage', - has_mutagen and 'mutagen', - SQLITE_AVAILABLE and 'sqlite', - has_websockets and 'websockets', - delim=', ') or 'none' - write_debug('Optional libraries: %s' % lib_str) + from .compat.compat_utils import get_package_info + from .dependencies import available_dependencies + + write_debug('Optional libraries: %s' % (', '.join(sorted({ + join_nonempty(*get_package_info(m)) for m in available_dependencies.values() + })) or 'none')) + self._setup_opener() proxy_map = {} for handler in self._opener.handlers: if hasattr(handler, 'proxies'): @@ -3703,10 +3808,10 @@ class YoutubeDL(object): # Not implemented if False and self.params.get('call_home'): - ipaddr = self.urlopen('https://yt-dl.org/ip').read().decode('utf-8') + ipaddr = self.urlopen('https://yt-dl.org/ip').read().decode() write_debug('Public IP address: %s' % ipaddr) latest_version = self.urlopen( - 'https://yt-dl.org/latest/version').read().decode('utf-8') + 'https://yt-dl.org/latest/version').read().decode() if version_tuple(latest_version) > version_tuple(__version__): self.report_warning( 'You are using an outdated version (newest version: %s)! ' @@ -3714,6 +3819,8 @@ class YoutubeDL(object): latest_version) def _setup_opener(self): + if hasattr(self, '_opener'): + return timeout_val = self.params.get('socket_timeout') self._socket_timeout = 20 if timeout_val is None else float(timeout_val) @@ -3730,7 +3837,7 @@ class YoutubeDL(object): else: proxies = {'http': opts_proxy, 'https': opts_proxy} else: - proxies = compat_urllib_request.getproxies() + proxies = urllib.request.getproxies() # Set HTTPS proxy to HTTP one if given (https://github.com/ytdl-org/youtube-dl/issues/805) if 'http' in proxies and 'https' not in proxies: proxies['https'] = proxies['http'] @@ -3740,19 +3847,19 @@ class YoutubeDL(object): https_handler = make_HTTPS_handler(self.params, debuglevel=debuglevel) ydlh = YoutubeDLHandler(self.params, debuglevel=debuglevel) redirect_handler = YoutubeDLRedirectHandler() - data_handler = compat_urllib_request_DataHandler() + data_handler = urllib.request.DataHandler() # When passing our own FileHandler instance, build_opener won't add the # default FileHandler and allows us to disable the file protocol, which # can be used for malicious purposes (see # https://github.com/ytdl-org/youtube-dl/issues/8227) - file_handler = compat_urllib_request.FileHandler() + file_handler = urllib.request.FileHandler() def file_open(*args, **kwargs): - raise compat_urllib_error.URLError('file:// scheme is explicitly disabled in hypervideo for security reasons') + raise urllib.error.URLError('file:// scheme is explicitly disabled in hypervideo for security reasons') file_handler.file_open = file_open - opener = compat_urllib_request.build_opener( + opener = urllib.request.build_opener( proxy_handler, https_handler, cookie_processor, ydlh, redirect_handler, data_handler, file_handler) # Delete the default user-agent header, which would otherwise apply in @@ -3796,7 +3903,7 @@ class YoutubeDL(object): try: write_json_file(self.sanitize_info(ie_result, self.params.get('clean_infojson', True)), infofn) return True - except (OSError, IOError): + except OSError: self.report_error(f'Cannot write {label} metadata to JSON file {infofn}') return None @@ -3817,9 +3924,9 @@ class YoutubeDL(object): else: try: self.to_screen(f'[info] Writing {label} description to: {descfn}') - with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile: + with open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile: descfile.write(ie_result['description']) - except (OSError, IOError): + except OSError: self.report_error(f'Cannot write {label} description file {descfn}') return None return True @@ -3853,12 +3960,12 @@ class YoutubeDL(object): try: # Use newline='' to prevent conversion of newline characters # See https://github.com/ytdl-org/youtube-dl/issues/10268 - with io.open(sub_filename, 'w', encoding='utf-8', newline='') as subfile: + with open(sub_filename, 'w', encoding='utf-8', newline='') as subfile: subfile.write(sub_info['data']) sub_info['filepath'] = sub_filename ret.append((sub_filename, sub_filename_final)) continue - except (OSError, IOError): + except OSError: self.report_error(f'Cannot write video subtitles file {sub_filename}') return None diff --git a/hypervideo_dl/__init__.py b/hypervideo_dl/__init__.py index dc53a9e..8ac1c0c 100644 --- a/hypervideo_dl/__init__.py +++ b/hypervideo_dl/__init__.py @@ -1,81 +1,80 @@ #!/usr/bin/python -# coding: utf-8 +f'You are using an unsupported version of Python. Only Python versions 3.6 and above are supported by hypervideo' # noqa: F541 __license__ = 'CC0-1.0' -import codecs -import io +import getpass import itertools +import optparse import os -import random import re import sys +from .compat import compat_shlex_quote, workaround_optparse_bug9161 +from .cookies import SUPPORTED_BROWSERS, SUPPORTED_KEYRINGS +from .downloader import FileDownloader +from .downloader.external import get_external_downloader +from .extractor import list_extractor_classes +from .extractor.adobepass import MSO_INFO +from .extractor.common import InfoExtractor from .options import parseOpts -from .compat import ( - compat_getpass, - compat_os_name, - compat_shlex_quote, - workaround_optparse_bug9161, +from .postprocessor import ( + FFmpegExtractAudioPP, + FFmpegSubtitlesConvertorPP, + FFmpegThumbnailsConvertorPP, + FFmpegVideoConvertorPP, + FFmpegVideoRemuxerPP, + MetadataFromFieldPP, + MetadataParserPP, ) -from .cookies import SUPPORTED_BROWSERS, SUPPORTED_KEYRINGS from .utils import ( + NO_DEFAULT, + POSTPROCESS_WHEN, DateRange, - decodeOption, DownloadCancelled, DownloadError, + GeoUtils, + PlaylistEntries, + SameFileError, + decodeOption, + download_range_func, expand_path, float_or_none, - GeoUtils, + format_field, int_or_none, match_filter_func, - NO_DEFAULT, parse_duration, preferredencoding, read_batch_urls, + read_stdin, render_table, - SameFileError, setproctitle, std_headers, traverse_obj, + variadic, write_string, ) -from .downloader import ( - FileDownloader, -) -from .extractor import gen_extractors, list_extractors -from .extractor.common import InfoExtractor -from .extractor.adobepass import MSO_INFO -from .postprocessor import ( - FFmpegExtractAudioPP, - FFmpegSubtitlesConvertorPP, - FFmpegThumbnailsConvertorPP, - FFmpegVideoConvertorPP, - FFmpegVideoRemuxerPP, - MetadataFromFieldPP, - MetadataParserPP, -) from .YoutubeDL import YoutubeDL +def _exit(status=0, *args): + for msg in args: + sys.stderr.write(msg) + raise SystemExit(status) + + def get_urls(urls, batchfile, verbose): # Batch file verification batch_urls = [] if batchfile is not None: try: - if batchfile == '-': - write_string('Reading URLs from stdin - EOF (%s) to end:\n' % ( - 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D')) - batchfd = sys.stdin - else: - batchfd = io.open( - expand_path(batchfile), - 'r', encoding='utf-8', errors='ignore') - batch_urls = read_batch_urls(batchfd) + batch_urls = read_batch_urls( + read_stdin('URLs') if batchfile == '-' + else open(expand_path(batchfile), encoding='utf-8', errors='ignore')) if verbose: write_string('[debug] Batch file urls: ' + repr(batch_urls) + '\n') - except IOError: - sys.exit('ERROR: batch file %s could not be read' % batchfile) + except OSError: + _exit(f'ERROR: batch file {batchfile} could not be read') _enc = preferredencoding() return [ url.strip().decode(_enc, 'ignore') if isinstance(url, bytes) else url.strip() @@ -83,6 +82,11 @@ def get_urls(urls, batchfile, verbose): def print_extractor_information(opts, urls): + # Importing GenericIE is currently slow since it imports other extractors + # TODO: Move this back to module level after generalization of embed detection + from .extractor.generic import GenericIE + + out = '' if opts.list_extractors: for ie in list_extractors(opts.age_limit): write_string(ie.IE_NAME + (' (CURRENTLY BROKEN)' if not ie.working() else '') + '\n', out=sys.stdout) @@ -218,15 +222,11 @@ def validate_options(opts): validate_regex('format sorting', f, InfoExtractor.FormatSort.regex) # Postprocessor formats - validate_in('audio format', opts.audioformat, ['best'] + list(FFmpegExtractAudioPP.SUPPORTED_EXTS)) + validate_regex('audio format', opts.audioformat, FFmpegExtractAudioPP.FORMAT_RE) validate_in('subtitle format', opts.convertsubtitles, FFmpegSubtitlesConvertorPP.SUPPORTED_EXTS) - validate_in('thumbnail format', opts.convertthumbnails, FFmpegThumbnailsConvertorPP.SUPPORTED_EXTS) - if opts.recodevideo is not None: - opts.recodevideo = opts.recodevideo.replace(' ', '') - validate_regex('video recode format', opts.recodevideo, FFmpegVideoConvertorPP.FORMAT_RE) - if opts.remuxvideo is not None: - opts.remuxvideo = opts.remuxvideo.replace(' ', '') - validate_regex('video remux format', opts.remuxvideo, FFmpegVideoRemuxerPP.FORMAT_RE) + validate_regex('thumbnail format', opts.convertthumbnails, FFmpegThumbnailsConvertorPP.FORMAT_RE) + validate_regex('recode video format', opts.recodevideo, FFmpegVideoConvertorPP.FORMAT_RE) + validate_regex('remux video format', opts.remuxvideo, FFmpegVideoRemuxerPP.FORMAT_RE) if opts.audioquality: opts.audioquality = opts.audioquality.strip('k').strip('K') # int_or_none prevents inf, nan @@ -248,6 +248,28 @@ def validate_options(opts): opts.extractor_retries = parse_retries('extractor', opts.extractor_retries) opts.file_access_retries = parse_retries('file access', opts.file_access_retries) + # Retry sleep function + def parse_sleep_func(expr): + NUMBER_RE = r'\d+(?:\.\d+)?' + op, start, limit, step, *_ = tuple(re.fullmatch( + rf'(?:(linear|exp)=)?({NUMBER_RE})(?::({NUMBER_RE})?)?(?::({NUMBER_RE}))?', + expr.strip()).groups()) + (None, None) + + if op == 'exp': + return lambda n: min(float(start) * (float(step or 2) ** n), float(limit or 'inf')) + else: + default_step = start if op or limit else 0 + return lambda n: min(float(start) + float(step or default_step) * n, float(limit or 'inf')) + + for key, expr in opts.retry_sleep.items(): + if not expr: + del opts.retry_sleep[key] + continue + try: + opts.retry_sleep[key] = parse_sleep_func(expr) + except AttributeError: + raise ValueError(f'invalid {key} retry sleep expression {expr!r}') + # Bytes def parse_bytes(name, value): if value is None: @@ -292,20 +314,25 @@ def validate_options(opts): 'Cannot download a video and extract audio into the same file! ' f'Use "{outtmpl_default}.%(ext)s" instead of "{outtmpl_default}" as the output template') - # Remove chapters - remove_chapters_patterns, opts.remove_ranges = [], [] - for regex in opts.remove_chapters or []: - if regex.startswith('*'): - dur = list(map(parse_duration, regex[1:].split('-'))) - if len(dur) == 2 and all(t is not None for t in dur): - opts.remove_ranges.append(tuple(dur)) + def parse_chapters(name, value): + chapters, ranges = [], [] + for regex in value or []: + if regex.startswith('*'): + for range in regex[1:].split(','): + dur = tuple(map(parse_duration, range.strip().split('-'))) + if len(dur) == 2 and all(t is not None for t in dur): + ranges.append(dur) + else: + raise ValueError(f'invalid {name} time range "{regex}". Must be of the form *start-end') continue - raise ValueError(f'invalid --remove-chapters time range "{regex}". Must be of the form *start-end') - try: - remove_chapters_patterns.append(re.compile(regex)) - except re.error as err: - raise ValueError(f'invalid --remove-chapters regex "{regex}" - {err}') - opts.remove_chapters = remove_chapters_patterns + try: + chapters.append(re.compile(regex)) + except re.error as err: + raise ValueError(f'invalid {name} regex "{regex}" - {err}') + return chapters, ranges + + opts.remove_chapters, opts.remove_ranges = parse_chapters('--remove-chapters', opts.remove_chapters) + opts.download_ranges = download_range_func(*parse_chapters('--download-sections', opts.download_ranges)) # Cookies from browser if opts.cookiesfrombrowser: @@ -349,6 +376,12 @@ def validate_options(opts): opts.parse_metadata = list(itertools.chain(*map(metadataparser_actions, parse_metadata))) # Other options + if opts.playlist_items is not None: + try: + tuple(PlaylistEntries.parse_playlist_items(opts.playlist_items)) + except Exception as err: + raise ValueError(f'Invalid playlist-items {opts.playlist_items!r}: {err}') + geo_bypass_code = opts.geo_bypass_ip_block or opts.geo_bypass_country if geo_bypass_code is not None: try: @@ -369,6 +402,17 @@ def validate_options(opts): if opts.no_sponsorblock: opts.sponsorblock_mark = opts.sponsorblock_remove = set() + default_downloader = None + for proto, path in opts.external_downloader.items(): + if path == 'native': + continue + ed = get_external_downloader(path) + if ed is None: + raise ValueError( + f'No such {format_field(proto, None, "%s ", ignore="default")}external downloader "{path}"') + elif ed and proto == 'default': + default_downloader = ed.get_basename() + warnings, deprecation_warnings = [], [] # Common mistake: -f best @@ -379,13 +423,18 @@ def validate_options(opts): 'If you know what you are doing and want only the best pre-merged format, use "-f b" instead to suppress this warning'))) # --(postprocessor/downloader)-args without name - def report_args_compat(name, value, key1, key2=None): + def report_args_compat(name, value, key1, key2=None, where=None): if key1 in value and key2 not in value: - warnings.append(f'{name} arguments given without specifying name. The arguments will be given to all {name}s') + warnings.append(f'{name.title()} arguments given without specifying name. ' + f'The arguments will be given to {where or f"all {name}s"}') return True return False - report_args_compat('external downloader', opts.external_downloader_args, 'default') + if report_args_compat('external downloader', opts.external_downloader_args, + 'default', where=default_downloader) and default_downloader: + # Compat with youtube-dl's behavior. See https://github.com/ytdl-org/youtube-dl/commit/49c5293014bc11ec8c009856cd63cffa6296c1e1 + opts.external_downloader_args.setdefault(default_downloader, opts.external_downloader_args.pop('default')) + if report_args_compat('post-processor', opts.postprocessor_args, 'default-compat', 'default'): opts.postprocessor_args['default'] = opts.postprocessor_args.pop('default-compat') opts.postprocessor_args.setdefault('sponskrub', []) @@ -404,6 +453,9 @@ def validate_options(opts): setattr(opts, opt1, default) # Conflicting options + report_conflict('--playlist-reverse', 'playlist_reverse', '--playlist-random', 'playlist_random') + report_conflict('--playlist-reverse', 'playlist_reverse', '--lazy-playlist', 'lazy_playlist') + report_conflict('--playlist-random', 'playlist_random', '--lazy-playlist', 'lazy_playlist') report_conflict('--dateafter', 'dateafter', '--date', 'date', default=None) report_conflict('--datebefore', 'datebefore', '--date', 'date', default=None) report_conflict('--exec-before-download', 'exec_before_dl_cmd', '"--exec before_dl:"', 'exec_cmd', opts.exec_cmd.get('before_dl')) @@ -478,9 +530,9 @@ def validate_options(opts): # Ask for passwords if opts.username is not None and opts.password is None: - opts.password = compat_getpass('Type account password and press [Return]: ') + opts.password = getpass.getpass('Type account password and press [Return]: ') if opts.ap_username is not None and opts.ap_password is None: - opts.ap_password = compat_getpass('Type TV provider account password and press [Return]: ') + opts.ap_password = getpass.getpass('Type TV provider account password and press [Return]: ') return warnings, deprecation_warnings @@ -634,7 +686,7 @@ def parse_options(argv=None): final_ext = ( opts.recodevideo if opts.recodevideo in FFmpegVideoConvertorPP.SUPPORTED_EXTS else opts.remuxvideo if opts.remuxvideo in FFmpegVideoRemuxerPP.SUPPORTED_EXTS - else opts.audioformat if (opts.extractaudio and opts.audioformat != 'best') + else opts.audioformat if (opts.extractaudio and opts.audioformat in FFmpegExtractAudioPP.SUPPORTED_EXTS) else None) return parser, opts, urls, { @@ -690,6 +742,7 @@ def parse_options(argv=None): 'file_access_retries': opts.file_access_retries, 'fragment_retries': opts.fragment_retries, 'extractor_retries': opts.extractor_retries, + 'retry_sleep_functions': opts.retry_sleep, 'skip_unavailable_fragments': opts.skip_unavailable_fragments, 'keep_fragments': opts.keep_fragments, 'concurrent_fragment_downloads': opts.concurrent_fragment_downloads, @@ -704,6 +757,7 @@ def parse_options(argv=None): 'playlistend': opts.playlistend, 'playlistreverse': opts.playlist_reverse, 'playlistrandom': opts.playlist_random, + 'lazy_playlist': opts.lazy_playlist, 'noplaylist': opts.noplaylist, 'logtostderr': opts.outtmpl.get('default') == '-', 'consoletitle': opts.consoletitle, @@ -735,6 +789,7 @@ def parse_options(argv=None): 'verbose': opts.verbose, 'dump_intermediate_pages': opts.dump_intermediate_pages, 'write_pages': opts.write_pages, + 'load_pages': opts.load_pages, 'test': opts.test, 'keepvideo': opts.keepvideo, 'min_filesize': opts.min_filesize, @@ -783,6 +838,8 @@ def parse_options(argv=None): 'max_sleep_interval': opts.max_sleep_interval, 'sleep_interval_subtitles': opts.sleep_interval_subtitles, 'external_downloader': opts.external_downloader, + 'download_ranges': opts.download_ranges, + 'force_keyframes_at_cuts': opts.force_keyframes_at_cuts, 'list_thumbnails': opts.list_thumbnails, 'playlist_items': opts.playlist_items, 'xattr_set_filesize': opts.xattr_set_filesize, @@ -821,52 +878,66 @@ def _real_main(argv=None): if opts.dump_user_agent: ua = traverse_obj(opts.headers, 'User-Agent', casesense=False, default=std_headers['User-Agent']) write_string(f'{ua}\n', out=sys.stdout) - sys.exit(0) + return if print_extractor_information(opts, all_urls): - sys.exit(0) + return with YoutubeDL(ydl_opts) as ydl: + pre_process = opts.update_self or opts.rm_cachedir actual_use = all_urls or opts.load_info_filename - # Remove cache dir if opts.rm_cachedir: ydl.cache.remove() - # Maybe do nothing + updater = Updater(ydl) + if opts.update_self and updater.update() and actual_use: + if updater.cmd: + return updater.restart() + # This code is reachable only for zip variant in py < 3.10 + # It makes sense to exit here, but the old behavior is to continue + ydl.report_warning('Restart hypervideo to use the updated version') + # return 100, 'ERROR: The program must exit for the update to complete' + if not actual_use: + if pre_process: + return ydl._download_retcode + ydl.warn_if_short_id(sys.argv[1:] if argv is None else argv) parser.error( 'You must provide at least one URL.\n' 'Type hypervideo --help to see a list of all options.') + parser.destroy() try: if opts.load_info_filename is not None: - retcode = ydl.download_with_info_file(expand_path(opts.load_info_filename)) + return ydl.download_with_info_file(expand_path(opts.load_info_filename)) else: - retcode = ydl.download(all_urls) + return ydl.download(all_urls) except DownloadCancelled: ydl.to_screen('Aborting remaining downloads') - retcode = 101 - - sys.exit(retcode) + return 101 def main(argv=None): try: - _real_main(argv) + _exit(*variadic(_real_main(argv))) except DownloadError: - sys.exit(1) + _exit(1) except SameFileError as e: - sys.exit(f'ERROR: {e}') + _exit(f'ERROR: {e}') except KeyboardInterrupt: - sys.exit('\nERROR: Interrupted by user') + _exit('\nERROR: Interrupted by user') except BrokenPipeError as e: # https://docs.python.org/3/library/signal.html#note-on-sigpipe devnull = os.open(os.devnull, os.O_WRONLY) os.dup2(devnull, sys.stdout.fileno()) - sys.exit(f'\nERROR: {e}') + _exit(f'\nERROR: {e}') + except optparse.OptParseError as e: + _exit(2, f'\n{e}') + +from .extractor import gen_extractors, list_extractors __all__ = [ 'main', diff --git a/hypervideo_dl/__main__.py b/hypervideo_dl/__main__.py index 49765e4..c45082e 100644 --- a/hypervideo_dl/__main__.py +++ b/hypervideo_dl/__main__.py @@ -1,13 +1,11 @@ #!/usr/bin/env python3 -from __future__ import unicode_literals # Execute with -# $ python hypervideo_dl/__main__.py (2.6+) -# $ python -m hypervideo_dl (2.7+) +# $ python -m hypervideo_dl import sys -if __package__ is None and not hasattr(sys, 'frozen'): +if __package__ is None and not getattr(sys, 'frozen', False): # direct call of __main__.py import os.path path = os.path.realpath(os.path.abspath(__file__)) diff --git a/hypervideo_dl/aes.py b/hypervideo_dl/aes.py index b37f0dd..60ce99c 100644 --- a/hypervideo_dl/aes.py +++ b/hypervideo_dl/aes.py @@ -1,26 +1,18 @@ -from __future__ import unicode_literals - +import base64 from math import ceil -from .compat import ( - compat_b64decode, - compat_ord, - compat_pycrypto_AES, -) -from .utils import ( - bytes_to_intlist, - intlist_to_bytes, -) - +from .compat import compat_ord +from .dependencies import Cryptodome_AES +from .utils import bytes_to_intlist, intlist_to_bytes -if compat_pycrypto_AES: +if Cryptodome_AES: def aes_cbc_decrypt_bytes(data, key, iv): """ Decrypt bytes with AES-CBC using pycryptodome """ - return compat_pycrypto_AES.new(key, compat_pycrypto_AES.MODE_CBC, iv).decrypt(data) + return Cryptodome_AES.new(key, Cryptodome_AES.MODE_CBC, iv).decrypt(data) def aes_gcm_decrypt_and_verify_bytes(data, key, tag, nonce): """ Decrypt bytes with AES-GCM using pycryptodome """ - return compat_pycrypto_AES.new(key, compat_pycrypto_AES.MODE_GCM, nonce).decrypt_and_verify(data, tag) + return Cryptodome_AES.new(key, Cryptodome_AES.MODE_GCM, nonce).decrypt_and_verify(data, tag) else: def aes_cbc_decrypt_bytes(data, key, iv): @@ -32,16 +24,59 @@ else: return intlist_to_bytes(aes_gcm_decrypt_and_verify(*map(bytes_to_intlist, (data, key, tag, nonce)))) +def aes_cbc_encrypt_bytes(data, key, iv, **kwargs): + return intlist_to_bytes(aes_cbc_encrypt(*map(bytes_to_intlist, (data, key, iv)), **kwargs)) + + +BLOCK_SIZE_BYTES = 16 + + def unpad_pkcs7(data): return data[:-compat_ord(data[-1])] -BLOCK_SIZE_BYTES = 16 +def pkcs7_padding(data): + """ + PKCS#7 padding + + @param {int[]} data cleartext + @returns {int[]} padding data + """ + + remaining_length = BLOCK_SIZE_BYTES - len(data) % BLOCK_SIZE_BYTES + return data + [remaining_length] * remaining_length + + +def pad_block(block, padding_mode): + """ + Pad a block with the given padding mode + @param {int[]} block block to pad + @param padding_mode padding mode + """ + padding_size = BLOCK_SIZE_BYTES - len(block) + + PADDING_BYTE = { + 'pkcs7': padding_size, + 'iso7816': 0x0, + 'whitespace': 0x20, + 'zero': 0x0, + } + + if padding_size < 0: + raise ValueError('Block size exceeded') + elif padding_mode not in PADDING_BYTE: + raise NotImplementedError(f'Padding mode {padding_mode} is not implemented') + + if padding_mode == 'iso7816' and padding_size: + block = block + [0x80] # NB: += mutates list + padding_size -= 1 + + return block + [PADDING_BYTE[padding_mode]] * padding_size def aes_ecb_encrypt(data, key, iv=None): """ - Encrypt with aes in ECB mode + Encrypt with aes in ECB mode. Using PKCS#7 padding @param {int[]} data cleartext @param {int[]} key 16/24/32-Byte cipher key @@ -54,8 +89,7 @@ def aes_ecb_encrypt(data, key, iv=None): encrypted_data = [] for i in range(block_count): block = data[i * BLOCK_SIZE_BYTES: (i + 1) * BLOCK_SIZE_BYTES] - encrypted_data += aes_encrypt(block, expanded_key) - encrypted_data = encrypted_data[:len(data)] + encrypted_data += aes_encrypt(pkcs7_padding(block), expanded_key) return encrypted_data @@ -145,13 +179,14 @@ def aes_cbc_decrypt(data, key, iv): return decrypted_data -def aes_cbc_encrypt(data, key, iv): +def aes_cbc_encrypt(data, key, iv, *, padding_mode='pkcs7'): """ - Encrypt with aes in CBC mode. Using PKCS#7 padding + Encrypt with aes in CBC mode @param {int[]} data cleartext @param {int[]} key 16/24/32-Byte cipher key @param {int[]} iv 16-Byte IV + @param padding_mode Padding mode to use @returns {int[]} encrypted data """ expanded_key = key_expansion(key) @@ -161,8 +196,8 @@ def aes_cbc_encrypt(data, key, iv): previous_cipher_block = iv for i in range(block_count): block = data[i * BLOCK_SIZE_BYTES: (i + 1) * BLOCK_SIZE_BYTES] - remaining_length = BLOCK_SIZE_BYTES - len(block) - block += [remaining_length] * remaining_length + block = pad_block(block, padding_mode) + mixed_block = xor(block, previous_cipher_block) encrypted_block = aes_encrypt(mixed_block, expanded_key) @@ -273,8 +308,8 @@ def aes_decrypt_text(data, password, key_size_bytes): """ NONCE_LENGTH_BYTES = 8 - data = bytes_to_intlist(compat_b64decode(data)) - password = bytes_to_intlist(password.encode('utf-8')) + data = bytes_to_intlist(base64.b64decode(data)) + password = bytes_to_intlist(password.encode()) key = password[:key_size_bytes] + [0] * (key_size_bytes - len(password)) key = aes_encrypt(key[:BLOCK_SIZE_BYTES], key_expansion(key)) * (key_size_bytes // BLOCK_SIZE_BYTES) @@ -503,20 +538,30 @@ def ghash(subkey, data): last_y = [0] * BLOCK_SIZE_BYTES for i in range(0, len(data), BLOCK_SIZE_BYTES): - block = data[i : i + BLOCK_SIZE_BYTES] # noqa: E203 + block = data[i: i + BLOCK_SIZE_BYTES] last_y = block_product(xor(last_y, block), subkey) return last_y __all__ = [ - 'aes_ctr_decrypt', 'aes_cbc_decrypt', 'aes_cbc_decrypt_bytes', + 'aes_ctr_decrypt', 'aes_decrypt_text', - 'aes_encrypt', + 'aes_decrypt', + 'aes_ecb_decrypt', 'aes_gcm_decrypt_and_verify', 'aes_gcm_decrypt_and_verify_bytes', + + 'aes_cbc_encrypt', + 'aes_cbc_encrypt_bytes', + 'aes_ctr_encrypt', + 'aes_ecb_encrypt', + 'aes_encrypt', + 'key_expansion', + 'pad_block', + 'pkcs7_padding', 'unpad_pkcs7', ] diff --git a/hypervideo_dl/cache.py b/hypervideo_dl/cache.py index 24acb1b..2e9c1ef 100644 --- a/hypervideo_dl/cache.py +++ b/hypervideo_dl/cache.py @@ -1,28 +1,23 @@ -from __future__ import unicode_literals - +import contextlib import errno -import io import json import os import re import shutil import traceback -from .compat import compat_getenv -from .utils import ( - expand_path, - write_json_file, -) +from .utils import expand_path, traverse_obj, version_tuple, write_json_file +from .version import __version__ -class Cache(object): +class Cache: def __init__(self, ydl): self._ydl = ydl def _get_root_dir(self): res = self._ydl.params.get('cachedir') if res is None: - cache_root = compat_getenv('XDG_CACHE_HOME', '~/.cache') + cache_root = os.getenv('XDG_CACHE_HOME', '~/.cache') res = os.path.join(cache_root, 'hypervideo') return expand_path(res) @@ -31,7 +26,7 @@ class Cache(object): 'invalid section %r' % section assert re.match(r'^[a-zA-Z0-9_.-]+$', key), 'invalid key %r' % key return os.path.join( - self._get_root_dir(), section, '%s.%s' % (key, dtype)) + self._get_root_dir(), section, f'{key}.{dtype}') @property def enabled(self): @@ -51,33 +46,37 @@ class Cache(object): if ose.errno != errno.EEXIST: raise self._ydl.write_debug(f'Saving {section}.{key} to cache') - write_json_file(data, fn) + write_json_file({'hypervideo_version': __version__, 'data': data}, fn) except Exception: tb = traceback.format_exc() - self._ydl.report_warning( - 'Writing cache to %r failed: %s' % (fn, tb)) + self._ydl.report_warning(f'Writing cache to {fn!r} failed: {tb}') + + def _validate(self, data, min_ver): + version = traverse_obj(data, 'hypervideo_version') + if not version: # Backward compatibility + data, version = {'data': data}, '2022.08.19' + if not min_ver or version_tuple(version) >= version_tuple(min_ver): + return data['data'] + self._ydl.write_debug(f'Discarding old cache from version {version} (needs {min_ver})') - def load(self, section, key, dtype='json', default=None): + def load(self, section, key, dtype='json', default=None, *, min_ver=None): assert dtype in ('json',) if not self.enabled: return default cache_fn = self._get_cache_fn(section, key, dtype) - try: + with contextlib.suppress(OSError): try: - with io.open(cache_fn, 'r', encoding='utf-8') as cachef: + with open(cache_fn, encoding='utf-8') as cachef: self._ydl.write_debug(f'Loading {section}.{key} from cache') - return json.load(cachef) - except ValueError: + return self._validate(json.load(cachef), min_ver) + except (ValueError, KeyError): try: file_size = os.path.getsize(cache_fn) - except (OSError, IOError) as oe: + except OSError as oe: file_size = str(oe) - self._ydl.report_warning( - 'Cache retrieval from %s failed (%s)' % (cache_fn, file_size)) - except IOError: - pass # No cache available + self._ydl.report_warning(f'Cache retrieval from {cache_fn} failed ({file_size})') return default diff --git a/hypervideo_dl/compat.py b/hypervideo_dl/compat.py deleted file mode 100644 index bdea14c..0000000 --- a/hypervideo_dl/compat.py +++ /dev/null @@ -1,330 +0,0 @@ -# coding: utf-8 - -import asyncio -import base64 -import collections -import ctypes -import getpass -import html -import html.parser -import http -import http.client -import http.cookiejar -import http.cookies -import http.server -import itertools -import optparse -import os -import re -import shlex -import shutil -import socket -import struct -import subprocess -import sys -import tokenize -import urllib -import xml.etree.ElementTree as etree -from subprocess import DEVNULL - - -# HTMLParseError has been deprecated in Python 3.3 and removed in -# Python 3.5. Introducing dummy exception for Python >3.5 for compatible -# and uniform cross-version exception handling -class compat_HTMLParseError(Exception): - pass - - -# compat_ctypes_WINFUNCTYPE = ctypes.WINFUNCTYPE -# will not work since ctypes.WINFUNCTYPE does not exist in UNIX machines -def compat_ctypes_WINFUNCTYPE(*args, **kwargs): - return ctypes.WINFUNCTYPE(*args, **kwargs) - - -class _TreeBuilder(etree.TreeBuilder): - def doctype(self, name, pubid, system): - pass - - -def compat_etree_fromstring(text): - return etree.XML(text, parser=etree.XMLParser(target=_TreeBuilder())) - - -compat_os_name = os._name if os.name == 'java' else os.name - - -if compat_os_name == 'nt': - def compat_shlex_quote(s): - return s if re.match(r'^[-_\w./]+$', s) else '"%s"' % s.replace('"', '\\"') -else: - from shlex import quote as compat_shlex_quote - - -def compat_ord(c): - if type(c) is int: - return c - else: - return ord(c) - - -def compat_setenv(key, value, env=os.environ): - env[key] = value - - -if compat_os_name == 'nt' and sys.version_info < (3, 8): - # os.path.realpath on Windows does not follow symbolic links - # prior to Python 3.8 (see https://bugs.python.org/issue9949) - def compat_realpath(path): - while os.path.islink(path): - path = os.path.abspath(os.readlink(path)) - return path -else: - compat_realpath = os.path.realpath - - -def compat_print(s): - assert isinstance(s, compat_str) - print(s) - - -# Fix https://github.com/ytdl-org/youtube-dl/issues/4223 -# See http://bugs.python.org/issue9161 for what is broken -def workaround_optparse_bug9161(): - op = optparse.OptionParser() - og = optparse.OptionGroup(op, 'foo') - try: - og.add_option('-t') - except TypeError: - real_add_option = optparse.OptionGroup.add_option - - def _compat_add_option(self, *args, **kwargs): - enc = lambda v: ( - v.encode('ascii', 'replace') if isinstance(v, compat_str) - else v) - bargs = [enc(a) for a in args] - bkwargs = dict( - (k, enc(v)) for k, v in kwargs.items()) - return real_add_option(self, *bargs, **bkwargs) - optparse.OptionGroup.add_option = _compat_add_option - - -try: - compat_Pattern = re.Pattern -except AttributeError: - compat_Pattern = type(re.compile('')) - - -try: - compat_Match = re.Match -except AttributeError: - compat_Match = type(re.compile('').match('')) - - -try: - compat_asyncio_run = asyncio.run # >= 3.7 -except AttributeError: - def compat_asyncio_run(coro): - try: - loop = asyncio.get_event_loop() - except RuntimeError: - loop = asyncio.new_event_loop() - asyncio.set_event_loop(loop) - loop.run_until_complete(coro) - - asyncio.run = compat_asyncio_run - - -try: # >= 3.7 - asyncio.tasks.all_tasks -except AttributeError: - asyncio.tasks.all_tasks = asyncio.tasks.Task.all_tasks - -try: - import websockets as compat_websockets -except ImportError: - compat_websockets = None - -# Python 3.8+ does not honor %HOME% on windows, but this breaks compatibility with youtube-dl -# See https://github.com/hypervideo/hypervideo/issues/792 -# https://docs.python.org/3/library/os.path.html#os.path.expanduser -if compat_os_name in ('nt', 'ce') and 'HOME' in os.environ: - _userhome = os.environ['HOME'] - - def compat_expanduser(path): - if not path.startswith('~'): - return path - i = path.replace('\\', '/', 1).find('/') # ~user - if i < 0: - i = len(path) - userhome = os.path.join(os.path.dirname(_userhome), path[1:i]) if i > 1 else _userhome - return userhome + path[i:] -else: - compat_expanduser = os.path.expanduser - - -try: - from Cryptodome.Cipher import AES as compat_pycrypto_AES -except ImportError: - try: - from Crypto.Cipher import AES as compat_pycrypto_AES - except ImportError: - compat_pycrypto_AES = None - -try: - import brotlicffi as compat_brotli -except ImportError: - try: - import brotli as compat_brotli - except ImportError: - compat_brotli = None - -WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None - - -def windows_enable_vt_mode(): # TODO: Do this the proper way https://bugs.python.org/issue30075 - if compat_os_name != 'nt': - return - global WINDOWS_VT_MODE - startupinfo = subprocess.STARTUPINFO() - startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW - try: - subprocess.Popen('', shell=True, startupinfo=startupinfo) - WINDOWS_VT_MODE = True - except Exception: - pass - - -# Deprecated - -compat_basestring = str -compat_chr = chr -compat_filter = filter -compat_input = input -compat_integer_types = (int, ) -compat_kwargs = lambda kwargs: kwargs -compat_map = map -compat_numeric_types = (int, float, complex) -compat_str = str -compat_xpath = lambda xpath: xpath -compat_zip = zip - -compat_collections_abc = collections.abc -compat_HTMLParser = html.parser.HTMLParser -compat_HTTPError = urllib.error.HTTPError -compat_Struct = struct.Struct -compat_b64decode = base64.b64decode -compat_cookiejar = http.cookiejar -compat_cookiejar_Cookie = compat_cookiejar.Cookie -compat_cookies = http.cookies -compat_cookies_SimpleCookie = compat_cookies.SimpleCookie -compat_etree_Element = etree.Element -compat_etree_register_namespace = etree.register_namespace -compat_get_terminal_size = shutil.get_terminal_size -compat_getenv = os.getenv -compat_getpass = getpass.getpass -compat_html_entities = html.entities -compat_html_entities_html5 = compat_html_entities.html5 -compat_http_client = http.client -compat_http_server = http.server -compat_itertools_count = itertools.count -compat_parse_qs = urllib.parse.parse_qs -compat_shlex_split = shlex.split -compat_socket_create_connection = socket.create_connection -compat_struct_pack = struct.pack -compat_struct_unpack = struct.unpack -compat_subprocess_get_DEVNULL = lambda: DEVNULL -compat_tokenize_tokenize = tokenize.tokenize -compat_urllib_error = urllib.error -compat_urllib_parse = urllib.parse -compat_urllib_parse_quote = urllib.parse.quote -compat_urllib_parse_quote_plus = urllib.parse.quote_plus -compat_urllib_parse_unquote = urllib.parse.unquote -compat_urllib_parse_unquote_plus = urllib.parse.unquote_plus -compat_urllib_parse_unquote_to_bytes = urllib.parse.unquote_to_bytes -compat_urllib_parse_urlencode = urllib.parse.urlencode -compat_urllib_parse_urlparse = urllib.parse.urlparse -compat_urllib_parse_urlunparse = urllib.parse.urlunparse -compat_urllib_request = urllib.request -compat_urllib_request_DataHandler = urllib.request.DataHandler -compat_urllib_response = urllib.response -compat_urlparse = urllib.parse -compat_urlretrieve = urllib.request.urlretrieve -compat_xml_parse_error = etree.ParseError - - -# Set public objects - -__all__ = [ - 'WINDOWS_VT_MODE', - 'compat_HTMLParseError', - 'compat_HTMLParser', - 'compat_HTTPError', - 'compat_Match', - 'compat_Pattern', - 'compat_Struct', - 'compat_asyncio_run', - 'compat_b64decode', - 'compat_basestring', - 'compat_brotli', - 'compat_chr', - 'compat_collections_abc', - 'compat_cookiejar', - 'compat_cookiejar_Cookie', - 'compat_cookies', - 'compat_cookies_SimpleCookie', - 'compat_ctypes_WINFUNCTYPE', - 'compat_etree_Element', - 'compat_etree_fromstring', - 'compat_etree_register_namespace', - 'compat_expanduser', - 'compat_filter', - 'compat_get_terminal_size', - 'compat_getenv', - 'compat_getpass', - 'compat_html_entities', - 'compat_html_entities_html5', - 'compat_http_client', - 'compat_http_server', - 'compat_input', - 'compat_integer_types', - 'compat_itertools_count', - 'compat_kwargs', - 'compat_map', - 'compat_numeric_types', - 'compat_ord', - 'compat_os_name', - 'compat_parse_qs', - 'compat_print', - 'compat_pycrypto_AES', - 'compat_realpath', - 'compat_setenv', - 'compat_shlex_quote', - 'compat_shlex_split', - 'compat_socket_create_connection', - 'compat_str', - 'compat_struct_pack', - 'compat_struct_unpack', - 'compat_subprocess_get_DEVNULL', - 'compat_tokenize_tokenize', - 'compat_urllib_error', - 'compat_urllib_parse', - 'compat_urllib_parse_quote', - 'compat_urllib_parse_quote_plus', - 'compat_urllib_parse_unquote', - 'compat_urllib_parse_unquote_plus', - 'compat_urllib_parse_unquote_to_bytes', - 'compat_urllib_parse_urlencode', - 'compat_urllib_parse_urlparse', - 'compat_urllib_parse_urlunparse', - 'compat_urllib_request', - 'compat_urllib_request_DataHandler', - 'compat_urllib_response', - 'compat_urlparse', - 'compat_urlretrieve', - 'compat_websockets', - 'compat_xml_parse_error', - 'compat_xpath', - 'compat_zip', - 'windows_enable_vt_mode', - 'workaround_optparse_bug9161', -] diff --git a/hypervideo_dl/compat/__init__.py b/hypervideo_dl/compat/__init__.py new file mode 100644 index 0000000..2f2621b --- /dev/null +++ b/hypervideo_dl/compat/__init__.py @@ -0,0 +1,78 @@ +import os +import sys +import warnings +import xml.etree.ElementTree as etree + +from ._deprecated import * # noqa: F401, F403 +from .compat_utils import passthrough_module + +# XXX: Implement this the same way as other DeprecationWarnings without circular import +passthrough_module(__name__, '._legacy', callback=lambda attr: warnings.warn( + DeprecationWarning(f'{__name__}.{attr} is deprecated'), stacklevel=3)) + + +# HTMLParseError has been deprecated in Python 3.3 and removed in +# Python 3.5. Introducing dummy exception for Python >3.5 for compatible +# and uniform cross-version exception handling +class compat_HTMLParseError(ValueError): + pass + + +class _TreeBuilder(etree.TreeBuilder): + def doctype(self, name, pubid, system): + pass + + +def compat_etree_fromstring(text): + return etree.XML(text, parser=etree.XMLParser(target=_TreeBuilder())) + + +compat_os_name = os._name if os.name == 'java' else os.name + + +if compat_os_name == 'nt': + def compat_shlex_quote(s): + import re + return s if re.match(r'^[-_\w./]+$', s) else '"%s"' % s.replace('"', '\\"') +else: + from shlex import quote as compat_shlex_quote # noqa: F401 + + +def compat_ord(c): + return c if isinstance(c, int) else ord(c) + + +if compat_os_name == 'nt' and sys.version_info < (3, 8): + # os.path.realpath on Windows does not follow symbolic links + # prior to Python 3.8 (see https://bugs.python.org/issue9949) + def compat_realpath(path): + while os.path.islink(path): + path = os.path.abspath(os.readlink(path)) + return os.path.realpath(path) +else: + compat_realpath = os.path.realpath + + +# Python 3.8+ does not honor %HOME% on windows, but this breaks compatibility with youtube-dl +# See https://github.com/hypervideo/hypervideo/issues/792 +# https://docs.python.org/3/library/os.path.html#os.path.expanduser +if compat_os_name in ('nt', 'ce'): + def compat_expanduser(path): + HOME = os.environ.get('HOME') + if not HOME: + return os.path.expanduser(path) + elif not path.startswith('~'): + return path + i = path.replace('\\', '/', 1).find('/') # ~user + if i < 0: + i = len(path) + userhome = os.path.join(os.path.dirname(HOME), path[1:i]) if i > 1 else HOME + return userhome + path[i:] +else: + compat_expanduser = os.path.expanduser + + +# NB: Add modules that are imported dynamically here so that PyInstaller can find them +# See https://github.com/pyinstaller/pyinstaller-hooks-contrib/issues/438 +if False: + from . import _legacy # noqa: F401 diff --git a/hypervideo_dl/compat/_deprecated.py b/hypervideo_dl/compat/_deprecated.py new file mode 100644 index 0000000..342f1f8 --- /dev/null +++ b/hypervideo_dl/compat/_deprecated.py @@ -0,0 +1,16 @@ +"""Deprecated - New code should avoid these""" + +import base64 +import urllib.error +import urllib.parse + +compat_str = str + +compat_b64decode = base64.b64decode + +compat_HTTPError = urllib.error.HTTPError +compat_urlparse = urllib.parse +compat_parse_qs = urllib.parse.parse_qs +compat_urllib_parse_unquote = urllib.parse.unquote +compat_urllib_parse_urlencode = urllib.parse.urlencode +compat_urllib_parse_urlparse = urllib.parse.urlparse diff --git a/hypervideo_dl/compat/_legacy.py b/hypervideo_dl/compat/_legacy.py new file mode 100644 index 0000000..d19333d --- /dev/null +++ b/hypervideo_dl/compat/_legacy.py @@ -0,0 +1,97 @@ +""" Do not use! """ + +import collections +import ctypes +import getpass +import html.entities +import html.parser +import http.client +import http.cookiejar +import http.cookies +import http.server +import itertools +import os +import shlex +import shutil +import socket +import struct +import tokenize +import urllib.error +import urllib.parse +import urllib.request +import xml.etree.ElementTree as etree +from subprocess import DEVNULL + +# isort: split +import asyncio # noqa: F401 +import re # noqa: F401 +from asyncio import run as compat_asyncio_run # noqa: F401 +from re import Pattern as compat_Pattern # noqa: F401 +from re import match as compat_Match # noqa: F401 + +from .compat_utils import passthrough_module +from ..dependencies import Cryptodome_AES as compat_pycrypto_AES # noqa: F401 +from ..dependencies import brotli as compat_brotli # noqa: F401 +from ..dependencies import websockets as compat_websockets # noqa: F401 + +passthrough_module(__name__, '...utils', ('WINDOWS_VT_MODE', 'windows_enable_vt_mode')) + + +# compat_ctypes_WINFUNCTYPE = ctypes.WINFUNCTYPE +# will not work since ctypes.WINFUNCTYPE does not exist in UNIX machines +def compat_ctypes_WINFUNCTYPE(*args, **kwargs): + return ctypes.WINFUNCTYPE(*args, **kwargs) + + +def compat_setenv(key, value, env=os.environ): + env[key] = value + + +compat_basestring = str +compat_casefold = str.casefold +compat_chr = chr +compat_collections_abc = collections.abc +compat_cookiejar = http.cookiejar +compat_cookiejar_Cookie = http.cookiejar.Cookie +compat_cookies = http.cookies +compat_cookies_SimpleCookie = http.cookies.SimpleCookie +compat_etree_Element = etree.Element +compat_etree_register_namespace = etree.register_namespace +compat_filter = filter +compat_get_terminal_size = shutil.get_terminal_size +compat_getenv = os.getenv +compat_getpass = getpass.getpass +compat_html_entities = html.entities +compat_html_entities_html5 = html.entities.html5 +compat_HTMLParser = html.parser.HTMLParser +compat_http_client = http.client +compat_http_server = http.server +compat_input = input +compat_integer_types = (int, ) +compat_itertools_count = itertools.count +compat_kwargs = lambda kwargs: kwargs +compat_map = map +compat_numeric_types = (int, float, complex) +compat_print = print +compat_shlex_split = shlex.split +compat_socket_create_connection = socket.create_connection +compat_Struct = struct.Struct +compat_struct_pack = struct.pack +compat_struct_unpack = struct.unpack +compat_subprocess_get_DEVNULL = lambda: DEVNULL +compat_tokenize_tokenize = tokenize.tokenize +compat_urllib_error = urllib.error +compat_urllib_parse = urllib.parse +compat_urllib_parse_quote = urllib.parse.quote +compat_urllib_parse_quote_plus = urllib.parse.quote_plus +compat_urllib_parse_unquote_plus = urllib.parse.unquote_plus +compat_urllib_parse_unquote_to_bytes = urllib.parse.unquote_to_bytes +compat_urllib_parse_urlunparse = urllib.parse.urlunparse +compat_urllib_request = urllib.request +compat_urllib_request_DataHandler = urllib.request.DataHandler +compat_urllib_response = urllib.response +compat_urlretrieve = urllib.request.urlretrieve +compat_xml_parse_error = etree.ParseError +compat_xpath = lambda xpath: xpath +compat_zip = zip +workaround_optparse_bug9161 = lambda: None diff --git a/hypervideo_dl/compat/compat_utils.py b/hypervideo_dl/compat/compat_utils.py new file mode 100644 index 0000000..1bf6566 --- /dev/null +++ b/hypervideo_dl/compat/compat_utils.py @@ -0,0 +1,70 @@ +import collections +import contextlib +import importlib +import sys +import types + +_NO_ATTRIBUTE = object() + +_Package = collections.namedtuple('Package', ('name', 'version')) + + +def get_package_info(module): + parent = module.__name__.split('.')[0] + parent_module = None + with contextlib.suppress(ImportError): + parent_module = importlib.import_module(parent) + + for attr in ('__version__', 'version_string', 'version'): + version = getattr(parent_module, attr, None) + if version is not None: + break + return _Package(getattr(module, '_hypervideo_dl__identifier', parent), str(version)) + + +def _is_package(module): + try: + module.__getattribute__('__path__') + except AttributeError: + return False + return True + + +def passthrough_module(parent, child, allowed_attributes=None, *, callback=lambda _: None): + parent_module = importlib.import_module(parent) + child_module = None # Import child module only as needed + + class PassthroughModule(types.ModuleType): + def __getattr__(self, attr): + if _is_package(parent_module): + with contextlib.suppress(ImportError): + return importlib.import_module(f'.{attr}', parent) + + ret = self.__from_child(attr) + if ret is _NO_ATTRIBUTE: + raise AttributeError(f'module {parent} has no attribute {attr}') + callback(attr) + return ret + + def __from_child(self, attr): + if allowed_attributes is None: + if attr.startswith('__') and attr.endswith('__'): + return _NO_ATTRIBUTE + elif attr not in allowed_attributes: + return _NO_ATTRIBUTE + + nonlocal child_module + child_module = child_module or importlib.import_module(child, parent) + + with contextlib.suppress(AttributeError): + return getattr(child_module, attr) + + if _is_package(child_module): + with contextlib.suppress(ImportError): + return importlib.import_module(f'.{attr}', child) + + return _NO_ATTRIBUTE + + # Python 3.6 does not have module level __getattr__ + # https://peps.python.org/pep-0562/ + sys.modules[parent].__class__ = PassthroughModule diff --git a/hypervideo_dl/compat/functools.py b/hypervideo_dl/compat/functools.py new file mode 100644 index 0000000..ec003ea --- /dev/null +++ b/hypervideo_dl/compat/functools.py @@ -0,0 +1,26 @@ +# flake8: noqa: F405 +from functools import * # noqa: F403 + +from .compat_utils import passthrough_module + +passthrough_module(__name__, 'functools') +del passthrough_module + +try: + cache # >= 3.9 +except NameError: + cache = lru_cache(maxsize=None) + +try: + cached_property # >= 3.8 +except NameError: + class cached_property: + def __init__(self, func): + update_wrapper(self, func) + self.func = func + + def __get__(self, instance, _): + if instance is None: + return self + setattr(instance, self.func.__name__, self.func(instance)) + return getattr(instance, self.func.__name__) diff --git a/hypervideo_dl/compat/imghdr.py b/hypervideo_dl/compat/imghdr.py new file mode 100644 index 0000000..5d64ab0 --- /dev/null +++ b/hypervideo_dl/compat/imghdr.py @@ -0,0 +1,16 @@ +tests = { + 'webp': lambda h: h[0:4] == b'RIFF' and h[8:] == b'WEBP', + 'png': lambda h: h[:8] == b'\211PNG\r\n\032\n', + 'jpeg': lambda h: h[6:10] in (b'JFIF', b'Exif'), + 'gif': lambda h: h[:6] in (b'GIF87a', b'GIF89a'), +} + + +def what(file=None, h=None): + """Detect format of image (Currently supports jpeg, png, webp, gif only) + Ref: https://github.com/python/cpython/blob/3.10/Lib/imghdr.py + """ + if h is None: + with open(file, 'rb') as f: + h = f.read(12) + return next((type_ for type_, test in tests.items() if test(h)), None) diff --git a/hypervideo_dl/compat/shutil.py b/hypervideo_dl/compat/shutil.py new file mode 100644 index 0000000..23239d5 --- /dev/null +++ b/hypervideo_dl/compat/shutil.py @@ -0,0 +1,30 @@ +# flake8: noqa: F405 +from shutil import * # noqa: F403 + +from .compat_utils import passthrough_module + +passthrough_module(__name__, 'shutil') +del passthrough_module + + +import sys + +if sys.platform.startswith('freebsd'): + import errno + import os + import shutil + + # Workaround for PermissionError when using restricted ACL mode on FreeBSD + def copy2(src, dst, *args, **kwargs): + if os.path.isdir(dst): + dst = os.path.join(dst, os.path.basename(src)) + shutil.copyfile(src, dst, *args, **kwargs) + try: + shutil.copystat(src, dst, *args, **kwargs) + except PermissionError as e: + if e.errno != getattr(errno, 'EPERM', None): + raise + return dst + + def move(*args, copy_function=copy2, **kwargs): + return shutil.move(*args, copy_function=copy_function, **kwargs) diff --git a/hypervideo_dl/cookies.py b/hypervideo_dl/cookies.py index f963729..97457a1 100644 --- a/hypervideo_dl/cookies.py +++ b/hypervideo_dl/cookies.py @@ -1,12 +1,16 @@ +import base64 import contextlib -import ctypes +import http.cookiejar +import http.cookies import json import os +import re import shutil import struct import subprocess import sys import tempfile +import time from datetime import datetime, timedelta, timezone from enum import Enum, auto from hashlib import pbkdf2_hmac @@ -16,39 +20,21 @@ from .aes import ( aes_gcm_decrypt_and_verify_bytes, unpad_pkcs7, ) -from .compat import ( - compat_b64decode, - compat_cookiejar_Cookie, +from .dependencies import ( + _SECRETSTORAGE_UNAVAILABLE_REASON, + secretstorage, + sqlite3, ) +from .minicurses import MultilinePrinter, QuietMultilinePrinter from .utils import ( - error_to_str, - expand_path, Popen, YoutubeDLCookieJar, + error_to_str, + expand_path, + is_path_like, + try_call, ) -try: - import sqlite3 - SQLITE_AVAILABLE = True -except ImportError: - # although sqlite3 is part of the standard library, it is possible to compile python without - # sqlite support. See: https://github.com/hypervideo/hypervideo/issues/544 - SQLITE_AVAILABLE = False - - -try: - import secretstorage - SECRETSTORAGE_AVAILABLE = True -except ImportError: - SECRETSTORAGE_AVAILABLE = False - SECRETSTORAGE_UNAVAILABLE_REASON = ( - 'as the `secretstorage` module is not installed. ' - 'Please install by running `python3 -m pip install secretstorage`.') -except Exception as _err: - SECRETSTORAGE_AVAILABLE = False - SECRETSTORAGE_UNAVAILABLE_REASON = f'as the `secretstorage` module could not be initialized. {_err}' - - CHROMIUM_BASED_BROWSERS = {'brave', 'chrome', 'chromium', 'edge', 'opera', 'vivaldi'} SUPPORTED_BROWSERS = CHROMIUM_BASED_BROWSERS | {'firefox', 'safari'} @@ -73,37 +59,72 @@ class YDLLogger: if self._ydl: self._ydl.report_error(message) + class ProgressBar(MultilinePrinter): + _DELAY, _timer = 0.1, 0 + + def print(self, message): + if time.time() - self._timer > self._DELAY: + self.print_at_line(f'[Cookies] {message}', 0) + self._timer = time.time() + + def progress_bar(self): + """Return a context manager with a print method. (Optional)""" + # Do not print to files/pipes, loggers, or when --no-progress is used + if not self._ydl or self._ydl.params.get('noprogress') or self._ydl.params.get('logger'): + return + file = self._ydl._out_files.error + try: + if not file.isatty(): + return + except BaseException: + return + return self.ProgressBar(file, preserve_output=False) + + +def _create_progress_bar(logger): + if hasattr(logger, 'progress_bar'): + printer = logger.progress_bar() + if printer: + return printer + printer = QuietMultilinePrinter() + printer.print = lambda _: None + return printer + def load_cookies(cookie_file, browser_specification, ydl): cookie_jars = [] if browser_specification is not None: - browser_name, profile, keyring = _parse_browser_specification(*browser_specification) - cookie_jars.append(extract_cookies_from_browser(browser_name, profile, YDLLogger(ydl), keyring=keyring)) + browser_name, profile, keyring, container = _parse_browser_specification(*browser_specification) + cookie_jars.append( + extract_cookies_from_browser(browser_name, profile, YDLLogger(ydl), keyring=keyring, container=container)) if cookie_file is not None: - cookie_file = expand_path(cookie_file) + is_filename = is_path_like(cookie_file) + if is_filename: + cookie_file = expand_path(cookie_file) + jar = YoutubeDLCookieJar(cookie_file) - if os.access(cookie_file, os.R_OK): + if not is_filename or os.access(cookie_file, os.R_OK): jar.load(ignore_discard=True, ignore_expires=True) cookie_jars.append(jar) return _merge_cookie_jars(cookie_jars) -def extract_cookies_from_browser(browser_name, profile=None, logger=YDLLogger(), *, keyring=None): +def extract_cookies_from_browser(browser_name, profile=None, logger=YDLLogger(), *, keyring=None, container=None): if browser_name == 'firefox': - return _extract_firefox_cookies(profile, logger) + return _extract_firefox_cookies(profile, container, logger) elif browser_name == 'safari': return _extract_safari_cookies(profile, logger) elif browser_name in CHROMIUM_BASED_BROWSERS: return _extract_chrome_cookies(browser_name, profile, keyring, logger) else: - raise ValueError('unknown browser: {}'.format(browser_name)) + raise ValueError(f'unknown browser: {browser_name}') -def _extract_firefox_cookies(profile, logger): +def _extract_firefox_cookies(profile, container, logger): logger.info('Extracting cookies from firefox') - if not SQLITE_AVAILABLE: + if not sqlite3: logger.warning('Cannot extract cookies from firefox without sqlite3 support. ' 'Please use a python interpreter compiled with sqlite3 support') return YoutubeDLCookieJar() @@ -115,25 +136,54 @@ def _extract_firefox_cookies(profile, logger): else: search_root = os.path.join(_firefox_browser_dir(), profile) - cookie_database_path = _find_most_recently_used_file(search_root, 'cookies.sqlite') + cookie_database_path = _find_most_recently_used_file(search_root, 'cookies.sqlite', logger) if cookie_database_path is None: - raise FileNotFoundError('could not find firefox cookies database in {}'.format(search_root)) - logger.debug('Extracting cookies from: "{}"'.format(cookie_database_path)) + raise FileNotFoundError(f'could not find firefox cookies database in {search_root}') + logger.debug(f'Extracting cookies from: "{cookie_database_path}"') + + container_id = None + if container not in (None, 'none'): + containers_path = os.path.join(os.path.dirname(cookie_database_path), 'containers.json') + if not os.path.isfile(containers_path) or not os.access(containers_path, os.R_OK): + raise FileNotFoundError(f'could not read containers.json in {search_root}') + with open(containers_path) as containers: + identities = json.load(containers).get('identities', []) + container_id = next((context.get('userContextId') for context in identities if container in ( + context.get('name'), + try_call(lambda: re.fullmatch(r'userContext([^\.]+)\.label', context['l10nID']).group()) + )), None) + if not isinstance(container_id, int): + raise ValueError(f'could not find firefox container "{container}" in containers.json') with tempfile.TemporaryDirectory(prefix='hypervideo_dl') as tmpdir: cursor = None try: cursor = _open_database_copy(cookie_database_path, tmpdir) - cursor.execute('SELECT host, name, value, path, expiry, isSecure FROM moz_cookies') + if isinstance(container_id, int): + logger.debug( + f'Only loading cookies from firefox container "{container}", ID {container_id}') + cursor.execute( + 'SELECT host, name, value, path, expiry, isSecure FROM moz_cookies WHERE originAttributes LIKE ? OR originAttributes LIKE ?', + (f'%userContextId={container_id}', f'%userContextId={container_id}&%')) + elif container == 'none': + logger.debug('Only loading cookies not belonging to any container') + cursor.execute( + 'SELECT host, name, value, path, expiry, isSecure FROM moz_cookies WHERE NOT INSTR(originAttributes,"userContextId=")') + else: + cursor.execute('SELECT host, name, value, path, expiry, isSecure FROM moz_cookies') jar = YoutubeDLCookieJar() - for host, name, value, path, expiry, is_secure in cursor.fetchall(): - cookie = compat_cookiejar_Cookie( - version=0, name=name, value=value, port=None, port_specified=False, - domain=host, domain_specified=bool(host), domain_initial_dot=host.startswith('.'), - path=path, path_specified=bool(path), secure=is_secure, expires=expiry, discard=False, - comment=None, comment_url=None, rest={}) - jar.set_cookie(cookie) - logger.info('Extracted {} cookies from firefox'.format(len(jar))) + with _create_progress_bar(logger) as progress_bar: + table = cursor.fetchall() + total_cookie_count = len(table) + for i, (host, name, value, path, expiry, is_secure) in enumerate(table): + progress_bar.print(f'Loading cookie {i: 6d}/{total_cookie_count: 6d}') + cookie = http.cookiejar.Cookie( + version=0, name=name, value=value, port=None, port_specified=False, + domain=host, domain_specified=bool(host), domain_initial_dot=host.startswith('.'), + path=path, path_specified=bool(path), secure=is_secure, expires=expiry, discard=False, + comment=None, comment_url=None, rest={}) + jar.set_cookie(cookie) + logger.info(f'Extracted {len(jar)} cookies from firefox') return jar finally: if cursor is not None: @@ -141,39 +191,25 @@ def _extract_firefox_cookies(profile, logger): def _firefox_browser_dir(): - if sys.platform in ('linux', 'linux2'): - return os.path.expanduser('~/.mozilla/firefox') - elif sys.platform == 'win32': - return os.path.expandvars(r'%APPDATA%\Mozilla\Firefox\Profiles') + if sys.platform in ('cygwin', 'win32'): + return os.path.expandvars(R'%APPDATA%\Mozilla\Firefox\Profiles') elif sys.platform == 'darwin': return os.path.expanduser('~/Library/Application Support/Firefox') - else: - raise ValueError('unsupported platform: {}'.format(sys.platform)) + return os.path.expanduser('~/.mozilla/firefox') def _get_chromium_based_browser_settings(browser_name): # https://chromium.googlesource.com/chromium/src/+/HEAD/docs/user_data_dir.md - if sys.platform in ('linux', 'linux2'): - config = _config_home() - browser_dir = { - 'brave': os.path.join(config, 'BraveSoftware/Brave-Browser'), - 'chrome': os.path.join(config, 'google-chrome'), - 'chromium': os.path.join(config, 'chromium'), - 'edge': os.path.join(config, 'microsoft-edge'), - 'opera': os.path.join(config, 'opera'), - 'vivaldi': os.path.join(config, 'vivaldi'), - }[browser_name] - - elif sys.platform == 'win32': + if sys.platform in ('cygwin', 'win32'): appdata_local = os.path.expandvars('%LOCALAPPDATA%') appdata_roaming = os.path.expandvars('%APPDATA%') browser_dir = { - 'brave': os.path.join(appdata_local, r'BraveSoftware\Brave-Browser\User Data'), - 'chrome': os.path.join(appdata_local, r'Google\Chrome\User Data'), - 'chromium': os.path.join(appdata_local, r'Chromium\User Data'), - 'edge': os.path.join(appdata_local, r'Microsoft\Edge\User Data'), - 'opera': os.path.join(appdata_roaming, r'Opera Software\Opera Stable'), - 'vivaldi': os.path.join(appdata_local, r'Vivaldi\User Data'), + 'brave': os.path.join(appdata_local, R'BraveSoftware\Brave-Browser\User Data'), + 'chrome': os.path.join(appdata_local, R'Google\Chrome\User Data'), + 'chromium': os.path.join(appdata_local, R'Chromium\User Data'), + 'edge': os.path.join(appdata_local, R'Microsoft\Edge\User Data'), + 'opera': os.path.join(appdata_roaming, R'Opera Software\Opera Stable'), + 'vivaldi': os.path.join(appdata_local, R'Vivaldi\User Data'), }[browser_name] elif sys.platform == 'darwin': @@ -188,7 +224,15 @@ def _get_chromium_based_browser_settings(browser_name): }[browser_name] else: - raise ValueError('unsupported platform: {}'.format(sys.platform)) + config = _config_home() + browser_dir = { + 'brave': os.path.join(config, 'BraveSoftware/Brave-Browser'), + 'chrome': os.path.join(config, 'google-chrome'), + 'chromium': os.path.join(config, 'chromium'), + 'edge': os.path.join(config, 'microsoft-edge'), + 'opera': os.path.join(config, 'opera'), + 'vivaldi': os.path.join(config, 'vivaldi'), + }[browser_name] # Linux keyring names can be determined by snooping on dbus while opening the browser in KDE: # dbus-monitor "interface='org.kde.KWallet'" "type=method_return" @@ -211,11 +255,11 @@ def _get_chromium_based_browser_settings(browser_name): def _extract_chrome_cookies(browser_name, profile, keyring, logger): - logger.info('Extracting cookies from {}'.format(browser_name)) + logger.info(f'Extracting cookies from {browser_name}') - if not SQLITE_AVAILABLE: - logger.warning(('Cannot extract cookies from {} without sqlite3 support. ' - 'Please use a python interpreter compiled with sqlite3 support').format(browser_name)) + if not sqlite3: + logger.warning(f'Cannot extract cookies from {browser_name} without sqlite3 support. ' + 'Please use a python interpreter compiled with sqlite3 support') return YoutubeDLCookieJar() config = _get_chromium_based_browser_settings(browser_name) @@ -229,13 +273,13 @@ def _extract_chrome_cookies(browser_name, profile, keyring, logger): if config['supports_profiles']: search_root = os.path.join(config['browser_dir'], profile) else: - logger.error('{} does not support profiles'.format(browser_name)) + logger.error(f'{browser_name} does not support profiles') search_root = config['browser_dir'] - cookie_database_path = _find_most_recently_used_file(search_root, 'Cookies') + cookie_database_path = _find_most_recently_used_file(search_root, 'Cookies', logger) if cookie_database_path is None: - raise FileNotFoundError('could not find {} cookies database in "{}"'.format(browser_name, search_root)) - logger.debug('Extracting cookies from: "{}"'.format(cookie_database_path)) + raise FileNotFoundError(f'could not find {browser_name} cookies database in "{search_root}"') + logger.debug(f'Extracting cookies from: "{cookie_database_path}"') decryptor = get_cookie_decryptor(config['browser_dir'], config['keyring_name'], logger, keyring=keyring) @@ -246,45 +290,55 @@ def _extract_chrome_cookies(browser_name, profile, keyring, logger): cursor.connection.text_factory = bytes column_names = _get_column_names(cursor, 'cookies') secure_column = 'is_secure' if 'is_secure' in column_names else 'secure' - cursor.execute('SELECT host_key, name, value, encrypted_value, path, ' - 'expires_utc, {} FROM cookies'.format(secure_column)) + cursor.execute(f'SELECT host_key, name, value, encrypted_value, path, expires_utc, {secure_column} FROM cookies') jar = YoutubeDLCookieJar() failed_cookies = 0 unencrypted_cookies = 0 - for host_key, name, value, encrypted_value, path, expires_utc, is_secure in cursor.fetchall(): - host_key = host_key.decode('utf-8') - name = name.decode('utf-8') - value = value.decode('utf-8') - path = path.decode('utf-8') - - if not value and encrypted_value: - value = decryptor.decrypt(encrypted_value) - if value is None: + with _create_progress_bar(logger) as progress_bar: + table = cursor.fetchall() + total_cookie_count = len(table) + for i, line in enumerate(table): + progress_bar.print(f'Loading cookie {i: 6d}/{total_cookie_count: 6d}') + is_encrypted, cookie = _process_chrome_cookie(decryptor, *line) + if not cookie: failed_cookies += 1 continue - else: - unencrypted_cookies += 1 - - cookie = compat_cookiejar_Cookie( - version=0, name=name, value=value, port=None, port_specified=False, - domain=host_key, domain_specified=bool(host_key), domain_initial_dot=host_key.startswith('.'), - path=path, path_specified=bool(path), secure=is_secure, expires=expires_utc, discard=False, - comment=None, comment_url=None, rest={}) - jar.set_cookie(cookie) + elif not is_encrypted: + unencrypted_cookies += 1 + jar.set_cookie(cookie) if failed_cookies > 0: - failed_message = ' ({} could not be decrypted)'.format(failed_cookies) + failed_message = f' ({failed_cookies} could not be decrypted)' else: failed_message = '' - logger.info('Extracted {} cookies from {}{}'.format(len(jar), browser_name, failed_message)) - counts = decryptor.cookie_counts.copy() + logger.info(f'Extracted {len(jar)} cookies from {browser_name}{failed_message}') + counts = decryptor._cookie_counts.copy() counts['unencrypted'] = unencrypted_cookies - logger.debug('cookie version breakdown: {}'.format(counts)) + logger.debug(f'cookie version breakdown: {counts}') return jar finally: if cursor is not None: cursor.connection.close() +def _process_chrome_cookie(decryptor, host_key, name, value, encrypted_value, path, expires_utc, is_secure): + host_key = host_key.decode() + name = name.decode() + value = value.decode() + path = path.decode() + is_encrypted = not value and encrypted_value + + if is_encrypted: + value = decryptor.decrypt(encrypted_value) + if value is None: + return is_encrypted, None + + return is_encrypted, http.cookiejar.Cookie( + version=0, name=name, value=value, port=None, port_specified=False, + domain=host_key, domain_specified=bool(host_key), domain_initial_dot=host_key.startswith('.'), + path=path, path_specified=bool(path), secure=is_secure, expires=expires_utc, discard=False, + comment=None, comment_url=None, rest={}) + + class ChromeCookieDecryptor: """ Overview: @@ -311,24 +365,18 @@ class ChromeCookieDecryptor: - KeyStorageLinux::CreateService """ - def decrypt(self, encrypted_value): - raise NotImplementedError + _cookie_counts = {} - @property - def cookie_counts(self): - raise NotImplementedError + def decrypt(self, encrypted_value): + raise NotImplementedError('Must be implemented by sub classes') def get_cookie_decryptor(browser_root, browser_keyring_name, logger, *, keyring=None): - if sys.platform in ('linux', 'linux2'): - return LinuxChromeCookieDecryptor(browser_keyring_name, logger, keyring=keyring) - elif sys.platform == 'darwin': + if sys.platform == 'darwin': return MacChromeCookieDecryptor(browser_keyring_name, logger) - elif sys.platform == 'win32': + elif sys.platform in ('win32', 'cygwin'): return WindowsChromeCookieDecryptor(browser_root, logger) - else: - raise NotImplementedError('Chrome cookie decryption is not supported ' - 'on this platform: {}'.format(sys.platform)) + return LinuxChromeCookieDecryptor(browser_keyring_name, logger, keyring=keyring) class LinuxChromeCookieDecryptor(ChromeCookieDecryptor): @@ -345,10 +393,6 @@ class LinuxChromeCookieDecryptor(ChromeCookieDecryptor): # https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/os_crypt_linux.cc return pbkdf2_sha1(password, salt=b'saltysalt', iterations=1, key_length=16) - @property - def cookie_counts(self): - return self._cookie_counts - def decrypt(self, encrypted_value): version = encrypted_value[:3] ciphertext = encrypted_value[3:] @@ -382,10 +426,6 @@ class MacChromeCookieDecryptor(ChromeCookieDecryptor): # https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/os_crypt_mac.mm return pbkdf2_sha1(password, salt=b'saltysalt', iterations=1003, key_length=16) - @property - def cookie_counts(self): - return self._cookie_counts - def decrypt(self, encrypted_value): version = encrypted_value[:3] ciphertext = encrypted_value[3:] @@ -411,10 +451,6 @@ class WindowsChromeCookieDecryptor(ChromeCookieDecryptor): self._v10_key = _get_windows_v10_key(browser_root, logger) self._cookie_counts = {'v10': 0, 'other': 0} - @property - def cookie_counts(self): - return self._cookie_counts - def decrypt(self, encrypted_value): version = encrypted_value[:3] ciphertext = encrypted_value[3:] @@ -443,14 +479,14 @@ class WindowsChromeCookieDecryptor(ChromeCookieDecryptor): self._cookie_counts['other'] += 1 # any other prefix means the data is DPAPI encrypted # https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/os_crypt_win.cc - return _decrypt_windows_dpapi(encrypted_value, self._logger).decode('utf-8') + return _decrypt_windows_dpapi(encrypted_value, self._logger).decode() def _extract_safari_cookies(profile, logger): if profile is not None: logger.error('safari does not support profiles') if sys.platform != 'darwin': - raise ValueError('unsupported platform: {}'.format(sys.platform)) + raise ValueError(f'unsupported platform: {sys.platform}') cookies_path = os.path.expanduser('~/Library/Cookies/Cookies.binarycookies') @@ -464,7 +500,7 @@ def _extract_safari_cookies(profile, logger): cookies_data = f.read() jar = parse_safari_cookies(cookies_data, logger=logger) - logger.info('Extracted {} cookies from safari'.format(len(jar))) + logger.info(f'Extracted {len(jar)} cookies from safari') return jar @@ -480,7 +516,7 @@ class DataParser: def read_bytes(self, num_bytes): if num_bytes < 0: - raise ParserError('invalid read of {} bytes'.format(num_bytes)) + raise ParserError(f'invalid read of {num_bytes} bytes') end = self.cursor + num_bytes if end > len(self._data): raise ParserError('reached end of input') @@ -491,7 +527,7 @@ class DataParser: def expect_bytes(self, expected_value, message): value = self.read_bytes(len(expected_value)) if value != expected_value: - raise ParserError('unexpected value: {} != {} ({})'.format(value, expected_value, message)) + raise ParserError(f'unexpected value: {value} != {expected_value} ({message})') def read_uint(self, big_endian=False): data_format = '>I' if big_endian else ' 0: - self._logger.debug('skipping {} bytes ({}): {}'.format( - num_bytes, description, self.read_bytes(num_bytes))) + self._logger.debug(f'skipping {num_bytes} bytes ({description}): {self.read_bytes(num_bytes)!r}') elif num_bytes < 0: - raise ParserError('invalid skip of {} bytes'.format(num_bytes)) + raise ParserError(f'invalid skip of {num_bytes} bytes') def skip_to(self, offset, description='unknown'): self.skip(offset - self.cursor, description) @@ -542,15 +577,17 @@ def _parse_safari_cookies_page(data, jar, logger): number_of_cookies = p.read_uint() record_offsets = [p.read_uint() for _ in range(number_of_cookies)] if number_of_cookies == 0: - logger.debug('a cookies page of size {} has no cookies'.format(len(data))) + logger.debug(f'a cookies page of size {len(data)} has no cookies') return p.skip_to(record_offsets[0], 'unknown page header field') - for record_offset in record_offsets: - p.skip_to(record_offset, 'space between records') - record_length = _parse_safari_cookies_record(data[record_offset:], jar, logger) - p.read_bytes(record_length) + with _create_progress_bar(logger) as progress_bar: + for i, record_offset in enumerate(record_offsets): + progress_bar.print(f'Loading cookie {i: 6d}/{number_of_cookies: 6d}') + p.skip_to(record_offset, 'space between records') + record_length = _parse_safari_cookies_record(data[record_offset:], jar, logger) + p.read_bytes(record_length) p.skip_to_end('space in between pages') @@ -587,7 +624,7 @@ def _parse_safari_cookies_record(data, jar, logger): p.skip_to(record_size, 'space at the end of the record') - cookie = compat_cookiejar_Cookie( + cookie = http.cookiejar.Cookie( version=0, name=name, value=value, port=None, port_specified=False, domain=domain, domain_specified=bool(domain), domain_initial_dot=domain.startswith('.'), path=path, path_specified=bool(path), secure=is_secure, expires=expiration_date, discard=False, @@ -686,7 +723,7 @@ def _choose_linux_keyring(logger): SelectBackend """ desktop_environment = _get_linux_desktop_environment(os.environ) - logger.debug('detected desktop environment: {}'.format(desktop_environment.name)) + logger.debug(f'detected desktop environment: {desktop_environment.name}') if desktop_environment == _LinuxDesktopEnvironment.KDE: linux_keyring = _LinuxKeyring.KWALLET elif desktop_environment == _LinuxDesktopEnvironment.OTHER: @@ -707,23 +744,21 @@ def _get_kwallet_network_wallet(logger): """ default_wallet = 'kdewallet' try: - proc = Popen([ + stdout, _, returncode = Popen.run([ 'dbus-send', '--session', '--print-reply=literal', '--dest=org.kde.kwalletd5', '/modules/kwalletd5', 'org.kde.KWallet.networkWallet' - ], stdout=subprocess.PIPE, stderr=subprocess.DEVNULL) + ], text=True, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL) - stdout, stderr = proc.communicate_or_kill() - if proc.returncode != 0: + if returncode: logger.warning('failed to read NetworkWallet') return default_wallet else: - network_wallet = stdout.decode('utf-8').strip() - logger.debug('NetworkWallet = "{}"'.format(network_wallet)) - return network_wallet + logger.debug(f'NetworkWallet = "{stdout.strip()}"') + return stdout.strip() except Exception as e: - logger.warning('exception while obtaining NetworkWallet: {}'.format(e)) + logger.warning(f'exception while obtaining NetworkWallet: {e}') return default_wallet @@ -739,17 +774,16 @@ def _get_kwallet_password(browser_keyring_name, logger): network_wallet = _get_kwallet_network_wallet(logger) try: - proc = Popen([ + stdout, _, returncode = Popen.run([ 'kwallet-query', - '--read-password', '{} Safe Storage'.format(browser_keyring_name), - '--folder', '{} Keys'.format(browser_keyring_name), + '--read-password', f'{browser_keyring_name} Safe Storage', + '--folder', f'{browser_keyring_name} Keys', network_wallet ], stdout=subprocess.PIPE, stderr=subprocess.DEVNULL) - stdout, stderr = proc.communicate_or_kill() - if proc.returncode != 0: - logger.error('kwallet-query failed with return code {}. Please consult ' - 'the kwallet-query man page for details'.format(proc.returncode)) + if returncode: + logger.error(f'kwallet-query failed with return code {returncode}. ' + 'Please consult the kwallet-query man page for details') return b'' else: if stdout.lower().startswith(b'failed to read'): @@ -764,17 +798,15 @@ def _get_kwallet_password(browser_keyring_name, logger): return b'' else: logger.debug('password found') - if stdout[-1:] == b'\n': - stdout = stdout[:-1] - return stdout + return stdout.rstrip(b'\n') except Exception as e: logger.warning(f'exception running kwallet-query: {error_to_str(e)}') return b'' def _get_gnome_keyring_password(browser_keyring_name, logger): - if not SECRETSTORAGE_AVAILABLE: - logger.error('secretstorage not available {}'.format(SECRETSTORAGE_UNAVAILABLE_REASON)) + if not secretstorage: + logger.error(f'secretstorage not available {_SECRETSTORAGE_UNAVAILABLE_REASON}') return b'' # the Gnome keyring does not seem to organise keys in the same way as KWallet, # using `dbus-monitor` during startup, it can be observed that chromium lists all keys @@ -783,7 +815,7 @@ def _get_gnome_keyring_password(browser_keyring_name, logger): with contextlib.closing(secretstorage.dbus_init()) as con: col = secretstorage.get_default_collection(con) for item in col.get_all_items(): - if item.get_label() == '{} Safe Storage'.format(browser_keyring_name): + if item.get_label() == f'{browser_keyring_name} Safe Storage': return item.get_secret() else: logger.error('failed to read from keyring') @@ -813,35 +845,35 @@ def _get_linux_keyring_password(browser_keyring_name, keyring, logger): def _get_mac_keyring_password(browser_keyring_name, logger): logger.debug('using find-generic-password to obtain password from OSX keychain') try: - proc = Popen( + stdout, _, returncode = Popen.run( ['security', 'find-generic-password', '-w', # write password to stdout '-a', browser_keyring_name, # match 'account' - '-s', '{} Safe Storage'.format(browser_keyring_name)], # match 'service' + '-s', f'{browser_keyring_name} Safe Storage'], # match 'service' stdout=subprocess.PIPE, stderr=subprocess.DEVNULL) - - stdout, stderr = proc.communicate_or_kill() - if stdout[-1:] == b'\n': - stdout = stdout[:-1] - return stdout + if returncode: + logger.warning('find-generic-password failed') + return None + return stdout.rstrip(b'\n') except Exception as e: logger.warning(f'exception running find-generic-password: {error_to_str(e)}') return None def _get_windows_v10_key(browser_root, logger): - path = _find_most_recently_used_file(browser_root, 'Local State') + path = _find_most_recently_used_file(browser_root, 'Local State', logger) if path is None: logger.error('could not find local state file') return None - with open(path, 'r', encoding='utf8') as f: + logger.debug(f'Found local state file at "{path}"') + with open(path, encoding='utf8') as f: data = json.load(f) try: base64_key = data['os_crypt']['encrypted_key'] except KeyError: logger.error('no encrypted key in Local State') return None - encrypted_key = compat_b64decode(base64_key) + encrypted_key = base64.b64decode(base64_key) prefix = b'DPAPI' if not encrypted_key.startswith(prefix): logger.error('invalid key') @@ -856,7 +888,7 @@ def pbkdf2_sha1(password, salt, iterations, key_length): def _decrypt_aes_cbc(ciphertext, key, logger, initialization_vector=b' ' * 16): plaintext = unpad_pkcs7(aes_cbc_decrypt_bytes(ciphertext, key, initialization_vector)) try: - return plaintext.decode('utf-8') + return plaintext.decode() except UnicodeDecodeError: logger.warning('failed to decrypt cookie (AES-CBC) because UTF-8 decoding failed. Possibly the key is wrong?', only_once=True) return None @@ -870,7 +902,7 @@ def _decrypt_aes_gcm(ciphertext, key, nonce, authentication_tag, logger): return None try: - return plaintext.decode('utf-8') + return plaintext.decode() except UnicodeDecodeError: logger.warning('failed to decrypt cookie (AES-GCM) because UTF-8 decoding failed. Possibly the key is wrong?', only_once=True) return None @@ -881,10 +913,12 @@ def _decrypt_windows_dpapi(ciphertext, logger): References: - https://docs.microsoft.com/en-us/windows/win32/api/dpapi/nf-dpapi-cryptunprotectdata """ - from ctypes.wintypes import DWORD + + import ctypes + import ctypes.wintypes class DATA_BLOB(ctypes.Structure): - _fields_ = [('cbData', DWORD), + _fields_ = [('cbData', ctypes.wintypes.DWORD), ('pbData', ctypes.POINTER(ctypes.c_char))] buffer = ctypes.create_string_buffer(ciphertext) @@ -921,17 +955,20 @@ def _open_database_copy(database_path, tmpdir): def _get_column_names(cursor, table_name): - table_info = cursor.execute('PRAGMA table_info({})'.format(table_name)).fetchall() - return [row[1].decode('utf-8') for row in table_info] + table_info = cursor.execute(f'PRAGMA table_info({table_name})').fetchall() + return [row[1].decode() for row in table_info] -def _find_most_recently_used_file(root, filename): +def _find_most_recently_used_file(root, filename, logger): # if there are multiple browser profiles, take the most recently used one - paths = [] - for root, dirs, files in os.walk(root): - for file in files: - if file == filename: - paths.append(os.path.join(root, file)) + i, paths = 0, [] + with _create_progress_bar(logger) as progress_bar: + for curr_root, dirs, files in os.walk(root): + for file in files: + i += 1 + progress_bar.print(f'Searching for "{filename}": {i: 6d} files searched') + if file == filename: + paths.append(os.path.join(curr_root, file)) return None if not paths else max(paths, key=lambda path: os.lstat(path).st_mtime) @@ -949,11 +986,102 @@ def _is_path(value): return os.path.sep in value -def _parse_browser_specification(browser_name, profile=None, keyring=None): +def _parse_browser_specification(browser_name, profile=None, keyring=None, container=None): if browser_name not in SUPPORTED_BROWSERS: raise ValueError(f'unsupported browser: "{browser_name}"') if keyring not in (None, *SUPPORTED_KEYRINGS): raise ValueError(f'unsupported keyring: "{keyring}"') - if profile is not None and _is_path(profile): - profile = os.path.expanduser(profile) - return browser_name, profile, keyring + if profile is not None and _is_path(expand_path(profile)): + profile = expand_path(profile) + return browser_name, profile, keyring, container + + +class LenientSimpleCookie(http.cookies.SimpleCookie): + """More lenient version of http.cookies.SimpleCookie""" + # From https://github.com/python/cpython/blob/v3.10.7/Lib/http/cookies.py + # We use Morsel's legal key chars to avoid errors on setting values + _LEGAL_KEY_CHARS = r'\w\d' + re.escape('!#$%&\'*+-.:^_`|~') + _LEGAL_VALUE_CHARS = _LEGAL_KEY_CHARS + re.escape('(),/<=>?@[]{}') + + _RESERVED = { + "expires", + "path", + "comment", + "domain", + "max-age", + "secure", + "httponly", + "version", + "samesite", + } + + _FLAGS = {"secure", "httponly"} + + # Added 'bad' group to catch the remaining value + _COOKIE_PATTERN = re.compile(r""" + \s* # Optional whitespace at start of cookie + (?P # Start of group 'key' + [""" + _LEGAL_KEY_CHARS + r"""]+?# Any word of at least one letter + ) # End of group 'key' + ( # Optional group: there may not be a value. + \s*=\s* # Equal Sign + ( # Start of potential value + (?P # Start of group 'val' + "(?:[^\\"]|\\.)*" # Any doublequoted string + | # or + \w{3},\s[\w\d\s-]{9,11}\s[\d:]{8}\sGMT # Special case for "expires" attr + | # or + [""" + _LEGAL_VALUE_CHARS + r"""]* # Any word or empty string + ) # End of group 'val' + | # or + (?P(?:\\;|[^;])*?) # 'bad' group fallback for invalid values + ) # End of potential value + )? # End of optional value group + \s* # Any number of spaces. + (\s+|;|$) # Ending either at space, semicolon, or EOS. + """, re.ASCII | re.VERBOSE) + + def load(self, data): + # Workaround for https://github.com/hypervideo/hypervideo/issues/4776 + if not isinstance(data, str): + return super().load(data) + + morsel = None + for match in self._COOKIE_PATTERN.finditer(data): + if match.group('bad'): + morsel = None + continue + + key, value = match.group('key', 'val') + + is_attribute = False + if key.startswith('$'): + key = key[1:] + is_attribute = True + + lower_key = key.lower() + if lower_key in self._RESERVED: + if morsel is None: + continue + + if value is None: + if lower_key not in self._FLAGS: + morsel = None + continue + value = True + else: + value, _ = self.value_decode(value) + + morsel[key] = value + + elif is_attribute: + morsel = None + + elif value is not None: + morsel = self.get(key, http.cookies.Morsel()) + real_value, coded_value = self.value_decode(value) + morsel.set(key, real_value, coded_value) + self[key] = morsel + + else: + morsel = None diff --git a/hypervideo_dl/dependencies.py b/hypervideo_dl/dependencies.py new file mode 100644 index 0000000..a913169 --- /dev/null +++ b/hypervideo_dl/dependencies.py @@ -0,0 +1,97 @@ +# flake8: noqa: F401 +"""Imports all optional dependencies for the project. +An attribute "_hypervideo_dl__identifier" may be inserted into the module if it uses an ambiguous namespace""" + +try: + import brotlicffi as brotli +except ImportError: + try: + import brotli + except ImportError: + brotli = None + + +try: + import certifi +except ImportError: + certifi = None +else: + from os.path import exists as _path_exists + + # The certificate may not be bundled in executable + if not _path_exists(certifi.where()): + certifi = None + + +try: + from Cryptodome.Cipher import AES as Cryptodome_AES +except ImportError: + try: + from Crypto.Cipher import AES as Cryptodome_AES + except (ImportError, SyntaxError): # Old Crypto gives SyntaxError in newer Python + Cryptodome_AES = None + else: + try: + # In pycrypto, mode defaults to ECB. See: + # https://www.pycryptodome.org/en/latest/src/vs_pycrypto.html#:~:text=not%20have%20ECB%20as%20default%20mode + Cryptodome_AES.new(b'abcdefghijklmnop') + except TypeError: + pass + else: + Cryptodome_AES._hypervideo_dl__identifier = 'pycrypto' + + +try: + import mutagen +except ImportError: + mutagen = None + + +secretstorage = None +try: + import secretstorage + _SECRETSTORAGE_UNAVAILABLE_REASON = None +except ImportError: + _SECRETSTORAGE_UNAVAILABLE_REASON = ( + 'as the `secretstorage` module is not installed. ' + 'Please install by running `python3 -m pip install secretstorage`') +except Exception as _err: + _SECRETSTORAGE_UNAVAILABLE_REASON = f'as the `secretstorage` module could not be initialized. {_err}' + + +try: + import sqlite3 +except ImportError: + # although sqlite3 is part of the standard library, it is possible to compile python without + # sqlite support. See: https://github.com/hypervideo/hypervideo/issues/544 + sqlite3 = None + + +try: + import websockets +except (ImportError, SyntaxError): + # websockets 3.10 on python 3.6 causes SyntaxError + # See https://github.com/hypervideo/hypervideo/issues/2633 + websockets = None + + +try: + import xattr # xattr or pyxattr +except ImportError: + xattr = None +else: + if hasattr(xattr, 'set'): # pyxattr + xattr._hypervideo_dl__identifier = 'pyxattr' + + +all_dependencies = {k: v for k, v in globals().items() if not k.startswith('_')} + + +available_dependencies = {k: v for k, v in all_dependencies.items() if v} + + +__all__ = [ + 'all_dependencies', + 'available_dependencies', + *all_dependencies.keys(), +] diff --git a/hypervideo_dl/downloader/__init__.py b/hypervideo_dl/downloader/__init__.py index 96d484d..c34dbce 100644 --- a/hypervideo_dl/downloader/__init__.py +++ b/hypervideo_dl/downloader/__init__.py @@ -1,10 +1,4 @@ -from __future__ import unicode_literals - -from ..compat import compat_str -from ..utils import ( - determine_protocol, - NO_DEFAULT -) +from ..utils import NO_DEFAULT, determine_protocol def get_suitable_downloader(info_dict, params={}, default=NO_DEFAULT, protocol=None, to_stdout=False): @@ -29,21 +23,18 @@ def get_suitable_downloader(info_dict, params={}, default=NO_DEFAULT, protocol=N # Some of these require get_suitable_downloader from .common import FileDownloader from .dash import DashSegmentsFD +from .external import FFmpegFD, get_external_downloader from .f4m import F4mFD from .fc2 import FC2LiveFD from .hls import HlsFD from .http import HttpFD -from .rtmp import RtmpFD -from .rtsp import RtspFD from .ism import IsmFD from .mhtml import MhtmlFD from .niconico import NiconicoDmcFD +from .rtmp import RtmpFD +from .rtsp import RtspFD from .websocket import WebSocketFragmentFD from .youtube_live_chat import YoutubeLiveChatFD -from .external import ( - get_external_downloader, - FFmpegFD, -) PROTOCOL_MAP = { 'rtmp': RtmpFD, @@ -68,10 +59,11 @@ PROTOCOL_MAP = { def shorten_protocol_name(proto, simplify=False): short_protocol_names = { - 'm3u8_native': 'm3u8_n', - 'rtmp_ffmpeg': 'rtmp_f', + 'm3u8_native': 'm3u8', + 'm3u8': 'm3u8F', + 'rtmp_ffmpeg': 'rtmpF', 'http_dash_segments': 'dash', - 'http_dash_segments_generator': 'dash_g', + 'http_dash_segments_generator': 'dashG', 'niconico_dmc': 'dmc', 'websocket_frag': 'WSfrag', } @@ -79,6 +71,7 @@ def shorten_protocol_name(proto, simplify=False): short_protocol_names.update({ 'https': 'http', 'ftps': 'ftp', + 'm3u8': 'm3u8', # Reverse above m3u8 mapping 'm3u8_native': 'm3u8', 'http_dash_segments_generator': 'dash', 'rtmp_ffmpeg': 'rtmp', @@ -93,13 +86,13 @@ def _get_suitable_downloader(info_dict, protocol, params, default): if default is NO_DEFAULT: default = HttpFD - # if (info_dict.get('start_time') or info_dict.get('end_time')) and not info_dict.get('requested_formats') and FFmpegFD.can_download(info_dict): - # return FFmpegFD + if (info_dict.get('section_start') or info_dict.get('section_end')) and FFmpegFD.can_download(info_dict): + return FFmpegFD info_dict['protocol'] = protocol downloaders = params.get('external_downloader') external_downloader = ( - downloaders if isinstance(downloaders, compat_str) or downloaders is None + downloaders if isinstance(downloaders, str) or downloaders is None else downloaders.get(shorten_protocol_name(protocol, True), downloaders.get('default'))) if external_downloader is None: diff --git a/hypervideo_dl/downloader/common.py b/hypervideo_dl/downloader/common.py index 7cef3e8..72d4822 100644 --- a/hypervideo_dl/downloader/common.py +++ b/hypervideo_dl/downloader/common.py @@ -1,30 +1,39 @@ -from __future__ import division, unicode_literals - +import contextlib +import errno +import functools import os +import random import re import time -import random -import errno +from ..minicurses import ( + BreaklineStatusPrinter, + MultilineLogger, + MultilinePrinter, + QuietMultilinePrinter, +) from ..utils import ( + IDENTITY, + NO_DEFAULT, + LockingUnsupportedError, + Namespace, + RetryManager, + classproperty, decodeArgument, encodeFilename, - error_to_compat_str, format_bytes, + join_nonempty, + parse_bytes, + remove_start, sanitize_open, shell_quote, timeconvert, timetuple_from_msec, -) -from ..minicurses import ( - MultilineLogger, - MultilinePrinter, - QuietMultilinePrinter, - BreaklineStatusPrinter + try_call, ) -class FileDownloader(object): +class FileDownloader: """File Downloader class. File downloader objects are the ones responsible of downloading the @@ -39,6 +48,7 @@ class FileDownloader(object): verbose: Print additional info to stdout. quiet: Do not print messages to stdout. ratelimit: Download speed limit, in bytes/sec. + continuedl: Attempt to continue downloads if possible throttledratelimit: Assume the download is being throttled below this speed (bytes/sec) retries: Number of times to retry for HTTP error 5xx file_access_retries: Number of times to retry on file access error @@ -62,6 +72,7 @@ class FileDownloader(object): useful for bypassing bandwidth throttling imposed by a webserver (experimental) progress_template: See YoutubeDL.py + retry_sleep_functions: See YoutubeDL.py Subclasses of this one must re-define the real_download method. """ @@ -71,21 +82,51 @@ class FileDownloader(object): def __init__(self, ydl, params): """Create a FileDownloader object with the given options.""" - self.ydl = ydl + self._set_ydl(ydl) self._progress_hooks = [] self.params = params self._prepare_multiline_status() self.add_progress_hook(self.report_progress) + def _set_ydl(self, ydl): + self.ydl = ydl + + for func in ( + 'deprecation_warning', + 'deprecated_feature', + 'report_error', + 'report_file_already_downloaded', + 'report_warning', + 'to_console_title', + 'to_stderr', + 'trouble', + 'write_debug', + ): + if not hasattr(self, func): + setattr(self, func, getattr(ydl, func)) + + def to_screen(self, *args, **kargs): + self.ydl.to_screen(*args, quiet=self.params.get('quiet'), **kargs) + + __to_screen = to_screen + + @classproperty + def FD_NAME(cls): + return re.sub(r'(?<=[a-z])(?=[A-Z])', '_', cls.__name__[:-2]).lower() + @staticmethod def format_seconds(seconds): + if seconds is None: + return ' Unknown' time = timetuple_from_msec(seconds * 1000) if time.hours > 99: return '--:--:--' - if not time.hours: - return '%02d:%02d' % time[1:-1] return '%02d:%02d:%02d' % time[:-1] + @classmethod + def format_eta(cls, seconds): + return f'{remove_start(cls.format_seconds(seconds), "00:"):>8s}' + @staticmethod def calc_percent(byte_counter, data_len): if data_len is None: @@ -94,11 +135,7 @@ class FileDownloader(object): @staticmethod def format_percent(percent): - if percent is None: - return '---.-%' - elif percent == 100: - return '100%' - return '%6s' % ('%3.1f%%' % percent) + return ' N/A%' if percent is None else f'{percent:>5.1f}%' @staticmethod def calc_eta(start, now, total, current): @@ -112,12 +149,6 @@ class FileDownloader(object): rate = float(current) / dif return int((float(total) - float(current)) / rate) - @staticmethod - def format_eta(eta): - if eta is None: - return '--:--' - return FileDownloader.format_seconds(eta) - @staticmethod def calc_speed(start, now, bytes): dif = now - start @@ -127,13 +158,11 @@ class FileDownloader(object): @staticmethod def format_speed(speed): - if speed is None: - return '%10s' % '---b/s' - return '%10s' % ('%s/s' % format_bytes(speed)) + return ' Unknown B/s' if speed is None else f'{format_bytes(speed):>10s}/s' @staticmethod def format_retries(retries): - return 'inf' if retries == float('inf') else '%.0f' % retries + return 'inf' if retries == float('inf') else int(retries) @staticmethod def best_block_size(elapsed_time, bytes): @@ -151,33 +180,7 @@ class FileDownloader(object): @staticmethod def parse_bytes(bytestr): """Parse a string indicating a byte quantity into an integer.""" - matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr) - if matchobj is None: - return None - number = float(matchobj.group(1)) - multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower()) - return int(round(number * multiplier)) - - def to_screen(self, *args, **kargs): - self.ydl.to_screen(*args, quiet=self.params.get('quiet'), **kargs) - - def to_stderr(self, message): - self.ydl.to_stderr(message) - - def to_console_title(self, message): - self.ydl.to_console_title(message) - - def trouble(self, *args, **kargs): - self.ydl.trouble(*args, **kargs) - - def report_warning(self, *args, **kargs): - self.ydl.report_warning(*args, **kargs) - - def report_error(self, *args, **kargs): - self.ydl.report_error(*args, **kargs) - - def write_debug(self, *args, **kargs): - self.ydl.write_debug(*args, **kargs) + parse_bytes(bytestr) def slow_down(self, start_time, now, byte_counter): """Sleep if the download speed is over the rate limit.""" @@ -211,30 +214,31 @@ class FileDownloader(object): return filename + '.ytdl' def wrap_file_access(action, *, fatal=False): - def outer(func): - def inner(self, *args, **kwargs): - file_access_retries = self.params.get('file_access_retries', 0) - retry = 0 - while True: - try: - return func(self, *args, **kwargs) - except (IOError, OSError) as err: - retry = retry + 1 - if retry > file_access_retries or err.errno not in (errno.EACCES, errno.EINVAL): - if not fatal: - self.report_error(f'unable to {action} file: {err}') - return - raise - self.to_screen( - f'[download] Unable to {action} file due to file access error. ' - f'Retrying (attempt {retry} of {self.format_retries(file_access_retries)}) ...') - time.sleep(0.01) - return inner - return outer + def error_callback(err, count, retries, *, fd): + return RetryManager.report_retry( + err, count, retries, info=fd.__to_screen, + warn=lambda e: (time.sleep(0.01), fd.to_screen(f'[download] Unable to {action} file: {e}')), + error=None if fatal else lambda e: fd.report_error(f'Unable to {action} file: {e}'), + sleep_func=fd.params.get('retry_sleep_functions', {}).get('file_access')) + + def wrapper(self, func, *args, **kwargs): + for retry in RetryManager(self.params.get('file_access_retries'), error_callback, fd=self): + try: + return func(self, *args, **kwargs) + except OSError as err: + if err.errno in (errno.EACCES, errno.EINVAL): + retry.error = err + continue + retry.error_callback(err, 1, 0) + + return functools.partial(functools.partialmethod, wrapper) @wrap_file_access('open', fatal=True) def sanitize_open(self, filename, open_mode): - return sanitize_open(filename, open_mode) + f, filename = sanitize_open(filename, open_mode) + if not getattr(f, 'locked', None): + self.write_debug(f'{LockingUnsupportedError.msg}. Proceeding without locking', only_once=True) + return f, filename @wrap_file_access('remove') def try_remove(self, filename): @@ -261,10 +265,8 @@ class FileDownloader(object): # Ignore obviously invalid dates if filetime == 0: return - try: + with contextlib.suppress(Exception): os.utime(filename, (time.time(), filetime)) - except Exception: - pass return filetime def report_destination(self, filename): @@ -277,26 +279,26 @@ class FileDownloader(object): elif self.ydl.params.get('logger'): self._multiline = MultilineLogger(self.ydl.params['logger'], lines) elif self.params.get('progress_with_newline'): - self._multiline = BreaklineStatusPrinter(self.ydl._out_files['screen'], lines) + self._multiline = BreaklineStatusPrinter(self.ydl._out_files.out, lines) else: - self._multiline = MultilinePrinter(self.ydl._out_files['screen'], lines, not self.params.get('quiet')) + self._multiline = MultilinePrinter(self.ydl._out_files.out, lines, not self.params.get('quiet')) self._multiline.allow_colors = self._multiline._HAVE_FULLCAP and not self.params.get('no_color') def _finish_multiline_status(self): self._multiline.end() - _progress_styles = { - 'downloaded_bytes': 'light blue', - 'percent': 'light blue', - 'eta': 'yellow', - 'speed': 'green', - 'elapsed': 'bold white', - 'total_bytes': '', - 'total_bytes_estimate': '', - } + ProgressStyles = Namespace( + downloaded_bytes='light blue', + percent='light blue', + eta='yellow', + speed='green', + elapsed='bold white', + total_bytes='', + total_bytes_estimate='', + ) def _report_progress_status(self, s, default_template): - for name, style in self._progress_styles.items(): + for name, style in self.ProgressStyles.items_: name = f'_{name}_str' if name not in s: continue @@ -320,78 +322,73 @@ class FileDownloader(object): self._multiline.stream, self._multiline.allow_colors, *args, **kwargs) def report_progress(self, s): + def with_fields(*tups, default=''): + for *fields, tmpl in tups: + if all(s.get(f) is not None for f in fields): + return tmpl + return default + + _format_bytes = lambda k: f'{format_bytes(s.get(k)):>10s}' + if s['status'] == 'finished': if self.params.get('noprogress'): self.to_screen('[download] Download completed') - msg_template = '100%%' - if s.get('total_bytes') is not None: - s['_total_bytes_str'] = format_bytes(s['total_bytes']) - msg_template += ' of %(_total_bytes_str)s' - if s.get('elapsed') is not None: - s['_elapsed_str'] = self.format_seconds(s['elapsed']) - msg_template += ' in %(_elapsed_str)s' - s['_percent_str'] = self.format_percent(100) - self._report_progress_status(s, msg_template) - return + speed = try_call(lambda: s['total_bytes'] / s['elapsed']) + s.update({ + 'speed': speed, + '_speed_str': self.format_speed(speed).strip(), + '_total_bytes_str': _format_bytes('total_bytes'), + '_elapsed_str': self.format_seconds(s.get('elapsed')), + '_percent_str': self.format_percent(100), + }) + self._report_progress_status(s, join_nonempty( + '100%%', + with_fields(('total_bytes', 'of %(_total_bytes_str)s')), + with_fields(('elapsed', 'in %(_elapsed_str)s')), + with_fields(('speed', 'at %(_speed_str)s')), + delim=' ')) if s['status'] != 'downloading': return - if s.get('eta') is not None: - s['_eta_str'] = self.format_eta(s['eta']) - else: - s['_eta_str'] = 'Unknown' - - if s.get('total_bytes') and s.get('downloaded_bytes') is not None: - s['_percent_str'] = self.format_percent(100 * s['downloaded_bytes'] / s['total_bytes']) - elif s.get('total_bytes_estimate') and s.get('downloaded_bytes') is not None: - s['_percent_str'] = self.format_percent(100 * s['downloaded_bytes'] / s['total_bytes_estimate']) - else: - if s.get('downloaded_bytes') == 0: - s['_percent_str'] = self.format_percent(0) - else: - s['_percent_str'] = 'Unknown %' - - if s.get('speed') is not None: - s['_speed_str'] = self.format_speed(s['speed']) - else: - s['_speed_str'] = 'Unknown speed' - - if s.get('total_bytes') is not None: - s['_total_bytes_str'] = format_bytes(s['total_bytes']) - msg_template = '%(_percent_str)s of %(_total_bytes_str)s at %(_speed_str)s ETA %(_eta_str)s' - elif s.get('total_bytes_estimate') is not None: - s['_total_bytes_estimate_str'] = format_bytes(s['total_bytes_estimate']) - msg_template = '%(_percent_str)s of ~%(_total_bytes_estimate_str)s at %(_speed_str)s ETA %(_eta_str)s' - else: - if s.get('downloaded_bytes') is not None: - s['_downloaded_bytes_str'] = format_bytes(s['downloaded_bytes']) - if s.get('elapsed'): - s['_elapsed_str'] = self.format_seconds(s['elapsed']) - msg_template = '%(_downloaded_bytes_str)s at %(_speed_str)s (%(_elapsed_str)s)' - else: - msg_template = '%(_downloaded_bytes_str)s at %(_speed_str)s' - else: - msg_template = '%(_percent_str)s at %(_speed_str)s ETA %(_eta_str)s' - if s.get('fragment_index') and s.get('fragment_count'): - msg_template += ' (frag %(fragment_index)s/%(fragment_count)s)' - elif s.get('fragment_index'): - msg_template += ' (frag %(fragment_index)s)' + s.update({ + '_eta_str': self.format_eta(s.get('eta')).strip(), + '_speed_str': self.format_speed(s.get('speed')), + '_percent_str': self.format_percent(try_call( + lambda: 100 * s['downloaded_bytes'] / s['total_bytes'], + lambda: 100 * s['downloaded_bytes'] / s['total_bytes_estimate'], + lambda: s['downloaded_bytes'] == 0 and 0)), + '_total_bytes_str': _format_bytes('total_bytes'), + '_total_bytes_estimate_str': _format_bytes('total_bytes_estimate'), + '_downloaded_bytes_str': _format_bytes('downloaded_bytes'), + '_elapsed_str': self.format_seconds(s.get('elapsed')), + }) + + msg_template = with_fields( + ('total_bytes', '%(_percent_str)s of %(_total_bytes_str)s at %(_speed_str)s ETA %(_eta_str)s'), + ('total_bytes_estimate', '%(_percent_str)s of ~%(_total_bytes_estimate_str)s at %(_speed_str)s ETA %(_eta_str)s'), + ('downloaded_bytes', 'elapsed', '%(_downloaded_bytes_str)s at %(_speed_str)s (%(_elapsed_str)s)'), + ('downloaded_bytes', '%(_downloaded_bytes_str)s at %(_speed_str)s'), + default='%(_percent_str)s at %(_speed_str)s ETA %(_eta_str)s') + + msg_template += with_fields( + ('fragment_index', 'fragment_count', ' (frag %(fragment_index)s/%(fragment_count)s)'), + ('fragment_index', ' (frag %(fragment_index)s)')) self._report_progress_status(s, msg_template) def report_resuming_byte(self, resume_len): """Report attempt to resume at given byte.""" self.to_screen('[download] Resuming download at byte %s' % resume_len) - def report_retry(self, err, count, retries): - """Report retry in case of HTTP error 5xx""" - self.to_screen( - '[download] Got server HTTP error: %s. Retrying (attempt %d of %s) ...' - % (error_to_compat_str(err), count, self.format_retries(retries))) - - def report_file_already_downloaded(self, *args, **kwargs): - """Report file has already been fully downloaded.""" - return self.ydl.report_file_already_downloaded(*args, **kwargs) + def report_retry(self, err, count, retries, frag_index=NO_DEFAULT, fatal=True): + """Report retry""" + is_frag = False if frag_index is NO_DEFAULT else 'fragment' + RetryManager.report_retry( + err, count, retries, info=self.__to_screen, + warn=lambda msg: self.__to_screen(f'[download] Got error: {msg}'), + error=IDENTITY if not fatal else lambda e: self.report_error(f'\r[download] Got error: {e}'), + sleep_func=self.params.get('retry_sleep_functions', {}).get(is_frag or 'http'), + suffix=f'fragment{"s" if frag_index is None else f" {frag_index}"}' if is_frag else None) def report_unable_to_resume(self): """Report it was impossible to resume download.""" @@ -431,25 +428,16 @@ class FileDownloader(object): self._finish_multiline_status() return True, False - if subtitle is False: - min_sleep_interval = self.params.get('sleep_interval') - if min_sleep_interval: - max_sleep_interval = self.params.get('max_sleep_interval', min_sleep_interval) - sleep_interval = random.uniform(min_sleep_interval, max_sleep_interval) - self.to_screen( - '[download] Sleeping %s seconds ...' % ( - int(sleep_interval) if sleep_interval.is_integer() - else '%.2f' % sleep_interval)) - time.sleep(sleep_interval) + if subtitle: + sleep_interval = self.params.get('sleep_interval_subtitles') or 0 else: - sleep_interval_sub = 0 - if type(self.params.get('sleep_interval_subtitles')) is int: - sleep_interval_sub = self.params.get('sleep_interval_subtitles') - if sleep_interval_sub > 0: - self.to_screen( - '[download] Sleeping %s seconds ...' % ( - sleep_interval_sub)) - time.sleep(sleep_interval_sub) + min_sleep_interval = self.params.get('sleep_interval') or 0 + sleep_interval = random.uniform( + min_sleep_interval, self.params.get('max_sleep_interval') or min_sleep_interval) + if sleep_interval > 0: + self.to_screen(f'[download] Sleeping {sleep_interval:.2f} seconds ...') + time.sleep(sleep_interval) + ret = self.real_download(filename, info_dict) self._finish_multiline_status() return ret, True @@ -459,8 +447,7 @@ class FileDownloader(object): raise NotImplementedError('This method must be implemented by subclasses') def _hook_progress(self, status, info_dict): - if not self._progress_hooks: - return + # Ideally we want to make a copy of the dict, but that is too slow status['info_dict'] = info_dict # youtube-dl passes the same status object to all the hooks. # Some third party scripts seems to be relying on this. @@ -482,4 +469,4 @@ class FileDownloader(object): if exe is None: exe = os.path.basename(str_args[0]) - self.write_debug('%s command line: %s' % (exe, shell_quote(str_args))) + self.write_debug(f'{exe} command line: {shell_quote(str_args)}') diff --git a/hypervideo_dl/downloader/dash.py b/hypervideo_dl/downloader/dash.py index a845ee7..4328d73 100644 --- a/hypervideo_dl/downloader/dash.py +++ b/hypervideo_dl/downloader/dash.py @@ -1,10 +1,9 @@ -from __future__ import unicode_literals import time +import urllib.parse -from ..downloader import get_suitable_downloader +from . import get_suitable_downloader from .fragment import FragmentFD - -from ..utils import urljoin +from ..utils import update_url_query, urljoin class DashSegmentsFD(FragmentFD): @@ -42,24 +41,29 @@ class DashSegmentsFD(FragmentFD): self._prepare_and_start_frag_download(ctx, fmt) ctx['start'] = real_start - fragments_to_download = self._get_fragments(fmt, ctx) + extra_query = None + extra_param_to_segment_url = info_dict.get('extra_param_to_segment_url') + if extra_param_to_segment_url: + extra_query = urllib.parse.parse_qs(extra_param_to_segment_url) + + fragments_to_download = self._get_fragments(fmt, ctx, extra_query) if real_downloader: self.to_screen( - '[%s] Fragment downloads will be delegated to %s' % (self.FD_NAME, real_downloader.get_basename())) + f'[{self.FD_NAME}] Fragment downloads will be delegated to {real_downloader.get_basename()}') info_dict['fragments'] = list(fragments_to_download) fd = real_downloader(self.ydl, self.params) return fd.real_download(filename, info_dict) args.append([ctx, fragments_to_download, fmt]) - return self.download_and_append_fragments_multiple(*args) + return self.download_and_append_fragments_multiple(*args, is_fatal=lambda idx: idx == 0) def _resolve_fragments(self, fragments, ctx): fragments = fragments(ctx) if callable(fragments) else fragments return [next(iter(fragments))] if self.params.get('test') else fragments - def _get_fragments(self, fmt, ctx): + def _get_fragments(self, fmt, ctx, extra_query): fragment_base_url = fmt.get('fragment_base_url') fragments = self._resolve_fragments(fmt['fragments'], ctx) @@ -72,9 +76,12 @@ class DashSegmentsFD(FragmentFD): if not fragment_url: assert fragment_base_url fragment_url = urljoin(fragment_base_url, fragment['path']) + if extra_query: + fragment_url = update_url_query(fragment_url, extra_query) yield { 'frag_index': frag_index, + 'fragment_count': fragment.get('fragment_count'), 'index': i, 'url': fragment_url, } diff --git a/hypervideo_dl/downloader/external.py b/hypervideo_dl/downloader/external.py index b99dc37..75257a7 100644 --- a/hypervideo_dl/downloader/external.py +++ b/hypervideo_dl/downloader/external.py @@ -1,5 +1,4 @@ -from __future__ import unicode_literals - +import enum import os.path import re import subprocess @@ -7,30 +6,35 @@ import sys import time from .fragment import FragmentFD -from ..compat import ( - compat_setenv, - compat_str, -) -from ..postprocessor.ffmpeg import FFmpegPostProcessor, EXT_TO_OUT_FORMATS +from ..compat import functools +from ..postprocessor.ffmpeg import EXT_TO_OUT_FORMATS, FFmpegPostProcessor from ..utils import ( + Popen, + RetryManager, + _configuration_args, + check_executable, classproperty, + cli_bool_option, cli_option, cli_valueless_option, - cli_bool_option, - _configuration_args, determine_ext, - encodeFilename, encodeArgument, + encodeFilename, handle_youtubedl_headers, - check_executable, - Popen, remove_end, + traverse_obj, ) +class Features(enum.Enum): + TO_STDOUT = enum.auto() + MULTIPLE_FORMATS = enum.auto() + + class ExternalFD(FragmentFD): SUPPORTED_PROTOCOLS = ('http', 'https', 'ftp', 'ftps') - can_download_to_stdout = False + SUPPORTED_FEATURES = () + _CAPTURE_STDERR = True def real_download(self, filename, info_dict): self.report_destination(filename) @@ -56,7 +60,7 @@ class ExternalFD(FragmentFD): } if filename != '-': fsize = os.path.getsize(encodeFilename(tmpfilename)) - self.to_screen('\r[%s] Downloaded %s bytes' % (self.get_basename(), fsize)) + self.to_screen(f'\r[{self.get_basename()}] Downloaded {fsize} bytes') self.try_rename(tmpfilename, filename) status.update({ 'downloaded_bytes': fsize, @@ -78,7 +82,7 @@ class ExternalFD(FragmentFD): def EXE_NAME(cls): return cls.get_basename() - @property + @functools.cached_property def exe(self): return self.EXE_NAME @@ -94,9 +98,11 @@ class ExternalFD(FragmentFD): @classmethod def supports(cls, info_dict): - return ( - (cls.can_download_to_stdout or not info_dict.get('to_stdout')) - and info_dict['protocol'] in cls.SUPPORTED_PROTOCOLS) + return all(( + not info_dict.get('to_stdout') or Features.TO_STDOUT in cls.SUPPORTED_FEATURES, + '+' not in info_dict['protocol'] or Features.MULTIPLE_FORMATS in cls.SUPPORTED_FEATURES, + all(proto in cls.SUPPORTED_PROTOCOLS for proto in info_dict['protocol'].split('+')), + )) @classmethod def can_download(cls, info_dict, path=None): @@ -123,33 +129,28 @@ class ExternalFD(FragmentFD): self._debug_cmd(cmd) if 'fragments' not in info_dict: - p = Popen(cmd, stderr=subprocess.PIPE) - _, stderr = p.communicate_or_kill() - if p.returncode != 0: - self.to_stderr(stderr.decode('utf-8', 'replace')) - return p.returncode + _, stderr, returncode = Popen.run( + cmd, text=True, stderr=subprocess.PIPE if self._CAPTURE_STDERR else None) + if returncode and stderr: + self.to_stderr(stderr) + return returncode - fragment_retries = self.params.get('fragment_retries', 0) skip_unavailable_fragments = self.params.get('skip_unavailable_fragments', True) - count = 0 - while count <= fragment_retries: - p = Popen(cmd, stderr=subprocess.PIPE) - _, stderr = p.communicate_or_kill() - if p.returncode == 0: + retry_manager = RetryManager(self.params.get('fragment_retries'), self.report_retry, + frag_index=None, fatal=not skip_unavailable_fragments) + for retry in retry_manager: + _, stderr, returncode = Popen.run(cmd, text=True, stderr=subprocess.PIPE) + if not returncode: break # TODO: Decide whether to retry based on error code # https://aria2.github.io/manual/en/html/aria2c.html#exit-status - self.to_stderr(stderr.decode('utf-8', 'replace')) - count += 1 - if count <= fragment_retries: - self.to_screen( - '[%s] Got error. Retrying fragments (attempt %d of %s)...' - % (self.get_basename(), count, self.format_retries(fragment_retries))) - if count > fragment_retries: - if not skip_unavailable_fragments: - self.report_error('Giving up after %s fragment retries' % fragment_retries) - return -1 + if stderr: + self.to_stderr(stderr) + retry.error = Exception() + continue + if not skip_unavailable_fragments and retry_manager.error: + return -1 decrypt_fragment = self.decrypter(info_dict) dest, _ = self.sanitize_open(tmpfilename, 'wb') @@ -157,7 +158,7 @@ class ExternalFD(FragmentFD): fragment_filename = '%s-Frag%d' % (tmpfilename, frag_index) try: src, _ = self.sanitize_open(fragment_filename, 'rb') - except IOError as err: + except OSError as err: if skip_unavailable_fragments and frag_index > 1: self.report_skip_fragment(frag_index, err) continue @@ -174,12 +175,13 @@ class ExternalFD(FragmentFD): class CurlFD(ExternalFD): AVAILABLE_OPT = '-V' + _CAPTURE_STDERR = False # curl writes the progress to stderr def _make_cmd(self, tmpfilename, info_dict): cmd = [self.exe, '--location', '-o', tmpfilename, '--compressed'] if info_dict.get('http_headers') is not None: for key, val in info_dict['http_headers'].items(): - cmd += ['--header', '%s: %s' % (key, val)] + cmd += ['--header', f'{key}: {val}'] cmd += self._bool_option('--continue-at', 'continuedl', '-', '0') cmd += self._valueless_option('--silent', 'noprogress') @@ -198,16 +200,6 @@ class CurlFD(ExternalFD): cmd += ['--', info_dict['url']] return cmd - def _call_downloader(self, tmpfilename, info_dict): - cmd = [encodeArgument(a) for a in self._make_cmd(tmpfilename, info_dict)] - - self._debug_cmd(cmd) - - # curl writes the progress to stderr so don't capture it. - p = Popen(cmd) - p.communicate_or_kill() - return p.returncode - class AxelFD(ExternalFD): AVAILABLE_OPT = '-V' @@ -216,7 +208,7 @@ class AxelFD(ExternalFD): cmd = [self.exe, '-o', tmpfilename] if info_dict.get('http_headers') is not None: for key, val in info_dict['http_headers'].items(): - cmd += ['-H', '%s: %s' % (key, val)] + cmd += ['-H', f'{key}: {val}'] cmd += self._configuration_args() cmd += ['--', info_dict['url']] return cmd @@ -229,7 +221,7 @@ class WgetFD(ExternalFD): cmd = [self.exe, '-O', tmpfilename, '-nv', '--no-cookies', '--compression=auto'] if info_dict.get('http_headers') is not None: for key, val in info_dict['http_headers'].items(): - cmd += ['--header', '%s: %s' % (key, val)] + cmd += ['--header', f'{key}: {val}'] cmd += self._option('--limit-rate', 'ratelimit') retry = self._option('--tries', 'retries') if len(retry) == 2: @@ -240,7 +232,7 @@ class WgetFD(ExternalFD): proxy = self.params.get('proxy') if proxy: for var in ('http_proxy', 'https_proxy'): - cmd += ['--execute', '%s=%s' % (var, proxy)] + cmd += ['--execute', f'{var}={proxy}'] cmd += self._valueless_option('--no-check-certificate', 'nocheckcertificate') cmd += self._configuration_args() cmd += ['--', info_dict['url']] @@ -260,6 +252,10 @@ class Aria2cFD(ExternalFD): check_results = (not re.search(feature, manifest) for feature in UNSUPPORTED_FEATURES) return all(check_results) + @staticmethod + def _aria2c_filename(fn): + return fn if os.path.isabs(fn) else f'.{os.path.sep}{fn}' + def _make_cmd(self, tmpfilename, info_dict): cmd = [self.exe, '-c', '--console-log-level=warn', '--summary-interval=0', '--download-result=hide', @@ -271,7 +267,7 @@ class Aria2cFD(ExternalFD): if info_dict.get('http_headers') is not None: for key, val in info_dict['http_headers'].items(): - cmd += ['--header', '%s: %s' % (key, val)] + cmd += ['--header', f'{key}: {val}'] cmd += self._option('--max-overall-download-limit', 'ratelimit') cmd += self._option('--interface', 'source_address') cmd += self._option('--all-proxy', 'proxy') @@ -288,11 +284,9 @@ class Aria2cFD(ExternalFD): # https://github.com/aria2/aria2/issues/1373 dn = os.path.dirname(tmpfilename) if dn: - if not os.path.isabs(dn): - dn = '.%s%s' % (os.path.sep, dn) - cmd += ['--dir', dn + os.path.sep] + cmd += ['--dir', self._aria2c_filename(dn) + os.path.sep] if 'fragments' not in info_dict: - cmd += ['--out', '.%s%s' % (os.path.sep, os.path.basename(tmpfilename))] + cmd += ['--out', self._aria2c_filename(os.path.basename(tmpfilename))] cmd += ['--auto-file-renaming=false'] if 'fragments' in info_dict: @@ -301,11 +295,11 @@ class Aria2cFD(ExternalFD): url_list = [] for frag_index, fragment in enumerate(info_dict['fragments']): fragment_filename = '%s-Frag%d' % (os.path.basename(tmpfilename), frag_index) - url_list.append('%s\n\tout=%s' % (fragment['url'], fragment_filename)) + url_list.append('%s\n\tout=%s' % (fragment['url'], self._aria2c_filename(fragment_filename))) stream, _ = self.sanitize_open(url_list_file, 'wb') - stream.write('\n'.join(url_list).encode('utf-8')) + stream.write('\n'.join(url_list).encode()) stream.close() - cmd += ['-i', url_list_file] + cmd += ['-i', self._aria2c_filename(url_list_file)] else: cmd += ['--', info_dict['url']] return cmd @@ -320,13 +314,13 @@ class HttpieFD(ExternalFD): if info_dict.get('http_headers') is not None: for key, val in info_dict['http_headers'].items(): - cmd += ['%s:%s' % (key, val)] + cmd += [f'{key}:{val}'] return cmd class FFmpegFD(ExternalFD): SUPPORTED_PROTOCOLS = ('http', 'https', 'ftp', 'ftps', 'm3u8', 'm3u8_native', 'rtsp', 'rtmp', 'rtmp_ffmpeg', 'mms', 'http_dash_segments') - can_download_to_stdout = True + SUPPORTED_FEATURES = (Features.TO_STDOUT, Features.MULTIPLE_FORMATS) @classmethod def available(cls, path=None): @@ -334,10 +328,6 @@ class FFmpegFD(ExternalFD): # Fixme: This may be wrong when --ffmpeg-location is used return FFmpegPostProcessor().available - @classmethod - def supports(cls, info_dict): - return all(proto in cls.SUPPORTED_PROTOCOLS for proto in info_dict['protocol'].split('+')) - def on_process_started(self, proc, stdin): """ Override this in subclasses """ pass @@ -368,9 +358,11 @@ class FFmpegFD(ExternalFD): if not self.params.get('verbose'): args += ['-hide_banner'] - args += info_dict.get('_ffmpeg_args', []) + args += traverse_obj(info_dict, ('downloader_options', 'ffmpeg_args'), default=[]) - # This option exists only for compatibility. Extractors should use `_ffmpeg_args` instead + # These exists only for compatibility. Extractors should use + # info_dict['downloader_options']['ffmpeg_args'] instead + args += info_dict.get('_ffmpeg_args') or [] seekable = info_dict.get('_seekable') if seekable is not None: # setting -seekable prevents ffmpeg from guessing if the server @@ -380,20 +372,15 @@ class FFmpegFD(ExternalFD): # http://trac.ffmpeg.org/ticket/6125#comment:10 args += ['-seekable', '1' if seekable else '0'] - # start_time = info_dict.get('start_time') or 0 - # if start_time: - # args += ['-ss', compat_str(start_time)] - # end_time = info_dict.get('end_time') - # if end_time: - # args += ['-t', compat_str(end_time - start_time)] - - if info_dict.get('http_headers') is not None and re.match(r'^https?://', urls[0]): - # Trailing \r\n after each HTTP header is important to prevent warning from ffmpeg/avconv: - # [http @ 00000000003d2fa0] No trailing CRLF found in HTTP header. - headers = handle_youtubedl_headers(info_dict['http_headers']) - args += [ + http_headers = None + if info_dict.get('http_headers'): + youtubedl_headers = handle_youtubedl_headers(info_dict['http_headers']) + http_headers = [ + # Trailing \r\n after each HTTP header is important to prevent warning from ffmpeg/avconv: + # [http @ 00000000003d2fa0] No trailing CRLF found in HTTP header. '-headers', - ''.join('%s: %s\r\n' % (key, val) for key, val in headers.items())] + ''.join(f'{key}: {val}\r\n' for key, val in youtubedl_headers.items()) + ] env = None proxy = self.params.get('proxy') @@ -411,8 +398,8 @@ class FFmpegFD(ExternalFD): # We could switch to the following code if we are able to detect version properly # args += ['-http_proxy', proxy] env = os.environ.copy() - compat_setenv('HTTP_PROXY', proxy, env=env) - compat_setenv('http_proxy', proxy, env=env) + env['HTTP_PROXY'] = proxy + env['http_proxy'] = proxy protocol = info_dict.get('protocol') @@ -442,20 +429,31 @@ class FFmpegFD(ExternalFD): if isinstance(conn, list): for entry in conn: args += ['-rtmp_conn', entry] - elif isinstance(conn, compat_str): + elif isinstance(conn, str): args += ['-rtmp_conn', conn] + start_time, end_time = info_dict.get('section_start') or 0, info_dict.get('section_end') + for i, url in enumerate(urls): + if http_headers is not None and re.match(r'^https?://', url): + args += http_headers + if start_time: + args += ['-ss', str(start_time)] + if end_time: + args += ['-t', str(end_time - start_time)] + args += self._configuration_args((f'_i{i + 1}', '_i')) + ['-i', url] - args += ['-c', 'copy'] + if not (start_time or end_time) or not self.params.get('force_keyframes_at_cuts'): + args += ['-c', 'copy'] + if info_dict.get('requested_formats') or protocol == 'http_dash_segments': for (i, fmt) in enumerate(info_dict.get('requested_formats') or [info_dict]): stream_number = fmt.get('manifest_stream_number', 0) args.extend(['-map', f'{i}:{stream_number}']) if self.params.get('test', False): - args += ['-fs', compat_str(self._TEST_FILE_SIZE)] + args += ['-fs', str(self._TEST_FILE_SIZE)] ext = info_dict['ext'] if protocol in ('m3u8', 'm3u8_native'): @@ -490,24 +488,23 @@ class FFmpegFD(ExternalFD): args.append(encodeFilename(ffpp._ffmpeg_filename_argument(tmpfilename), True)) self._debug_cmd(args) - proc = Popen(args, stdin=subprocess.PIPE, env=env) - if url in ('-', 'pipe:'): - self.on_process_started(proc, proc.stdin) - try: - retval = proc.wait() - except BaseException as e: - # subprocces.run would send the SIGKILL signal to ffmpeg and the - # mp4 file couldn't be played, but if we ask ffmpeg to quit it - # produces a file that is playable (this is mostly useful for live - # streams). Note that Windows is not affected and produces playable - # files (see https://github.com/ytdl-org/youtube-dl/issues/8300). - if isinstance(e, KeyboardInterrupt) and sys.platform != 'win32' and url not in ('-', 'pipe:'): - proc.communicate_or_kill(b'q') - else: - proc.kill() - proc.wait() - raise - return retval + with Popen(args, stdin=subprocess.PIPE, env=env) as proc: + if url in ('-', 'pipe:'): + self.on_process_started(proc, proc.stdin) + try: + retval = proc.wait() + except BaseException as e: + # subprocces.run would send the SIGKILL signal to ffmpeg and the + # mp4 file couldn't be played, but if we ask ffmpeg to quit it + # produces a file that is playable (this is mostly useful for live + # streams). Note that Windows is not affected and produces playable + # files (see https://github.com/ytdl-org/youtube-dl/issues/8300). + if isinstance(e, KeyboardInterrupt) and sys.platform != 'win32' and url not in ('-', 'pipe:'): + proc.communicate_or_kill(b'q') + else: + proc.kill(timeout=None) + raise + return retval class AVconvFD(FFmpegFD): @@ -520,16 +517,14 @@ _BY_NAME = { if name.endswith('FD') and name not in ('ExternalFD', 'FragmentFD') } -_BY_EXE = {klass.EXE_NAME: klass for klass in _BY_NAME.values()} - def list_external_downloaders(): return sorted(_BY_NAME.keys()) def get_external_downloader(external_downloader): - """ Given the name of the executable, see whether we support the given - downloader . """ - # Drop .exe extension on Windows + """ Given the name of the executable, see whether we support the given downloader """ bn = os.path.splitext(os.path.basename(external_downloader))[0] - return _BY_NAME.get(bn, _BY_EXE.get(bn)) + return _BY_NAME.get(bn) or next(( + klass for klass in _BY_NAME.values() if klass.EXE_NAME in bn + ), None) diff --git a/hypervideo_dl/downloader/f4m.py b/hypervideo_dl/downloader/f4m.py index 0008b7c..306f921 100644 --- a/hypervideo_dl/downloader/f4m.py +++ b/hypervideo_dl/downloader/f4m.py @@ -1,23 +1,14 @@ -from __future__ import division, unicode_literals - +import base64 import io import itertools +import struct import time +import urllib.error +import urllib.parse from .fragment import FragmentFD -from ..compat import ( - compat_b64decode, - compat_etree_fromstring, - compat_urlparse, - compat_urllib_error, - compat_urllib_parse_urlparse, - compat_struct_pack, - compat_struct_unpack, -) -from ..utils import ( - fix_xml_ampersands, - xpath_text, -) +from ..compat import compat_etree_fromstring +from ..utils import fix_xml_ampersands, xpath_text class DataTruncatedError(Exception): @@ -40,13 +31,13 @@ class FlvReader(io.BytesIO): # Utility functions for reading numbers and strings def read_unsigned_long_long(self): - return compat_struct_unpack('!Q', self.read_bytes(8))[0] + return struct.unpack('!Q', self.read_bytes(8))[0] def read_unsigned_int(self): - return compat_struct_unpack('!I', self.read_bytes(4))[0] + return struct.unpack('!I', self.read_bytes(4))[0] def read_unsigned_char(self): - return compat_struct_unpack('!B', self.read_bytes(1))[0] + return struct.unpack('!B', self.read_bytes(1))[0] def read_string(self): res = b'' @@ -193,7 +184,7 @@ def build_fragments_list(boot_info): first_frag_number = fragment_run_entry_table[0]['first'] fragments_counter = itertools.count(first_frag_number) for segment, fragments_count in segment_run_table['segment_run']: - # In some live HDS streams (for example Rai), `fragments_count` is + # In some live HDS streams (e.g. Rai), `fragments_count` is # abnormal and causing out-of-memory errors. It's OK to change the # number of fragments for live streams as they are updated periodically if fragments_count == 4294967295 and boot_info['live']: @@ -208,11 +199,11 @@ def build_fragments_list(boot_info): def write_unsigned_int(stream, val): - stream.write(compat_struct_pack('!I', val)) + stream.write(struct.pack('!I', val)) def write_unsigned_int_24(stream, val): - stream.write(compat_struct_pack('!I', val)[1:]) + stream.write(struct.pack('!I', val)[1:]) def write_flv_header(stream): @@ -261,8 +252,6 @@ class F4mFD(FragmentFD): A downloader for f4m manifests or AdobeHDS. """ - FD_NAME = 'f4m' - def _get_unencrypted_media(self, doc): media = doc.findall(_add_ns('media')) if not media: @@ -308,12 +297,12 @@ class F4mFD(FragmentFD): # 1. http://live-1-1.rutube.ru/stream/1024/HDS/SD/C2NKsS85HQNckgn5HdEmOQ/1454167650/S-s604419906/move/four/dirs/upper/1024-576p.f4m bootstrap_url = node.get('url') if bootstrap_url: - bootstrap_url = compat_urlparse.urljoin( + bootstrap_url = urllib.parse.urljoin( base_url, bootstrap_url) boot_info = self._get_bootstrap_from_url(bootstrap_url) else: bootstrap_url = None - bootstrap = compat_b64decode(node.text) + bootstrap = base64.b64decode(node.text) boot_info = read_bootstrap_info(bootstrap) return boot_info, bootstrap_url @@ -343,14 +332,14 @@ class F4mFD(FragmentFD): # Prefer baseURL for relative URLs as per 11.2 of F4M 3.0 spec. man_base_url = get_base_url(doc) or man_url - base_url = compat_urlparse.urljoin(man_base_url, media.attrib['url']) + base_url = urllib.parse.urljoin(man_base_url, media.attrib['url']) bootstrap_node = doc.find(_add_ns('bootstrapInfo')) boot_info, bootstrap_url = self._parse_bootstrap_node( bootstrap_node, man_base_url) live = boot_info['live'] metadata_node = media.find(_add_ns('metadata')) if metadata_node is not None: - metadata = compat_b64decode(metadata_node.text) + metadata = base64.b64decode(metadata_node.text) else: metadata = None @@ -378,7 +367,7 @@ class F4mFD(FragmentFD): if not live: write_metadata_tag(dest_stream, metadata) - base_url_parsed = compat_urllib_parse_urlparse(base_url) + base_url_parsed = urllib.parse.urlparse(base_url) self._start_frag_download(ctx, info_dict) @@ -398,9 +387,10 @@ class F4mFD(FragmentFD): query.append(info_dict['extra_param_to_segment_url']) url_parsed = base_url_parsed._replace(path=base_url_parsed.path + name, query='&'.join(query)) try: - success, down_data = self._download_fragment(ctx, url_parsed.geturl(), info_dict) + success = self._download_fragment(ctx, url_parsed.geturl(), info_dict) if not success: return False + down_data = self._read_fragment(ctx) reader = FlvReader(down_data) while True: try: @@ -417,7 +407,7 @@ class F4mFD(FragmentFD): if box_type == b'mdat': self._append_fragment(ctx, box_data) break - except (compat_urllib_error.HTTPError, ) as err: + except urllib.error.HTTPError as err: if live and (err.code == 404 or err.code == 410): # We didn't keep up with the live window. Continue # with the next available fragment. @@ -434,6 +424,4 @@ class F4mFD(FragmentFD): msg = 'Missed %d fragments' % (fragments_list[0][1] - (frag_i + 1)) self.report_warning(msg) - self._finish_frag_download(ctx, info_dict) - - return True + return self._finish_frag_download(ctx, info_dict) diff --git a/hypervideo_dl/downloader/fc2.py b/hypervideo_dl/downloader/fc2.py index 157bcf2..f9763de 100644 --- a/hypervideo_dl/downloader/fc2.py +++ b/hypervideo_dl/downloader/fc2.py @@ -1,5 +1,3 @@ -from __future__ import division, unicode_literals - import threading from .common import FileDownloader @@ -20,6 +18,9 @@ class FC2LiveFD(FileDownloader): heartbeat_state = [None, 1] def heartbeat(): + if heartbeat_state[1] < 0: + return + try: heartbeat_state[1] += 1 ws.send('{"name":"heartbeat","arguments":{},"id":%d}' % heartbeat_state[1]) @@ -38,4 +39,8 @@ class FC2LiveFD(FileDownloader): 'ws': None, 'protocol': 'live_ffmpeg', }) - return FFmpegFD(self.ydl, self.params or {}).download(filename, new_info_dict) + try: + return FFmpegFD(self.ydl, self.params or {}).download(filename, new_info_dict) + finally: + # stop heartbeating + heartbeat_state[1] = -1 diff --git a/hypervideo_dl/downloader/fragment.py b/hypervideo_dl/downloader/fragment.py index a991c6d..e61bd0e 100644 --- a/hypervideo_dl/downloader/fragment.py +++ b/hypervideo_dl/downloader/fragment.py @@ -1,28 +1,20 @@ -from __future__ import division, unicode_literals - +import concurrent.futures +import contextlib import http.client import json import math import os +import struct import time - -try: - import concurrent.futures - can_threaded_download = True -except ImportError: - can_threaded_download = False +import urllib.error from .common import FileDownloader from .http import HttpFD from ..aes import aes_cbc_decrypt_bytes, unpad_pkcs7 -from ..compat import ( - compat_os_name, - compat_urllib_error, - compat_struct_pack, -) +from ..compat import compat_os_name from ..utils import ( DownloadError, - error_to_compat_str, + RetryManager, encodeFilename, sanitized_Request, traverse_obj, @@ -33,9 +25,7 @@ class HttpQuietDownloader(HttpFD): def to_screen(self, *args, **kargs): pass - def report_retry(self, err, count, retries): - super().to_screen( - f'[download] Got server HTTP error: {err}. Retrying (attempt {count} of {self.format_retries(retries)}) ...') + to_console_title = to_screen class FragmentFD(FileDownloader): @@ -75,9 +65,9 @@ class FragmentFD(FileDownloader): """ def report_retry_fragment(self, err, frag_index, count, retries): - self.to_screen( - '\r[download] Got server HTTP error: %s. Retrying fragment %d (attempt %d of %s) ...' - % (error_to_compat_str(err), frag_index, count, self.format_retries(retries))) + self.deprecation_warning('hypervideo_dl.downloader.FragmentFD.report_retry_fragment is deprecated. ' + 'Use hypervideo_dl.downloader.FileDownloader.report_retry instead') + return self.report_retry(err, count, retries, frag_index) def report_skip_fragment(self, frag_index, err=None): err = f' {err};' if err else '' @@ -131,7 +121,7 @@ class FragmentFD(FileDownloader): 'request_data': request_data, 'ctx_id': ctx.get('ctx_id'), } - success = ctx['dl'].download(fragment_filename, fragment_info_dict) + success, _ = ctx['dl'].download(fragment_filename, fragment_info_dict) if not success: return False if fragment_info_dict.get('filetime'): @@ -140,6 +130,8 @@ class FragmentFD(FileDownloader): return True def _read_fragment(self, ctx): + if not ctx.get('fragment_filename_sanitized'): + return None try: down, frag_sanitized = self.sanitize_open(ctx['fragment_filename_sanitized'], 'rb') except FileNotFoundError: @@ -172,21 +164,13 @@ class FragmentFD(FileDownloader): total_frags_str += ' (not including %d ad)' % ad_frags else: total_frags_str = 'unknown (live)' - self.to_screen( - '[%s] Total fragments: %s' % (self.FD_NAME, total_frags_str)) + self.to_screen(f'[{self.FD_NAME}] Total fragments: {total_frags_str}') self.report_destination(ctx['filename']) - dl = HttpQuietDownloader( - self.ydl, - { - 'continuedl': self.params.get('continuedl', True), - 'quiet': self.params.get('quiet'), - 'noprogress': True, - 'ratelimit': self.params.get('ratelimit'), - 'retries': self.params.get('retries', 0), - 'nopart': self.params.get('nopart', False), - 'test': self.params.get('test', False), - } - ) + dl = HttpQuietDownloader(self.ydl, { + **self.params, + 'noprogress': True, + 'test': False, + }) tmpfilename = self.temp_name(ctx['filename']) open_mode = 'wb' resume_len = 0 @@ -259,6 +243,9 @@ class FragmentFD(FileDownloader): if s['status'] not in ('downloading', 'finished'): return + if not total_frags and ctx.get('fragment_count'): + state['fragment_count'] = ctx['fragment_count'] + if ctx_id is not None and s.get('ctx_id') != ctx_id: return @@ -308,18 +295,23 @@ class FragmentFD(FileDownloader): self.try_remove(ytdl_filename) elapsed = time.time() - ctx['started'] - if ctx['tmpfilename'] == '-': - downloaded_bytes = ctx['complete_frags_downloaded_bytes'] + to_file = ctx['tmpfilename'] != '-' + if to_file: + downloaded_bytes = os.path.getsize(encodeFilename(ctx['tmpfilename'])) else: + downloaded_bytes = ctx['complete_frags_downloaded_bytes'] + + if not downloaded_bytes: + if to_file: + self.try_remove(ctx['tmpfilename']) + self.report_error('The downloaded file is empty') + return False + elif to_file: self.try_rename(ctx['tmpfilename'], ctx['filename']) - if self.params.get('updatetime', True): - filetime = ctx.get('fragment_filetime') - if filetime: - try: - os.utime(ctx['filename'], (time.time(), filetime)) - except Exception: - pass - downloaded_bytes = os.path.getsize(encodeFilename(ctx['filename'])) + filetime = ctx.get('fragment_filetime') + if self.params.get('updatetime', True) and filetime: + with contextlib.suppress(Exception): + os.utime(ctx['filename'], (time.time(), filetime)) self._hook_progress({ 'downloaded_bytes': downloaded_bytes, @@ -331,6 +323,7 @@ class FragmentFD(FileDownloader): 'max_progress': ctx.get('max_progress'), 'progress_idx': ctx.get('progress_idx'), }, info_dict) + return True def _prepare_external_frag_download(self, ctx): if 'live' not in ctx: @@ -342,8 +335,7 @@ class FragmentFD(FileDownloader): total_frags_str += ' (not including %d ad)' % ad_frags else: total_frags_str = 'unknown (live)' - self.to_screen( - '[%s] Total fragments: %s' % (self.FD_NAME, total_frags_str)) + self.to_screen(f'[{self.FD_NAME}] Total fragments: {total_frags_str}') tmpfilename = self.temp_name(ctx['filename']) @@ -362,10 +354,12 @@ class FragmentFD(FileDownloader): return _key_cache[url] def decrypt_fragment(fragment, frag_content): + if frag_content is None: + return decrypt_info = fragment.get('decrypt_info') if not decrypt_info or decrypt_info['METHOD'] != 'AES-128': return frag_content - iv = decrypt_info.get('IV') or compat_struct_pack('>8xq', fragment['media_sequence']) + iv = decrypt_info.get('IV') or struct.pack('>8xq', fragment['media_sequence']) decrypt_info['KEY'] = decrypt_info.get('KEY') or _get_key(info_dict.get('_decryption_key_url') or decrypt_info['URI']) # Don't decrypt the content in tests since the data is explicitly truncated and it's not to a valid block # size (see https://github.com/ytdl-org/youtube-dl/pull/27660). Tests only care that the correct data downloaded, @@ -376,7 +370,7 @@ class FragmentFD(FileDownloader): return decrypt_fragment - def download_and_append_fragments_multiple(self, *args, pack_func=None, finish_func=None): + def download_and_append_fragments_multiple(self, *args, **kwargs): ''' @params (ctx1, fragments1, info_dict1), (ctx2, fragments2, info_dict2), ... all args must be either tuple or list @@ -384,7 +378,7 @@ class FragmentFD(FileDownloader): interrupt_trigger = [True] max_progress = len(args) if max_progress == 1: - return self.download_and_append_fragments(*args[0], pack_func=pack_func, finish_func=finish_func) + return self.download_and_append_fragments(*args[0], **kwargs) max_workers = self.params.get('concurrent_fragment_downloads', 1) if max_progress > 1: self._prepare_multiline_status(max_progress) @@ -394,8 +388,7 @@ class FragmentFD(FileDownloader): ctx['max_progress'] = max_progress ctx['progress_idx'] = idx return self.download_and_append_fragments( - ctx, fragments, info_dict, pack_func=pack_func, finish_func=finish_func, - tpe=tpe, interrupt_trigger=interrupt_trigger) + ctx, fragments, info_dict, **kwargs, tpe=tpe, interrupt_trigger=interrupt_trigger) class FTPE(concurrent.futures.ThreadPoolExecutor): # has to stop this or it's going to wait on the worker thread itself @@ -442,18 +435,12 @@ class FragmentFD(FileDownloader): return result def download_and_append_fragments( - self, ctx, fragments, info_dict, *, pack_func=None, finish_func=None, - tpe=None, interrupt_trigger=None): - if not interrupt_trigger: - interrupt_trigger = (True, ) - - fragment_retries = self.params.get('fragment_retries', 0) - is_fatal = ( - ((lambda _: False) if info_dict.get('is_live') else (lambda idx: idx == 0)) - if self.params.get('skip_unavailable_fragments', True) else (lambda _: True)) + self, ctx, fragments, info_dict, *, is_fatal=(lambda idx: False), + pack_func=(lambda content, idx: content), finish_func=None, + tpe=None, interrupt_trigger=(True, )): - if not pack_func: - pack_func = lambda frag_content, _: frag_content + if not self.params.get('skip_unavailable_fragments', True): + is_fatal = lambda _: True def download_fragment(fragment, ctx): if not interrupt_trigger[0]: @@ -467,31 +454,25 @@ class FragmentFD(FileDownloader): headers['Range'] = 'bytes=%d-%d' % (byte_range['start'], byte_range['end'] - 1) # Never skip the first fragment - fatal, count = is_fatal(fragment.get('index') or (frag_index - 1)), 0 - while count <= fragment_retries: - try: - if self._download_fragment(ctx, fragment['url'], info_dict, headers): - break - return - except (compat_urllib_error.HTTPError, http.client.IncompleteRead) as err: - # Unavailable (possibly temporary) fragments may be served. - # First we try to retry then either skip or abort. - # See https://github.com/ytdl-org/youtube-dl/issues/10165, - # https://github.com/ytdl-org/youtube-dl/issues/10448). - count += 1 - ctx['last_error'] = err - if count <= fragment_retries: - self.report_retry_fragment(err, frag_index, count, fragment_retries) - except DownloadError: - # Don't retry fragment if error occurred during HTTP downloading - # itself since it has own retry settings - if not fatal: - break - raise + fatal = is_fatal(fragment.get('index') or (frag_index - 1)) - if count > fragment_retries and fatal: - ctx['dest_stream'].close() - self.report_error('Giving up after %s fragment retries' % fragment_retries) + def error_callback(err, count, retries): + if fatal and count > retries: + ctx['dest_stream'].close() + self.report_retry(err, count, retries, frag_index, fatal) + ctx['last_error'] = err + + for retry in RetryManager(self.params.get('fragment_retries'), error_callback): + try: + ctx['fragment_count'] = fragment.get('fragment_count') + if not self._download_fragment(ctx, fragment['url'], info_dict, headers): + return + except (urllib.error.HTTPError, http.client.IncompleteRead) as err: + retry.error = err + continue + except DownloadError: # has own retry settings + if fatal: + raise def append_fragment(frag_content, frag_index, ctx): if frag_content: @@ -508,8 +489,7 @@ class FragmentFD(FileDownloader): max_workers = math.ceil( self.params.get('concurrent_fragment_downloads', 1) / ctx.get('max_progress', 1)) - if can_threaded_download and max_workers > 1: - + if max_workers > 1: def _download_fragment(fragment): ctx_copy = ctx.copy() download_fragment(fragment, ctx_copy) @@ -517,23 +497,36 @@ class FragmentFD(FileDownloader): self.report_warning('The download speed shown is only of one thread. This is a known issue and patches are welcome') with tpe or concurrent.futures.ThreadPoolExecutor(max_workers) as pool: - for fragment, frag_index, frag_filename in pool.map(_download_fragment, fragments): - ctx['fragment_filename_sanitized'] = frag_filename - ctx['fragment_index'] = frag_index - result = append_fragment(decrypt_fragment(fragment, self._read_fragment(ctx)), frag_index, ctx) - if not result: - return False + try: + for fragment, frag_index, frag_filename in pool.map(_download_fragment, fragments): + ctx.update({ + 'fragment_filename_sanitized': frag_filename, + 'fragment_index': frag_index, + }) + if not append_fragment(decrypt_fragment(fragment, self._read_fragment(ctx)), frag_index, ctx): + return False + except KeyboardInterrupt: + self._finish_multiline_status() + self.report_error( + 'Interrupted by user. Waiting for all threads to shutdown...', is_error=False, tb=False) + pool.shutdown(wait=False) + raise else: for fragment in fragments: if not interrupt_trigger[0]: break - download_fragment(fragment, ctx) - result = append_fragment(decrypt_fragment(fragment, self._read_fragment(ctx)), fragment['frag_index'], ctx) + try: + download_fragment(fragment, ctx) + result = append_fragment( + decrypt_fragment(fragment, self._read_fragment(ctx)), fragment['frag_index'], ctx) + except KeyboardInterrupt: + if info_dict.get('is_live'): + break + raise if not result: return False if finish_func is not None: ctx['dest_stream'].write(finish_func()) ctx['dest_stream'].flush() - self._finish_frag_download(ctx, info_dict) - return True + return self._finish_frag_download(ctx, info_dict) diff --git a/hypervideo_dl/downloader/hls.py b/hypervideo_dl/downloader/hls.py index f3f32b5..4520edc 100644 --- a/hypervideo_dl/downloader/hls.py +++ b/hypervideo_dl/downloader/hls.py @@ -1,23 +1,14 @@ -from __future__ import unicode_literals - -import re -import io import binascii +import io +import re +import urllib.parse -from ..downloader import get_suitable_downloader -from .fragment import FragmentFD +from . import get_suitable_downloader from .external import FFmpegFD - -from ..compat import ( - compat_pycrypto_AES, - compat_urlparse, -) -from ..utils import ( - parse_m3u8_attributes, - update_url_query, - bug_reports_message, -) +from .fragment import FragmentFD from .. import webvtt +from ..dependencies import Cryptodome_AES +from ..utils import bug_reports_message, parse_m3u8_attributes, update_url_query class HlsFD(FragmentFD): @@ -70,12 +61,18 @@ class HlsFD(FragmentFD): s = urlh.read().decode('utf-8', 'ignore') can_download, message = self.can_download(s, info_dict, self.params.get('allow_unplayable_formats')), None - if can_download and not compat_pycrypto_AES and '#EXT-X-KEY:METHOD=AES-128' in s: - if FFmpegFD.available(): + if can_download: + has_ffmpeg = FFmpegFD.available() + no_crypto = not Cryptodome_AES and '#EXT-X-KEY:METHOD=AES-128' in s + if no_crypto and has_ffmpeg: can_download, message = False, 'The stream has AES-128 encryption and pycryptodome is not available' - else: + elif no_crypto: message = ('The stream has AES-128 encryption and neither ffmpeg nor pycryptodome are available; ' 'Decryption will be performed natively, but will be extremely slow') + elif info_dict.get('extractor_key') == 'Generic' and re.search(r'(?m)#EXT-X-MEDIA-SEQUENCE:(?!0$)', s): + install_ffmpeg = '' if has_ffmpeg else 'install ffmpeg and ' + message = ('Live HLS streams are not supported by the native downloader. If this is a livestream, ' + f'please {install_ffmpeg}add "--downloader ffmpeg --hls-use-mpegts" to your command') if not can_download: has_drm = re.search('|'.join([ r'#EXT-X-FAXS-CM:', # Adobe Flash Access @@ -102,8 +99,7 @@ class HlsFD(FragmentFD): if real_downloader and not real_downloader.supports_manifest(s): real_downloader = None if real_downloader: - self.to_screen( - '[%s] Fragment downloads will be delegated to %s' % (self.FD_NAME, real_downloader.get_basename())) + self.to_screen(f'[{self.FD_NAME}] Fragment downloads will be delegated to {real_downloader.get_basename()}') def is_ad_fragment_start(s): return (s.startswith('#ANVATO-SEGMENT-INFO') and 'type=ad' in s @@ -150,7 +146,7 @@ class HlsFD(FragmentFD): extra_query = None extra_param_to_segment_url = info_dict.get('extra_param_to_segment_url') if extra_param_to_segment_url: - extra_query = compat_urlparse.parse_qs(extra_param_to_segment_url) + extra_query = urllib.parse.parse_qs(extra_param_to_segment_url) i = 0 media_sequence = 0 decrypt_info = {'METHOD': 'NONE'} @@ -172,7 +168,7 @@ class HlsFD(FragmentFD): frag_url = ( line if re.match(r'^https?://', line) - else compat_urlparse.urljoin(man_url, line)) + else urllib.parse.urljoin(man_url, line)) if extra_query: frag_url = update_url_query(frag_url, extra_query) @@ -197,10 +193,18 @@ class HlsFD(FragmentFD): frag_url = ( map_info.get('URI') if re.match(r'^https?://', map_info.get('URI')) - else compat_urlparse.urljoin(man_url, map_info.get('URI'))) + else urllib.parse.urljoin(man_url, map_info.get('URI'))) if extra_query: frag_url = update_url_query(frag_url, extra_query) + if map_info.get('BYTERANGE'): + splitted_byte_range = map_info.get('BYTERANGE').split('@') + sub_range_start = int(splitted_byte_range[1]) if len(splitted_byte_range) == 2 else byte_range['end'] + byte_range = { + 'start': sub_range_start, + 'end': sub_range_start + int(splitted_byte_range[0]), + } + fragments.append({ 'frag_index': frag_index, 'url': frag_url, @@ -210,14 +214,6 @@ class HlsFD(FragmentFD): }) media_sequence += 1 - if map_info.get('BYTERANGE'): - splitted_byte_range = map_info.get('BYTERANGE').split('@') - sub_range_start = int(splitted_byte_range[1]) if len(splitted_byte_range) == 2 else byte_range['end'] - byte_range = { - 'start': sub_range_start, - 'end': sub_range_start + int(splitted_byte_range[0]), - } - elif line.startswith('#EXT-X-KEY'): decrypt_url = decrypt_info.get('URI') decrypt_info = parse_m3u8_attributes(line[11:]) @@ -225,7 +221,7 @@ class HlsFD(FragmentFD): if 'IV' in decrypt_info: decrypt_info['IV'] = binascii.unhexlify(decrypt_info['IV'][2:].zfill(32)) if not re.match(r'^https?://', decrypt_info['URI']): - decrypt_info['URI'] = compat_urlparse.urljoin( + decrypt_info['URI'] = urllib.parse.urljoin( man_url, decrypt_info['URI']) if extra_query: decrypt_info['URI'] = update_url_query(decrypt_info['URI'], extra_query) @@ -339,7 +335,7 @@ class HlsFD(FragmentFD): continue block.write_into(output) - return output.getvalue().encode('utf-8') + return output.getvalue().encode() def fin_fragments(): dedup_window = extra_state.get('webvtt_dedup_window') @@ -350,7 +346,7 @@ class HlsFD(FragmentFD): for cue in dedup_window: webvtt.CueBlock.from_json(cue).write_into(output) - return output.getvalue().encode('utf-8') + return output.getvalue().encode() self.download_and_append_fragments( ctx, fragments, info_dict, pack_func=pack_fragment, finish_func=fin_fragments) diff --git a/hypervideo_dl/downloader/http.py b/hypervideo_dl/downloader/http.py index 591a9b0..95c870e 100644 --- a/hypervideo_dl/downloader/http.py +++ b/hypervideo_dl/downloader/http.py @@ -1,29 +1,33 @@ -from __future__ import unicode_literals - +import http.client import os +import random +import socket import ssl import time -import random +import urllib.error from .common import FileDownloader -from ..compat import ( - compat_urllib_error, - compat_http_client -) from ..utils import ( ContentTooShortError, + RetryManager, + ThrottledDownload, + XAttrMetadataError, + XAttrUnavailableError, encodeFilename, int_or_none, parse_http_range, sanitized_Request, - ThrottledDownload, try_call, write_xattr, - XAttrMetadataError, - XAttrUnavailableError, ) -RESPONSE_READ_EXCEPTIONS = (TimeoutError, ConnectionError, ssl.SSLError, compat_http_client.HTTPException) +RESPONSE_READ_EXCEPTIONS = ( + TimeoutError, + socket.timeout, # compat: py < 3.10 + ConnectionError, + ssl.SSLError, + http.client.HTTPException +) class HttpFD(FileDownloader): @@ -69,9 +73,6 @@ class HttpFD(FileDownloader): ctx.is_resume = ctx.resume_len > 0 - count = 0 - retries = self.params.get('retries', 0) - class SucceedDownload(Exception): pass @@ -134,19 +135,18 @@ class HttpFD(FileDownloader): if has_range: content_range = ctx.data.headers.get('Content-Range') content_range_start, content_range_end, content_len = parse_http_range(content_range) - if content_range_start is not None and range_start == content_range_start: - # Content-Range is present and matches requested Range, resume is possible - accept_content_len = ( + # Content-Range is present and matches requested Range, resume is possible + if range_start == content_range_start and ( # Non-chunked download not ctx.chunk_size # Chunked download and requested piece or # its part is promised to be served or content_range_end == range_end - or content_len < range_end) - if accept_content_len: - ctx.content_len = content_len - ctx.data_len = min(content_len, req_end or content_len) - (req_start or 0) - return + or content_len < range_end): + ctx.content_len = content_len + if content_len or req_end: + ctx.data_len = min(content_len or req_end, req_end or content_len) - (req_start or 0) + return # Content-Range is either not present or invalid. Assuming remote webserver is # trying to send the whole file, resume is not possible, so wiping the local file # and performing entire redownload @@ -154,7 +154,7 @@ class HttpFD(FileDownloader): ctx.resume_len = 0 ctx.open_mode = 'wb' ctx.data_len = ctx.content_len = int_or_none(ctx.data.info().get('Content-length', None)) - except (compat_urllib_error.HTTPError, ) as err: + except urllib.error.HTTPError as err: if err.code == 416: # Unable to resume (requested range not satisfiable) try: @@ -162,7 +162,7 @@ class HttpFD(FileDownloader): ctx.data = self.ydl.urlopen( sanitized_Request(url, request_data, headers)) content_length = ctx.data.info()['Content-Length'] - except (compat_urllib_error.HTTPError, ) as err: + except urllib.error.HTTPError as err: if err.code < 500 or err.code >= 600: raise else: @@ -195,7 +195,7 @@ class HttpFD(FileDownloader): # Unexpected HTTP error raise raise RetryDownload(err) - except compat_urllib_error.URLError as err: + except urllib.error.URLError as err: if isinstance(err.reason, ssl.CertificateError): raise raise RetryDownload(err) @@ -204,6 +204,12 @@ class HttpFD(FileDownloader): except RESPONSE_READ_EXCEPTIONS as err: raise RetryDownload(err) + def close_stream(): + if ctx.stream is not None: + if not ctx.tmpfilename == '-': + ctx.stream.close() + ctx.stream = None + def download(): data_len = ctx.data.info().get('Content-length', None) @@ -220,10 +226,12 @@ class HttpFD(FileDownloader): min_data_len = self.params.get('min_filesize') max_data_len = self.params.get('max_filesize') if min_data_len is not None and data_len < min_data_len: - self.to_screen('\r[download] File is smaller than min-filesize (%s bytes < %s bytes). Aborting.' % (data_len, min_data_len)) + self.to_screen( + f'\r[download] File is smaller than min-filesize ({data_len} bytes < {min_data_len} bytes). Aborting.') return False if max_data_len is not None and data_len > max_data_len: - self.to_screen('\r[download] File is larger than max-filesize (%s bytes > %s bytes). Aborting.' % (data_len, max_data_len)) + self.to_screen( + f'\r[download] File is larger than max-filesize ({data_len} bytes > {max_data_len} bytes). Aborting.') return False byte_counter = 0 + ctx.resume_len @@ -235,12 +243,9 @@ class HttpFD(FileDownloader): before = start # start measuring def retry(e): - to_stdout = ctx.tmpfilename == '-' - if ctx.stream is not None: - if not to_stdout: - ctx.stream.close() - ctx.stream = None - ctx.resume_len = byte_counter if to_stdout else os.path.getsize(encodeFilename(ctx.tmpfilename)) + close_stream() + ctx.resume_len = (byte_counter if ctx.tmpfilename == '-' + else os.path.getsize(encodeFilename(ctx.tmpfilename))) raise RetryDownload(e) while True: @@ -264,19 +269,19 @@ class HttpFD(FileDownloader): assert ctx.stream is not None ctx.filename = self.undo_temp_name(ctx.tmpfilename) self.report_destination(ctx.filename) - except (OSError, IOError) as err: + except OSError as err: self.report_error('unable to open for writing: %s' % str(err)) return False if self.params.get('xattr_set_filesize', False) and data_len is not None: try: - write_xattr(ctx.tmpfilename, 'user.ytdl.filesize', str(data_len).encode('utf-8')) + write_xattr(ctx.tmpfilename, 'user.ytdl.filesize', str(data_len).encode()) except (XAttrUnavailableError, XAttrMetadataError) as err: self.report_error('unable to set filesize xattr: %s' % str(err)) try: ctx.stream.write(data_block) - except (IOError, OSError) as err: + except OSError as err: self.to_stderr('\n') self.report_error('unable to write data: %s' % str(err)) return False @@ -342,9 +347,7 @@ class HttpFD(FileDownloader): if data_len is not None and byte_counter != data_len: err = ContentTooShortError(byte_counter, int(data_len)) - if count <= retries: - retry(err) - raise err + retry(err) self.try_rename(ctx.tmpfilename, ctx.filename) @@ -363,21 +366,20 @@ class HttpFD(FileDownloader): return True - while count <= retries: + for retry in RetryManager(self.params.get('retries'), self.report_retry): try: establish_connection() return download() - except RetryDownload as e: - count += 1 - if count <= retries: - self.report_retry(e.source_error, count, retries) - else: - self.to_screen(f'[download] Got server HTTP error: {e.source_error}') + except RetryDownload as err: + retry.error = err.source_error continue except NextFragment: + retry.error = None + retry.attempt -= 1 continue except SucceedDownload: return True - - self.report_error('giving up after %s retries' % retries) + except: # noqa: E722 + close_stream() + raise return False diff --git a/hypervideo_dl/downloader/ism.py b/hypervideo_dl/downloader/ism.py index 4d5618c..a157a8a 100644 --- a/hypervideo_dl/downloader/ism.py +++ b/hypervideo_dl/downloader/ism.py @@ -1,27 +1,23 @@ -from __future__ import unicode_literals - -import time import binascii import io +import struct +import time +import urllib.error from .fragment import FragmentFD -from ..compat import ( - compat_Struct, - compat_urllib_error, -) +from ..utils import RetryManager +u8 = struct.Struct('>B') +u88 = struct.Struct('>Bx') +u16 = struct.Struct('>H') +u1616 = struct.Struct('>Hxx') +u32 = struct.Struct('>I') +u64 = struct.Struct('>Q') -u8 = compat_Struct('>B') -u88 = compat_Struct('>Bx') -u16 = compat_Struct('>H') -u1616 = compat_Struct('>Hxx') -u32 = compat_Struct('>I') -u64 = compat_Struct('>Q') - -s88 = compat_Struct('>bx') -s16 = compat_Struct('>h') -s1616 = compat_Struct('>hxx') -s32 = compat_Struct('>i') +s88 = struct.Struct('>bx') +s16 = struct.Struct('>h') +s1616 = struct.Struct('>hxx') +s32 = struct.Struct('>i') unity_matrix = (s32.pack(0x10000) + s32.pack(0) * 3) * 2 + s32.pack(0x40000000) @@ -142,6 +138,8 @@ def write_piff_header(stream, params): if fourcc == 'AACL': sample_entry_box = box(b'mp4a', sample_entry_payload) + if fourcc == 'EC-3': + sample_entry_box = box(b'ec-3', sample_entry_payload) elif stream_type == 'video': sample_entry_payload += u16.pack(0) # pre defined sample_entry_payload += u16.pack(0) # reserved @@ -156,7 +154,7 @@ def write_piff_header(stream, params): sample_entry_payload += u16.pack(0x18) # depth sample_entry_payload += s16.pack(-1) # pre defined - codec_private_data = binascii.unhexlify(params['codec_private_data'].encode('utf-8')) + codec_private_data = binascii.unhexlify(params['codec_private_data'].encode()) if fourcc in ('H264', 'AVC1'): sps, pps = codec_private_data.split(u32.pack(1))[1:] avcc_payload = u8.pack(1) # configuration version @@ -235,8 +233,6 @@ class IsmFD(FragmentFD): Download segments in a ISM manifest """ - FD_NAME = 'ism' - def real_download(self, filename, info_dict): segments = info_dict['fragments'][:1] if self.params.get( 'test', False) else info_dict['fragments'] @@ -252,7 +248,6 @@ class IsmFD(FragmentFD): 'ism_track_written': False, }) - fragment_retries = self.params.get('fragment_retries', 0) skip_unavailable_fragments = self.params.get('skip_unavailable_fragments', True) frag_index = 0 @@ -260,8 +255,10 @@ class IsmFD(FragmentFD): frag_index += 1 if frag_index <= ctx['fragment_index']: continue - count = 0 - while count <= fragment_retries: + + retry_manager = RetryManager(self.params.get('fragment_retries'), self.report_retry, + frag_index=frag_index, fatal=not skip_unavailable_fragments) + for retry in retry_manager: try: success = self._download_fragment(ctx, segment['url'], info_dict) if not success: @@ -274,18 +271,13 @@ class IsmFD(FragmentFD): write_piff_header(ctx['dest_stream'], info_dict['_download_params']) extra_state['ism_track_written'] = True self._append_fragment(ctx, frag_content) - break - except compat_urllib_error.HTTPError as err: - count += 1 - if count <= fragment_retries: - self.report_retry_fragment(err, frag_index, count, fragment_retries) - if count > fragment_retries: - if skip_unavailable_fragments: - self.report_skip_fragment(frag_index) + except urllib.error.HTTPError as err: + retry.error = err continue - self.report_error('giving up after %s fragment retries' % fragment_retries) - return False - self._finish_frag_download(ctx, info_dict) + if retry_manager.error: + if not skip_unavailable_fragments: + return False + self.report_skip_fragment(frag_index) - return True + return self._finish_frag_download(ctx, info_dict) diff --git a/hypervideo_dl/downloader/mhtml.py b/hypervideo_dl/downloader/mhtml.py index c8332c0..170a78d 100644 --- a/hypervideo_dl/downloader/mhtml.py +++ b/hypervideo_dl/downloader/mhtml.py @@ -1,24 +1,15 @@ -# coding: utf-8 -from __future__ import unicode_literals - import io import quopri import re import uuid from .fragment import FragmentFD -from ..utils import ( - escapeHTML, - formatSeconds, - srt_subtitles_timecode, - urljoin, -) +from ..compat import imghdr +from ..utils import escapeHTML, formatSeconds, srt_subtitles_timecode, urljoin from ..version import __version__ as YT_DLP_VERSION class MhtmlFD(FragmentFD): - FD_NAME = 'mhtml' - _STYLESHEET = """\ html, body { margin: 0; @@ -62,7 +53,7 @@ body > figure > img { def _escape_mime(s): return '=?utf-8?Q?' + (b''.join( bytes((b,)) if b >= 0x20 else b'=%02X' % b - for b in quopri.encodestring(s.encode('utf-8'), header=True) + for b in quopri.encodestring(s.encode(), header=True) )).decode('us-ascii') + '?=' def _gen_cid(self, i, fragment, frag_boundary): @@ -159,7 +150,7 @@ body > figure > img { length=len(stub), title=self._escape_mime(title), stub=stub - ).encode('utf-8')) + ).encode()) extra_state['header_written'] = True for i, fragment in enumerate(fragments): @@ -176,21 +167,13 @@ body > figure > img { continue frag_content = self._read_fragment(ctx) - mime_type = b'image/jpeg' - if frag_content.startswith(b'\x89PNG\r\n\x1a\n'): - mime_type = b'image/png' - if frag_content.startswith((b'GIF87a', b'GIF89a')): - mime_type = b'image/gif' - if frag_content.startswith(b'RIFF') and frag_content[8:12] == 'WEBP': - mime_type = b'image/webp' - frag_header = io.BytesIO() frag_header.write( b'--%b\r\n' % frag_boundary.encode('us-ascii')) frag_header.write( b'Content-ID: <%b>\r\n' % self._gen_cid(i, fragment, frag_boundary).encode('us-ascii')) frag_header.write( - b'Content-type: %b\r\n' % mime_type) + b'Content-type: %b\r\n' % f'image/{imghdr.what(h=frag_content) or "jpeg"}'.encode()) frag_header.write( b'Content-length: %u\r\n' % len(frag_content)) frag_header.write( @@ -203,5 +186,4 @@ body > figure > img { ctx['dest_stream'].write( b'--%b--\r\n\r\n' % frag_boundary.encode('us-ascii')) - self._finish_frag_download(ctx, info_dict) - return True + return self._finish_frag_download(ctx, info_dict) diff --git a/hypervideo_dl/downloader/niconico.py b/hypervideo_dl/downloader/niconico.py index 521dfec..77ed39e 100644 --- a/hypervideo_dl/downloader/niconico.py +++ b/hypervideo_dl/downloader/niconico.py @@ -1,22 +1,17 @@ -# coding: utf-8 -from __future__ import unicode_literals - import threading +from . import get_suitable_downloader from .common import FileDownloader -from ..downloader import get_suitable_downloader -from ..extractor.niconico import NiconicoIE from ..utils import sanitized_Request class NiconicoDmcFD(FileDownloader): """ Downloading niconico douga from DMC with heartbeat """ - FD_NAME = 'niconico_dmc' - def real_download(self, filename, info_dict): - self.to_screen('[%s] Downloading from DMC' % self.FD_NAME) + from ..extractor.niconico import NiconicoIE + self.to_screen('[%s] Downloading from DMC' % self.FD_NAME) ie = NiconicoIE(self.ydl) info_dict, heartbeat_info_dict = ie._get_heartbeat_info(info_dict) @@ -54,4 +49,4 @@ class NiconicoDmcFD(FileDownloader): with heartbeat_lock: timer[0].cancel() download_complete = True - return success + return success diff --git a/hypervideo_dl/downloader/rtmp.py b/hypervideo_dl/downloader/rtmp.py index 90f1acf..0e09525 100644 --- a/hypervideo_dl/downloader/rtmp.py +++ b/hypervideo_dl/downloader/rtmp.py @@ -1,18 +1,15 @@ -from __future__ import unicode_literals - import os import re import subprocess import time from .common import FileDownloader -from ..compat import compat_str from ..utils import ( + Popen, check_executable, - encodeFilename, encodeArgument, + encodeFilename, get_exe_version, - Popen, ) @@ -94,8 +91,7 @@ class RtmpFD(FileDownloader): self.to_screen('') return proc.wait() except BaseException: # Including KeyboardInterrupt - proc.kill() - proc.wait() + proc.kill(timeout=None) raise url = info_dict['url'] @@ -146,7 +142,7 @@ class RtmpFD(FileDownloader): if isinstance(conn, list): for entry in conn: basic_args += ['--conn', entry] - elif isinstance(conn, compat_str): + elif isinstance(conn, str): basic_args += ['--conn', conn] if protocol is not None: basic_args += ['--protocol', protocol] diff --git a/hypervideo_dl/downloader/rtsp.py b/hypervideo_dl/downloader/rtsp.py index 7815d59..e89269f 100644 --- a/hypervideo_dl/downloader/rtsp.py +++ b/hypervideo_dl/downloader/rtsp.py @@ -1,13 +1,8 @@ -from __future__ import unicode_literals - import os import subprocess from .common import FileDownloader -from ..utils import ( - check_executable, - encodeFilename, -) +from ..utils import check_executable, encodeFilename class RtspFD(FileDownloader): @@ -32,7 +27,7 @@ class RtspFD(FileDownloader): retval = subprocess.call(args) if retval == 0: fsize = os.path.getsize(encodeFilename(tmpfilename)) - self.to_screen('\r[%s] %s bytes' % (args[0], fsize)) + self.to_screen(f'\r[{args[0]}] {fsize} bytes') self.try_rename(tmpfilename, filename) self._hook_progress({ 'downloaded_bytes': fsize, diff --git a/hypervideo_dl/downloader/websocket.py b/hypervideo_dl/downloader/websocket.py index 58e2bce..6837ff1 100644 --- a/hypervideo_dl/downloader/websocket.py +++ b/hypervideo_dl/downloader/websocket.py @@ -1,19 +1,12 @@ +import asyncio +import contextlib import os import signal -import asyncio import threading -try: - import websockets -except (ImportError, SyntaxError): - # websockets 3.10 on python 3.6 causes SyntaxError - # See https://github.com/hypervideo/hypervideo/issues/2633 - has_websockets = False -else: - has_websockets = True - from .common import FileDownloader from .external import FFmpegFD +from ..dependencies import websockets class FFmpegSinkFD(FileDownloader): @@ -26,14 +19,12 @@ class FFmpegSinkFD(FileDownloader): async def call_conn(proc, stdin): try: await self.real_connection(stdin, info_dict) - except (BrokenPipeError, OSError): + except OSError: pass finally: - try: + with contextlib.suppress(OSError): stdin.flush() stdin.close() - except OSError: - pass os.kill(os.getpid(), signal.SIGINT) class FFmpegStdinFD(FFmpegFD): diff --git a/hypervideo_dl/downloader/youtube_live_chat.py b/hypervideo_dl/downloader/youtube_live_chat.py index dd21ac8..dfd290a 100644 --- a/hypervideo_dl/downloader/youtube_live_chat.py +++ b/hypervideo_dl/downloader/youtube_live_chat.py @@ -1,24 +1,20 @@ -from __future__ import division, unicode_literals - import json import time +import urllib.error from .fragment import FragmentFD -from ..compat import compat_urllib_error from ..utils import ( - try_get, + RegexNotFoundError, + RetryManager, dict_get, int_or_none, - RegexNotFoundError, + try_get, ) -from ..extractor.youtube import YoutubeBaseInfoExtractor as YT_BaseIE class YoutubeLiveChatFD(FragmentFD): """ Downloads YouTube live chats fragment by fragment """ - FD_NAME = 'youtube_live_chat' - def real_download(self, filename, info_dict): video_id = info_dict['video_id'] self.to_screen('[%s] Downloading live chat' % self.FD_NAME) @@ -26,7 +22,6 @@ class YoutubeLiveChatFD(FragmentFD): self.report_warning('Live chat download runs until the livestream ends. ' 'If you wish to download the video simultaneously, run a separate hypervideo instance') - fragment_retries = self.params.get('fragment_retries', 0) test = self.params.get('test', False) ctx = { @@ -35,7 +30,9 @@ class YoutubeLiveChatFD(FragmentFD): 'total_frags': None, } - ie = YT_BaseIE(self.ydl) + from ..extractor.youtube import YoutubeBaseInfoExtractor + + ie = YoutubeBaseInfoExtractor(self.ydl) start_time = int(time.time() * 1000) @@ -54,7 +51,7 @@ class YoutubeLiveChatFD(FragmentFD): replay_chat_item_action = action['replayChatItemAction'] offset = int(replay_chat_item_action['videoOffsetTimeMsec']) processed_fragment.extend( - json.dumps(action, ensure_ascii=False).encode('utf-8') + b'\n') + json.dumps(action, ensure_ascii=False).encode() + b'\n') if offset is not None: continuation = try_get( live_chat_continuation, @@ -96,7 +93,7 @@ class YoutubeLiveChatFD(FragmentFD): 'isLive': True, } processed_fragment.extend( - json.dumps(pseudo_action, ensure_ascii=False).encode('utf-8') + b'\n') + json.dumps(pseudo_action, ensure_ascii=False).encode() + b'\n') continuation_data_getters = [ lambda x: x['continuations'][0]['invalidationContinuationData'], lambda x: x['continuations'][0]['timedContinuationData'], @@ -112,8 +109,7 @@ class YoutubeLiveChatFD(FragmentFD): return continuation_id, live_offset, click_tracking_params def download_and_parse_fragment(url, frag_index, request_data=None, headers=None): - count = 0 - while count <= fragment_retries: + for retry in RetryManager(self.params.get('fragment_retries'), self.report_retry, frag_index=frag_index): try: success = dl_fragment(url, request_data, headers) if not success: @@ -128,21 +124,15 @@ class YoutubeLiveChatFD(FragmentFD): live_chat_continuation = try_get( data, lambda x: x['continuationContents']['liveChatContinuation'], dict) or {} - if info_dict['protocol'] == 'youtube_live_chat_replay': - if frag_index == 1: - continuation_id, offset, click_tracking_params = try_refresh_replay_beginning(live_chat_continuation) - else: - continuation_id, offset, click_tracking_params = parse_actions_replay(live_chat_continuation) - elif info_dict['protocol'] == 'youtube_live_chat': - continuation_id, offset, click_tracking_params = parse_actions_live(live_chat_continuation) - return True, continuation_id, offset, click_tracking_params - except compat_urllib_error.HTTPError as err: - count += 1 - if count <= fragment_retries: - self.report_retry_fragment(err, frag_index, count, fragment_retries) - if count > fragment_retries: - self.report_error('giving up after %s fragment retries' % fragment_retries) - return False, None, None, None + + func = (info_dict['protocol'] == 'youtube_live_chat' and parse_actions_live + or frag_index == 1 and try_refresh_replay_beginning + or parse_actions_replay) + return (True, *func(live_chat_continuation)) + except urllib.error.HTTPError as err: + retry.error = err + continue + return False, None, None, None self._prepare_and_start_frag_download(ctx, info_dict) @@ -190,7 +180,7 @@ class YoutubeLiveChatFD(FragmentFD): request_data['context']['clickTracking'] = {'clickTrackingParams': click_tracking_params} headers = ie.generate_api_headers(ytcfg=ytcfg, visitor_data=visitor_data) headers.update({'content-type': 'application/json'}) - fragment_request_data = json.dumps(request_data, ensure_ascii=False).encode('utf-8') + b'\n' + fragment_request_data = json.dumps(request_data, ensure_ascii=False).encode() + b'\n' success, continuation_id, offset, click_tracking_params = download_and_parse_fragment( url, frag_index, fragment_request_data, headers) else: @@ -201,8 +191,7 @@ class YoutubeLiveChatFD(FragmentFD): if test: break - self._finish_frag_download(ctx, info_dict) - return True + return self._finish_frag_download(ctx, info_dict) @staticmethod def parse_live_timestamp(action): diff --git a/hypervideo_dl/extractor/__init__.py b/hypervideo_dl/extractor/__init__.py index b354842..6bfa4bd 100644 --- a/hypervideo_dl/extractor/__init__.py +++ b/hypervideo_dl/extractor/__init__.py @@ -1,33 +1,15 @@ -import os +from ..compat.compat_utils import passthrough_module -from ..utils import load_plugins - -_LAZY_LOADER = False -if not os.environ.get('YTDLP_NO_LAZY_EXTRACTORS'): - try: - from .lazy_extractors import * - from .lazy_extractors import _ALL_CLASSES - _LAZY_LOADER = True - except ImportError: - pass - -if not _LAZY_LOADER: - from .extractors import * - _ALL_CLASSES = [ - klass - for name, klass in globals().items() - if name.endswith('IE') and name != 'GenericIE' - ] - _ALL_CLASSES.append(GenericIE) - -_PLUGIN_CLASSES = load_plugins('extractor', 'IE', globals()) -_ALL_CLASSES = list(_PLUGIN_CLASSES.values()) + _ALL_CLASSES +passthrough_module(__name__, '.extractors') +del passthrough_module def gen_extractor_classes(): """ Return a list of supported extractors. The order does matter; the first extractor matched is the one handling the URL. """ + from .extractors import _ALL_CLASSES + return _ALL_CLASSES @@ -38,17 +20,23 @@ def gen_extractors(): return [klass() for klass in gen_extractor_classes()] -def list_extractors(age_limit): - """ - Return a list of extractors that are suitable for the given age, - sorted by extractor ID. - """ +def list_extractor_classes(age_limit=None): + """Return a list of extractors that are suitable for the given age, sorted by extractor name""" + from .generic import GenericIE + + yield from sorted(filter( + lambda ie: ie.is_suitable(age_limit) and ie != GenericIE, + gen_extractor_classes()), key=lambda ie: ie.IE_NAME.lower()) + yield GenericIE - return sorted( - filter(lambda ie: ie.is_suitable(age_limit), gen_extractors()), - key=lambda ie: ie.IE_NAME.lower()) + +def list_extractors(age_limit=None): + """Return a list of extractor instances that are suitable for the given age, sorted by extractor name""" + return [ie() for ie in list_extractor_classes(age_limit)] def get_info_extractor(ie_name): """Returns the info extractor class with the given ie_name""" - return globals()[ie_name + 'IE'] + from . import extractors + + return getattr(extractors, f'{ie_name}IE') diff --git a/hypervideo_dl/extractor/_extractors.py b/hypervideo_dl/extractor/_extractors.py new file mode 100644 index 0000000..2fe15f6 --- /dev/null +++ b/hypervideo_dl/extractor/_extractors.py @@ -0,0 +1,2354 @@ +# flake8: noqa: F401 + +from .youtube import ( # Youtube is moved to the top to improve performance + YoutubeIE, + YoutubeClipIE, + YoutubeFavouritesIE, + YoutubeNotificationsIE, + YoutubeHistoryIE, + YoutubeTabIE, + YoutubeLivestreamEmbedIE, + YoutubePlaylistIE, + YoutubeRecommendedIE, + YoutubeSearchDateIE, + YoutubeSearchIE, + YoutubeSearchURLIE, + YoutubeMusicSearchURLIE, + YoutubeSubscriptionsIE, + YoutubeStoriesIE, + YoutubeTruncatedIDIE, + YoutubeTruncatedURLIE, + YoutubeYtBeIE, + YoutubeYtUserIE, + YoutubeWatchLaterIE, + YoutubeShortsAudioPivotIE +) + +from .abc import ( + ABCIE, + ABCIViewIE, + ABCIViewShowSeriesIE, +) +from .abcnews import ( + AbcNewsIE, + AbcNewsVideoIE, +) +from .abcotvs import ( + ABCOTVSIE, + ABCOTVSClipsIE, +) +from .abematv import ( + AbemaTVIE, + AbemaTVTitleIE, +) +from .academicearth import AcademicEarthCourseIE +from .acast import ( + ACastIE, + ACastChannelIE, +) +from .acfun import AcFunVideoIE, AcFunBangumiIE +from .adn import ADNIE +from .adobeconnect import AdobeConnectIE +from .adobetv import ( + AdobeTVEmbedIE, + AdobeTVIE, + AdobeTVShowIE, + AdobeTVChannelIE, + AdobeTVVideoIE, +) +from .adultswim import AdultSwimIE +from .aenetworks import ( + AENetworksIE, + AENetworksCollectionIE, + AENetworksShowIE, + HistoryTopicIE, + HistoryPlayerIE, + BiographyIE, +) +from .aeonco import AeonCoIE +from .afreecatv import ( + AfreecaTVIE, + AfreecaTVLiveIE, + AfreecaTVUserIE, +) +from .agora import ( + TokFMAuditionIE, + TokFMPodcastIE, + WyborczaPodcastIE, + WyborczaVideoIE, +) +from .airmozilla import AirMozillaIE +from .aljazeera import AlJazeeraIE +from .alphaporno import AlphaPornoIE +from .amara import AmaraIE +from .alura import ( + AluraIE, + AluraCourseIE +) +from .amcnetworks import AMCNetworksIE +from .amazon import AmazonStoreIE +from .amazonminitv import ( + AmazonMiniTVIE, + AmazonMiniTVSeasonIE, + AmazonMiniTVSeriesIE, +) +from .americastestkitchen import ( + AmericasTestKitchenIE, + AmericasTestKitchenSeasonIE, +) +from .angel import AngelIE +from .anvato import AnvatoIE +from .aol import AolIE +from .allocine import AllocineIE +from .aliexpress import AliExpressLiveIE +from .alsace20tv import ( + Alsace20TVIE, + Alsace20TVEmbedIE, +) +from .apa import APAIE +from .aparat import AparatIE +from .appleconnect import AppleConnectIE +from .appletrailers import ( + AppleTrailersIE, + AppleTrailersSectionIE, +) +from .applepodcasts import ApplePodcastsIE +from .archiveorg import ( + ArchiveOrgIE, + YoutubeWebArchiveIE, +) +from .arcpublishing import ArcPublishingIE +from .arkena import ArkenaIE +from .ard import ( + ARDBetaMediathekIE, + ARDIE, + ARDMediathekIE, +) +from .arte import ( + ArteTVIE, + ArteTVEmbedIE, + ArteTVPlaylistIE, + ArteTVCategoryIE, +) +from .arnes import ArnesIE +from .asiancrush import ( + AsianCrushIE, + AsianCrushPlaylistIE, +) +from .atresplayer import AtresPlayerIE +from .atscaleconf import AtScaleConfEventIE +from .atttechchannel import ATTTechChannelIE +from .atvat import ATVAtIE +from .audimedia import AudiMediaIE +from .audioboom import AudioBoomIE +from .audiodraft import ( + AudiodraftCustomIE, + AudiodraftGenericIE, +) +from .audiomack import AudiomackIE, AudiomackAlbumIE +from .audius import ( + AudiusIE, + AudiusTrackIE, + AudiusPlaylistIE, + AudiusProfileIE, +) +from .awaan import ( + AWAANIE, + AWAANVideoIE, + AWAANLiveIE, + AWAANSeasonIE, +) +from .azmedien import AZMedienIE +from .baidu import BaiduVideoIE +from .banbye import ( + BanByeIE, + BanByeChannelIE, +) +from .bandaichannel import BandaiChannelIE +from .bandcamp import ( + BandcampIE, + BandcampAlbumIE, + BandcampWeeklyIE, + BandcampUserIE, +) +from .bannedvideo import BannedVideoIE +from .bbc import ( + BBCCoUkIE, + BBCCoUkArticleIE, + BBCCoUkIPlayerEpisodesIE, + BBCCoUkIPlayerGroupIE, + BBCCoUkPlaylistIE, + BBCIE, +) +from .beeg import BeegIE +from .behindkink import BehindKinkIE +from .bellmedia import BellMediaIE +from .beatport import BeatportIE +from .berufetv import BerufeTVIE +from .bet import BetIE +from .bfi import BFIPlayerIE +from .bfmtv import ( + BFMTVIE, + BFMTVLiveIE, + BFMTVArticleIE, +) +from .bibeltv import BibelTVIE +from .bigflix import BigflixIE +from .bigo import BigoIE +from .bild import BildIE +from .bilibili import ( + BiliBiliIE, + BiliBiliBangumiIE, + BiliBiliBangumiMediaIE, + BiliBiliSearchIE, + BilibiliCategoryIE, + BilibiliAudioIE, + BilibiliAudioAlbumIE, + BiliBiliPlayerIE, + BilibiliSpaceVideoIE, + BilibiliSpaceAudioIE, + BilibiliSpacePlaylistIE, + BiliIntlIE, + BiliIntlSeriesIE, + BiliLiveIE, +) +from .biobiochiletv import BioBioChileTVIE +from .bitchute import ( + BitChuteIE, + BitChuteChannelIE, +) +from .bitwave import ( + BitwaveReplayIE, + BitwaveStreamIE, +) +from .biqle import BIQLEIE +from .blackboardcollaborate import BlackboardCollaborateIE +from .bleacherreport import ( + BleacherReportIE, + BleacherReportCMSIE, +) +from .blogger import BloggerIE +from .bloomberg import BloombergIE +from .bokecc import BokeCCIE +from .bongacams import BongaCamsIE +from .bostonglobe import BostonGlobeIE +from .box import BoxIE +from .booyah import BooyahClipsIE +from .bpb import BpbIE +from .br import ( + BRIE, + BRMediathekIE, +) +from .bravotv import BravoTVIE +from .breakcom import BreakIE +from .breitbart import BreitBartIE +from .brightcove import ( + BrightcoveLegacyIE, + BrightcoveNewIE, +) +from .businessinsider import BusinessInsiderIE +from .bundesliga import BundesligaIE +from .buzzfeed import BuzzFeedIE +from .byutv import BYUtvIE +from .c56 import C56IE +from .cableav import CableAVIE +from .callin import CallinIE +from .caltrans import CaltransIE +from .cam4 import CAM4IE +from .camdemy import ( + CamdemyIE, + CamdemyFolderIE +) +from .cammodels import CamModelsIE +from .camsoda import CamsodaIE +from .camtasia import CamtasiaEmbedIE +from .camwithher import CamWithHerIE +from .canalalpha import CanalAlphaIE +from .canalplus import CanalplusIE +from .canalc2 import Canalc2IE +from .canvas import ( + CanvasIE, + CanvasEenIE, + VrtNUIE, + DagelijkseKostIE, +) +from .carambatv import ( + CarambaTVIE, + CarambaTVPageIE, +) +from .cartoonnetwork import CartoonNetworkIE +from .cbc import ( + CBCIE, + CBCPlayerIE, + CBCGemIE, + CBCGemPlaylistIE, + CBCGemLiveIE, +) +from .cbs import CBSIE +from .cbslocal import ( + CBSLocalIE, + CBSLocalArticleIE, +) +from .cbsinteractive import CBSInteractiveIE +from .cbsnews import ( + CBSNewsEmbedIE, + CBSNewsIE, + CBSNewsLiveVideoIE, +) +from .cbssports import ( + CBSSportsEmbedIE, + CBSSportsIE, + TwentyFourSevenSportsIE, +) +from .ccc import ( + CCCIE, + CCCPlaylistIE, +) +from .ccma import CCMAIE +from .cctv import CCTVIE +from .cda import CDAIE +from .cellebrite import CellebriteIE +from .ceskatelevize import CeskaTelevizeIE +from .cgtn import CGTNIE +from .channel9 import Channel9IE +from .charlierose import CharlieRoseIE +from .chaturbate import ChaturbateIE +from .chilloutzone import ChilloutzoneIE +from .chingari import ( + ChingariIE, + ChingariUserIE, +) +from .chirbit import ( + ChirbitIE, + ChirbitProfileIE, +) +from .cinchcast import CinchcastIE +from .cinemax import CinemaxIE +from .cinetecamilano import CinetecaMilanoIE +from .ciscolive import ( + CiscoLiveSessionIE, + CiscoLiveSearchIE, +) +from .ciscowebex import CiscoWebexIE +from .cjsw import CJSWIE +from .cliphunter import CliphunterIE +from .clippit import ClippitIE +from .cliprs import ClipRsIE +from .clipsyndicate import ClipsyndicateIE +from .closertotruth import CloserToTruthIE +from .cloudflarestream import CloudflareStreamIE +from .cloudy import CloudyIE +from .clubic import ClubicIE +from .clyp import ClypIE +from .cmt import CMTIE +from .cnbc import ( + CNBCIE, + CNBCVideoIE, +) +from .cnn import ( + CNNIE, + CNNBlogsIE, + CNNArticleIE, + CNNIndonesiaIE, +) +from .coub import CoubIE +from .comedycentral import ( + ComedyCentralIE, + ComedyCentralTVIE, +) +from .commonmistakes import CommonMistakesIE, UnicodeBOMIE +from .commonprotocols import ( + MmsIE, + RtmpIE, + ViewSourceIE, +) +from .condenast import CondeNastIE +from .contv import CONtvIE +from .corus import CorusIE +from .cpac import ( + CPACIE, + CPACPlaylistIE, +) +from .cozytv import CozyTVIE +from .cracked import CrackedIE +from .crackle import CrackleIE +from .craftsy import CraftsyIE +from .crooksandliars import CrooksAndLiarsIE +from .crowdbunker import ( + CrowdBunkerIE, + CrowdBunkerChannelIE, +) +from .crunchyroll import ( + CrunchyrollBetaIE, + CrunchyrollBetaShowIE, +) +from .cspan import CSpanIE, CSpanCongressIE +from .ctsnews import CtsNewsIE +from .ctv import CTVIE +from .ctvnews import CTVNewsIE +from .cultureunplugged import CultureUnpluggedIE +from .curiositystream import ( + CuriosityStreamIE, + CuriosityStreamCollectionsIE, + CuriosityStreamSeriesIE, +) +from .cwtv import CWTVIE +from .cybrary import ( + CybraryIE, + CybraryCourseIE +) +from .daftsex import DaftsexIE +from .dailymail import DailyMailIE +from .dailymotion import ( + DailymotionIE, + DailymotionPlaylistIE, + DailymotionUserIE, +) +from .dailywire import ( + DailyWireIE, + DailyWirePodcastIE, +) +from .damtomo import ( + DamtomoRecordIE, + DamtomoVideoIE, +) +from .daum import ( + DaumIE, + DaumClipIE, + DaumPlaylistIE, + DaumUserIE, +) +from .daystar import DaystarClipIE +from .dbtv import DBTVIE +from .dctp import DctpTvIE +from .deezer import ( + DeezerPlaylistIE, + DeezerAlbumIE, +) +from .democracynow import DemocracynowIE +from .detik import DetikEmbedIE +from .dfb import DFBIE +from .dhm import DHMIE +from .digg import DiggIE +from .dotsub import DotsubIE +from .douyutv import ( + DouyuShowIE, + DouyuTVIE, +) +from .dplay import ( + DPlayIE, + DiscoveryPlusIE, + HGTVDeIE, + GoDiscoveryIE, + TravelChannelIE, + CookingChannelIE, + HGTVUsaIE, + FoodNetworkIE, + InvestigationDiscoveryIE, + DestinationAmericaIE, + AmHistoryChannelIE, + ScienceChannelIE, + DIYNetworkIE, + DiscoveryLifeIE, + AnimalPlanetIE, + TLCIE, + MotorTrendIE, + MotorTrendOnDemandIE, + DiscoveryPlusIndiaIE, + DiscoveryNetworksDeIE, + DiscoveryPlusItalyIE, + DiscoveryPlusItalyShowIE, + DiscoveryPlusIndiaShowIE, +) +from .dreisat import DreiSatIE +from .drbonanza import DRBonanzaIE +from .drtuber import DrTuberIE +from .drtv import ( + DRTVIE, + DRTVLiveIE, +) +from .dtube import DTubeIE +from .dvtv import DVTVIE +from .duboku import ( + DubokuIE, + DubokuPlaylistIE +) +from .dumpert import DumpertIE +from .defense import DefenseGouvFrIE +from .deuxm import ( + DeuxMIE, + DeuxMNewsIE +) +from .digitalconcerthall import DigitalConcertHallIE +from .discovery import DiscoveryIE +from .disney import DisneyIE +from .dispeak import DigitallySpeakingIE +from .dropbox import DropboxIE +from .dropout import ( + DropoutSeasonIE, + DropoutIE +) +from .dw import ( + DWIE, + DWArticleIE, +) +from .eagleplatform import EaglePlatformIE, ClipYouEmbedIE +from .ebaumsworld import EbaumsWorldIE +from .echomsk import EchoMskIE +from .egghead import ( + EggheadCourseIE, + EggheadLessonIE, +) +from .ehow import EHowIE +from .eighttracks import EightTracksIE +from .einthusan import EinthusanIE +from .eitb import EitbIE +from .ellentube import ( + EllenTubeIE, + EllenTubeVideoIE, + EllenTubePlaylistIE, +) +from .elonet import ElonetIE +from .elpais import ElPaisIE +from .embedly import EmbedlyIE +from .engadget import EngadgetIE +from .epicon import ( + EpiconIE, + EpiconSeriesIE, +) +from .epoch import EpochIE +from .eporner import EpornerIE +from .eroprofile import ( + EroProfileIE, + EroProfileAlbumIE, +) +from .ertgr import ( + ERTFlixCodenameIE, + ERTFlixIE, + ERTWebtvEmbedIE, +) +from .escapist import EscapistIE +from .espn import ( + ESPNIE, + WatchESPNIE, + ESPNArticleIE, + FiveThirtyEightIE, + ESPNCricInfoIE, +) +from .esri import EsriVideoIE +from .europa import EuropaIE +from .europeantour import EuropeanTourIE +from .eurosport import EurosportIE +from .euscreen import EUScreenIE +from .expotv import ExpoTVIE +from .expressen import ExpressenIE +from .extremetube import ExtremeTubeIE +from .eyedotv import EyedoTVIE +from .facebook import ( + FacebookIE, + FacebookPluginsVideoIE, + FacebookRedirectURLIE, + FacebookReelIE, +) +from .fancode import ( + FancodeVodIE, + FancodeLiveIE +) + +from .faz import FazIE +from .fc2 import ( + FC2IE, + FC2EmbedIE, + FC2LiveIE, +) +from .fczenit import FczenitIE +from .fifa import FifaIE +from .filmmodu import FilmmoduIE +from .filmon import ( + FilmOnIE, + FilmOnChannelIE, +) +from .filmweb import FilmwebIE +from .firsttv import FirstTVIE +from .fivetv import FiveTVIE +from .flickr import FlickrIE +from .folketinget import FolketingetIE +from .footyroom import FootyRoomIE +from .formula1 import Formula1IE +from .fourtube import ( + FourTubeIE, + PornTubeIE, + PornerBrosIE, + FuxIE, +) +from .fourzerostudio import ( + FourZeroStudioArchiveIE, + FourZeroStudioClipIE, +) +from .fox import FOXIE +from .fox9 import ( + FOX9IE, + FOX9NewsIE, +) +from .foxgay import FoxgayIE +from .foxnews import ( + FoxNewsIE, + FoxNewsArticleIE, + FoxNewsVideoIE, +) +from .foxsports import FoxSportsIE +from .fptplay import FptplayIE +from .franceinter import FranceInterIE +from .francetv import ( + FranceTVIE, + FranceTVSiteIE, + FranceTVInfoIE, +) +from .freesound import FreesoundIE +from .freespeech import FreespeechIE +from .frontendmasters import ( + FrontendMastersIE, + FrontendMastersLessonIE, + FrontendMastersCourseIE +) +from .freetv import ( + FreeTvIE, + FreeTvMoviesIE, +) +from .fujitv import FujiTVFODPlus7IE +from .funimation import ( + FunimationIE, + FunimationPageIE, + FunimationShowIE, +) +from .funk import FunkIE +from .fusion import FusionIE +from .fuyintv import FuyinTVIE +from .gab import ( + GabTVIE, + GabIE, +) +from .gaia import GaiaIE +from .gameinformer import GameInformerIE +from .gamejolt import ( + GameJoltIE, + GameJoltUserIE, + GameJoltGameIE, + GameJoltGameSoundtrackIE, + GameJoltCommunityIE, + GameJoltSearchIE, +) +from .gamespot import GameSpotIE +from .gamestar import GameStarIE +from .gaskrank import GaskrankIE +from .gazeta import GazetaIE +from .gdcvault import GDCVaultIE +from .gedidigital import GediDigitalIE +from .generic import GenericIE +from .genius import ( + GeniusIE, + GeniusLyricsIE, +) +from .gettr import ( + GettrIE, + GettrStreamingIE, +) +from .gfycat import GfycatIE +from .giantbomb import GiantBombIE +from .giga import GigaIE +from .glide import GlideIE +from .globo import ( + GloboIE, + GloboArticleIE, +) +from .go import GoIE +from .godtube import GodTubeIE +from .gofile import GofileIE +from .golem import GolemIE +from .goodgame import GoodGameIE +from .googledrive import ( + GoogleDriveIE, + GoogleDriveFolderIE, +) +from .googlepodcasts import ( + GooglePodcastsIE, + GooglePodcastsFeedIE, +) +from .googlesearch import GoogleSearchIE +from .gopro import GoProIE +from .goplay import GoPlayIE +from .goshgay import GoshgayIE +from .gotostage import GoToStageIE +from .gputechconf import GPUTechConfIE +from .gronkh import ( + GronkhIE, + GronkhFeedIE, + GronkhVodsIE +) +from .groupon import GrouponIE +from .harpodeon import HarpodeonIE +from .hbo import HBOIE +from .hearthisat import HearThisAtIE +from .heise import HeiseIE +from .hellporno import HellPornoIE +from .helsinki import HelsinkiIE +from .hentaistigma import HentaiStigmaIE +from .hgtv import HGTVComShowIE +from .hketv import HKETVIE +from .hidive import HiDiveIE +from .historicfilms import HistoricFilmsIE +from .hitbox import HitboxIE, HitboxLiveIE +from .hitrecord import HitRecordIE +from .holodex import HolodexIE +from .hotnewhiphop import HotNewHipHopIE +from .hotstar import ( + HotStarIE, + HotStarPrefixIE, + HotStarPlaylistIE, + HotStarSeasonIE, + HotStarSeriesIE, +) +from .howcast import HowcastIE +from .howstuffworks import HowStuffWorksIE +from .hrfensehen import HRFernsehenIE +from .hrti import ( + HRTiIE, + HRTiPlaylistIE, +) +from .hse import ( + HSEShowIE, + HSEProductIE, +) +from .genericembeds import ( + HTML5MediaEmbedIE, + QuotedHTMLIE, +) +from .huajiao import HuajiaoIE +from .huya import HuyaLiveIE +from .huffpost import HuffPostIE +from .hungama import ( + HungamaIE, + HungamaSongIE, + HungamaAlbumPlaylistIE, +) +from .hypem import HypemIE +from .hytale import HytaleIE +from .icareus import IcareusIE +from .ichinanalive import ( + IchinanaLiveIE, + IchinanaLiveClipIE, +) +from .ign import ( + IGNIE, + IGNVideoIE, + IGNArticleIE, +) +from .iheart import ( + IHeartRadioIE, + IHeartRadioPodcastIE, +) +from .iltalehti import IltalehtiIE +from .imdb import ( + ImdbIE, + ImdbListIE +) +from .imgur import ( + ImgurIE, + ImgurAlbumIE, + ImgurGalleryIE, +) +from .ina import InaIE +from .inc import IncIE +from .indavideo import IndavideoEmbedIE +from .infoq import InfoQIE +from .instagram import ( + InstagramIE, + InstagramIOSIE, + InstagramUserIE, + InstagramTagIE, + InstagramStoryIE, +) +from .internazionale import InternazionaleIE +from .internetvideoarchive import InternetVideoArchiveIE +from .iprima import ( + IPrimaIE, + IPrimaCNNIE +) +from .iqiyi import ( + IqiyiIE, + IqIE, + IqAlbumIE +) +from .islamchannel import ( + IslamChannelIE, + IslamChannelSeriesIE, +) +from .israelnationalnews import IsraelNationalNewsIE +from .itprotv import ( + ITProTVIE, + ITProTVCourseIE +) +from .itv import ( + ITVIE, + ITVBTCCIE, +) +from .ivi import ( + IviIE, + IviCompilationIE +) +from .ivideon import IvideonIE +from .iwara import ( + IwaraIE, + IwaraPlaylistIE, + IwaraUserIE, +) +from .ixigua import IxiguaIE +from .izlesene import IzleseneIE +from .jable import ( + JableIE, + JablePlaylistIE, +) +from .jamendo import ( + JamendoIE, + JamendoAlbumIE, +) +from .japandiet import ( + ShugiinItvLiveIE, + ShugiinItvLiveRoomIE, + ShugiinItvVodIE, + SangiinInstructionIE, + SangiinIE, +) +from .jeuxvideo import JeuxVideoIE +from .jove import JoveIE +from .joj import JojIE +from .jwplatform import JWPlatformIE +from .kakao import KakaoIE +from .kaltura import KalturaIE +from .kanal2 import Kanal2IE +from .karaoketv import KaraoketvIE +from .karrierevideos import KarriereVideosIE +from .keezmovies import KeezMoviesIE +from .kelbyone import KelbyOneIE +from .ketnet import KetnetIE +from .khanacademy import ( + KhanAcademyIE, + KhanAcademyUnitIE, +) +from .kicker import KickerIE +from .kickstarter import KickStarterIE +from .kinja import KinjaEmbedIE +from .kinopoisk import KinoPoiskIE +from .kompas import KompasVideoIE +from .konserthusetplay import KonserthusetPlayIE +from .koo import KooIE +from .kth import KTHIE +from .krasview import KrasViewIE +from .ku6 import Ku6IE +from .kusi import KUSIIE +from .kuwo import ( + KuwoIE, + KuwoAlbumIE, + KuwoChartIE, + KuwoSingerIE, + KuwoCategoryIE, + KuwoMvIE, +) +from .la7 import ( + LA7IE, + LA7PodcastEpisodeIE, + LA7PodcastIE, +) +from .laola1tv import ( + Laola1TvEmbedIE, + Laola1TvIE, + EHFTVIE, + ITTFIE, +) +from .lastfm import ( + LastFMIE, + LastFMPlaylistIE, + LastFMUserIE, +) +from .lbry import ( + LBRYIE, + LBRYChannelIE, +) +from .lci import LCIIE +from .lcp import ( + LcpPlayIE, + LcpIE, +) +from .lecture2go import Lecture2GoIE +from .lecturio import ( + LecturioIE, + LecturioCourseIE, + LecturioDeCourseIE, +) +from .leeco import ( + LeIE, + LePlaylistIE, + LetvCloudIE, +) +from .lego import LEGOIE +from .lemonde import LemondeIE +from .lenta import LentaIE +from .libraryofcongress import LibraryOfCongressIE +from .libsyn import LibsynIE +from .lifenews import ( + LifeNewsIE, + LifeEmbedIE, +) +from .likee import ( + LikeeIE, + LikeeUserIE +) +from .limelight import ( + LimelightMediaIE, + LimelightChannelIE, + LimelightChannelListIE, +) +from .line import ( + LineLiveIE, + LineLiveChannelIE, +) +from .linkedin import ( + LinkedInIE, + LinkedInLearningIE, + LinkedInLearningCourseIE, +) +from .linuxacademy import LinuxAcademyIE +from .liputan6 import Liputan6IE +from .listennotes import ListenNotesIE +from .litv import LiTVIE +from .livejournal import LiveJournalIE +from .livestream import ( + LivestreamIE, + LivestreamOriginalIE, + LivestreamShortenerIE, +) +from .livestreamfails import LivestreamfailsIE +from .lnkgo import ( + LnkGoIE, + LnkIE, +) +from .localnews8 import LocalNews8IE +from .lovehomeporn import LoveHomePornIE +from .lrt import ( + LRTVODIE, + LRTStreamIE +) +from .lynda import ( + LyndaIE, + LyndaCourseIE +) +from .m6 import M6IE +from .magentamusik360 import MagentaMusik360IE +from .mailru import ( + MailRuIE, + MailRuMusicIE, + MailRuMusicSearchIE, +) +from .mainstreaming import MainStreamingIE +from .malltv import MallTVIE +from .mangomolo import ( + MangomoloVideoIE, + MangomoloLiveIE, +) +from .manoto import ( + ManotoTVIE, + ManotoTVShowIE, + ManotoTVLiveIE, +) +from .manyvids import ManyVidsIE +from .maoritv import MaoriTVIE +from .markiza import ( + MarkizaIE, + MarkizaPageIE, +) +from .massengeschmacktv import MassengeschmackTVIE +from .masters import MastersIE +from .matchtv import MatchTVIE +from .mdr import MDRIE +from .medaltv import MedalTVIE +from .mediaite import MediaiteIE +from .mediaklikk import MediaKlikkIE +from .mediaset import ( + MediasetIE, + MediasetShowIE, +) +from .mediasite import ( + MediasiteIE, + MediasiteCatalogIE, + MediasiteNamedCatalogIE, +) +from .mediaworksnz import MediaWorksNZVODIE +from .medici import MediciIE +from .megaphone import MegaphoneIE +from .meipai import MeipaiIE +from .melonvod import MelonVODIE +from .meta import METAIE +from .metacafe import MetacafeIE +from .metacritic import MetacriticIE +from .mgoon import MgoonIE +from .mgtv import MGTVIE +from .miaopai import MiaoPaiIE +from .microsoftstream import MicrosoftStreamIE +from .microsoftvirtualacademy import ( + MicrosoftVirtualAcademyIE, + MicrosoftVirtualAcademyCourseIE, +) +from .microsoftembed import MicrosoftEmbedIE +from .mildom import ( + MildomIE, + MildomVodIE, + MildomClipIE, + MildomUserVodIE, +) +from .minds import ( + MindsIE, + MindsChannelIE, + MindsGroupIE, +) +from .ministrygrid import MinistryGridIE +from .minoto import MinotoIE +from .miomio import MioMioIE +from .mirrativ import ( + MirrativIE, + MirrativUserIE, +) +from .mirrorcouk import MirrorCoUKIE +from .mit import TechTVMITIE, OCWMITIE +from .mitele import MiTeleIE +from .mixch import ( + MixchIE, + MixchArchiveIE, +) +from .mixcloud import ( + MixcloudIE, + MixcloudUserIE, + MixcloudPlaylistIE, +) +from .mlb import ( + MLBIE, + MLBVideoIE, + MLBTVIE, + MLBArticleIE, +) +from .mlssoccer import MLSSoccerIE +from .mnet import MnetIE +from .mocha import MochaVideoIE +from .moevideo import MoeVideoIE +from .mofosex import ( + MofosexIE, + MofosexEmbedIE, +) +from .mojvideo import MojvideoIE +from .morningstar import MorningstarIE +from .motherless import ( + MotherlessIE, + MotherlessGroupIE +) +from .motorsport import MotorsportIE +from .movieclips import MovieClipsIE +from .moviepilot import MoviepilotIE +from .moview import MoviewPlayIE +from .moviezine import MoviezineIE +from .movingimage import MovingImageIE +from .msn import MSNIE +from .mtv import ( + MTVIE, + MTVVideoIE, + MTVServicesEmbeddedIE, + MTVDEIE, + MTVJapanIE, + MTVItaliaIE, + MTVItaliaProgrammaIE, +) +from .muenchentv import MuenchenTVIE +from .murrtube import MurrtubeIE, MurrtubeUserIE +from .musescore import MuseScoreIE +from .musicdex import ( + MusicdexSongIE, + MusicdexAlbumIE, + MusicdexArtistIE, + MusicdexPlaylistIE, +) +from .mwave import MwaveIE, MwaveMeetGreetIE +from .mxplayer import ( + MxplayerIE, + MxplayerShowIE, +) +from .mychannels import MyChannelsIE +from .myspace import MySpaceIE, MySpaceAlbumIE +from .myspass import MySpassIE +from .myvi import ( + MyviIE, + MyviEmbedIE, +) +from .myvideoge import MyVideoGeIE +from .myvidster import MyVidsterIE +from .n1 import ( + N1InfoAssetIE, + N1InfoIIE, +) +from .nate import ( + NateIE, + NateProgramIE, +) +from .nationalgeographic import ( + NationalGeographicVideoIE, + NationalGeographicTVIE, +) +from .naver import ( + NaverIE, + NaverLiveIE, + NaverNowIE, +) +from .nba import ( + NBAWatchEmbedIE, + NBAWatchIE, + NBAWatchCollectionIE, + NBAEmbedIE, + NBAIE, + NBAChannelIE, +) +from .nbc import ( + NBCIE, + NBCNewsIE, + NBCOlympicsIE, + NBCOlympicsStreamIE, + NBCSportsIE, + NBCSportsStreamIE, + NBCSportsVPlayerIE, + NBCStationsIE, +) +from .ndr import ( + NDRIE, + NJoyIE, + NDREmbedBaseIE, + NDREmbedIE, + NJoyEmbedIE, +) +from .ndtv import NDTVIE +from .nebula import ( + NebulaIE, + NebulaSubscriptionsIE, + NebulaChannelIE, +) +from .nerdcubed import NerdCubedFeedIE +from .netzkino import NetzkinoIE +from .neteasemusic import ( + NetEaseMusicIE, + NetEaseMusicAlbumIE, + NetEaseMusicSingerIE, + NetEaseMusicListIE, + NetEaseMusicMvIE, + NetEaseMusicProgramIE, + NetEaseMusicDjRadioIE, +) +from .netverse import ( + NetverseIE, + NetversePlaylistIE, +) +from .newgrounds import ( + NewgroundsIE, + NewgroundsPlaylistIE, + NewgroundsUserIE, +) +from .newspicks import NewsPicksIE +from .newstube import NewstubeIE +from .newsy import NewsyIE +from .nextmedia import ( + NextMediaIE, + NextMediaActionNewsIE, + AppleDailyIE, + NextTVIE, +) +from .nexx import ( + NexxIE, + NexxEmbedIE, +) +from .nfb import NFBIE +from .nfhsnetwork import NFHSNetworkIE +from .nfl import ( + NFLIE, + NFLArticleIE, +) +from .nhk import ( + NhkVodIE, + NhkVodProgramIE, + NhkForSchoolBangumiIE, + NhkForSchoolSubjectIE, + NhkForSchoolProgramListIE, +) +from .nhl import NHLIE +from .nick import ( + NickIE, + NickBrIE, + NickDeIE, + NickNightIE, + NickRuIE, +) +from .niconico import ( + NiconicoIE, + NiconicoPlaylistIE, + NiconicoUserIE, + NiconicoSeriesIE, + NiconicoHistoryIE, + NicovideoSearchDateIE, + NicovideoSearchIE, + NicovideoSearchURLIE, + NicovideoTagURLIE, +) +from .ninecninemedia import ( + NineCNineMediaIE, + CPTwentyFourIE, +) +from .ninegag import NineGagIE +from .ninenow import NineNowIE +from .nintendo import NintendoIE +from .nitter import NitterIE +from .njpwworld import NJPWWorldIE +from .nobelprize import NobelPrizeIE +from .nonktube import NonkTubeIE +from .noodlemagazine import NoodleMagazineIE +from .noovo import NoovoIE +from .normalboots import NormalbootsIE +from .nosvideo import NosVideoIE +from .nosnl import NOSNLArticleIE +from .nova import ( + NovaEmbedIE, + NovaIE, +) +from .novaplay import NovaPlayIE +from .nowness import ( + NownessIE, + NownessPlaylistIE, + NownessSeriesIE, +) +from .noz import NozIE +from .npo import ( + AndereTijdenIE, + NPOIE, + NPOLiveIE, + NPORadioIE, + NPORadioFragmentIE, + SchoolTVIE, + HetKlokhuisIE, + VPROIE, + WNLIE, +) +from .npr import NprIE +from .nrk import ( + NRKIE, + NRKPlaylistIE, + NRKSkoleIE, + NRKTVIE, + NRKTVDirekteIE, + NRKRadioPodkastIE, + NRKTVEpisodeIE, + NRKTVEpisodesIE, + NRKTVSeasonIE, + NRKTVSeriesIE, +) +from .nrl import NRLTVIE +from .ntvcojp import NTVCoJpCUIE +from .ntvde import NTVDeIE +from .ntvru import NTVRuIE +from .nytimes import ( + NYTimesIE, + NYTimesArticleIE, + NYTimesCookingIE, +) +from .nuvid import NuvidIE +from .nzherald import NZHeraldIE +from .nzz import NZZIE +from .odatv import OdaTVIE +from .odnoklassniki import OdnoklassnikiIE +from .oftv import ( + OfTVIE, + OfTVPlaylistIE +) +from .oktoberfesttv import OktoberfestTVIE +from .olympics import OlympicsReplayIE +from .on24 import On24IE +from .ondemandkorea import OnDemandKoreaIE +from .onefootball import OneFootballIE +from .onenewsnz import OneNewsNZIE +from .onet import ( + OnetIE, + OnetChannelIE, + OnetMVPIE, + OnetPlIE, +) +from .onionstudios import OnionStudiosIE +from .ooyala import ( + OoyalaIE, + OoyalaExternalIE, +) +from .opencast import ( + OpencastIE, + OpencastPlaylistIE, +) +from .openrec import ( + OpenRecIE, + OpenRecCaptureIE, + OpenRecMovieIE, +) +from .ora import OraTVIE +from .orf import ( + ORFTVthekIE, + ORFFM4StoryIE, + ORFRadioIE, + ORFIPTVIE, +) +from .outsidetv import OutsideTVIE +from .packtpub import ( + PacktPubIE, + PacktPubCourseIE, +) +from .palcomp3 import ( + PalcoMP3IE, + PalcoMP3ArtistIE, + PalcoMP3VideoIE, +) +from .pandoratv import PandoraTVIE +from .panopto import ( + PanoptoIE, + PanoptoListIE, + PanoptoPlaylistIE +) +from .paramountplus import ( + ParamountPlusIE, + ParamountPlusSeriesIE, +) +from .parler import ParlerIE +from .parlview import ParlviewIE +from .patreon import ( + PatreonIE, + PatreonCampaignIE +) +from .pbs import PBSIE +from .pearvideo import PearVideoIE +from .peekvids import PeekVidsIE, PlayVidsIE +from .peertube import ( + PeerTubeIE, + PeerTubePlaylistIE, +) +from .peertv import PeerTVIE +from .peloton import ( + PelotonIE, + PelotonLiveIE +) +from .people import PeopleIE +from .performgroup import PerformGroupIE +from .periscope import ( + PeriscopeIE, + PeriscopeUserIE, +) +from .philharmoniedeparis import PhilharmonieDeParisIE +from .phoenix import PhoenixIE +from .photobucket import PhotobucketIE +from .piapro import PiaproIE +from .picarto import ( + PicartoIE, + PicartoVodIE, +) +from .piksel import PikselIE +from .pinkbike import PinkbikeIE +from .pinterest import ( + PinterestIE, + PinterestCollectionIE, +) +from .pixivsketch import ( + PixivSketchIE, + PixivSketchUserIE, +) +from .pladform import PladformIE +from .planetmarathi import PlanetMarathiIE +from .platzi import ( + PlatziIE, + PlatziCourseIE, +) +from .playfm import PlayFMIE +from .playplustv import PlayPlusTVIE +from .plays import PlaysTVIE +from .playstuff import PlayStuffIE +from .playsuisse import PlaySuisseIE +from .playtvak import PlaytvakIE +from .playvid import PlayvidIE +from .playwire import PlaywireIE +from .plutotv import PlutoTVIE +from .pluralsight import ( + PluralsightIE, + PluralsightCourseIE, +) +from .podbayfm import PodbayFMIE, PodbayFMChannelIE +from .podchaser import PodchaserIE +from .podomatic import PodomaticIE +from .pokemon import ( + PokemonIE, + PokemonWatchIE, +) +from .pokergo import ( + PokerGoIE, + PokerGoCollectionIE, +) +from .polsatgo import PolsatGoIE +from .polskieradio import ( + PolskieRadioIE, + PolskieRadioCategoryIE, + PolskieRadioPlayerIE, + PolskieRadioPodcastIE, + PolskieRadioPodcastListIE, + PolskieRadioRadioKierowcowIE, +) +from .popcorntimes import PopcorntimesIE +from .popcorntv import PopcornTVIE +from .porn91 import Porn91IE +from .porncom import PornComIE +from .pornflip import PornFlipIE +from .pornhd import PornHdIE +from .pornhub import ( + PornHubIE, + PornHubUserIE, + PornHubPlaylistIE, + PornHubPagedVideoListIE, + PornHubUserVideosUploadIE, +) +from .pornotube import PornotubeIE +from .pornovoisines import PornoVoisinesIE +from .pornoxo import PornoXOIE +from .pornez import PornezIE +from .puhutv import ( + PuhuTVIE, + PuhuTVSerieIE, +) +from .prankcast import PrankCastIE +from .premiershiprugby import PremiershipRugbyIE +from .presstv import PressTVIE +from .projectveritas import ProjectVeritasIE +from .prosiebensat1 import ProSiebenSat1IE +from .prx import ( + PRXStoryIE, + PRXSeriesIE, + PRXAccountIE, + PRXStoriesSearchIE, + PRXSeriesSearchIE +) +from .puls4 import Puls4IE +from .pyvideo import PyvideoIE +from .qingting import QingTingIE +from .qqmusic import ( + QQMusicIE, + QQMusicSingerIE, + QQMusicAlbumIE, + QQMusicToplistIE, + QQMusicPlaylistIE, +) +from .r7 import ( + R7IE, + R7ArticleIE, +) +from .radiko import RadikoIE, RadikoRadioIE +from .radiocanada import ( + RadioCanadaIE, + RadioCanadaAudioVideoIE, +) +from .radiode import RadioDeIE +from .radiojavan import RadioJavanIE +from .radiobremen import RadioBremenIE +from .radiofrance import FranceCultureIE, RadioFranceIE +from .radiozet import RadioZetPodcastIE +from .radiokapital import ( + RadioKapitalIE, + RadioKapitalShowIE, +) +from .radlive import ( + RadLiveIE, + RadLiveChannelIE, + RadLiveSeasonIE, +) +from .rai import ( + RaiPlayIE, + RaiPlayLiveIE, + RaiPlayPlaylistIE, + RaiPlaySoundIE, + RaiPlaySoundLiveIE, + RaiPlaySoundPlaylistIE, + RaiNewsIE, + RaiSudtirolIE, + RaiIE, +) +from .raywenderlich import ( + RayWenderlichIE, + RayWenderlichCourseIE, +) +from .rbmaradio import RBMARadioIE +from .rcs import ( + RCSIE, + RCSEmbedsIE, + RCSVariousIE, +) +from .rcti import ( + RCTIPlusIE, + RCTIPlusSeriesIE, + RCTIPlusTVIE, +) +from .rds import RDSIE +from .redbee import ParliamentLiveUKIE, RTBFIE +from .redbulltv import ( + RedBullTVIE, + RedBullEmbedIE, + RedBullTVRrnContentIE, + RedBullIE, +) +from .reddit import RedditIE +from .redgifs import ( + RedGifsIE, + RedGifsSearchIE, + RedGifsUserIE, +) +from .redtube import RedTubeIE +from .regiotv import RegioTVIE +from .rentv import ( + RENTVIE, + RENTVArticleIE, +) +from .restudy import RestudyIE +from .reuters import ReutersIE +from .reverbnation import ReverbNationIE +from .rice import RICEIE +from .rmcdecouverte import RMCDecouverteIE +from .rockstargames import RockstarGamesIE +from .rokfin import ( + RokfinIE, + RokfinStackIE, + RokfinChannelIE, + RokfinSearchIE, +) +from .roosterteeth import RoosterTeethIE, RoosterTeethSeriesIE +from .rottentomatoes import RottenTomatoesIE +from .rozhlas import RozhlasIE +from .rte import RteIE, RteRadioIE +from .rtlnl import ( + RtlNlIE, + RTLLuTeleVODIE, + RTLLuArticleIE, + RTLLuLiveIE, + RTLLuRadioIE, +) +from .rtl2 import ( + RTL2IE, + RTL2YouIE, + RTL2YouSeriesIE, +) +from .rtnews import ( + RTNewsIE, + RTDocumentryIE, + RTDocumentryPlaylistIE, + RuptlyIE, +) +from .rtp import RTPIE +from .rtrfm import RTRFMIE +from .rts import RTSIE +from .rtve import ( + RTVEALaCartaIE, + RTVEAudioIE, + RTVELiveIE, + RTVEInfantilIE, + RTVETelevisionIE, +) +from .rtvnh import RTVNHIE +from .rtvs import RTVSIE +from .rtvslo import RTVSLOIE +from .ruhd import RUHDIE +from .rule34video import Rule34VideoIE +from .rumble import ( + RumbleEmbedIE, + RumbleChannelIE, +) +from .rutube import ( + RutubeIE, + RutubeChannelIE, + RutubeEmbedIE, + RutubeMovieIE, + RutubePersonIE, + RutubePlaylistIE, + RutubeTagsIE, +) +from .glomex import ( + GlomexIE, + GlomexEmbedIE, +) +from .megatvcom import ( + MegaTVComIE, + MegaTVComEmbedIE, +) +from .ant1newsgr import ( + Ant1NewsGrWatchIE, + Ant1NewsGrArticleIE, + Ant1NewsGrEmbedIE, +) +from .rutv import RUTVIE +from .ruutu import RuutuIE +from .ruv import ( + RuvIE, + RuvSpilaIE +) +from .safari import ( + SafariIE, + SafariApiIE, + SafariCourseIE, +) +from .saitosan import SaitosanIE +from .samplefocus import SampleFocusIE +from .sapo import SapoIE +from .savefrom import SaveFromIE +from .sbs import SBSIE +from .screen9 import Screen9IE +from .screencast import ScreencastIE +from .screencastify import ScreencastifyIE +from .screencastomatic import ScreencastOMaticIE +from .scrippsnetworks import ( + ScrippsNetworksWatchIE, + ScrippsNetworksIE, +) +from .scte import ( + SCTEIE, + SCTECourseIE, +) +from .scrolller import ScrolllerIE +from .seeker import SeekerIE +from .senategov import SenateISVPIE, SenateGovIE +from .sendtonews import SendtoNewsIE +from .servus import ServusIE +from .sevenplus import SevenPlusIE +from .sexu import SexuIE +from .seznamzpravy import ( + SeznamZpravyIE, + SeznamZpravyArticleIE, +) +from .shahid import ( + ShahidIE, + ShahidShowIE, +) +from .shared import ( + SharedIE, + VivoIE, +) +from .sharevideos import ShareVideosEmbedIE +from .shemaroome import ShemarooMeIE +from .showroomlive import ShowRoomLiveIE +from .simplecast import ( + SimplecastIE, + SimplecastEpisodeIE, + SimplecastPodcastIE, +) +from .sina import SinaIE +from .sixplay import SixPlayIE +from .skeb import SkebIE +from .skyit import ( + SkyItPlayerIE, + SkyItVideoIE, + SkyItVideoLiveIE, + SkyItIE, + SkyItArteIE, + CieloTVItIE, + TV8ItIE, +) +from .skylinewebcams import SkylineWebcamsIE +from .skynewsarabia import ( + SkyNewsArabiaIE, + SkyNewsArabiaArticleIE, +) +from .skynewsau import SkyNewsAUIE +from .sky import ( + SkyNewsIE, + SkyNewsStoryIE, + SkySportsIE, + SkySportsNewsIE, +) +from .slideshare import SlideshareIE +from .slideslive import SlidesLiveIE +from .slutload import SlutloadIE +from .smotrim import SmotrimIE +from .snotr import SnotrIE +from .sohu import SohuIE +from .sonyliv import ( + SonyLIVIE, + SonyLIVSeriesIE, +) +from .soundcloud import ( + SoundcloudEmbedIE, + SoundcloudIE, + SoundcloudSetIE, + SoundcloudRelatedIE, + SoundcloudUserIE, + SoundcloudTrackStationIE, + SoundcloudPlaylistIE, + SoundcloudSearchIE, +) +from .soundgasm import ( + SoundgasmIE, + SoundgasmProfileIE +) +from .southpark import ( + SouthParkIE, + SouthParkDeIE, + SouthParkDkIE, + SouthParkEsIE, + SouthParkLatIE, + SouthParkNlIE +) +from .sovietscloset import ( + SovietsClosetIE, + SovietsClosetPlaylistIE +) +from .spankbang import ( + SpankBangIE, + SpankBangPlaylistIE, +) +from .spankwire import SpankwireIE +from .spiegel import SpiegelIE +from .spike import ( + BellatorIE, + ParamountNetworkIE, +) +from .startrek import StarTrekIE +from .stitcher import ( + StitcherIE, + StitcherShowIE, +) +from .sport5 import Sport5IE +from .sportbox import SportBoxIE +from .sportdeutschland import SportDeutschlandIE +from .spotify import ( + SpotifyIE, + SpotifyShowIE, +) +from .spreaker import ( + SpreakerIE, + SpreakerPageIE, + SpreakerShowIE, + SpreakerShowPageIE, +) +from .springboardplatform import SpringboardPlatformIE +from .sprout import SproutIE +from .srgssr import ( + SRGSSRIE, + SRGSSRPlayIE, +) +from .srmediathek import SRMediathekIE +from .stanfordoc import StanfordOpenClassroomIE +from .startv import StarTVIE +from .steam import ( + SteamIE, + SteamCommunityBroadcastIE, +) +from .storyfire import ( + StoryFireIE, + StoryFireUserIE, + StoryFireSeriesIE, +) +from .streamable import StreamableIE +from .streamanity import StreamanityIE +from .streamcloud import StreamcloudIE +from .streamcz import StreamCZIE +from .streamff import StreamFFIE +from .streetvoice import StreetVoiceIE +from .stretchinternet import StretchInternetIE +from .stripchat import StripchatIE +from .stv import STVPlayerIE +from .substack import SubstackIE +from .sunporno import SunPornoIE +from .sverigesradio import ( + SverigesRadioEpisodeIE, + SverigesRadioPublicationIE, +) +from .svt import ( + SVTIE, + SVTPageIE, + SVTPlayIE, + SVTSeriesIE, +) +from .swearnet import SwearnetEpisodeIE +from .swrmediathek import SWRMediathekIE +from .syvdk import SYVDKIE +from .syfy import SyfyIE +from .sztvhu import SztvHuIE +from .tagesschau import TagesschauIE +from .tass import TassIE +from .tbs import TBSIE +from .tdslifeway import TDSLifewayIE +from .teachable import ( + TeachableIE, + TeachableCourseIE, +) +from .teachertube import ( + TeacherTubeIE, + TeacherTubeUserIE, +) +from .teachingchannel import TeachingChannelIE +from .teamcoco import TeamcocoIE +from .teamtreehouse import TeamTreeHouseIE +from .techtalks import TechTalksIE +from .ted import ( + TedEmbedIE, + TedPlaylistIE, + TedSeriesIE, + TedTalkIE, +) +from .tele5 import Tele5IE +from .tele13 import Tele13IE +from .telebruxelles import TeleBruxellesIE +from .telecinco import TelecincoIE +from .telegraaf import TelegraafIE +from .telegram import TelegramEmbedIE +from .telemb import TeleMBIE +from .telemundo import TelemundoIE +from .telequebec import ( + TeleQuebecIE, + TeleQuebecSquatIE, + TeleQuebecEmissionIE, + TeleQuebecLiveIE, + TeleQuebecVideoIE, +) +from .teletask import TeleTaskIE +from .telewebion import TelewebionIE +from .tempo import TempoIE +from .tencent import ( + IflixEpisodeIE, + IflixSeriesIE, + VQQSeriesIE, + VQQVideoIE, + WeTvEpisodeIE, + WeTvSeriesIE, +) +from .tennistv import TennisTVIE +from .tenplay import TenPlayIE +from .testurl import TestURLIE +from .tf1 import TF1IE +from .tfo import TFOIE +from .theholetv import TheHoleTvIE +from .theintercept import TheInterceptIE +from .theplatform import ( + ThePlatformIE, + ThePlatformFeedIE, +) +from .thestar import TheStarIE +from .thesun import TheSunIE +from .theta import ( + ThetaVideoIE, + ThetaStreamIE, +) +from .theweatherchannel import TheWeatherChannelIE +from .thisamericanlife import ThisAmericanLifeIE +from .thisav import ThisAVIE +from .thisoldhouse import ThisOldHouseIE +from .threespeak import ( + ThreeSpeakIE, + ThreeSpeakUserIE, +) +from .threeqsdn import ThreeQSDNIE +from .tiktok import ( + TikTokIE, + TikTokUserIE, + TikTokSoundIE, + TikTokEffectIE, + TikTokTagIE, + TikTokVMIE, + DouyinIE, +) +from .tinypic import TinyPicIE +from .tmz import TMZIE +from .tnaflix import ( + TNAFlixNetworkEmbedIE, + TNAFlixIE, + EMPFlixIE, + MovieFapIE, +) +from .toggle import ( + ToggleIE, + MeWatchIE, +) +from .toggo import ( + ToggoIE, +) +from .tokentube import ( + TokentubeIE, + TokentubeChannelIE +) +from .tonline import TOnlineIE +from .toongoggles import ToonGogglesIE +from .toutv import TouTvIE +from .toypics import ToypicsUserIE, ToypicsIE +from .traileraddict import TrailerAddictIE +from .triller import ( + TrillerIE, + TrillerUserIE, +) +from .trilulilu import TriluliluIE +from .trovo import ( + TrovoIE, + TrovoVodIE, + TrovoChannelVodIE, + TrovoChannelClipIE, +) +from .trueid import TrueIDIE +from .trunews import TruNewsIE +from .truth import TruthIE +from .trutv import TruTVIE +from .tube8 import Tube8IE +from .tubetugraz import TubeTuGrazIE, TubeTuGrazSeriesIE +from .tubitv import ( + TubiTvIE, + TubiTvShowIE, +) +from .tumblr import TumblrIE +from .tunein import ( + TuneInClipIE, + TuneInStationIE, + TuneInProgramIE, + TuneInTopicIE, + TuneInShortenerIE, +) +from .tunepk import TunePkIE +from .turbo import TurboIE +from .tv2 import ( + TV2IE, + TV2ArticleIE, + KatsomoIE, + MTVUutisetArticleIE, +) +from .tv24ua import ( + TV24UAVideoIE, +) +from .tv2dk import ( + TV2DKIE, + TV2DKBornholmPlayIE, +) +from .tv2hu import ( + TV2HuIE, + TV2HuSeriesIE, +) +from .tv4 import TV4IE +from .tv5mondeplus import TV5MondePlusIE +from .tv5unis import ( + TV5UnisVideoIE, + TV5UnisIE, +) +from .tva import ( + TVAIE, + QubIE, +) +from .tvanouvelles import ( + TVANouvellesIE, + TVANouvellesArticleIE, +) +from .tvc import ( + TVCIE, + TVCArticleIE, +) +from .tver import TVerIE +from .tvigle import TvigleIE +from .tviplayer import TVIPlayerIE +from .tvland import TVLandIE +from .tvn24 import TVN24IE +from .tvnet import TVNetIE +from .tvnoe import TVNoeIE +from .tvnow import ( + TVNowIE, + TVNowFilmIE, + TVNowNewIE, + TVNowSeasonIE, + TVNowAnnualIE, + TVNowShowIE, +) +from .tvopengr import ( + TVOpenGrWatchIE, + TVOpenGrEmbedIE, +) +from .tvp import ( + TVPEmbedIE, + TVPIE, + TVPStreamIE, + TVPVODSeriesIE, + TVPVODVideoIE, +) +from .tvplay import ( + TVPlayIE, + ViafreeIE, + TVPlayHomeIE, +) +from .tvplayer import TVPlayerIE +from .tweakers import TweakersIE +from .twentyfourvideo import TwentyFourVideoIE +from .twentymin import TwentyMinutenIE +from .twentythreevideo import TwentyThreeVideoIE +from .twitcasting import ( + TwitCastingIE, + TwitCastingLiveIE, + TwitCastingUserIE, +) +from .twitch import ( + TwitchVodIE, + TwitchCollectionIE, + TwitchVideosIE, + TwitchVideosClipsIE, + TwitchVideosCollectionsIE, + TwitchStreamIE, + TwitchClipsIE, +) +from .twitter import ( + TwitterCardIE, + TwitterIE, + TwitterAmplifyIE, + TwitterBroadcastIE, + TwitterSpacesIE, + TwitterShortenerIE, +) +from .udemy import ( + UdemyIE, + UdemyCourseIE +) +from .udn import UDNEmbedIE +from .ufctv import ( + UFCTVIE, + UFCArabiaIE, +) +from .ukcolumn import UkColumnIE +from .uktvplay import UKTVPlayIE +from .digiteka import DigitekaIE +from .dlive import ( + DLiveVODIE, + DLiveStreamIE, +) +from .drooble import DroobleIE +from .umg import UMGDeIE +from .unistra import UnistraIE +from .unity import UnityIE +from .unscripted import UnscriptedNewsVideoIE +from .unsupported import KnownDRMIE, KnownPiracyIE +from .uol import UOLIE +from .uplynk import ( + UplynkIE, + UplynkPreplayIE, +) +from .urort import UrortIE +from .urplay import URPlayIE +from .usanetwork import USANetworkIE +from .usatoday import USATodayIE +from .ustream import UstreamIE, UstreamChannelIE +from .ustudio import ( + UstudioIE, + UstudioEmbedIE, +) +from .utreon import UtreonIE +from .varzesh3 import Varzesh3IE +from .vbox7 import Vbox7IE +from .veehd import VeeHDIE +from .veo import VeoIE +from .veoh import ( + VeohIE, + VeohUserIE +) +from .vesti import VestiIE +from .vevo import ( + VevoIE, + VevoPlaylistIE, +) +from .vgtv import ( + BTArticleIE, + BTVestlendingenIE, + VGTVIE, +) +from .vh1 import VH1IE +from .vice import ( + ViceIE, + ViceArticleIE, + ViceShowIE, +) +from .vidbit import VidbitIE +from .viddler import ViddlerIE +from .videa import VideaIE +from .videocampus_sachsen import ( + VideocampusSachsenIE, + ViMPPlaylistIE, +) +from .videodetective import VideoDetectiveIE +from .videofyme import VideofyMeIE +from .videomore import ( + VideomoreIE, + VideomoreVideoIE, + VideomoreSeasonIE, +) +from .videopress import VideoPressIE +from .vidio import ( + VidioIE, + VidioPremierIE, + VidioLiveIE +) +from .vidlii import VidLiiIE +from .viewlift import ( + ViewLiftIE, + ViewLiftEmbedIE, +) +from .viidea import ViideaIE +from .vimeo import ( + VimeoIE, + VimeoAlbumIE, + VimeoChannelIE, + VimeoGroupsIE, + VimeoLikesIE, + VimeoOndemandIE, + VimeoProIE, + VimeoReviewIE, + VimeoUserIE, + VimeoWatchLaterIE, + VHXEmbedIE, +) +from .vimm import ( + VimmIE, + VimmRecordingIE, +) +from .vimple import VimpleIE +from .vine import ( + VineIE, + VineUserIE, +) +from .viki import ( + VikiIE, + VikiChannelIE, +) +from .viqeo import ViqeoIE +from .viu import ( + ViuIE, + ViuPlaylistIE, + ViuOTTIE, +) +from .vk import ( + VKIE, + VKUserVideosIE, + VKWallPostIE, +) +from .vlive import ( + VLiveIE, + VLivePostIE, + VLiveChannelIE, +) +from .vodlocker import VodlockerIE +from .vodpl import VODPlIE +from .vodplatform import VODPlatformIE +from .voicerepublic import VoiceRepublicIE +from .voicy import ( + VoicyIE, + VoicyChannelIE, +) +from .voot import ( + VootIE, + VootSeriesIE, +) +from .voxmedia import ( + VoxMediaVolumeIE, + VoxMediaIE, +) +from .vrt import VRTIE +from .vrak import VrakIE +from .vrv import ( + VRVIE, + VRVSeriesIE, +) +from .vshare import VShareIE +from .vtm import VTMIE +from .medialaan import MedialaanIE +from .vuclip import VuClipIE +from .vupload import VuploadIE +from .vvvvid import ( + VVVVIDIE, + VVVVIDShowIE, +) +from .vyborymos import VyboryMosIE +from .vzaar import VzaarIE +from .wakanim import WakanimIE +from .walla import WallaIE +from .washingtonpost import ( + WashingtonPostIE, + WashingtonPostArticleIE, +) +from .wasdtv import ( + WASDTVStreamIE, + WASDTVRecordIE, + WASDTVClipIE, +) +from .wat import WatIE +from .watchbox import WatchBoxIE +from .watchindianporn import WatchIndianPornIE +from .wdr import ( + WDRIE, + WDRPageIE, + WDRElefantIE, + WDRMobileIE, +) +from .webcaster import ( + WebcasterIE, + WebcasterFeedIE, +) +from .webofstories import ( + WebOfStoriesIE, + WebOfStoriesPlaylistIE, +) +from .weibo import ( + WeiboIE, + WeiboMobileIE +) +from .weiqitv import WeiqiTVIE +from .wikimedia import WikimediaIE +from .willow import WillowIE +from .wimtv import WimTVIE +from .whowatch import WhoWatchIE +from .wistia import ( + WistiaIE, + WistiaPlaylistIE, + WistiaChannelIE, +) +from .wordpress import ( + WordpressPlaylistEmbedIE, + WordpressMiniAudioPlayerEmbedIE, +) +from .worldstarhiphop import WorldStarHipHopIE +from .wppilot import ( + WPPilotIE, + WPPilotChannelsIE, +) +from .wsj import ( + WSJIE, + WSJArticleIE, +) +from .wwe import WWEIE +from .xbef import XBefIE +from .xboxclips import XboxClipsIE +from .xfileshare import XFileShareIE +from .xhamster import ( + XHamsterIE, + XHamsterEmbedIE, + XHamsterUserIE, +) +from .xiami import ( + XiamiSongIE, + XiamiAlbumIE, + XiamiArtistIE, + XiamiCollectionIE +) +from .ximalaya import ( + XimalayaIE, + XimalayaAlbumIE +) +from .xinpianchang import XinpianchangIE +from .xminus import XMinusIE +from .xnxx import XNXXIE +from .xstream import XstreamIE +from .xtube import XTubeUserIE, XTubeIE +from .xuite import XuiteIE +from .xvideos import XVideosIE +from .xxxymovies import XXXYMoviesIE +from .yahoo import ( + YahooIE, + YahooSearchIE, + YahooGyaOPlayerIE, + YahooGyaOIE, + YahooJapanNewsIE, +) +from .yandexdisk import YandexDiskIE +from .yandexmusic import ( + YandexMusicTrackIE, + YandexMusicAlbumIE, + YandexMusicPlaylistIE, + YandexMusicArtistTracksIE, + YandexMusicArtistAlbumsIE, +) +from .yandexvideo import ( + YandexVideoIE, + YandexVideoPreviewIE, + ZenYandexIE, + ZenYandexChannelIE, +) +from .yapfiles import YapFilesIE +from .yesjapan import YesJapanIE +from .yinyuetai import YinYueTaiIE +from .yle_areena import YleAreenaIE +from .ynet import YnetIE +from .youjizz import YouJizzIE +from .youku import ( + YoukuIE, + YoukuShowIE, +) +from .younow import ( + YouNowLiveIE, + YouNowChannelIE, + YouNowMomentIE, +) +from .youporn import YouPornIE +from .yourporn import YourPornIE +from .yourupload import YourUploadIE +from .zapiks import ZapiksIE +from .zattoo import ( + BBVTVIE, + BBVTVLiveIE, + BBVTVRecordingsIE, + EinsUndEinsTVIE, + EinsUndEinsTVLiveIE, + EinsUndEinsTVRecordingsIE, + EWETVIE, + EWETVLiveIE, + EWETVRecordingsIE, + GlattvisionTVIE, + GlattvisionTVLiveIE, + GlattvisionTVRecordingsIE, + MNetTVIE, + MNetTVLiveIE, + MNetTVRecordingsIE, + NetPlusTVIE, + NetPlusTVLiveIE, + NetPlusTVRecordingsIE, + OsnatelTVIE, + OsnatelTVLiveIE, + OsnatelTVRecordingsIE, + QuantumTVIE, + QuantumTVLiveIE, + QuantumTVRecordingsIE, + SaltTVIE, + SaltTVLiveIE, + SaltTVRecordingsIE, + SAKTVIE, + SAKTVLiveIE, + SAKTVRecordingsIE, + VTXTVIE, + VTXTVLiveIE, + VTXTVRecordingsIE, + WalyTVIE, + WalyTVLiveIE, + WalyTVRecordingsIE, + ZattooIE, + ZattooLiveIE, + ZattooMoviesIE, + ZattooRecordingsIE, +) +from .zdf import ZDFIE, ZDFChannelIE +from .zee5 import ( + Zee5IE, + Zee5SeriesIE, +) +from .zeenews import ZeeNewsIE +from .zhihu import ZhihuIE +from .zingmp3 import ( + ZingMp3IE, + ZingMp3AlbumIE, + ZingMp3ChartHomeIE, + ZingMp3WeekChartIE, + ZingMp3ChartMusicVideoIE, + ZingMp3UserIE, +) +from .zoom import ZoomIE +from .zype import ZypeIE diff --git a/hypervideo_dl/extractor/abc.py b/hypervideo_dl/extractor/abc.py index 6fe195e..0ca76b8 100644 --- a/hypervideo_dl/extractor/abc.py +++ b/hypervideo_dl/extractor/abc.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import hashlib import hmac import re @@ -157,8 +155,6 @@ class ABCIE(InfoExtractor): 'format_id': format_id }) - self._sort_formats(formats) - return { 'id': video_id, 'title': self._og_search_title(webpage), @@ -223,7 +219,6 @@ class ABCIViewIE(InfoExtractor): entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) if formats: break - self._sort_formats(formats) subtitles = {} src_vtt = stream.get('captions', {}).get('src-vtt') diff --git a/hypervideo_dl/extractor/abcnews.py b/hypervideo_dl/extractor/abcnews.py index 296b8ce..a57295b 100644 --- a/hypervideo_dl/extractor/abcnews.py +++ b/hypervideo_dl/extractor/abcnews.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .amp import AMPIE from .common import InfoExtractor from ..utils import ( diff --git a/hypervideo_dl/extractor/abcotvs.py b/hypervideo_dl/extractor/abcotvs.py index 5bff466..6dca19d 100644 --- a/hypervideo_dl/extractor/abcotvs.py +++ b/hypervideo_dl/extractor/abcotvs.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor from ..compat import compat_str from ..utils import ( @@ -82,7 +78,6 @@ class ABCOTVSIE(InfoExtractor): 'url': mp4_url, 'width': 640, }) - self._sort_formats(formats) image = video.get('image') or {} @@ -123,7 +118,6 @@ class ABCOTVSClipsIE(InfoExtractor): title = video_data['title'] formats = self._extract_m3u8_formats( video_data['videoURL'].split('?')[0], video_id, 'mp4') - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/abematv.py b/hypervideo_dl/extractor/abematv.py index 27b7d86..80046af 100644 --- a/hypervideo_dl/extractor/abematv.py +++ b/hypervideo_dl/extractor/abematv.py @@ -1,42 +1,41 @@ -import io -import json -import time +import base64 +import binascii +import functools import hashlib import hmac +import io +import json import re import struct -from base64 import urlsafe_b64encode -from binascii import unhexlify +import time +import urllib.parse +import urllib.request +import urllib.response +import uuid from .common import InfoExtractor from ..aes import aes_ecb_decrypt -from ..compat import ( - compat_urllib_response, - compat_urllib_parse_urlparse, - compat_urllib_request, -) from ..utils import ( ExtractorError, - decode_base, + bytes_to_intlist, + decode_base_n, int_or_none, - random_uuidv4, + intlist_to_bytes, + OnDemandPagedList, request_to_url, time_seconds, - update_url_query, traverse_obj, - intlist_to_bytes, - bytes_to_intlist, - urljoin, + update_url_query, ) - # NOTE: network handler related code is temporary thing until network stack overhaul PRs are merged (#2861/#2862) + def add_opener(ydl, handler): ''' Add a handler for opening URLs, like _download_webpage ''' # https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L426 # https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L605 - assert isinstance(ydl._opener, compat_urllib_request.OpenerDirector) + assert isinstance(ydl._opener, urllib.request.OpenerDirector) ydl._opener.add_handler(handler) @@ -49,7 +48,7 @@ def remove_opener(ydl, handler): # https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L426 # https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L605 opener = ydl._opener - assert isinstance(ydl._opener, compat_urllib_request.OpenerDirector) + assert isinstance(ydl._opener, urllib.request.OpenerDirector) if isinstance(handler, (type, tuple)): find_cp = lambda x: isinstance(x, handler) else: @@ -99,20 +98,20 @@ def remove_opener(ydl, handler): opener.handlers[:] = [x for x in opener.handlers if not find_cp(x)] -class AbemaLicenseHandler(compat_urllib_request.BaseHandler): +class AbemaLicenseHandler(urllib.request.BaseHandler): handler_order = 499 STRTABLE = '123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz' HKEY = b'3AF0298C219469522A313570E8583005A642E73EDD58E3EA2FB7339D3DF1597E' def __init__(self, ie: 'AbemaTVIE'): - # the protcol that this should really handle is 'abematv-license://' + # the protocol that this should really handle is 'abematv-license://' # abematv_license_open is just a placeholder for development purposes # ref. https://github.com/python/cpython/blob/f4c03484da59049eb62a9bf7777b963e2267d187/Lib/urllib/request.py#L510 setattr(self, 'abematv-license_open', getattr(self, 'abematv_license_open')) self.ie = ie def _get_videokey_from_ticket(self, ticket): - to_show = self.ie._downloader.params.get('verbose', False) + to_show = self.ie.get_param('verbose', False) media_token = self.ie._get_media_token(to_show=to_show) license_response = self.ie._download_json( @@ -126,11 +125,11 @@ class AbemaLicenseHandler(compat_urllib_request.BaseHandler): 'Content-Type': 'application/json', }) - res = decode_base(license_response['k'], self.STRTABLE) + res = decode_base_n(license_response['k'], table=self.STRTABLE) encvideokey = bytes_to_intlist(struct.pack('>QQ', res >> 64, res & 0xffffffffffffffff)) h = hmac.new( - unhexlify(self.HKEY), + binascii.unhexlify(self.HKEY), (license_response['cid'] + self.ie._DEVICE_ID).encode('utf-8'), digestmod=hashlib.sha256) enckey = bytes_to_intlist(h.digest()) @@ -139,84 +138,22 @@ class AbemaLicenseHandler(compat_urllib_request.BaseHandler): def abematv_license_open(self, url): url = request_to_url(url) - ticket = compat_urllib_parse_urlparse(url).netloc + ticket = urllib.parse.urlparse(url).netloc response_data = self._get_videokey_from_ticket(ticket) - return compat_urllib_response.addinfourl(io.BytesIO(response_data), headers={ + return urllib.response.addinfourl(io.BytesIO(response_data), headers={ 'Content-Length': len(response_data), }, url=url, code=200) class AbemaTVBaseIE(InfoExtractor): - def _extract_breadcrumb_list(self, webpage, video_id): - for jld in re.finditer( - r'(?is)]+type=(["\']?)application/ld\+json\1[^>]*>(?P.+?)', - webpage): - jsonld = self._parse_json(jld.group('json_ld'), video_id, fatal=False) - if jsonld: - if jsonld.get('@type') != 'BreadcrumbList': - continue - trav = traverse_obj(jsonld, ('itemListElement', ..., 'name')) - if trav: - return trav - return [] - - -class AbemaTVIE(AbemaTVBaseIE): - _VALID_URL = r'https?://abema\.tv/(?Pnow-on-air|video/episode|channels/.+?/slots)/(?P[^?/]+)' - _NETRC_MACHINE = 'abematv' - _TESTS = [{ - 'url': 'https://abema.tv/video/episode/194-25_s2_p1', - 'info_dict': { - 'id': '194-25_s2_p1', - 'title': '第1話 「チーズケーキ」 「モーニング再び」', - 'series': '異世界食堂2', - 'series_number': 2, - 'episode': '第1話 「チーズケーキ」 「モーニング再び」', - 'episode_number': 1, - }, - 'skip': 'expired', - }, { - 'url': 'https://abema.tv/channels/anime-live2/slots/E8tvAnMJ7a9a5d', - 'info_dict': { - 'id': 'E8tvAnMJ7a9a5d', - 'title': 'ゆるキャン△ SEASON2 全話一挙【無料ビデオ72時間】', - 'series': 'ゆるキャン△ SEASON2', - 'episode': 'ゆるキャン△ SEASON2 全話一挙【無料ビデオ72時間】', - 'series_number': 2, - 'episode_number': 1, - 'description': 'md5:9c5a3172ae763278f9303922f0ea5b17', - }, - 'skip': 'expired', - }, { - 'url': 'https://abema.tv/video/episode/87-877_s1282_p31047', - 'info_dict': { - 'id': 'E8tvAnMJ7a9a5d', - 'title': '第5話『光射す』', - 'description': 'md5:56d4fc1b4f7769ded5f923c55bb4695d', - 'thumbnail': r're:https://hayabusa\.io/.+', - 'series': '相棒', - 'episode': '第5話『光射す』', - }, - 'skip': 'expired', - }, { - 'url': 'https://abema.tv/now-on-air/abema-anime', - 'info_dict': { - 'id': 'abema-anime', - # this varies - # 'title': '女子高生の無駄づかい 全話一挙【無料ビデオ72時間】', - 'description': 'md5:55f2e61f46a17e9230802d7bcc913d5f', - 'is_live': True, - }, - 'skip': 'Not supported until hypervideo implements native live downloader OR AbemaTV can start a local HTTP server', - }] _USERTOKEN = None _DEVICE_ID = None - _TIMETABLE = None _MEDIATOKEN = None _SECRETKEY = b'v+Gjs=25Aw5erR!J8ZuvRrCx*rGswhB&qdHd_SYerEWdU&a?3DzN9BRbp5KwY4hEmcj5#fykMjJ=AuWz5GSMY-d@H7DMEh3M@9n2G552Us$$k9cD=3TxwWe86!x#Zyhe' - def _generate_aks(self, deviceid): + @classmethod + def _generate_aks(cls, deviceid): deviceid = deviceid.encode('utf-8') # add 1 hour and then drop minute and secs ts_1hour = int((time_seconds(hours=9) // 3600 + 1) * 3600) @@ -227,7 +164,7 @@ class AbemaTVIE(AbemaTVBaseIE): def mix_once(nonce): nonlocal tmp - h = hmac.new(self._SECRETKEY, digestmod=hashlib.sha256) + h = hmac.new(cls._SECRETKEY, digestmod=hashlib.sha256) h.update(nonce) tmp = h.digest() @@ -238,22 +175,22 @@ class AbemaTVIE(AbemaTVBaseIE): def mix_twist(nonce): nonlocal tmp - mix_once(urlsafe_b64encode(tmp).rstrip(b'=') + nonce) + mix_once(base64.urlsafe_b64encode(tmp).rstrip(b'=') + nonce) - mix_once(self._SECRETKEY) + mix_once(cls._SECRETKEY) mix_tmp(time_struct.tm_mon) mix_twist(deviceid) mix_tmp(time_struct.tm_mday % 5) mix_twist(ts_1hour_str) mix_tmp(time_struct.tm_hour % 5) - return urlsafe_b64encode(tmp).rstrip(b'=').decode('utf-8') + return base64.urlsafe_b64encode(tmp).rstrip(b'=').decode('utf-8') def _get_device_token(self): if self._USERTOKEN: return self._USERTOKEN - self._DEVICE_ID = random_uuidv4() + AbemaTVBaseIE._DEVICE_ID = str(uuid.uuid4()) aks = self._generate_aks(self._DEVICE_ID) user_data = self._download_json( 'https://api.abema.io/v1/users', None, note='Authorizing', @@ -264,7 +201,7 @@ class AbemaTVIE(AbemaTVBaseIE): headers={ 'Content-Type': 'application/json', }) - self._USERTOKEN = user_data['token'] + AbemaTVBaseIE._USERTOKEN = user_data['token'] # don't allow adding it 2 times or more, though it's guarded remove_opener(self._downloader, AbemaLicenseHandler) @@ -276,7 +213,7 @@ class AbemaTVIE(AbemaTVBaseIE): if not invalidate and self._MEDIATOKEN: return self._MEDIATOKEN - self._MEDIATOKEN = self._download_json( + AbemaTVBaseIE._MEDIATOKEN = self._download_json( 'https://api.abema.io/v1/media/token', None, note='Fetching media token' if to_show else False, query={ 'osName': 'android', @@ -286,11 +223,82 @@ class AbemaTVIE(AbemaTVBaseIE): 'appId': 'tv.abema', 'appVersion': '3.27.1' }, headers={ - 'Authorization': 'bearer ' + self._get_device_token() + 'Authorization': f'bearer {self._get_device_token()}', })['token'] return self._MEDIATOKEN + def _call_api(self, endpoint, video_id, query=None, note='Downloading JSON metadata'): + return self._download_json( + f'https://api.abema.io/{endpoint}', video_id, query=query or {}, + note=note, + headers={ + 'Authorization': f'bearer {self._get_device_token()}', + }) + + def _extract_breadcrumb_list(self, webpage, video_id): + for jld in re.finditer( + r'(?is)]+type=(["\']?)application/ld\+json\1[^>]*>(?P.+?)', + webpage): + jsonld = self._parse_json(jld.group('json_ld'), video_id, fatal=False) + if traverse_obj(jsonld, '@type') != 'BreadcrumbList': + continue + items = traverse_obj(jsonld, ('itemListElement', ..., 'name')) + if items: + return items + return [] + + +class AbemaTVIE(AbemaTVBaseIE): + _VALID_URL = r'https?://abema\.tv/(?Pnow-on-air|video/episode|channels/.+?/slots)/(?P[^?/]+)' + _NETRC_MACHINE = 'abematv' + _TESTS = [{ + 'url': 'https://abema.tv/video/episode/194-25_s2_p1', + 'info_dict': { + 'id': '194-25_s2_p1', + 'title': '第1話 「チーズケーキ」 「モーニング再び」', + 'series': '異世界食堂2', + 'series_number': 2, + 'episode': '第1話 「チーズケーキ」 「モーニング再び」', + 'episode_number': 1, + }, + 'skip': 'expired', + }, { + 'url': 'https://abema.tv/channels/anime-live2/slots/E8tvAnMJ7a9a5d', + 'info_dict': { + 'id': 'E8tvAnMJ7a9a5d', + 'title': 'ゆるキャン△ SEASON2 全話一挙【無料ビデオ72時間】', + 'series': 'ゆるキャン△ SEASON2', + 'episode': 'ゆるキャン△ SEASON2 全話一挙【無料ビデオ72時間】', + 'series_number': 2, + 'episode_number': 1, + 'description': 'md5:9c5a3172ae763278f9303922f0ea5b17', + }, + 'skip': 'expired', + }, { + 'url': 'https://abema.tv/video/episode/87-877_s1282_p31047', + 'info_dict': { + 'id': 'E8tvAnMJ7a9a5d', + 'title': '第5話『光射す』', + 'description': 'md5:56d4fc1b4f7769ded5f923c55bb4695d', + 'thumbnail': r're:https://hayabusa\.io/.+', + 'series': '相棒', + 'episode': '第5話『光射す』', + }, + 'skip': 'expired', + }, { + 'url': 'https://abema.tv/now-on-air/abema-anime', + 'info_dict': { + 'id': 'abema-anime', + # this varies + # 'title': '女子高生の無駄づかい 全話一挙【無料ビデオ72時間】', + 'description': 'md5:55f2e61f46a17e9230802d7bcc913d5f', + 'is_live': True, + }, + 'skip': 'Not supported until hypervideo implements native live downloader OR AbemaTV can start a local HTTP server', + }] + _TIMETABLE = None + def _perform_login(self, username, password): if '@' in username: # don't strictly check if it's email address or not ep, method = 'user/email', 'email' @@ -303,18 +311,18 @@ class AbemaTVIE(AbemaTVBaseIE): method: username, 'password': password }).encode('utf-8'), headers={ - 'Authorization': 'bearer ' + self._get_device_token(), + 'Authorization': f'bearer {self._get_device_token()}', 'Origin': 'https://abema.tv', 'Referer': 'https://abema.tv/', 'Content-Type': 'application/json', }) - self._USERTOKEN = login_response['token'] + AbemaTVBaseIE._USERTOKEN = login_response['token'] self._get_media_token(True) def _real_extract(self, url): # starting download using infojson from this extractor is undefined behavior, - # and never be fixed in the future; you must trigger downloads by directly specifing URL. + # and never be fixed in the future; you must trigger downloads by directly specifying URL. # (unless there's a way to hook before downloading by extractor) video_id, video_type = self._match_valid_url(url).group('id', 'type') headers = { @@ -357,7 +365,7 @@ class AbemaTVIE(AbemaTVBaseIE): # read breadcrumb on top of page breadcrumb = self._extract_breadcrumb_list(webpage, video_id) if breadcrumb: - # breadcrumb list translates to: (example is 1st test for this IE) + # breadcrumb list translates to: (e.g. 1st test for this IE) # Home > Anime (genre) > Isekai Shokudo 2 (series name) > Episode 1 "Cheese cakes" "Morning again" (episode title) # hence this works info['series'] = breadcrumb[-2] @@ -444,6 +452,7 @@ class AbemaTVIE(AbemaTVBaseIE): class AbemaTVTitleIE(AbemaTVBaseIE): _VALID_URL = r'https?://abema\.tv/video/title/(?P[^?/]+)' + _PAGE_SIZE = 25 _TESTS = [{ 'url': 'https://abema.tv/video/title/90-1597', @@ -459,18 +468,39 @@ class AbemaTVTitleIE(AbemaTVBaseIE): 'title': '真心が届く~僕とスターのオフィス・ラブ!?~', }, 'playlist_mincount': 16, + }, { + 'url': 'https://abema.tv/video/title/25-102', + 'info_dict': { + 'id': '25-102', + 'title': 'ソードアート・オンライン アリシゼーション', + }, + 'playlist_mincount': 24, }] - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + def _fetch_page(self, playlist_id, series_version, page): + programs = self._call_api( + f'v1/video/series/{playlist_id}/programs', playlist_id, + note=f'Downloading page {page + 1}', + query={ + 'seriesVersion': series_version, + 'offset': str(page * self._PAGE_SIZE), + 'order': 'seq', + 'limit': str(self._PAGE_SIZE), + }) + yield from ( + self.url_result(f'https://abema.tv/video/episode/{x}') + for x in traverse_obj(programs, ('programs', ..., 'id'), default=[])) - playlist_title, breadcrumb = None, self._extract_breadcrumb_list(webpage, video_id) - if breadcrumb: - playlist_title = breadcrumb[-1] + def _entries(self, playlist_id, series_version): + return OnDemandPagedList( + functools.partial(self._fetch_page, playlist_id, series_version), + self._PAGE_SIZE) - playlist = [ - self.url_result(urljoin('https://abema.tv/', mobj.group(1))) - for mobj in re.finditer(r'[_\d]+)' + + _TESTS = [{ + 'url': 'https://www.acfun.cn/v/ac35457073', + 'info_dict': { + 'id': '35457073', + 'ext': 'mp4', + 'duration': 174.208, + 'timestamp': 1656403967, + 'title': '1 8 岁 现 状', + 'description': '“赶紧回去!班主任查班了!”', + 'uploader': '锤子game', + 'uploader_id': '51246077', + 'thumbnail': r're:^https?://.*\.(jpg|jpeg)', + 'upload_date': '20220628', + 'like_count': int, + 'view_count': int, + 'comment_count': int, + 'tags': list, + }, + }, { + # example for len(video_list) > 1 + 'url': 'https://www.acfun.cn/v/ac35468952_2', + 'info_dict': { + 'id': '35468952_2', + 'ext': 'mp4', + 'title': '【动画剧集】Rocket & Groot Season 1(2022)/火箭浣熊与格鲁特第1季 P02 S01E02 十拿九穩', + 'duration': 90.459, + 'uploader': '比令', + 'uploader_id': '37259967', + 'upload_date': '20220629', + 'timestamp': 1656479962, + 'tags': list, + 'like_count': int, + 'view_count': int, + 'comment_count': int, + 'thumbnail': r're:^https?://.*\.(jpg|jpeg)', + 'description': 'md5:67583aaf3a0f933bd606bc8a2d3ebb17', + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + json_all = self._search_json(r'window.videoInfo\s*=', webpage, 'videoInfo', video_id) + + title = json_all.get('title') + video_list = json_all.get('videoList') or [] + video_internal_id = traverse_obj(json_all, ('currentVideoInfo', 'id')) + if video_internal_id and len(video_list) > 1: + part_idx, part_video_info = next( + (idx + 1, v) for (idx, v) in enumerate(video_list) + if v['id'] == video_internal_id) + title = f'{title} P{part_idx:02d} {part_video_info["title"]}' + + return { + **self._extract_metadata(video_id, json_all['currentVideoInfo']), + 'title': title, + 'thumbnail': json_all.get('coverUrl'), + 'description': json_all.get('description'), + 'uploader': traverse_obj(json_all, ('user', 'name')), + 'uploader_id': traverse_obj(json_all, ('user', 'href')), + 'tags': traverse_obj(json_all, ('tagList', ..., 'name')), + 'view_count': int_or_none(json_all.get('viewCount')), + 'like_count': int_or_none(json_all.get('likeCountShow')), + 'comment_count': int_or_none(json_all.get('commentCountShow')), + } + + +class AcFunBangumiIE(AcFunVideoBaseIE): + _VALID_URL = r'https?://www\.acfun\.cn/bangumi/(?Paa[_\d]+)' + + _TESTS = [{ + 'url': 'https://www.acfun.cn/bangumi/aa6002917_36188_1745457?ac=2', + 'info_dict': { + 'id': 'aa6002917_36188_1745457__2', + 'ext': 'mp4', + 'title': '【7月】租借女友 水原千鹤角色曲『DATE』特别PV', + 'upload_date': '20200916', + 'timestamp': 1600243813, + 'duration': 92.091, + }, + }, { + 'url': 'https://www.acfun.cn/bangumi/aa5023171_36188_1750645', + 'info_dict': { + 'id': 'aa5023171_36188_1750645', + 'ext': 'mp4', + 'title': '红孩儿之趴趴蛙寻石记 第5话 ', + 'duration': 760.0, + 'season': '红孩儿之趴趴蛙寻石记', + 'season_id': 5023171, + 'season_number': 1, # series has only 1 season + 'episode': 'Episode 5', + 'episode_number': 5, + 'upload_date': '20181223', + 'timestamp': 1545552185, + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)', + 'comment_count': int, + }, + }, { + 'url': 'https://www.acfun.cn/bangumi/aa6065485_36188_1885061', + 'info_dict': { + 'id': 'aa6065485_36188_1885061', + 'ext': 'mp4', + 'title': '叽歪老表(第二季) 第5话 坚不可摧', + 'season': '叽歪老表(第二季)', + 'season_number': 2, + 'season_id': 6065485, + 'episode': '坚不可摧', + 'episode_number': 5, + 'upload_date': '20220324', + 'timestamp': 1648082786, + 'duration': 105.002, + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)', + 'comment_count': int, + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + ac_idx = parse_qs(url).get('ac', [None])[-1] + video_id = f'{video_id}{format_field(ac_idx, None, "__%s")}' + + webpage = self._download_webpage(url, video_id) + json_bangumi_data = self._search_json(r'window.bangumiData\s*=', webpage, 'bangumiData', video_id) + + if ac_idx: + video_info = json_bangumi_data['hlVideoInfo'] + return { + **self._extract_metadata(video_id, video_info), + 'title': video_info.get('title'), + } + + video_info = json_bangumi_data['currentVideoInfo'] + + season_id = json_bangumi_data.get('bangumiId') + season_number = season_id and next(( + idx for idx, v in enumerate(json_bangumi_data.get('relatedBangumis') or [], 1) + if v.get('id') == season_id), 1) + + json_bangumi_list = self._search_json( + r'window\.bangumiList\s*=', webpage, 'bangumiList', video_id, fatal=False) + video_internal_id = int_or_none(traverse_obj(json_bangumi_data, ('currentVideoInfo', 'id'))) + episode_number = video_internal_id and next(( + idx for idx, v in enumerate(json_bangumi_list.get('items') or [], 1) + if v.get('videoId') == video_internal_id), None) + + return { + **self._extract_metadata(video_id, video_info), + 'title': json_bangumi_data.get('showTitle'), + 'thumbnail': json_bangumi_data.get('image'), + 'season': json_bangumi_data.get('bangumiTitle'), + 'season_id': season_id, + 'season_number': season_number, + 'episode': json_bangumi_data.get('title'), + 'episode_number': episode_number, + 'comment_count': int_or_none(json_bangumi_data.get('commentCount')), + } diff --git a/hypervideo_dl/extractor/adn.py b/hypervideo_dl/extractor/adn.py index fca6e60..e0c18c8 100644 --- a/hypervideo_dl/extractor/adn.py +++ b/hypervideo_dl/extractor/adn.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import base64 import binascii import json @@ -31,30 +28,34 @@ from ..utils import ( class ADNIE(InfoExtractor): - IE_DESC = 'Anime Digital Network' - _VALID_URL = r'https?://(?:www\.)?animedigitalnetwork\.fr/video/[^/]+/(?P\d+)' - _TEST = { - 'url': 'http://animedigitalnetwork.fr/video/blue-exorcist-kyoto-saga/7778-episode-1-debut-des-hostilites', - 'md5': '0319c99885ff5547565cacb4f3f9348d', + IE_DESC = 'Animation Digital Network' + _VALID_URL = r'https?://(?:www\.)?(?:animation|anime)digitalnetwork\.fr/video/[^/]+/(?P\d+)' + _TESTS = [{ + 'url': 'https://animationdigitalnetwork.fr/video/fruits-basket/9841-episode-1-a-ce-soir', + 'md5': '1c9ef066ceb302c86f80c2b371615261', 'info_dict': { - 'id': '7778', + 'id': '9841', 'ext': 'mp4', - 'title': 'Blue Exorcist - Kyôto Saga - Episode 1', - 'description': 'md5:2f7b5aa76edbc1a7a92cedcda8a528d5', - 'series': 'Blue Exorcist - Kyôto Saga', - 'duration': 1467, - 'release_date': '20170106', + 'title': 'Fruits Basket - Episode 1', + 'description': 'md5:14be2f72c3c96809b0ca424b0097d336', + 'series': 'Fruits Basket', + 'duration': 1437, + 'release_date': '20190405', 'comment_count': int, 'average_rating': float, - 'season_number': 2, - 'episode': 'Début des hostilités', + 'season_number': 1, + 'episode': 'À ce soir !', 'episode_number': 1, - } - } + }, + 'skip': 'Only available in region (FR, ...)', + }, { + 'url': 'http://animedigitalnetwork.fr/video/blue-exorcist-kyoto-saga/7778-episode-1-debut-des-hostilites', + 'only_matching': True, + }] - _NETRC_MACHINE = 'animedigitalnetwork' - _BASE_URL = 'http://animedigitalnetwork.fr' - _API_BASE_URL = 'https://gw.api.animedigitalnetwork.fr/' + _NETRC_MACHINE = 'animationdigitalnetwork' + _BASE = 'animationdigitalnetwork.fr' + _API_BASE_URL = 'https://gw.api.' + _BASE + '/' _PLAYER_BASE_URL = _API_BASE_URL + 'player/' _HEADERS = {} _LOGIN_ERR_MESSAGE = 'Unable to log in' @@ -78,14 +79,14 @@ class ADNIE(InfoExtractor): if subtitle_location: enc_subtitles = self._download_webpage( subtitle_location, video_id, 'Downloading subtitles data', - fatal=False, headers={'Origin': 'https://animedigitalnetwork.fr'}) + fatal=False, headers={'Origin': 'https://' + self._BASE}) if not enc_subtitles: return None - # http://animedigitalnetwork.fr/components/com_vodvideo/videojs/adn-vjs.min.js + # http://animationdigitalnetwork.fr/components/com_vodvideo/videojs/adn-vjs.min.js dec_subtitles = unpad_pkcs7(aes_cbc_decrypt_bytes( compat_b64decode(enc_subtitles[24:]), - binascii.unhexlify(self._K + 'ab9f52f5baae7c72'), + binascii.unhexlify(self._K + '7fac1178830cfe0c'), compat_b64decode(enc_subtitles[:24]))) subtitles_json = self._parse_json(dec_subtitles.decode(), None, fatal=False) if not subtitles_json: @@ -234,7 +235,6 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text''' for f in m3u8_formats: f['language'] = 'fr' formats.extend(m3u8_formats) - self._sort_formats(formats) video = (self._download_json( self._API_BASE_URL + 'video/%s' % video_id, video_id, diff --git a/hypervideo_dl/extractor/adobeconnect.py b/hypervideo_dl/extractor/adobeconnect.py index e2e6f93..8963b12 100644 --- a/hypervideo_dl/extractor/adobeconnect.py +++ b/hypervideo_dl/extractor/adobeconnect.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import ( compat_parse_qs, diff --git a/hypervideo_dl/extractor/adobepass.py b/hypervideo_dl/extractor/adobepass.py index 5d98301..e5944f7 100644 --- a/hypervideo_dl/extractor/adobepass.py +++ b/hypervideo_dl/extractor/adobepass.py @@ -1,26 +1,20 @@ -# coding: utf-8 -from __future__ import unicode_literals - +import getpass import json import re import time +import urllib.error import xml.etree.ElementTree as etree from .common import InfoExtractor -from ..compat import ( - compat_kwargs, - compat_urlparse, - compat_getpass -) +from ..compat import compat_urlparse from ..utils import ( + NO_DEFAULT, + ExtractorError, unescapeHTML, - urlencode_postdata, unified_timestamp, - ExtractorError, - NO_DEFAULT, + urlencode_postdata, ) - MSO_INFO = { 'DTV': { 'name': 'DIRECTV', @@ -1350,10 +1344,15 @@ MSO_INFO = { 'username_field': 'username', 'password_field': 'password', }, + 'AlticeOne': { + 'name': 'Optimum TV', + 'username_field': 'j_username', + 'password_field': 'j_password', + }, } -class AdobePassIE(InfoExtractor): +class AdobePassIE(InfoExtractor): # XXX: Conventionally, base classes should end with BaseIE/InfoExtractor _SERVICE_PROVIDER_TEMPLATE = 'https://sp.auth.adobe.com/adobe-services/%s' _USER_AGENT = 'Mozilla/5.0 (X11; Linux i686; rv:47.0) Gecko/20100101 Firefox/47.0' _MVPD_CACHE = 'ap-mvpd' @@ -1365,7 +1364,7 @@ class AdobePassIE(InfoExtractor): headers.update(kwargs.get('headers', {})) kwargs['headers'] = headers return super(AdobePassIE, self)._download_webpage_handle( - *args, **compat_kwargs(kwargs)) + *args, **kwargs) @staticmethod def _get_mvpd_resource(provider_id, title, guid, rating): @@ -1434,32 +1433,34 @@ class AdobePassIE(InfoExtractor): guid = xml_text(resource, 'guid') if '<' in resource else resource count = 0 while count < 2: - requestor_info = self._downloader.cache.load(self._MVPD_CACHE, requestor_id) or {} + requestor_info = self.cache.load(self._MVPD_CACHE, requestor_id) or {} authn_token = requestor_info.get('authn_token') if authn_token and is_expired(authn_token, 'simpleTokenExpires'): authn_token = None if not authn_token: - # TODO add support for other TV Providers mso_id = self.get_param('ap_mso') - if not mso_id: - raise_mvpd_required() - username, password = self._get_login_info('ap_username', 'ap_password', mso_id) - if not username or not password: - raise_mvpd_required() - mso_info = MSO_INFO[mso_id] + if mso_id: + username, password = self._get_login_info('ap_username', 'ap_password', mso_id) + if not username or not password: + raise_mvpd_required() + mso_info = MSO_INFO[mso_id] - provider_redirect_page_res = self._download_webpage_handle( - self._SERVICE_PROVIDER_TEMPLATE % 'authenticate/saml', video_id, - 'Downloading Provider Redirect Page', query={ - 'noflash': 'true', - 'mso_id': mso_id, - 'requestor_id': requestor_id, - 'no_iframe': 'false', - 'domain_name': 'adobe.com', - 'redirect_url': url, - }) + provider_redirect_page_res = self._download_webpage_handle( + self._SERVICE_PROVIDER_TEMPLATE % 'authenticate/saml', video_id, + 'Downloading Provider Redirect Page', query={ + 'noflash': 'true', + 'mso_id': mso_id, + 'requestor_id': requestor_id, + 'no_iframe': 'false', + 'domain_name': 'adobe.com', + 'redirect_url': url, + }) + elif not self._cookies_passed: + raise_mvpd_required() - if mso_id == 'Comcast_SSO': + if not mso_id: + pass + elif mso_id == 'Comcast_SSO': # Comcast page flow varies by video site and whether you # are on Comcast's network. provider_redirect_page, urlh = provider_redirect_page_res @@ -1507,7 +1508,7 @@ class AdobePassIE(InfoExtractor): 'send_confirm_link': False, 'send_token': True })) - philo_code = compat_getpass('Type auth code you have received [Return]: ') + philo_code = getpass.getpass('Type auth code you have received [Return]: ') self._download_webpage( 'https://idp.philo.com/auth/update/login_code', video_id, 'Submitting token', data=urlencode_postdata({ 'token': philo_code @@ -1709,25 +1710,30 @@ class AdobePassIE(InfoExtractor): mso_info.get('username_field', 'username'): username, mso_info.get('password_field', 'password'): password } - if mso_id == 'Cablevision': + if mso_id in ('Cablevision', 'AlticeOne'): form_data['_eventId_proceed'] = '' mvpd_confirm_page_res = post_form(provider_login_page_res, 'Logging in', form_data) if mso_id != 'Rogers': post_form(mvpd_confirm_page_res, 'Confirming Login') - session = self._download_webpage( - self._SERVICE_PROVIDER_TEMPLATE % 'session', video_id, - 'Retrieving Session', data=urlencode_postdata({ - '_method': 'GET', - 'requestor_id': requestor_id, - }), headers=mvpd_headers) + try: + session = self._download_webpage( + self._SERVICE_PROVIDER_TEMPLATE % 'session', video_id, + 'Retrieving Session', data=urlencode_postdata({ + '_method': 'GET', + 'requestor_id': requestor_id, + }), headers=mvpd_headers) + except ExtractorError as e: + if not mso_id and isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 401: + raise_mvpd_required() + raise if '\d+)' + _EMBED_REGEX = [r']+src=[\'"](?P(?:https?:)?//video\.tv\.adobe\.com/v/\d+[^"]+)[\'"]'] _TEST = { # From https://helpx.adobe.com/acrobat/how-to/new-experience-acrobat-dc.html?set=acrobat--get-started--essential-beginners @@ -270,7 +268,6 @@ class AdobeTVVideoIE(AdobeTVBaseIE): 'width': int_or_none(source.get('width') or None), 'url': source_src, }) - self._sort_formats(formats) # For both metadata and downloaded files the duration varies among # formats. I just pick the max one diff --git a/hypervideo_dl/extractor/adultswim.py b/hypervideo_dl/extractor/adultswim.py index c97cfc1..bd29eb4 100644 --- a/hypervideo_dl/extractor/adultswim.py +++ b/hypervideo_dl/extractor/adultswim.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json from .turner import TurnerBaseIE @@ -183,7 +180,6 @@ class AdultSwimIE(TurnerBaseIE): info['subtitles'].setdefault('en', []).append({ 'url': asset_url, }) - self._sort_formats(info['formats']) return info else: diff --git a/hypervideo_dl/extractor/aenetworks.py b/hypervideo_dl/extractor/aenetworks.py index 8025de5..d7c4010 100644 --- a/hypervideo_dl/extractor/aenetworks.py +++ b/hypervideo_dl/extractor/aenetworks.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .theplatform import ThePlatformIE from ..utils import ( ExtractorError, @@ -12,7 +8,7 @@ from ..utils import ( ) -class AENetworksBaseIE(ThePlatformIE): +class AENetworksBaseIE(ThePlatformIE): # XXX: Do not subclass from concrete IE _BASE_URL_REGEX = r'''(?x)https?:// (?:(?:www|play|watch)\.)? (?P @@ -32,14 +28,17 @@ class AENetworksBaseIE(ThePlatformIE): } def _extract_aen_smil(self, smil_url, video_id, auth=None): - query = {'mbr': 'true'} + query = { + 'mbr': 'true', + 'formats': 'M3U+none,MPEG-DASH+none,MPEG4,MP3', + } if auth: query['auth'] = auth TP_SMIL_QUERY = [{ 'assetTypes': 'high_video_ak', - 'switch': 'hls_high_ak' + 'switch': 'hls_high_ak', }, { - 'assetTypes': 'high_video_s3' + 'assetTypes': 'high_video_s3', }, { 'assetTypes': 'high_video_s3', 'switch': 'hls_high_fastly', @@ -63,7 +62,6 @@ class AENetworksBaseIE(ThePlatformIE): subtitles = self._merge_subtitles(subtitles, tp_subtitles) if last_e and not formats: raise last_e - self._sort_formats(formats) return { 'id': video_id, 'formats': formats, @@ -305,7 +303,6 @@ class HistoryTopicIE(AENetworksBaseIE): class HistoryPlayerIE(AENetworksBaseIE): IE_NAME = 'history:player' _VALID_URL = r'https?://(?:www\.)?(?P(?:history|biography)\.com)/player/(?P\d+)' - _TESTS = [] def _real_extract(self, url): domain, video_id = self._match_valid_url(url).groups() diff --git a/hypervideo_dl/extractor/aeonco.py b/hypervideo_dl/extractor/aeonco.py new file mode 100644 index 0000000..4655862 --- /dev/null +++ b/hypervideo_dl/extractor/aeonco.py @@ -0,0 +1,40 @@ +from .common import InfoExtractor +from .vimeo import VimeoIE + + +class AeonCoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?aeon\.co/videos/(?P[^/?]+)' + _TESTS = [{ + 'url': 'https://aeon.co/videos/raw-solar-storm-footage-is-the-punk-rock-antidote-to-sleek-james-webb-imagery', + 'md5': 'e5884d80552c9b6ea8d268a258753362', + 'info_dict': { + 'id': '1284717', + 'ext': 'mp4', + 'title': 'Brilliant Noise', + 'thumbnail': 'https://i.vimeocdn.com/video/21006315-1a1e49da8b07fd908384a982b4ba9ff0268c509a474576ebdf7b1392f4acae3b-d_960', + 'uploader': 'Semiconductor', + 'uploader_id': 'semiconductor', + 'uploader_url': 'https://vimeo.com/semiconductor', + 'duration': 348 + } + }, { + 'url': 'https://aeon.co/videos/dazzling-timelapse-shows-how-microbes-spoil-our-food-and-sometimes-enrich-it', + 'md5': '4e5f3dad9dbda0dbfa2da41a851e631e', + 'info_dict': { + 'id': '728595228', + 'ext': 'mp4', + 'title': 'Wrought', + 'thumbnail': 'https://i.vimeocdn.com/video/1484618528-c91452611f9a4e4497735a533da60d45b2fe472deb0c880f0afaab0cd2efb22a-d_1280', + 'uploader': 'Biofilm Productions', + 'uploader_id': 'user140352216', + 'uploader_url': 'https://vimeo.com/user140352216', + 'duration': 1344 + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + vimeo_id = self._search_regex(r'hosterId":\s*"(?P[0-9]+)', webpage, 'vimeo id') + vimeo_url = VimeoIE._smuggle_referrer(f'https://player.vimeo.com/video/{vimeo_id}', 'https://aeon.co') + return self.url_result(vimeo_url, VimeoIE) diff --git a/hypervideo_dl/extractor/afreecatv.py b/hypervideo_dl/extractor/afreecatv.py index 77f0e3c..9276fe7 100644 --- a/hypervideo_dl/extractor/afreecatv.py +++ b/hypervideo_dl/extractor/afreecatv.py @@ -1,14 +1,12 @@ -# coding: utf-8 -from __future__ import unicode_literals - +import functools import re from .common import InfoExtractor -from ..compat import compat_xpath from ..utils import ( + ExtractorError, + OnDemandPagedList, date_from_str, determine_ext, - ExtractorError, int_or_none, qualities, traverse_obj, @@ -280,7 +278,7 @@ class AfreecaTVIE(InfoExtractor): else: raise ExtractorError('Unable to download video info') - video_element = video_xml.findall(compat_xpath('./track/video'))[-1] + video_element = video_xml.findall('./track/video')[-1] if video_element is None or video_element.text is None: raise ExtractorError( 'Video %s does not exist' % video_id, expected=True) @@ -310,7 +308,7 @@ class AfreecaTVIE(InfoExtractor): if not video_url: entries = [] - file_elements = video_element.findall(compat_xpath('./file')) + file_elements = video_element.findall('./file') one = len(file_elements) == 1 for file_num, file_element in enumerate(file_elements, start=1): file_url = url_or_none(file_element.text) @@ -340,7 +338,6 @@ class AfreecaTVIE(InfoExtractor): }] if not formats and not self.get_param('ignore_no_formats'): continue - self._sort_formats(formats) file_info = common_entry.copy() file_info.update({ 'id': format_id, @@ -382,7 +379,7 @@ class AfreecaTVIE(InfoExtractor): return info -class AfreecaTVLiveIE(AfreecaTVIE): +class AfreecaTVLiveIE(AfreecaTVIE): # XXX: Do not subclass from concrete IE IE_NAME = 'afreecatv:live' _VALID_URL = r'https?://play\.afreeca(?:tv)?\.com/(?P[^/]+)(?:/(?P\d+))?' @@ -466,8 +463,6 @@ class AfreecaTVLiveIE(AfreecaTVIE): 'quality': quality_key(quality_str), }) - self._sort_formats(formats) - station_info = self._download_json( 'https://st.afreecatv.com/api/get_station_status.php', broadcast_no, query={'szBjId': broadcaster_id}, fatal=False, @@ -482,3 +477,57 @@ class AfreecaTVLiveIE(AfreecaTVIE): 'formats': formats, 'is_live': True, } + + +class AfreecaTVUserIE(InfoExtractor): + IE_NAME = 'afreecatv:user' + _VALID_URL = r'https?://bj\.afreeca(?:tv)?\.com/(?P[^/]+)/vods/?(?P[^/]+)?' + _TESTS = [{ + 'url': 'https://bj.afreecatv.com/ryuryu24/vods/review', + 'info_dict': { + '_type': 'playlist', + 'id': 'ryuryu24', + 'title': 'ryuryu24 - review', + }, + 'playlist_count': 218, + }, { + 'url': 'https://bj.afreecatv.com/parang1995/vods/highlight', + 'info_dict': { + '_type': 'playlist', + 'id': 'parang1995', + 'title': 'parang1995 - highlight', + }, + 'playlist_count': 997, + }, { + 'url': 'https://bj.afreecatv.com/ryuryu24/vods', + 'info_dict': { + '_type': 'playlist', + 'id': 'ryuryu24', + 'title': 'ryuryu24 - all', + }, + 'playlist_count': 221, + }, { + 'url': 'https://bj.afreecatv.com/ryuryu24/vods/balloonclip', + 'info_dict': { + '_type': 'playlist', + 'id': 'ryuryu24', + 'title': 'ryuryu24 - balloonclip', + }, + 'playlist_count': 0, + }] + _PER_PAGE = 60 + + def _fetch_page(self, user_id, user_type, page): + page += 1 + info = self._download_json(f'https://bjapi.afreecatv.com/api/{user_id}/vods/{user_type}', user_id, + query={'page': page, 'per_page': self._PER_PAGE, 'orderby': 'reg_date'}, + note=f'Downloading {user_type} video page {page}') + for item in info['data']: + yield self.url_result( + f'https://vod.afreecatv.com/player/{item["title_no"]}/', AfreecaTVIE, item['title_no']) + + def _real_extract(self, url): + user_id, user_type = self._match_valid_url(url).group('id', 'slug_type') + user_type = user_type or 'all' + entries = OnDemandPagedList(functools.partial(self._fetch_page, user_id, user_type), self._PER_PAGE) + return self.playlist_result(entries, user_id, f'{user_id} - {user_type}') diff --git a/hypervideo_dl/extractor/agora.py b/hypervideo_dl/extractor/agora.py new file mode 100644 index 0000000..abb2d3f --- /dev/null +++ b/hypervideo_dl/extractor/agora.py @@ -0,0 +1,251 @@ +import functools +import uuid + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + OnDemandPagedList, + int_or_none, + month_by_name, + parse_duration, + try_call, +) + + +class WyborczaVideoIE(InfoExtractor): + # this id is not an article id, it has to be extracted from the article + _VALID_URL = r'(?:wyborcza:video:|https?://wyborcza\.pl/(?:api-)?video/)(?P\d+)' + IE_NAME = 'wyborcza:video' + _TESTS = [{ + 'url': 'wyborcza:video:26207634', + 'info_dict': { + 'id': '26207634', + 'ext': 'mp4', + 'title': '- Polska w 2020 r. jest innym państwem niż w 2015 r. Nie zmieniła się konstytucja, ale jest to już inny ustrój - mówi Adam Bodnar', + 'description': ' ', + 'uploader': 'Dorota Roman', + 'duration': 2474, + 'thumbnail': r're:https://.+\.jpg', + }, + }, { + 'url': 'https://wyborcza.pl/video/26207634', + 'only_matching': True, + }, { + 'url': 'https://wyborcza.pl/api-video/26207634', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + meta = self._download_json(f'https://wyborcza.pl/api-video/{video_id}', video_id) + + formats = [] + base_url = meta['redirector'].replace('http://', 'https://') + meta['basePath'] + for quality in ('standard', 'high'): + if not meta['files'].get(quality): + continue + formats.append({ + 'url': base_url + meta['files'][quality], + 'height': int_or_none( + self._search_regex( + r'p(\d+)[a-z]+\.mp4$', meta['files'][quality], + 'mp4 video height', default=None)), + 'format_id': quality, + }) + if meta['files'].get('dash'): + formats.extend(self._extract_mpd_formats(base_url + meta['files']['dash'], video_id)) + + return { + 'id': video_id, + 'formats': formats, + 'title': meta.get('title'), + 'description': meta.get('lead'), + 'uploader': meta.get('signature'), + 'thumbnail': meta.get('imageUrl'), + 'duration': meta.get('duration'), + } + + +class WyborczaPodcastIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?://(?:www\.)?(?: + wyborcza\.pl/podcast(?:/0,172673\.html)?| + wysokieobcasy\.pl/wysokie-obcasy/0,176631\.html + )(?:\?(?:[^&#]+?&)*podcast=(?P\d+))? + ''' + _TESTS = [{ + 'url': 'https://wyborcza.pl/podcast/0,172673.html?podcast=100720#S.main_topic-K.C-B.6-L.1.podcast', + 'info_dict': { + 'id': '100720', + 'ext': 'mp3', + 'title': 'Cyfrodziewczyny. Kim były pionierki polskiej informatyki ', + 'uploader': 'Michał Nogaś ', + 'upload_date': '20210117', + 'description': 'md5:49f0a06ffc4c1931210d3ab1416a651d', + 'duration': 3684.0, + 'thumbnail': r're:https://.+\.jpg', + }, + }, { + 'url': 'https://www.wysokieobcasy.pl/wysokie-obcasy/0,176631.html?podcast=100673', + 'info_dict': { + 'id': '100673', + 'ext': 'mp3', + 'title': 'Czym jest ubóstwo menstruacyjne i dlaczego dotyczy każdej i każdego z nas?', + 'uploader': 'Agnieszka Urazińska ', + 'upload_date': '20210115', + 'description': 'md5:c161dc035f8dbb60077011fc41274899', + 'duration': 1803.0, + 'thumbnail': r're:https://.+\.jpg', + }, + }, { + 'url': 'https://wyborcza.pl/podcast', + 'info_dict': { + 'id': '334', + 'title': 'Gościnnie: Wyborcza, 8:10', + 'series': 'Gościnnie: Wyborcza, 8:10', + }, + 'playlist_mincount': 370, + }, { + 'url': 'https://www.wysokieobcasy.pl/wysokie-obcasy/0,176631.html', + 'info_dict': { + 'id': '395', + 'title': 'Gościnnie: Wysokie Obcasy', + 'series': 'Gościnnie: Wysokie Obcasy', + }, + 'playlist_mincount': 12, + }] + + def _real_extract(self, url): + podcast_id = self._match_id(url) + + if not podcast_id: # playlist + podcast_id = '395' if 'wysokieobcasy.pl/' in url else '334' + return self.url_result(TokFMAuditionIE._create_url(podcast_id), TokFMAuditionIE, podcast_id) + + meta = self._download_json('https://wyborcza.pl/api/podcast', podcast_id, + query={'guid': podcast_id, 'type': 'wo' if 'wysokieobcasy.pl/' in url else None}) + + day, month, year = self._search_regex(r'^(\d\d?) (\w+) (\d{4})$', meta.get('publishedDate'), + 'upload date', group=(1, 2, 3), default=(None, None, None)) + return { + 'id': podcast_id, + 'url': meta['url'], + 'title': meta.get('title'), + 'description': meta.get('description'), + 'thumbnail': meta.get('imageUrl'), + 'duration': parse_duration(meta.get('duration')), + 'uploader': meta.get('author'), + 'upload_date': try_call(lambda: f'{year}{month_by_name(month, lang="pl"):0>2}{day:0>2}'), + } + + +class TokFMPodcastIE(InfoExtractor): + _VALID_URL = r'(?:https?://audycje\.tokfm\.pl/podcast/|tokfm:podcast:)(?P\d+),?' + IE_NAME = 'tokfm:podcast' + _TESTS = [{ + 'url': 'https://audycje.tokfm.pl/podcast/91275,-Systemowy-rasizm-Czy-zamieszki-w-USA-po-morderstwie-w-Minneapolis-doprowadza-do-zmian-w-sluzbach-panstwowych', + 'info_dict': { + 'id': '91275', + 'ext': 'aac', + 'title': 'md5:a9b15488009065556900169fb8061cce', + 'episode': 'md5:a9b15488009065556900169fb8061cce', + 'series': 'Analizy', + }, + }] + + def _real_extract(self, url): + media_id = self._match_id(url) + + # in case it breaks see this but it returns a lot of useless data + # https://api.podcast.radioagora.pl/api4/getPodcasts?podcast_id=100091&with_guests=true&with_leaders_for_mobile=true + metadata = self._download_json( + f'https://audycje.tokfm.pl/getp/3{media_id}', media_id, 'Downloading podcast metadata') + if not metadata: + raise ExtractorError('No such podcast', expected=True) + metadata = metadata[0] + + formats = [] + for ext in ('aac', 'mp3'): + url_data = self._download_json( + f'https://api.podcast.radioagora.pl/api4/getSongUrl?podcast_id={media_id}&device_id={uuid.uuid4()}&ppre=false&audio={ext}', + media_id, 'Downloading podcast %s URL' % ext) + # prevents inserting the mp3 (default) multiple times + if 'link_ssl' in url_data and f'.{ext}' in url_data['link_ssl']: + formats.append({ + 'url': url_data['link_ssl'], + 'ext': ext, + 'vcodec': 'none', + 'acodec': ext, + }) + + return { + 'id': media_id, + 'formats': formats, + 'title': metadata.get('podcast_name'), + 'series': metadata.get('series_name'), + 'episode': metadata.get('podcast_name'), + } + + +class TokFMAuditionIE(InfoExtractor): + _VALID_URL = r'(?:https?://audycje\.tokfm\.pl/audycja/|tokfm:audition:)(?P\d+),?' + IE_NAME = 'tokfm:audition' + _TESTS = [{ + 'url': 'https://audycje.tokfm.pl/audycja/218,Analizy', + 'info_dict': { + 'id': '218', + 'title': 'Analizy', + 'series': 'Analizy', + }, + 'playlist_count': 1635, + }] + + _PAGE_SIZE = 30 + _HEADERS = { + 'User-Agent': 'Mozilla/5.0 (Linux; Android 9; Redmi 3S Build/PQ3A.190801.002; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/87.0.4280.101 Mobile Safari/537.36', + } + + @staticmethod + def _create_url(id): + return f'https://audycje.tokfm.pl/audycja/{id}' + + def _real_extract(self, url): + audition_id = self._match_id(url) + + data = self._download_json( + f'https://api.podcast.radioagora.pl/api4/getSeries?series_id={audition_id}', + audition_id, 'Downloading audition metadata', headers=self._HEADERS) + if not data: + raise ExtractorError('No such audition', expected=True) + data = data[0] + + entries = OnDemandPagedList(functools.partial( + self._fetch_page, audition_id, data), self._PAGE_SIZE) + + return { + '_type': 'playlist', + 'id': audition_id, + 'title': data.get('series_name'), + 'series': data.get('series_name'), + 'entries': entries, + } + + def _fetch_page(self, audition_id, data, page): + for retry in self.RetryManager(): + podcast_page = self._download_json( + f'https://api.podcast.radioagora.pl/api4/getPodcasts?series_id={audition_id}&limit=30&offset={page}&with_guests=true&with_leaders_for_mobile=true', + audition_id, f'Downloading podcast list page {page + 1}', headers=self._HEADERS) + if not podcast_page: + retry.error = ExtractorError('Agora returned empty page', expected=True) + + for podcast in podcast_page: + yield { + '_type': 'url_transparent', + 'url': podcast['podcast_sharing_url'], + 'ie_key': TokFMPodcastIE.ie_key(), + 'title': podcast.get('podcast_name'), + 'episode': podcast.get('podcast_name'), + 'description': podcast.get('podcast_description'), + 'timestamp': int_or_none(podcast.get('podcast_timestamp')), + 'series': data.get('series_name'), + } diff --git a/hypervideo_dl/extractor/airmozilla.py b/hypervideo_dl/extractor/airmozilla.py index 9e38136..669556b 100644 --- a/hypervideo_dl/extractor/airmozilla.py +++ b/hypervideo_dl/extractor/airmozilla.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/aliexpress.py b/hypervideo_dl/extractor/aliexpress.py index 9722fe9..2e83f2e 100644 --- a/hypervideo_dl/extractor/aliexpress.py +++ b/hypervideo_dl/extractor/aliexpress.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import compat_str from ..utils import ( diff --git a/hypervideo_dl/extractor/aljazeera.py b/hypervideo_dl/extractor/aljazeera.py index 7bcdb7a..124bab0 100644 --- a/hypervideo_dl/extractor/aljazeera.py +++ b/hypervideo_dl/extractor/aljazeera.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/allocine.py b/hypervideo_dl/extractor/allocine.py index 403a277..2d342cf 100644 --- a/hypervideo_dl/extractor/allocine.py +++ b/hypervideo_dl/extractor/allocine.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import compat_str from ..utils import ( @@ -115,8 +112,6 @@ class AllocineIE(InfoExtractor): }) duration, view_count, timestamp = [None] * 3 - self._sort_formats(formats) - return { 'id': video_id, 'display_id': display_id, diff --git a/hypervideo_dl/extractor/alphaporno.py b/hypervideo_dl/extractor/alphaporno.py index 3a6d99f..8d5b472 100644 --- a/hypervideo_dl/extractor/alphaporno.py +++ b/hypervideo_dl/extractor/alphaporno.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( parse_iso8601, diff --git a/hypervideo_dl/extractor/alsace20tv.py b/hypervideo_dl/extractor/alsace20tv.py index 4aae6fe..ea3332e 100644 --- a/hypervideo_dl/extractor/alsace20tv.py +++ b/hypervideo_dl/extractor/alsace20tv.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( clean_html, @@ -25,7 +22,6 @@ class Alsace20TVBaseIE(InfoExtractor): self._extract_smil_formats(fmt_url, video_id, fatal=False) if '/smil:_' in fmt_url else self._extract_mpd_formats(fmt_url, video_id, mpd_id=res, fatal=False)) - self._sort_formats(formats) webpage = (url and self._download_webpage(url, video_id, fatal=False)) or '' thumbnail = url_or_none(dict_get(info, ('image', 'preview', )) or self._og_search_thumbnail(webpage)) diff --git a/hypervideo_dl/extractor/alura.py b/hypervideo_dl/extractor/alura.py index d2e2df2..bfe066b 100644 --- a/hypervideo_dl/extractor/alura.py +++ b/hypervideo_dl/extractor/alura.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -66,8 +63,6 @@ class AluraIE(InfoExtractor): f['height'] = int('720' if m.group('res') == 'hd' else '480') formats.extend(video_format) - self._sort_formats(formats) - return { 'id': video_id, 'title': video_title, @@ -116,7 +111,7 @@ class AluraIE(InfoExtractor): raise ExtractorError('Unable to log in') -class AluraCourseIE(AluraIE): +class AluraCourseIE(AluraIE): # XXX: Do not subclass from concrete IE _VALID_URL = r'https?://(?:cursos\.)?alura\.com\.br/course/(?P[^/]+)' _LOGIN_URL = 'https://cursos.alura.com.br/loginForm?urlAfterLogin=/loginForm' diff --git a/hypervideo_dl/extractor/amara.py b/hypervideo_dl/extractor/amara.py index 61d4695..5018710 100644 --- a/hypervideo_dl/extractor/amara.py +++ b/hypervideo_dl/extractor/amara.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from .youtube import YoutubeIE from .vimeo import VimeoIE diff --git a/hypervideo_dl/extractor/amazon.py b/hypervideo_dl/extractor/amazon.py index 07b1b18..4d31706 100644 --- a/hypervideo_dl/extractor/amazon.py +++ b/hypervideo_dl/extractor/amazon.py @@ -1,6 +1,5 @@ -# coding: utf-8 from .common import InfoExtractor -from ..utils import int_or_none +from ..utils import ExtractorError, int_or_none class AmazonStoreIE(InfoExtractor): @@ -10,7 +9,7 @@ class AmazonStoreIE(InfoExtractor): 'url': 'https://www.amazon.co.uk/dp/B098XNCHLD/', 'info_dict': { 'id': 'B098XNCHLD', - 'title': 'md5:5f3194dbf75a8dcfc83079bd63a2abed', + 'title': 'md5:dae240564cbb2642170c02f7f0d7e472', }, 'playlist_mincount': 1, 'playlist': [{ @@ -19,28 +18,44 @@ class AmazonStoreIE(InfoExtractor): 'ext': 'mp4', 'title': 'mcdodo usb c cable 100W 5a', 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 34, }, }] }, { 'url': 'https://www.amazon.in/Sony-WH-1000XM4-Cancelling-Headphones-Bluetooth/dp/B0863TXGM3', 'info_dict': { 'id': 'B0863TXGM3', - 'title': 'md5:b0bde4881d3cfd40d63af19f7898b8ff', + 'title': 'md5:d1d3352428f8f015706c84b31e132169', }, 'playlist_mincount': 4, }, { 'url': 'https://www.amazon.com/dp/B0845NXCXF/', 'info_dict': { 'id': 'B0845NXCXF', - 'title': 'md5:2145cd4e3c7782f1ee73649a3cff1171', + 'title': 'md5:f3fa12779bf62ddb6a6ec86a360a858e', }, 'playlist-mincount': 1, + }, { + 'url': 'https://www.amazon.es/Samsung-Smartphone-s-AMOLED-Quad-c%C3%A1mara-espa%C3%B1ola/dp/B08WX337PQ', + 'info_dict': { + 'id': 'B08WX337PQ', + 'title': 'md5:f3fa12779bf62ddb6a6ec86a360a858e', + }, + 'playlist_mincount': 1, }] def _real_extract(self, url): id = self._match_id(url) - webpage = self._download_webpage(url, id) - data_json = self._parse_json(self._html_search_regex(r'var\s?obj\s?=\s?jQuery\.parseJSON\(\'(.*)\'\)', webpage, 'data'), id) + + for retry in self.RetryManager(): + webpage = self._download_webpage(url, id) + try: + data_json = self._search_json( + r'var\s?obj\s?=\s?jQuery\.parseJSON\(\'', webpage, 'data', id, + transform_source=lambda x: x.replace(R'\\u', R'\u')) + except ExtractorError as e: + retry.error = e + entries = [{ 'id': video['marketPlaceID'], 'url': video['url'], @@ -50,4 +65,4 @@ class AmazonStoreIE(InfoExtractor): 'height': int_or_none(video.get('videoHeight')), 'width': int_or_none(video.get('videoWidth')), } for video in (data_json.get('videos') or []) if video.get('isVideo') and video.get('url')] - return self.playlist_result(entries, playlist_id=id, playlist_title=data_json['title']) + return self.playlist_result(entries, playlist_id=id, playlist_title=data_json.get('title')) diff --git a/hypervideo_dl/extractor/amazonminitv.py b/hypervideo_dl/extractor/amazonminitv.py new file mode 100644 index 0000000..7309968 --- /dev/null +++ b/hypervideo_dl/extractor/amazonminitv.py @@ -0,0 +1,290 @@ +import json + +from .common import InfoExtractor +from ..utils import ExtractorError, int_or_none, traverse_obj, try_get + + +class AmazonMiniTVBaseIE(InfoExtractor): + def _real_initialize(self): + self._download_webpage( + 'https://www.amazon.in/minitv', None, + note='Fetching guest session cookies') + AmazonMiniTVBaseIE.session_id = self._get_cookies('https://www.amazon.in')['session-id'].value + + def _call_api(self, asin, data=None, note=None): + device = {'clientId': 'ATVIN', 'deviceLocale': 'en_GB'} + if data: + data['variables'].update({ + 'contentType': 'VOD', + 'sessionIdToken': self.session_id, + **device, + }) + + resp = self._download_json( + f'https://www.amazon.in/minitv/api/web/{"graphql" if data else "prs"}', + asin, note=note, headers={'Content-Type': 'application/json'}, + data=json.dumps(data).encode() if data else None, + query=None if data else { + 'deviceType': 'A1WMMUXPCUJL4N', + 'contentId': asin, + **device, + }) + + if resp.get('errors'): + raise ExtractorError(f'MiniTV said: {resp["errors"][0]["message"]}') + elif not data: + return resp + return resp['data'][data['operationName']] + + +class AmazonMiniTVIE(AmazonMiniTVBaseIE): + _VALID_URL = r'(?:https?://(?:www\.)?amazon\.in/minitv/tp/|amazonminitv:(?:amzn1\.dv\.gti\.)?)(?P[a-f0-9-]+)' + _TESTS = [{ + 'url': 'https://www.amazon.in/minitv/tp/75fe3a75-b8fe-4499-8100-5c9424344840?referrer=https%3A%2F%2Fwww.amazon.in%2Fminitv', + 'info_dict': { + 'id': 'amzn1.dv.gti.75fe3a75-b8fe-4499-8100-5c9424344840', + 'ext': 'mp4', + 'title': 'May I Kiss You?', + 'language': 'Hindi', + 'thumbnail': r're:^https?://.*\.jpg$', + 'description': 'md5:a549bfc747973e04feb707833474e59d', + 'release_timestamp': 1644710400, + 'release_date': '20220213', + 'duration': 846, + 'chapters': 'count:2', + 'series': 'Couple Goals', + 'series_id': 'amzn1.dv.gti.56521d46-b040-4fd5-872e-3e70476a04b0', + 'season': 'Season 3', + 'season_number': 3, + 'season_id': 'amzn1.dv.gti.20331016-d9b9-4968-b991-c89fa4927a36', + 'episode': 'May I Kiss You?', + 'episode_number': 2, + 'episode_id': 'amzn1.dv.gti.75fe3a75-b8fe-4499-8100-5c9424344840', + }, + }, { + 'url': 'https://www.amazon.in/minitv/tp/280d2564-584f-452f-9c98-7baf906e01ab?referrer=https%3A%2F%2Fwww.amazon.in%2Fminitv', + 'info_dict': { + 'id': 'amzn1.dv.gti.280d2564-584f-452f-9c98-7baf906e01ab', + 'ext': 'mp4', + 'title': 'Jahaan', + 'language': 'Hindi', + 'thumbnail': r're:^https?://.*\.jpg', + 'description': 'md5:05eb765a77bf703f322f120ec6867339', + 'release_timestamp': 1647475200, + 'release_date': '20220317', + 'duration': 783, + 'chapters': [], + }, + }, { + 'url': 'https://www.amazon.in/minitv/tp/280d2564-584f-452f-9c98-7baf906e01ab', + 'only_matching': True, + }, { + 'url': 'amazonminitv:amzn1.dv.gti.280d2564-584f-452f-9c98-7baf906e01ab', + 'only_matching': True, + }, { + 'url': 'amazonminitv:280d2564-584f-452f-9c98-7baf906e01ab', + 'only_matching': True, + }] + + _GRAPHQL_QUERY_CONTENT = ''' +query content($sessionIdToken: String!, $deviceLocale: String, $contentId: ID!, $contentType: ContentType!, $clientId: String) { + content( + applicationContextInput: {deviceLocale: $deviceLocale, sessionIdToken: $sessionIdToken, clientId: $clientId} + contentId: $contentId + contentType: $contentType + ) { + contentId + name + ... on Episode { + contentId + vodType + name + images + description { + synopsis + contentLengthInSeconds + } + publicReleaseDateUTC + audioTracks + seasonId + seriesId + seriesName + seasonNumber + episodeNumber + timecode { + endCreditsTime + } + } + ... on MovieContent { + contentId + vodType + name + description { + synopsis + contentLengthInSeconds + } + images + publicReleaseDateUTC + audioTracks + } + } +}''' + + def _real_extract(self, url): + asin = f'amzn1.dv.gti.{self._match_id(url)}' + prs = self._call_api(asin, note='Downloading playback info') + + formats, subtitles = [], {} + for type_, asset in prs['playbackAssets'].items(): + if not traverse_obj(asset, 'manifestUrl'): + continue + if type_ == 'hls': + m3u8_fmts, m3u8_subs = self._extract_m3u8_formats_and_subtitles( + asset['manifestUrl'], asin, ext='mp4', entry_protocol='m3u8_native', + m3u8_id=type_, fatal=False) + formats.extend(m3u8_fmts) + subtitles = self._merge_subtitles(subtitles, m3u8_subs) + elif type_ == 'dash': + mpd_fmts, mpd_subs = self._extract_mpd_formats_and_subtitles( + asset['manifestUrl'], asin, mpd_id=type_, fatal=False) + formats.extend(mpd_fmts) + subtitles = self._merge_subtitles(subtitles, mpd_subs) + else: + self.report_warning(f'Unknown asset type: {type_}') + + title_info = self._call_api( + asin, note='Downloading title info', data={ + 'operationName': 'content', + 'variables': {'contentId': asin}, + 'query': self._GRAPHQL_QUERY_CONTENT, + }) + credits_time = try_get(title_info, lambda x: x['timecode']['endCreditsTime'] / 1000) + is_episode = title_info.get('vodType') == 'EPISODE' + + return { + 'id': asin, + 'title': title_info.get('name'), + 'formats': formats, + 'subtitles': subtitles, + 'language': traverse_obj(title_info, ('audioTracks', 0)), + 'thumbnails': [{ + 'id': type_, + 'url': url, + } for type_, url in (title_info.get('images') or {}).items()], + 'description': traverse_obj(title_info, ('description', 'synopsis')), + 'release_timestamp': int_or_none(try_get(title_info, lambda x: x['publicReleaseDateUTC'] / 1000)), + 'duration': traverse_obj(title_info, ('description', 'contentLengthInSeconds')), + 'chapters': [{ + 'start_time': credits_time, + 'title': 'End Credits', + }] if credits_time else [], + 'series': title_info.get('seriesName'), + 'series_id': title_info.get('seriesId'), + 'season_number': title_info.get('seasonNumber'), + 'season_id': title_info.get('seasonId'), + 'episode': title_info.get('name') if is_episode else None, + 'episode_number': title_info.get('episodeNumber'), + 'episode_id': asin if is_episode else None, + } + + +class AmazonMiniTVSeasonIE(AmazonMiniTVBaseIE): + IE_NAME = 'amazonminitv:season' + _VALID_URL = r'amazonminitv:season:(?:amzn1\.dv\.gti\.)?(?P[a-f0-9-]+)' + IE_DESC = 'Amazon MiniTV Series, "minitv:season:" prefix' + _TESTS = [{ + 'url': 'amazonminitv:season:amzn1.dv.gti.0aa996eb-6a1b-4886-a342-387fbd2f1db0', + 'playlist_mincount': 6, + 'info_dict': { + 'id': 'amzn1.dv.gti.0aa996eb-6a1b-4886-a342-387fbd2f1db0', + }, + }, { + 'url': 'amazonminitv:season:0aa996eb-6a1b-4886-a342-387fbd2f1db0', + 'only_matching': True, + }] + + _GRAPHQL_QUERY = ''' +query getEpisodes($sessionIdToken: String!, $clientId: String, $episodeOrSeasonId: ID!, $deviceLocale: String) { + getEpisodes( + applicationContextInput: {sessionIdToken: $sessionIdToken, deviceLocale: $deviceLocale, clientId: $clientId} + episodeOrSeasonId: $episodeOrSeasonId + ) { + episodes { + ... on Episode { + contentId + name + images + seriesName + seasonId + seriesId + seasonNumber + episodeNumber + description { + synopsis + contentLengthInSeconds + } + publicReleaseDateUTC + } + } + } +} +''' + + def _entries(self, asin): + season_info = self._call_api( + asin, note='Downloading season info', data={ + 'operationName': 'getEpisodes', + 'variables': {'episodeOrSeasonId': asin}, + 'query': self._GRAPHQL_QUERY, + }) + + for episode in season_info['episodes']: + yield self.url_result( + f'amazonminitv:{episode["contentId"]}', AmazonMiniTVIE, episode['contentId']) + + def _real_extract(self, url): + asin = f'amzn1.dv.gti.{self._match_id(url)}' + return self.playlist_result(self._entries(asin), asin) + + +class AmazonMiniTVSeriesIE(AmazonMiniTVBaseIE): + IE_NAME = 'amazonminitv:series' + _VALID_URL = r'amazonminitv:series:(?:amzn1\.dv\.gti\.)?(?P[a-f0-9-]+)' + _TESTS = [{ + 'url': 'amazonminitv:series:amzn1.dv.gti.56521d46-b040-4fd5-872e-3e70476a04b0', + 'playlist_mincount': 3, + 'info_dict': { + 'id': 'amzn1.dv.gti.56521d46-b040-4fd5-872e-3e70476a04b0', + }, + }, { + 'url': 'amazonminitv:series:56521d46-b040-4fd5-872e-3e70476a04b0', + 'only_matching': True, + }] + + _GRAPHQL_QUERY = ''' +query getSeasons($sessionIdToken: String!, $deviceLocale: String, $episodeOrSeasonOrSeriesId: ID!, $clientId: String) { + getSeasons( + applicationContextInput: {deviceLocale: $deviceLocale, sessionIdToken: $sessionIdToken, clientId: $clientId} + episodeOrSeasonOrSeriesId: $episodeOrSeasonOrSeriesId + ) { + seasons { + seasonId + } + } +} +''' + + def _entries(self, asin): + season_info = self._call_api( + asin, note='Downloading series info', data={ + 'operationName': 'getSeasons', + 'variables': {'episodeOrSeasonOrSeriesId': asin}, + 'query': self._GRAPHQL_QUERY, + }) + + for season in season_info['seasons']: + yield self.url_result(f'amazonminitv:season:{season["seasonId"]}', AmazonMiniTVSeasonIE, season['seasonId']) + + def _real_extract(self, url): + asin = f'amzn1.dv.gti.{self._match_id(url)}' + return self.playlist_result(self._entries(asin), asin) diff --git a/hypervideo_dl/extractor/amcnetworks.py b/hypervideo_dl/extractor/amcnetworks.py index e38e215..c58bc7b 100644 --- a/hypervideo_dl/extractor/amcnetworks.py +++ b/hypervideo_dl/extractor/amcnetworks.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .theplatform import ThePlatformIE @@ -12,7 +9,7 @@ from ..utils import ( ) -class AMCNetworksIE(ThePlatformIE): +class AMCNetworksIE(ThePlatformIE): # XXX: Do not subclass from concrete IE _VALID_URL = r'https?://(?:www\.)?(?Pamc|bbcamerica|ifc|(?:we|sundance)tv)\.com/(?P(?:movies|shows(?:/[^/]+)+)/[^/?#&]+)' _TESTS = [{ 'url': 'https://www.bbcamerica.com/shows/the-graham-norton-show/videos/tina-feys-adorable-airline-themed-family-dinner--51631', @@ -109,7 +106,6 @@ class AMCNetworksIE(ThePlatformIE): media_url = update_url_query(media_url, query) formats, subtitles = self._extract_theplatform_smil( media_url, video_id) - self._sort_formats(formats) thumbnails = [] thumbnail_urls = [properties.get('imageDesktop')] diff --git a/hypervideo_dl/extractor/americastestkitchen.py b/hypervideo_dl/extractor/americastestkitchen.py index 6e6099a..abda55d 100644 --- a/hypervideo_dl/extractor/americastestkitchen.py +++ b/hypervideo_dl/extractor/americastestkitchen.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json from .common import InfoExtractor @@ -14,7 +11,7 @@ from ..utils import ( class AmericasTestKitchenIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?:americastestkitchen|cooks(?:country|illustrated))\.com/(?Pepisode|videos)/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?americastestkitchen\.com/(?:cooks(?:country|illustrated)/)?(?Pepisode|videos)/(?P\d+)' _TESTS = [{ 'url': 'https://www.americastestkitchen.com/episode/582-weeknight-japanese-suppers', 'md5': 'b861c3e365ac38ad319cfd509c30577f', @@ -22,15 +19,20 @@ class AmericasTestKitchenIE(InfoExtractor): 'id': '5b400b9ee338f922cb06450c', 'title': 'Japanese Suppers', 'ext': 'mp4', + 'display_id': 'weeknight-japanese-suppers', 'description': 'md5:64e606bfee910627efc4b5f050de92b3', - 'thumbnail': r're:^https?://', - 'timestamp': 1523318400, - 'upload_date': '20180410', - 'release_date': '20180410', - 'series': "America's Test Kitchen", - 'season_number': 18, + 'timestamp': 1523304000, + 'upload_date': '20180409', + 'release_date': '20180409', + 'series': 'America\'s Test Kitchen', + 'season': 'Season 18', 'episode': 'Japanese Suppers', + 'season_number': 18, 'episode_number': 15, + 'duration': 1376, + 'thumbnail': r're:^https?://', + 'average_rating': 0, + 'view_count': int, }, 'params': { 'skip_download': True, @@ -43,15 +45,20 @@ class AmericasTestKitchenIE(InfoExtractor): 'id': '5fbe8c61bda2010001c6763b', 'title': 'Simple Chicken Dinner', 'ext': 'mp4', + 'display_id': 'atktv_2103_simple-chicken-dinner_full-episode_web-mp4', 'description': 'md5:eb68737cc2fd4c26ca7db30139d109e7', - 'thumbnail': r're:^https?://', - 'timestamp': 1610755200, - 'upload_date': '20210116', - 'release_date': '20210116', - 'series': "America's Test Kitchen", - 'season_number': 21, + 'timestamp': 1610737200, + 'upload_date': '20210115', + 'release_date': '20210115', + 'series': 'America\'s Test Kitchen', + 'season': 'Season 21', 'episode': 'Simple Chicken Dinner', + 'season_number': 21, 'episode_number': 3, + 'duration': 1397, + 'thumbnail': r're:^https?://', + 'view_count': int, + 'average_rating': 0, }, 'params': { 'skip_download': True, @@ -60,10 +67,10 @@ class AmericasTestKitchenIE(InfoExtractor): 'url': 'https://www.americastestkitchen.com/videos/3420-pan-seared-salmon', 'only_matching': True, }, { - 'url': 'https://www.cookscountry.com/episode/564-when-only-chocolate-will-do', + 'url': 'https://www.americastestkitchen.com/cookscountry/episode/564-when-only-chocolate-will-do', 'only_matching': True, }, { - 'url': 'https://www.cooksillustrated.com/videos/4478-beef-wellington', + 'url': 'https://www.americastestkitchen.com/cooksillustrated/videos/4478-beef-wellington', 'only_matching': True, }] @@ -93,7 +100,7 @@ class AmericasTestKitchenIE(InfoExtractor): class AmericasTestKitchenSeasonIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?Pamericastestkitchen|cookscountry)\.com/episodes/browse/season_(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?americastestkitchen\.com(?P/cookscountry)?/episodes/browse/season_(?P\d+)' _TESTS = [{ # ATK Season 'url': 'https://www.americastestkitchen.com/episodes/browse/season_1', @@ -104,7 +111,7 @@ class AmericasTestKitchenSeasonIE(InfoExtractor): 'playlist_count': 13, }, { # Cooks Country Season - 'url': 'https://www.cookscountry.com/episodes/browse/season_12', + 'url': 'https://www.americastestkitchen.com/cookscountry/episodes/browse/season_12', 'info_dict': { 'id': 'season_12', 'title': 'Season 12', @@ -113,17 +120,17 @@ class AmericasTestKitchenSeasonIE(InfoExtractor): }] def _real_extract(self, url): - show_name, season_number = self._match_valid_url(url).groups() + show_path, season_number = self._match_valid_url(url).group('show', 'id') season_number = int(season_number) - slug = 'atk' if show_name == 'americastestkitchen' else 'cco' + slug = 'cco' if show_path == '/cookscountry' else 'atk' season = 'Season %d' % season_number season_search = self._download_json( 'https://y1fnzxui30-dsn.algolia.net/1/indexes/everest_search_%s_season_desc_production' % slug, season, headers={ - 'Origin': 'https://www.%s.com' % show_name, + 'Origin': 'https://www.americastestkitchen.com', 'X-Algolia-API-Key': '8d504d0099ed27c1b73708d22871d805', 'X-Algolia-Application-Id': 'Y1FNZXUI30', }, query={ @@ -139,12 +146,12 @@ class AmericasTestKitchenSeasonIE(InfoExtractor): def entries(): for episode in (season_search.get('hits') or []): - search_url = episode.get('search_url') + search_url = episode.get('search_url') # always formatted like '/episode/123-title-of-episode' if not search_url: continue yield { '_type': 'url', - 'url': 'https://www.%s.com%s' % (show_name, search_url), + 'url': f'https://www.americastestkitchen.com{show_path or ""}{search_url}', 'id': try_get(episode, lambda e: e['objectID'].split('_')[-1]), 'title': episode.get('title'), 'description': episode.get('description'), diff --git a/hypervideo_dl/extractor/amp.py b/hypervideo_dl/extractor/amp.py index 24c684c..b0cbd77 100644 --- a/hypervideo_dl/extractor/amp.py +++ b/hypervideo_dl/extractor/amp.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( determine_ext, @@ -13,7 +10,7 @@ from ..utils import ( ) -class AMPIE(InfoExtractor): +class AMPIE(InfoExtractor): # XXX: Conventionally, base classes should end with BaseIE/InfoExtractor # parse Akamai Adaptive Media Player feed def _extract_feed_info(self, url): feed = self._download_json( @@ -87,8 +84,6 @@ class AMPIE(InfoExtractor): 'ext': ext, }) - self._sort_formats(formats) - timestamp = unified_timestamp(item.get('pubDate'), ' ') or parse_iso8601(item.get('dc-date')) return { diff --git a/hypervideo_dl/extractor/angel.py b/hypervideo_dl/extractor/angel.py new file mode 100644 index 0000000..306b365 --- /dev/null +++ b/hypervideo_dl/extractor/angel.py @@ -0,0 +1,56 @@ +import re + +from .common import InfoExtractor +from ..utils import url_or_none, merge_dicts + + +class AngelIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?angel\.com/watch/(?P[^/?#]+)/episode/(?P[\w-]+)/season-(?P\d+)/episode-(?P\d+)/(?P[^/?#]+)' + _TESTS = [{ + 'url': 'https://www.angel.com/watch/tuttle-twins/episode/2f3d0382-ea82-4cdc-958e-84fbadadc710/season-1/episode-1/when-laws-give-you-lemons', + 'md5': '4734e5cfdd64a568e837246aa3eaa524', + 'info_dict': { + 'id': '2f3d0382-ea82-4cdc-958e-84fbadadc710', + 'ext': 'mp4', + 'title': 'Tuttle Twins Season 1, Episode 1: When Laws Give You Lemons', + 'description': 'md5:73b704897c20ab59c433a9c0a8202d5e', + 'thumbnail': r're:^https?://images.angelstudios.com/image/upload/angel-app/.*$', + 'duration': 1359.0 + } + }, { + 'url': 'https://www.angel.com/watch/the-chosen/episode/8dfb714d-bca5-4812-8125-24fb9514cd10/season-1/episode-1/i-have-called-you-by-name', + 'md5': 'e4774bad0a5f0ad2e90d175cafdb797d', + 'info_dict': { + 'id': '8dfb714d-bca5-4812-8125-24fb9514cd10', + 'ext': 'mp4', + 'title': 'The Chosen Season 1, Episode 1: I Have Called You By Name', + 'description': 'md5:aadfb4827a94415de5ff6426e6dee3be', + 'thumbnail': r're:^https?://images.angelstudios.com/image/upload/angel-app/.*$', + 'duration': 3276.0 + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + json_ld = self._search_json_ld(webpage, video_id) + + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + json_ld.pop('url'), video_id, note='Downloading HD m3u8 information') + + info_dict = { + 'id': video_id, + 'title': self._og_search_title(webpage), + 'description': self._og_search_description(webpage), + 'formats': formats, + 'subtitles': subtitles + } + + # Angel uses cloudinary in the background and supports image transformations. + # We remove these transformations and return the source file + base_thumbnail_url = url_or_none(self._og_search_thumbnail(webpage)) or json_ld.pop('thumbnails') + if base_thumbnail_url: + info_dict['thumbnail'] = re.sub(r'(/upload)/.+(/angel-app/.+)$', r'\1\2', base_thumbnail_url) + + return merge_dicts(info_dict, json_ld) diff --git a/hypervideo_dl/extractor/animelab.py b/hypervideo_dl/extractor/animelab.py deleted file mode 100644 index 1c2cc47..0000000 --- a/hypervideo_dl/extractor/animelab.py +++ /dev/null @@ -1,278 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor - -from ..utils import ( - ExtractorError, - urlencode_postdata, - int_or_none, - str_or_none, - determine_ext, -) - -from ..compat import compat_HTTPError - - -class AnimeLabBaseIE(InfoExtractor): - _LOGIN_URL = 'https://www.animelab.com/login' - _NETRC_MACHINE = 'animelab' - _LOGGED_IN = False - - def _is_logged_in(self, login_page=None): - if not self._LOGGED_IN: - if not login_page: - login_page = self._download_webpage(self._LOGIN_URL, None, 'Downloading login page') - AnimeLabBaseIE._LOGGED_IN = 'Sign In' not in login_page - return self._LOGGED_IN - - def _perform_login(self, username, password): - if self._is_logged_in(): - return - - login_form = { - 'email': username, - 'password': password, - } - - try: - response = self._download_webpage( - self._LOGIN_URL, None, 'Logging in', 'Wrong login info', - data=urlencode_postdata(login_form), - headers={'Content-Type': 'application/x-www-form-urlencoded'}) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: - raise ExtractorError('Unable to log in (wrong credentials?)', expected=True) - raise - - if not self._is_logged_in(response): - raise ExtractorError('Unable to login (cannot verify if logged in)') - - def _real_initialize(self): - if not self._is_logged_in(): - self.raise_login_required('Login is required to access any AnimeLab content') - - -class AnimeLabIE(AnimeLabBaseIE): - _VALID_URL = r'https?://(?:www\.)?animelab\.com/player/(?P<id>[^/]+)' - - # the following tests require authentication, but a free account will suffice - # just set 'usenetrc' to true in test/local_parameters.json if you use a .netrc file - # or you can set 'username' and 'password' there - # the tests also select a specific format so that the same video is downloaded - # regardless of whether the user is premium or not (needs testing on a premium account) - _TEST = { - 'url': 'https://www.animelab.com/player/fullmetal-alchemist-brotherhood-episode-42', - 'md5': '05bde4b91a5d1ff46ef5b94df05b0f7f', - 'info_dict': { - 'id': '383', - 'ext': 'mp4', - 'display_id': 'fullmetal-alchemist-brotherhood-episode-42', - 'title': 'Fullmetal Alchemist: Brotherhood - Episode 42 - Signs of a Counteroffensive', - 'description': 'md5:103eb61dd0a56d3dfc5dbf748e5e83f4', - 'series': 'Fullmetal Alchemist: Brotherhood', - 'episode': 'Signs of a Counteroffensive', - 'episode_number': 42, - 'duration': 1469, - 'season': 'Season 1', - 'season_number': 1, - 'season_id': '38', - }, - 'params': { - 'format': '[format_id=21711_yeshardsubbed_ja-JP][height=480]', - }, - 'skip': 'All AnimeLab content requires authentication', - } - - def _real_extract(self, url): - display_id = self._match_id(url) - - # unfortunately we can get different URLs for the same formats - # e.g. if we are using a "free" account so no dubs available - # (so _remove_duplicate_formats is not effective) - # so we use a dictionary as a workaround - formats = {} - for language_option_url in ('https://www.animelab.com/player/%s/subtitles', - 'https://www.animelab.com/player/%s/dubbed'): - actual_url = language_option_url % display_id - webpage = self._download_webpage(actual_url, display_id, 'Downloading URL ' + actual_url) - - video_collection = self._parse_json(self._search_regex(r'new\s+?AnimeLabApp\.VideoCollection\s*?\((.*?)\);', webpage, 'AnimeLab VideoCollection'), display_id) - position = int_or_none(self._search_regex(r'playlistPosition\s*?=\s*?(\d+)', webpage, 'Playlist Position')) - - raw_data = video_collection[position]['videoEntry'] - - video_id = str_or_none(raw_data['id']) - - # create a title from many sources (while grabbing other info) - # TODO use more fallback sources to get some of these - series = raw_data.get('showTitle') - video_type = raw_data.get('videoEntryType', {}).get('name') - episode_number = raw_data.get('episodeNumber') - episode_name = raw_data.get('name') - - title_parts = (series, video_type, episode_number, episode_name) - if None not in title_parts: - title = '%s - %s %s - %s' % title_parts - else: - title = episode_name - - description = raw_data.get('synopsis') or self._og_search_description(webpage, default=None) - - duration = int_or_none(raw_data.get('duration')) - - thumbnail_data = raw_data.get('images', []) - thumbnails = [] - for thumbnail in thumbnail_data: - for instance in thumbnail['imageInstances']: - image_data = instance.get('imageInfo', {}) - thumbnails.append({ - 'id': str_or_none(image_data.get('id')), - 'url': image_data.get('fullPath'), - 'width': image_data.get('width'), - 'height': image_data.get('height'), - }) - - season_data = raw_data.get('season', {}) or {} - season = str_or_none(season_data.get('name')) - season_number = int_or_none(season_data.get('seasonNumber')) - season_id = str_or_none(season_data.get('id')) - - for video_data in raw_data['videoList']: - current_video_list = {} - current_video_list['language'] = video_data.get('language', {}).get('languageCode') - - is_hardsubbed = video_data.get('hardSubbed') - - for video_instance in video_data['videoInstances']: - httpurl = video_instance.get('httpUrl') - url = httpurl if httpurl else video_instance.get('rtmpUrl') - if url is None: - # this video format is unavailable to the user (not premium etc.) - continue - - current_format = current_video_list.copy() - - format_id_parts = [] - - format_id_parts.append(str_or_none(video_instance.get('id'))) - - if is_hardsubbed is not None: - if is_hardsubbed: - format_id_parts.append('yeshardsubbed') - else: - format_id_parts.append('nothardsubbed') - - format_id_parts.append(current_format['language']) - - format_id = '_'.join([x for x in format_id_parts if x is not None]) - - ext = determine_ext(url) - if ext == 'm3u8': - for format_ in self._extract_m3u8_formats( - url, video_id, m3u8_id=format_id, fatal=False): - formats[format_['format_id']] = format_ - continue - elif ext == 'mpd': - for format_ in self._extract_mpd_formats( - url, video_id, mpd_id=format_id, fatal=False): - formats[format_['format_id']] = format_ - continue - - current_format['url'] = url - quality_data = video_instance.get('videoQuality') - if quality_data: - quality = quality_data.get('name') or quality_data.get('description') - else: - quality = None - - height = None - if quality: - height = int_or_none(self._search_regex(r'(\d+)p?$', quality, 'Video format height', default=None)) - - if height is None: - self.report_warning('Could not get height of video') - else: - current_format['height'] = height - current_format['format_id'] = format_id - - formats[current_format['format_id']] = current_format - - formats = list(formats.values()) - self._sort_formats(formats) - - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': description, - 'series': series, - 'episode': episode_name, - 'episode_number': int_or_none(episode_number), - 'thumbnails': thumbnails, - 'duration': duration, - 'formats': formats, - 'season': season, - 'season_number': season_number, - 'season_id': season_id, - } - - -class AnimeLabShowsIE(AnimeLabBaseIE): - _VALID_URL = r'https?://(?:www\.)?animelab\.com/shows/(?P<id>[^/]+)' - - _TEST = { - 'url': 'https://www.animelab.com/shows/attack-on-titan', - 'info_dict': { - 'id': '45', - 'title': 'Attack on Titan', - 'description': 'md5:989d95a2677e9309368d5cf39ba91469', - }, - 'playlist_count': 59, - 'skip': 'All AnimeLab content requires authentication', - } - - def _real_extract(self, url): - _BASE_URL = 'http://www.animelab.com' - _SHOWS_API_URL = '/api/videoentries/show/videos/' - display_id = self._match_id(url) - - webpage = self._download_webpage(url, display_id, 'Downloading requested URL') - - show_data_str = self._search_regex(r'({"id":.*}),\svideoEntry', webpage, 'AnimeLab show data') - show_data = self._parse_json(show_data_str, display_id) - - show_id = str_or_none(show_data.get('id')) - title = show_data.get('name') - description = show_data.get('shortSynopsis') or show_data.get('longSynopsis') - - entries = [] - for season in show_data['seasons']: - season_id = season['id'] - get_data = urlencode_postdata({ - 'seasonId': season_id, - 'limit': 1000, - }) - # despite using urlencode_postdata, we are sending a GET request - target_url = _BASE_URL + _SHOWS_API_URL + show_id + "?" + get_data.decode('utf-8') - response = self._download_webpage( - target_url, - None, 'Season id %s' % season_id) - - season_data = self._parse_json(response, display_id) - - for video_data in season_data['list']: - entries.append(self.url_result( - _BASE_URL + '/player/' + video_data['slug'], 'AnimeLab', - str_or_none(video_data.get('id')), video_data.get('name') - )) - - return { - '_type': 'playlist', - 'id': show_id, - 'title': title, - 'description': description, - 'entries': entries, - } - -# TODO implement myqueue diff --git a/hypervideo_dl/extractor/animeondemand.py b/hypervideo_dl/extractor/animeondemand.py deleted file mode 100644 index 2e674d5..0000000 --- a/hypervideo_dl/extractor/animeondemand.py +++ /dev/null @@ -1,284 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - determine_ext, - extract_attributes, - ExtractorError, - join_nonempty, - url_or_none, - urlencode_postdata, - urljoin, -) - - -class AnimeOnDemandIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?anime-on-demand\.de/anime/(?P<id>\d+)' - _LOGIN_URL = 'https://www.anime-on-demand.de/users/sign_in' - _APPLY_HTML5_URL = 'https://www.anime-on-demand.de/html5apply' - _NETRC_MACHINE = 'animeondemand' - # German-speaking countries of Europe - _GEO_COUNTRIES = ['AT', 'CH', 'DE', 'LI', 'LU'] - _TESTS = [{ - # jap, OmU - 'url': 'https://www.anime-on-demand.de/anime/161', - 'info_dict': { - 'id': '161', - 'title': 'Grimgar, Ashes and Illusions (OmU)', - 'description': 'md5:6681ce3c07c7189d255ac6ab23812d31', - }, - 'playlist_mincount': 4, - }, { - # Film wording is used instead of Episode, ger/jap, Dub/OmU - 'url': 'https://www.anime-on-demand.de/anime/39', - 'only_matching': True, - }, { - # Episodes without titles, jap, OmU - 'url': 'https://www.anime-on-demand.de/anime/162', - 'only_matching': True, - }, { - # ger/jap, Dub/OmU, account required - 'url': 'https://www.anime-on-demand.de/anime/169', - 'only_matching': True, - }, { - # Full length film, non-series, ger/jap, Dub/OmU, account required - 'url': 'https://www.anime-on-demand.de/anime/185', - 'only_matching': True, - }, { - # Flash videos - 'url': 'https://www.anime-on-demand.de/anime/12', - 'only_matching': True, - }] - - def _perform_login(self, username, password): - login_page = self._download_webpage( - self._LOGIN_URL, None, 'Downloading login page') - - if '>Our licensing terms allow the distribution of animes only to German-speaking countries of Europe' in login_page: - self.raise_geo_restricted( - '%s is only available in German-speaking countries of Europe' % self.IE_NAME) - - login_form = self._form_hidden_inputs('new_user', login_page) - - login_form.update({ - 'user[login]': username, - 'user[password]': password, - }) - - post_url = self._search_regex( - r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page, - 'post url', default=self._LOGIN_URL, group='url') - - if not post_url.startswith('http'): - post_url = urljoin(self._LOGIN_URL, post_url) - - response = self._download_webpage( - post_url, None, 'Logging in', - data=urlencode_postdata(login_form), headers={ - 'Referer': self._LOGIN_URL, - }) - - if all(p not in response for p in ('>Logout<', 'href="/users/sign_out"')): - error = self._search_regex( - r'<p[^>]+\bclass=(["\'])(?:(?!\1).)*\balert\b(?:(?!\1).)*\1[^>]*>(?P<error>.+?)</p>', - response, 'error', default=None, group='error') - if error: - raise ExtractorError('Unable to login: %s' % error, expected=True) - raise ExtractorError('Unable to log in') - - def _real_extract(self, url): - anime_id = self._match_id(url) - - webpage = self._download_webpage(url, anime_id) - - if 'data-playlist=' not in webpage: - self._download_webpage( - self._APPLY_HTML5_URL, anime_id, - 'Activating HTML5 beta', 'Unable to apply HTML5 beta') - webpage = self._download_webpage(url, anime_id) - - csrf_token = self._html_search_meta( - 'csrf-token', webpage, 'csrf token', fatal=True) - - anime_title = self._html_search_regex( - r'(?s)<h1[^>]+itemprop="name"[^>]*>(.+?)</h1>', - webpage, 'anime name') - anime_description = self._html_search_regex( - r'(?s)<div[^>]+itemprop="description"[^>]*>(.+?)</div>', - webpage, 'anime description', default=None) - - def extract_info(html, video_id, num=None): - title, description = [None] * 2 - formats = [] - - for input_ in re.findall( - r'<input[^>]+class=["\'].*?streamstarter[^>]+>', html): - attributes = extract_attributes(input_) - title = attributes.get('data-dialog-header') - playlist_urls = [] - for playlist_key in ('data-playlist', 'data-otherplaylist', 'data-stream'): - playlist_url = attributes.get(playlist_key) - if isinstance(playlist_url, compat_str) and re.match( - r'/?[\da-zA-Z]+', playlist_url): - playlist_urls.append(attributes[playlist_key]) - if not playlist_urls: - continue - - lang = attributes.get('data-lang') - lang_note = attributes.get('value') - - for playlist_url in playlist_urls: - kind = self._search_regex( - r'videomaterialurl/\d+/([^/]+)/', - playlist_url, 'media kind', default=None) - format_id = join_nonempty(lang, kind) if lang or kind else str(num) - format_note = join_nonempty(kind, lang_note, delim=', ') - item_id_list = [] - if format_id: - item_id_list.append(format_id) - item_id_list.append('videomaterial') - playlist = self._download_json( - urljoin(url, playlist_url), video_id, - 'Downloading %s JSON' % ' '.join(item_id_list), - headers={ - 'X-Requested-With': 'XMLHttpRequest', - 'X-CSRF-Token': csrf_token, - 'Referer': url, - 'Accept': 'application/json, text/javascript, */*; q=0.01', - }, fatal=False) - if not playlist: - continue - stream_url = url_or_none(playlist.get('streamurl')) - if stream_url: - rtmp = re.search( - r'^(?P<url>rtmpe?://(?P<host>[^/]+)/(?P<app>.+/))(?P<playpath>mp[34]:.+)', - stream_url) - if rtmp: - formats.append({ - 'url': rtmp.group('url'), - 'app': rtmp.group('app'), - 'play_path': rtmp.group('playpath'), - 'page_url': url, - 'player_url': 'https://www.anime-on-demand.de/assets/jwplayer.flash-55abfb34080700304d49125ce9ffb4a6.swf', - 'rtmp_real_time': True, - 'format_id': 'rtmp', - 'ext': 'flv', - }) - continue - start_video = playlist.get('startvideo', 0) - playlist = playlist.get('playlist') - if not playlist or not isinstance(playlist, list): - continue - playlist = playlist[start_video] - title = playlist.get('title') - if not title: - continue - description = playlist.get('description') - for source in playlist.get('sources', []): - file_ = source.get('file') - if not file_: - continue - ext = determine_ext(file_) - format_id = join_nonempty( - lang, kind, - 'hls' if ext == 'm3u8' else None, - 'dash' if source.get('type') == 'video/dash' or ext == 'mpd' else None) - if ext == 'm3u8': - file_formats = self._extract_m3u8_formats( - file_, video_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id=format_id, fatal=False) - elif source.get('type') == 'video/dash' or ext == 'mpd': - continue - file_formats = self._extract_mpd_formats( - file_, video_id, mpd_id=format_id, fatal=False) - else: - continue - for f in file_formats: - f.update({ - 'language': lang, - 'format_note': format_note, - }) - formats.extend(file_formats) - - return { - 'title': title, - 'description': description, - 'formats': formats, - } - - def extract_entries(html, video_id, common_info, num=None): - info = extract_info(html, video_id, num) - - if info['formats']: - self._sort_formats(info['formats']) - f = common_info.copy() - f.update(info) - yield f - - # Extract teaser/trailer only when full episode is not available - if not info['formats']: - m = re.search( - r'data-dialog-header=(["\'])(?P<title>.+?)\1[^>]+href=(["\'])(?P<href>.+?)\3[^>]*>(?P<kind>Teaser|Trailer)<', - html) - if m: - f = common_info.copy() - f.update({ - 'id': '%s-%s' % (f['id'], m.group('kind').lower()), - 'title': m.group('title'), - 'url': urljoin(url, m.group('href')), - }) - yield f - - def extract_episodes(html): - for num, episode_html in enumerate(re.findall( - r'(?s)<h3[^>]+class="episodebox-title".+?>Episodeninhalt<', html), 1): - episodebox_title = self._search_regex( - (r'class="episodebox-title"[^>]+title=(["\'])(?P<title>.+?)\1', - r'class="episodebox-title"[^>]+>(?P<title>.+?)<'), - episode_html, 'episodebox title', default=None, group='title') - if not episodebox_title: - continue - - episode_number = int(self._search_regex( - r'(?:Episode|Film)\s*(\d+)', - episodebox_title, 'episode number', default=num)) - episode_title = self._search_regex( - r'(?:Episode|Film)\s*\d+\s*-\s*(.+)', - episodebox_title, 'episode title', default=None) - - video_id = 'episode-%d' % episode_number - - common_info = { - 'id': video_id, - 'series': anime_title, - 'episode': episode_title, - 'episode_number': episode_number, - } - - for e in extract_entries(episode_html, video_id, common_info): - yield e - - def extract_film(html, video_id): - common_info = { - 'id': anime_id, - 'title': anime_title, - 'description': anime_description, - } - for e in extract_entries(html, video_id, common_info): - yield e - - def entries(): - has_episodes = False - for e in extract_episodes(webpage): - has_episodes = True - yield e - - if not has_episodes: - for e in extract_film(webpage, anime_id): - yield e - - return self.playlist_result( - entries(), anime_id, anime_title, anime_description) diff --git a/hypervideo_dl/extractor/ant1newsgr.py b/hypervideo_dl/extractor/ant1newsgr.py index 1075b46..7b384b2 100644 --- a/hypervideo_dl/extractor/ant1newsgr.py +++ b/hypervideo_dl/extractor/ant1newsgr.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re import urllib.parse from .common import InfoExtractor @@ -10,7 +6,6 @@ from ..utils import ( ExtractorError, determine_ext, scale_thumbnails_to_max_format_width, - unescapeHTML, ) @@ -24,7 +19,6 @@ class Ant1NewsGrBaseIE(InfoExtractor): raise ExtractorError('no source found for %s' % video_id) formats, subs = (self._extract_m3u8_formats_and_subtitles(source, video_id, 'mp4') if determine_ext(source) == 'm3u8' else ([{'url': source}], {})) - self._sort_formats(formats) thumbnails = scale_thumbnails_to_max_format_width( formats, [{'url': info['thumb']}], r'(?<=/imgHandler/)\d+') return { @@ -94,7 +88,7 @@ class Ant1NewsGrArticleIE(Ant1NewsGrBaseIE): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) info = self._search_json_ld(webpage, video_id, expected_type='NewsArticle') - embed_urls = list(Ant1NewsGrEmbedIE._extract_urls(webpage)) + embed_urls = list(Ant1NewsGrEmbedIE._extract_embed_urls(url, webpage)) if not embed_urls: raise ExtractorError('no videos found for %s' % video_id, expected=True) return self.playlist_from_matches( @@ -107,6 +101,7 @@ class Ant1NewsGrEmbedIE(Ant1NewsGrBaseIE): IE_DESC = 'ant1news.gr embedded videos' _BASE_PLAYER_URL_RE = r'(?:https?:)?//(?:[a-zA-Z0-9\-]+\.)?(?:antenna|ant1news)\.gr/templates/pages/player' _VALID_URL = rf'{_BASE_PLAYER_URL_RE}\?([^#]+&)?cid=(?P<id>[^#&]+)' + _EMBED_REGEX = [rf'<iframe[^>]+?src=(?P<_q1>["\'])(?P<url>{_BASE_PLAYER_URL_RE}\?(?:(?!(?P=_q1)).)+)(?P=_q1)'] _API_PATH = '/news/templates/data/jsonPlayer' _TESTS = [{ @@ -120,16 +115,6 @@ class Ant1NewsGrEmbedIE(Ant1NewsGrBaseIE): }, }] - @classmethod - def _extract_urls(cls, webpage): - _EMBED_URL_RE = rf'{cls._BASE_PLAYER_URL_RE}\?(?:(?!(?P=_q1)).)+' - _EMBED_RE = rf'<iframe[^>]+?src=(?P<_q1>["\'])(?P<url>{_EMBED_URL_RE})(?P=_q1)' - for mobj in re.finditer(_EMBED_RE, webpage): - url = unescapeHTML(mobj.group('url')) - if not cls.suitable(url): - continue - yield url - def _real_extract(self, url): video_id = self._match_id(url) diff --git a/hypervideo_dl/extractor/anvato.py b/hypervideo_dl/extractor/anvato.py index 686d453..79bfe41 100644 --- a/hypervideo_dl/extractor/anvato.py +++ b/hypervideo_dl/extractor/anvato.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import base64 import hashlib import json @@ -10,38 +7,68 @@ import time from .common import InfoExtractor from ..aes import aes_encrypt -from ..compat import compat_str from ..utils import ( bytes_to_intlist, determine_ext, - intlist_to_bytes, int_or_none, + intlist_to_bytes, join_nonempty, + smuggle_url, strip_jsonp, + traverse_obj, unescapeHTML, unsmuggle_url, ) -# This import causes a ModuleNotFoundError on some systems for unknown reason. -# See issues: -# https://github.com/hypervideo/hypervideo/issues/35 -# https://github.com/ytdl-org/youtube-dl/issues/27449 -# https://github.com/animelover1984/youtube-dl/issues/17 -try: - from .anvato_token_generator import NFLTokenGenerator -except ImportError: - NFLTokenGenerator = None - def md5_text(s): - if not isinstance(s, compat_str): - s = compat_str(s) - return hashlib.md5(s.encode('utf-8')).hexdigest() + return hashlib.md5(str(s).encode()).hexdigest() class AnvatoIE(InfoExtractor): _VALID_URL = r'anvato:(?P<access_key_or_mcp>[^:]+):(?P<id>\d+)' + _API_BASE_URL = 'https://tkx.mp.lura.live/rest/v2' + _ANVP_RE = r'<script[^>]+\bdata-anvp\s*=\s*(["\'])(?P<anvp>(?:(?!\1).)+)\1' + _AUTH_KEY = b'\x31\xc2\x42\x84\x9e\x73\xa0\xce' # from anvplayer.min.js + + _TESTS = [{ + # from https://www.nfl.com/videos/baker-mayfield-s-game-changing-plays-from-3-td-game-week-14 + 'url': 'anvato:GXvEgwyJeWem8KCYXfeoHWknwP48Mboj:899441', + 'md5': '921919dab3cd0b849ff3d624831ae3e2', + 'info_dict': { + 'id': '899441', + 'ext': 'mp4', + 'title': 'Baker Mayfield\'s game-changing plays from 3-TD game Week 14', + 'description': 'md5:85e05a3cc163f8c344340f220521136d', + 'upload_date': '20201215', + 'timestamp': 1608009755, + 'thumbnail': r're:^https?://.*\.jpg', + 'uploader': 'NFL', + 'tags': ['Baltimore Ravens at Cleveland Browns (2020-REG-14)', 'Baker Mayfield', 'Game Highlights', + 'Player Highlights', 'Cleveland Browns', 'league'], + 'duration': 157, + 'categories': ['Entertainment', 'Game', 'Highlights'], + }, + }, { + # from https://ktla.com/news/99-year-old-woman-learns-to-fly-in-torrance-checks-off-bucket-list-dream/ + 'url': 'anvato:X8POa4zpGZMmeiq0wqiO8IP5rMqQM9VN:8032455', + 'md5': '837718bcfb3a7778d022f857f7a9b19e', + 'info_dict': { + 'id': '8032455', + 'ext': 'mp4', + 'title': '99-year-old woman learns to fly plane in Torrance, checks off bucket list dream', + 'description': 'md5:0a12bab8159445e78f52a297a35c6609', + 'upload_date': '20220928', + 'timestamp': 1664408881, + 'thumbnail': r're:^https?://.*\.jpg', + 'uploader': 'LIN', + 'tags': ['video', 'news', '5live'], + 'duration': 155, + 'categories': ['News'], + }, + }] + # Copied from anvplayer.min.js _ANVACK_TABLE = { 'nbcu_nbcd_desktop_web_prod_93d8ead38ce2024f8f544b78306fbd15895ae5e6': 'NNemUkySjxLyPTKvZRiGntBIjEyK8uqicjMakIaQ', @@ -214,86 +241,74 @@ class AnvatoIE(InfoExtractor): 'telemundo': 'anvato_mcp_telemundo_web_prod_c5278d51ad46fda4b6ca3d0ea44a7846a054f582' } - _TOKEN_GENERATORS = { - 'GXvEgwyJeWem8KCYXfeoHWknwP48Mboj': NFLTokenGenerator, + def _generate_nfl_token(self, anvack, mcp_id): + reroute = self._download_json( + 'https://api.nfl.com/v1/reroute', mcp_id, data=b'grant_type=client_credentials', + headers={'X-Domain-Id': 100}, note='Fetching token info') + token_type = reroute.get('token_type') or 'Bearer' + auth_token = f'{token_type} {reroute["access_token"]}' + response = self._download_json( + 'https://api.nfl.com/v3/shield/', mcp_id, data=json.dumps({ + 'query': '''{ + viewer { + mediaToken(anvack: "%s", id: %s) { + token } + } +}''' % (anvack, mcp_id), + }).encode(), headers={ + 'Authorization': auth_token, + 'Content-Type': 'application/json', + }, note='Fetching NFL API token') + return traverse_obj(response, ('data', 'viewer', 'mediaToken', 'token')) - _API_KEY = '3hwbSuqqT690uxjNYBktSQpa5ZrpYYR0Iofx7NcJHyA' - - _ANVP_RE = r'<script[^>]+\bdata-anvp\s*=\s*(["\'])(?P<anvp>(?:(?!\1).)+)\1' - _AUTH_KEY = b'\x31\xc2\x42\x84\x9e\x73\xa0\xce' - - _TESTS = [{ - # from https://www.boston25news.com/news/watch-humpback-whale-breaches-right-next-to-fishing-boat-near-nh/817484874 - 'url': 'anvato:8v9BEynrwx8EFLYpgfOWcG1qJqyXKlRM:4465496', - 'info_dict': { - 'id': '4465496', - 'ext': 'mp4', - 'title': 'VIDEO: Humpback whale breaches right next to NH boat', - 'description': 'VIDEO: Humpback whale breaches right next to NH boat. Footage courtesy: Zach Fahey.', - 'duration': 22, - 'timestamp': 1534855680, - 'upload_date': '20180821', - 'uploader': 'ANV', - }, - 'params': { - 'skip_download': True, - }, - }, { - # from https://sanfrancisco.cbslocal.com/2016/06/17/source-oakland-cop-on-leave-for-having-girlfriend-help-with-police-reports/ - 'url': 'anvato:DVzl9QRzox3ZZsP9bNu5Li3X7obQOnqP:3417601', - 'only_matching': True, - }] - - def __init__(self, *args, **kwargs): - super(AnvatoIE, self).__init__(*args, **kwargs) - self.__server_time = None + _TOKEN_GENERATORS = { + 'GXvEgwyJeWem8KCYXfeoHWknwP48Mboj': _generate_nfl_token, + } def _server_time(self, access_key, video_id): - if self.__server_time is not None: - return self.__server_time - - self.__server_time = int(self._download_json( - self._api_prefix(access_key) + 'server_time?anvack=' + access_key, video_id, - note='Fetching server time')['server_time']) + return int_or_none(traverse_obj(self._download_json( + f'{self._API_BASE_URL}/server_time', video_id, query={'anvack': access_key}, + note='Fetching server time', fatal=False), 'server_time')) or int(time.time()) - return self.__server_time - - def _api_prefix(self, access_key): - return 'https://tkx2-%s.anvato.net/rest/v2/' % ('prod' if 'prod' in access_key else 'stage') - - def _get_video_json(self, access_key, video_id): + def _get_video_json(self, access_key, video_id, extracted_token): # See et() in anvplayer.min.js, which is an alias of getVideoJSON() - video_data_url = self._api_prefix(access_key) + 'mcp/video/%s?anvack=%s' % (video_id, access_key) + video_data_url = f'{self._API_BASE_URL}/mcp/video/{video_id}?anvack={access_key}' server_time = self._server_time(access_key, video_id) - input_data = '%d~%s~%s' % (server_time, md5_text(video_data_url), md5_text(server_time)) + input_data = f'{server_time}~{md5_text(video_data_url)}~{md5_text(server_time)}' auth_secret = intlist_to_bytes(aes_encrypt( bytes_to_intlist(input_data[:64]), bytes_to_intlist(self._AUTH_KEY))) - - video_data_url += '&X-Anvato-Adst-Auth=' + base64.b64encode(auth_secret).decode('ascii') + query = { + 'X-Anvato-Adst-Auth': base64.b64encode(auth_secret).decode('ascii'), + 'rtyp': 'fp', + } anvrid = md5_text(time.time() * 1000 * random.random())[:30] api = { 'anvrid': anvrid, 'anvts': server_time, } - if self._TOKEN_GENERATORS.get(access_key) is not None: - api['anvstk2'] = self._TOKEN_GENERATORS[access_key].generate(self, access_key, video_id) + if extracted_token is not None: + api['anvstk2'] = extracted_token + elif self._TOKEN_GENERATORS.get(access_key) is not None: + api['anvstk2'] = self._TOKEN_GENERATORS[access_key](self, access_key, video_id) + elif self._ANVACK_TABLE.get(access_key) is not None: + api['anvstk'] = md5_text(f'{access_key}|{anvrid}|{server_time}|{self._ANVACK_TABLE[access_key]}') else: - api['anvstk'] = md5_text('%s|%s|%d|%s' % ( - access_key, anvrid, server_time, - self._ANVACK_TABLE.get(access_key, self._API_KEY))) + api['anvstk2'] = 'default' return self._download_json( - video_data_url, video_id, transform_source=strip_jsonp, - data=json.dumps({'api': api}).encode('utf-8')) + video_data_url, video_id, transform_source=strip_jsonp, query=query, + data=json.dumps({'api': api}, separators=(',', ':')).encode('utf-8')) - def _get_anvato_videos(self, access_key, video_id): - video_data = self._get_video_json(access_key, video_id) + def _get_anvato_videos(self, access_key, video_id, token): + video_data = self._get_video_json(access_key, video_id, token) formats = [] for published_url in video_data['published_urls']: - video_url = published_url['embed_url'] + video_url = published_url.get('embed_url') + if not video_url: + continue media_format = published_url.get('format') ext = determine_ext(video_url) @@ -308,15 +323,27 @@ class AnvatoIE(InfoExtractor): 'tbr': tbr or None, } - if media_format == 'm3u8' and tbr is not None: + vtt_subs, hls_subs = {}, {} + if media_format == 'vtt': + _, vtt_subs = self._extract_m3u8_formats_and_subtitles( + video_url, video_id, m3u8_id='vtt', fatal=False) + continue + elif media_format == 'm3u8' and tbr is not None: a_format.update({ 'format_id': join_nonempty('hls', tbr), 'ext': 'mp4', }) elif media_format == 'm3u8-variant' or ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - video_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) + # For some videos the initial m3u8 URL returns JSON instead + manifest_json = self._download_json( + video_url, video_id, note='Downloading manifest JSON', errnote=False) + if manifest_json: + video_url = manifest_json.get('master_m3u8') + if not video_url: + continue + hls_fmts, hls_subs = self._extract_m3u8_formats_and_subtitles( + video_url, video_id, ext='mp4', m3u8_id='hls', fatal=False) + formats.extend(hls_fmts) continue elif ext == 'mp3' or media_format == 'mp3': a_format['vcodec'] = 'none' @@ -327,8 +354,6 @@ class AnvatoIE(InfoExtractor): }) formats.append(a_format) - self._sort_formats(formats) - subtitles = {} for caption in video_data.get('captions', []): a_caption = { @@ -336,6 +361,7 @@ class AnvatoIE(InfoExtractor): 'ext': 'tt' if caption.get('format') == 'SMPTE-TT' else None } subtitles.setdefault(caption['language'], []).append(a_caption) + subtitles = self._merge_subtitles(subtitles, hls_subs, vtt_subs) return { 'id': video_id, @@ -352,30 +378,19 @@ class AnvatoIE(InfoExtractor): 'subtitles': subtitles, } - @staticmethod - def _extract_urls(ie, webpage, video_id): - entries = [] - for mobj in re.finditer(AnvatoIE._ANVP_RE, webpage): - anvplayer_data = ie._parse_json( - mobj.group('anvp'), video_id, transform_source=unescapeHTML, - fatal=False) - if not anvplayer_data: - continue - video = anvplayer_data.get('video') - if not isinstance(video, compat_str) or not video.isdigit(): - continue - access_key = anvplayer_data.get('accessKey') - if not access_key: - mcp = anvplayer_data.get('mcp') - if mcp: - access_key = AnvatoIE._MCP_TO_ACCESS_KEY_TABLE.get( - mcp.lower()) + @classmethod + def _extract_from_webpage(cls, url, webpage): + for mobj in re.finditer(cls._ANVP_RE, webpage): + anvplayer_data = unescapeHTML(json.loads(mobj.group('anvp'))) or {} + video_id, access_key = anvplayer_data.get('video'), anvplayer_data.get('accessKey') if not access_key: + access_key = cls._MCP_TO_ACCESS_KEY_TABLE.get((anvplayer_data.get('mcp') or '').lower()) + if not (video_id or '').isdigit() or not access_key: continue - entries.append(ie.url_result( - 'anvato:%s:%s' % (access_key, video), ie=AnvatoIE.ie_key(), - video_id=video)) - return entries + url = f'anvato:{access_key}:{video_id}' + if anvplayer_data.get('token'): + url = smuggle_url(url, {'token': anvplayer_data['token']}) + yield cls.url_result(url, AnvatoIE, video_id) def _extract_anvato_videos(self, webpage, video_id): anvplayer_data = self._parse_json( @@ -383,7 +398,7 @@ class AnvatoIE(InfoExtractor): self._ANVP_RE, webpage, 'Anvato player data', group='anvp'), video_id) return self._get_anvato_videos( - anvplayer_data['accessKey'], anvplayer_data['video']) + anvplayer_data['accessKey'], anvplayer_data['video'], 'default') # cbslocal token = 'default' def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) @@ -391,9 +406,7 @@ class AnvatoIE(InfoExtractor): 'countries': smuggled_data.get('geo_countries'), }) - mobj = self._match_valid_url(url) - access_key, video_id = mobj.group('access_key_or_mcp', 'id') + access_key, video_id = self._match_valid_url(url).group('access_key_or_mcp', 'id') if access_key not in self._ANVACK_TABLE: - access_key = self._MCP_TO_ACCESS_KEY_TABLE.get( - access_key) or access_key - return self._get_anvato_videos(access_key, video_id) + access_key = self._MCP_TO_ACCESS_KEY_TABLE.get(access_key) or access_key + return self._get_anvato_videos(access_key, video_id, smuggled_data.get('token')) diff --git a/hypervideo_dl/extractor/anvato_token_generator/__init__.py b/hypervideo_dl/extractor/anvato_token_generator/__init__.py deleted file mode 100644 index 6e223db..0000000 --- a/hypervideo_dl/extractor/anvato_token_generator/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -from __future__ import unicode_literals - -from .nfl import NFLTokenGenerator - -__all__ = [ - 'NFLTokenGenerator', -] diff --git a/hypervideo_dl/extractor/anvato_token_generator/common.py b/hypervideo_dl/extractor/anvato_token_generator/common.py deleted file mode 100644 index b959a90..0000000 --- a/hypervideo_dl/extractor/anvato_token_generator/common.py +++ /dev/null @@ -1,6 +0,0 @@ -from __future__ import unicode_literals - - -class TokenGenerator: - def generate(self, anvack, mcp_id): - raise NotImplementedError('This method must be implemented by subclasses') diff --git a/hypervideo_dl/extractor/anvato_token_generator/nfl.py b/hypervideo_dl/extractor/anvato_token_generator/nfl.py deleted file mode 100644 index 97a2b24..0000000 --- a/hypervideo_dl/extractor/anvato_token_generator/nfl.py +++ /dev/null @@ -1,30 +0,0 @@ -from __future__ import unicode_literals - -import json - -from .common import TokenGenerator - - -class NFLTokenGenerator(TokenGenerator): - _AUTHORIZATION = None - - def generate(ie, anvack, mcp_id): - if not NFLTokenGenerator._AUTHORIZATION: - reroute = ie._download_json( - 'https://api.nfl.com/v1/reroute', mcp_id, - data=b'grant_type=client_credentials', - headers={'X-Domain-Id': 100}) - NFLTokenGenerator._AUTHORIZATION = '%s %s' % (reroute.get('token_type') or 'Bearer', reroute['access_token']) - return ie._download_json( - 'https://api.nfl.com/v3/shield/', mcp_id, data=json.dumps({ - 'query': '''{ - viewer { - mediaToken(anvack: "%s", id: %s) { - token - } - } -}''' % (anvack, mcp_id), - }).encode(), headers={ - 'Authorization': NFLTokenGenerator._AUTHORIZATION, - 'Content-Type': 'application/json', - })['data']['viewer']['mediaToken']['token'] diff --git a/hypervideo_dl/extractor/aol.py b/hypervideo_dl/extractor/aol.py index 4766a2c..6949ca9 100644 --- a/hypervideo_dl/extractor/aol.py +++ b/hypervideo_dl/extractor/aol.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .yahoo import YahooIE @@ -12,7 +9,7 @@ from ..utils import ( ) -class AolIE(YahooIE): +class AolIE(YahooIE): # XXX: Do not subclass from concrete IE IE_NAME = 'aol.com' _VALID_URL = r'(?:aol-video:|https?://(?:www\.)?aol\.(?:com|ca|co\.uk|de|jp)/video/(?:[^/]+/)*)(?P<id>\d{9}|[0-9a-f]{24}|[0-9a-f]{8}-(?:[0-9a-f]{4}-){3}[0-9a-f]{12})' @@ -122,7 +119,6 @@ class AolIE(YahooIE): 'height': int_or_none(qs.get('h', [None])[0]), }) formats.append(f) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/apa.py b/hypervideo_dl/extractor/apa.py index 1736cdf..1ea0b1d 100644 --- a/hypervideo_dl/extractor/apa.py +++ b/hypervideo_dl/extractor/apa.py @@ -1,8 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - from .common import InfoExtractor from ..utils import ( determine_ext, @@ -13,6 +8,7 @@ from ..utils import ( class APAIE(InfoExtractor): _VALID_URL = r'(?P<base_url>https?://[^/]+\.apa\.at)/embed/(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' + _EMBED_REGEX = [r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//[^/]+\.apa\.at/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}.*?)\1'] _TESTS = [{ 'url': 'http://uvp.apa.at/embed/293f6d17-692a-44e3-9fd5-7b178f3a1029', 'md5': '2b12292faeb0a7d930c778c7a5b4759b', @@ -33,14 +29,6 @@ class APAIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): - return [ - mobj.group('url') - for mobj in re.finditer( - r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//[^/]+\.apa\.at/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}.*?)\1', - webpage)] - def _real_extract(self, url): mobj = self._match_valid_url(url) video_id, base_url = mobj.group('id', 'base_url') @@ -84,7 +72,6 @@ class APAIE(InfoExtractor): 'format_id': format_id, 'height': height, }) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/aparat.py b/hypervideo_dl/extractor/aparat.py index 1057233..4a989d8 100644 --- a/hypervideo_dl/extractor/aparat.py +++ b/hypervideo_dl/extractor/aparat.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( get_element_by_id, @@ -13,6 +10,7 @@ from ..utils import ( class AparatIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?aparat\.com/(?:v/|video/video/embed/videohash/)(?P<id>[a-zA-Z0-9]+)' + _EMBED_REGEX = [r'<iframe .*?src="(?P<url>http://www\.aparat\.com/video/[^"]+)"'] _TESTS = [{ 'url': 'http://www.aparat.com/v/wP8On', @@ -75,7 +73,6 @@ class AparatIE(InfoExtractor): r'(\d+)[pP]', label or '', 'height', default=None)), }) - self._sort_formats(formats) info = self._search_json_ld(webpage, video_id, default={}) diff --git a/hypervideo_dl/extractor/appleconnect.py b/hypervideo_dl/extractor/appleconnect.py index 494f833..d00b0f9 100644 --- a/hypervideo_dl/extractor/appleconnect.py +++ b/hypervideo_dl/extractor/appleconnect.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( str_to_int, diff --git a/hypervideo_dl/extractor/applepodcasts.py b/hypervideo_dl/extractor/applepodcasts.py index 9139ff7..49bbeab 100644 --- a/hypervideo_dl/extractor/applepodcasts.py +++ b/hypervideo_dl/extractor/applepodcasts.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( clean_html, diff --git a/hypervideo_dl/extractor/appletrailers.py b/hypervideo_dl/extractor/appletrailers.py index 0abfb43..a5abb55 100644 --- a/hypervideo_dl/extractor/appletrailers.py +++ b/hypervideo_dl/extractor/appletrailers.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re import json @@ -122,7 +120,6 @@ class AppleTrailersIE(InfoExtractor): 'height': int_or_none(size_data.get('height')), 'language': version[:2], }) - self._sort_formats(formats) entries.append({ 'id': movie + '-' + re.sub(r'[^a-zA-Z0-9]', '', clip_title).lower(), @@ -187,8 +184,6 @@ class AppleTrailersIE(InfoExtractor): 'height': int_or_none(format['height']), }) - self._sort_formats(formats) - playlist.append({ '_type': 'video', 'id': video_id, diff --git a/hypervideo_dl/extractor/archiveorg.py b/hypervideo_dl/extractor/archiveorg.py index 2ab3c1b..90dda9f 100644 --- a/hypervideo_dl/extractor/archiveorg.py +++ b/hypervideo_dl/extractor/archiveorg.py @@ -1,39 +1,35 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re import json +import re +import urllib.parse + from .common import InfoExtractor -from .youtube import YoutubeIE, YoutubeBaseInfoExtractor -from ..compat import ( - compat_urllib_parse_unquote, - compat_urllib_parse_unquote_plus, - compat_HTTPError -) +from .youtube import YoutubeBaseInfoExtractor, YoutubeIE +from ..compat import compat_HTTPError, compat_urllib_parse_unquote from ..utils import ( + KNOWN_EXTENSIONS, + ExtractorError, + HEADRequest, bug_reports_message, clean_html, dict_get, extract_attributes, - ExtractorError, get_element_by_id, - HEADRequest, int_or_none, join_nonempty, - KNOWN_EXTENSIONS, + js_to_json, merge_dicts, mimetype2ext, orderedSet, parse_duration, parse_qs, - str_to_int, str_or_none, + str_to_int, traverse_obj, try_get, unified_strdate, unified_timestamp, + url_or_none, urlhandle_detect_ext, - url_or_none ) @@ -54,6 +50,11 @@ class ArchiveOrgIE(InfoExtractor): 'upload_date': '20100315', 'creator': 'SRI International', 'uploader': 'laura@archive.org', + 'thumbnail': r're:https://archive\.org/download/.*\.jpg', + 'release_year': 1968, + 'display_id': 'XD300-23_68HighlightsAResearchCntAugHumanIntellect.cdr', + 'track': 'XD300-23 68HighlightsAResearchCntAugHumanIntellect', + }, }, { 'url': 'https://archive.org/details/Cops1922', @@ -62,33 +63,43 @@ class ArchiveOrgIE(InfoExtractor): 'id': 'Cops1922', 'ext': 'mp4', 'title': 'Buster Keaton\'s "Cops" (1922)', - 'description': 'md5:43a603fd6c5b4b90d12a96b921212b9c', + 'description': 'md5:cd6f9910c35aedd5fc237dbc3957e2ca', 'uploader': 'yorkmba99@hotmail.com', 'timestamp': 1387699629, 'upload_date': '20131222', + 'display_id': 'Cops-v2.mp4', + 'thumbnail': r're:https://archive\.org/download/.*\.jpg', + 'duration': 1091.96, }, }, { 'url': 'http://archive.org/embed/XD300-23_68HighlightsAResearchCntAugHumanIntellect', 'only_matching': True, }, { 'url': 'https://archive.org/details/Election_Ads', - 'md5': '284180e857160cf866358700bab668a3', + 'md5': 'eec5cddebd4793c6a653b69c3b11f2e6', 'info_dict': { 'id': 'Election_Ads/Commercial-JFK1960ElectionAdCampaignJingle.mpg', 'title': 'Commercial-JFK1960ElectionAdCampaignJingle.mpg', - 'ext': 'mp4', + 'ext': 'mpg', + 'thumbnail': r're:https://archive\.org/download/.*\.jpg', + 'duration': 59.77, + 'display_id': 'Commercial-JFK1960ElectionAdCampaignJingle.mpg', }, }, { 'url': 'https://archive.org/details/Election_Ads/Commercial-Nixon1960ElectionAdToughonDefense.mpg', - 'md5': '7915213ef02559b5501fe630e1a53f59', + 'md5': 'ea1eed8234e7d4165f38c8c769edef38', 'info_dict': { 'id': 'Election_Ads/Commercial-Nixon1960ElectionAdToughonDefense.mpg', 'title': 'Commercial-Nixon1960ElectionAdToughonDefense.mpg', - 'ext': 'mp4', + 'ext': 'mpg', 'timestamp': 1205588045, 'uploader': 'mikedavisstripmaster@yahoo.com', 'description': '1960 Presidential Campaign Election Commercials John F Kennedy, Richard M Nixon', 'upload_date': '20080315', + 'display_id': 'Commercial-Nixon1960ElectionAdToughonDefense.mpg', + 'duration': 59.51, + 'license': 'http://creativecommons.org/licenses/publicdomain/', + 'thumbnail': r're:https://archive\.org/download/.*\.jpg', }, }, { 'url': 'https://archive.org/details/gd1977-05-08.shure57.stevenson.29303.flac16', @@ -97,6 +108,12 @@ class ArchiveOrgIE(InfoExtractor): 'id': 'gd1977-05-08.shure57.stevenson.29303.flac16/gd1977-05-08d01t01.flac', 'title': 'Turning', 'ext': 'flac', + 'track': 'Turning', + 'creator': 'Grateful Dead', + 'display_id': 'gd1977-05-08d01t01.flac', + 'track_number': 1, + 'album': '1977-05-08 - Barton Hall - Cornell University', + 'duration': 39.8, }, }, { 'url': 'https://archive.org/details/gd1977-05-08.shure57.stevenson.29303.flac16/gd1977-05-08d01t07.flac', @@ -107,11 +124,20 @@ class ArchiveOrgIE(InfoExtractor): 'ext': 'flac', 'timestamp': 1205895624, 'uploader': 'mvernon54@yahoo.com', - 'description': 'md5:6a31f1996db0aa0fc9da6d6e708a1bb0', + 'description': 'md5:6c921464414814720c6593810a5c7e3d', 'upload_date': '20080319', 'location': 'Barton Hall - Cornell University', + 'duration': 438.68, + 'track': 'Deal', + 'creator': 'Grateful Dead', + 'album': '1977-05-08 - Barton Hall - Cornell University', + 'release_date': '19770508', + 'display_id': 'gd1977-05-08d01t07.flac', + 'release_year': 1977, + 'track_number': 7, }, }, { + # FIXME: give a better error message than just IndexError when all available formats are restricted 'url': 'https://archive.org/details/lp_the-music-of-russia_various-artists-a-askaryan-alexander-melik', 'md5': '7cb019baa9b332e82ea7c10403acd180', 'info_dict': { @@ -119,6 +145,7 @@ class ArchiveOrgIE(InfoExtractor): 'title': 'Bells Of Rostov', 'ext': 'mp3', }, + 'skip': 'restricted' }, { 'url': 'https://archive.org/details/lp_the-music-of-russia_various-artists-a-askaryan-alexander-melik/disc1/02.02.+Song+And+Chorus+In+The+Polovetsian+Camp+From+%22Prince+Igor%22+(Act+2%2C+Scene+1).mp3', 'md5': '1d0aabe03edca83ca58d9ed3b493a3c3', @@ -131,6 +158,52 @@ class ArchiveOrgIE(InfoExtractor): 'description': 'md5:012b2d668ae753be36896f343d12a236', 'upload_date': '20190928', }, + 'skip': 'restricted' + }, { + # Original formats are private + 'url': 'https://archive.org/details/irelandthemakingofarepublic', + 'info_dict': { + 'id': 'irelandthemakingofarepublic', + 'title': 'Ireland: The Making of a Republic', + 'upload_date': '20160610', + 'description': 'md5:f70956a156645a658a0dc9513d9e78b7', + 'uploader': 'dimitrios@archive.org', + 'creator': ['British Broadcasting Corporation', 'Time-Life Films'], + 'timestamp': 1465594947, + }, + 'playlist': [ + { + 'md5': '0b211261b26590d49df968f71b90690d', + 'info_dict': { + 'id': 'irelandthemakingofarepublic/irelandthemakingofarepublicreel1_01.mov', + 'ext': 'mp4', + 'title': 'irelandthemakingofarepublicreel1_01.mov', + 'duration': 130.46, + 'thumbnail': 'https://archive.org/download/irelandthemakingofarepublic/irelandthemakingofarepublic.thumbs/irelandthemakingofarepublicreel1_01_000117.jpg', + 'display_id': 'irelandthemakingofarepublicreel1_01.mov', + }, + }, { + 'md5': '67335ee3b23a0da930841981c1e79b02', + 'info_dict': { + 'id': 'irelandthemakingofarepublic/irelandthemakingofarepublicreel1_02.mov', + 'ext': 'mp4', + 'duration': 1395.13, + 'title': 'irelandthemakingofarepublicreel1_02.mov', + 'display_id': 'irelandthemakingofarepublicreel1_02.mov', + 'thumbnail': 'https://archive.org/download/irelandthemakingofarepublic/irelandthemakingofarepublic.thumbs/irelandthemakingofarepublicreel1_02_001374.jpg', + }, + }, { + 'md5': 'e470e86787893603f4a341a16c281eb5', + 'info_dict': { + 'id': 'irelandthemakingofarepublic/irelandthemakingofarepublicreel2.mov', + 'ext': 'mp4', + 'duration': 1602.67, + 'title': 'irelandthemakingofarepublicreel2.mov', + 'thumbnail': 'https://archive.org/download/irelandthemakingofarepublic/irelandthemakingofarepublic.thumbs/irelandthemakingofarepublicreel2_001554.jpg', + 'display_id': 'irelandthemakingofarepublicreel2.mov', + }, + } + ] }] @staticmethod @@ -146,7 +219,7 @@ class ArchiveOrgIE(InfoExtractor): return json.loads(extract_attributes(element)['value']) def _real_extract(self, url): - video_id = compat_urllib_parse_unquote_plus(self._match_id(url)) + video_id = urllib.parse.unquote_plus(self._match_id(url)) identifier, entry_id = (video_id.split('/', 1) + [None])[:2] # Archive.org metadata API doesn't clearly demarcate playlist entries @@ -221,17 +294,25 @@ class ArchiveOrgIE(InfoExtractor): 'filesize': int_or_none(f.get('size'))}) extension = (f['name'].rsplit('.', 1) + [None])[1] - if extension in KNOWN_EXTENSIONS: + + # We don't want to skip private formats if the user has access to them, + # however without access to an account with such privileges we can't implement/test this. + # For now to be safe, we will only skip them if there is no user logged in. + is_logged_in = bool(self._get_cookies('https://archive.org').get('logged-in-sig')) + if extension in KNOWN_EXTENSIONS and (not f.get('private') or is_logged_in): entry['formats'].append({ 'url': 'https://archive.org/download/' + identifier + '/' + f['name'], 'format': f.get('format'), 'width': int_or_none(f.get('width')), 'height': int_or_none(f.get('height')), 'filesize': int_or_none(f.get('size')), - 'protocol': 'https'}) + 'protocol': 'https', + 'source_preference': 0 if f.get('source') == 'original' else -1, + 'format_note': f.get('source') + }) for entry in entries.values(): - self._sort_formats(entry['formats']) + entry['_format_sort_fields'] = ('source', ) if len(entries) == 1: # If there's only one item, use it as the main info dict @@ -287,7 +368,9 @@ class YoutubeWebArchiveIE(InfoExtractor): 'channel_id': 'UCukCyHaD-bK3in_pKpfH9Eg', 'duration': 32, 'uploader_id': 'Zeurel', - 'uploader_url': 'http://www.youtube.com/user/Zeurel' + 'uploader_url': 'https://www.youtube.com/user/Zeurel', + 'thumbnail': r're:https?://.*\.(jpg|webp)', + 'channel_url': 'https://www.youtube.com/channel/UCukCyHaD-bK3in_pKpfH9Eg', } }, { # Internal link @@ -302,7 +385,9 @@ class YoutubeWebArchiveIE(InfoExtractor): 'channel_id': 'UCHnyfMqiRRG1u-2MsSQLbXA', 'duration': 771, 'uploader_id': '1veritasium', - 'uploader_url': 'http://www.youtube.com/user/1veritasium' + 'uploader_url': 'https://www.youtube.com/user/1veritasium', + 'thumbnail': r're:https?://.*\.(jpg|webp)', + 'channel_url': 'https://www.youtube.com/channel/UCHnyfMqiRRG1u-2MsSQLbXA', } }, { # Video from 2012, webm format itag 45. Newest capture is deleted video, with an invalid description. @@ -316,7 +401,9 @@ class YoutubeWebArchiveIE(InfoExtractor): 'duration': 398, 'description': 'md5:ff4de6a7980cb65d951c2f6966a4f2f3', 'uploader_id': 'machinima', - 'uploader_url': 'http://www.youtube.com/user/machinima' + 'uploader_url': 'https://www.youtube.com/user/machinima', + 'thumbnail': r're:https?://.*\.(jpg|webp)', + 'uploader': 'machinima' } }, { # FLV video. Video file URL does not provide itag information @@ -330,7 +417,10 @@ class YoutubeWebArchiveIE(InfoExtractor): 'duration': 19, 'description': 'md5:10436b12e07ac43ff8df65287a56efb4', 'uploader_id': 'jawed', - 'uploader_url': 'http://www.youtube.com/user/jawed' + 'uploader_url': 'https://www.youtube.com/user/jawed', + 'channel_url': 'https://www.youtube.com/channel/UC4QobU6STFB0P71PMvOGN5A', + 'thumbnail': r're:https?://.*\.(jpg|webp)', + 'uploader': 'jawed', } }, { 'url': 'https://web.archive.org/web/20110712231407/http://www.youtube.com/watch?v=lTx3G6h2xyA', @@ -344,7 +434,9 @@ class YoutubeWebArchiveIE(InfoExtractor): 'duration': 204, 'description': 'md5:f7535343b6eda34a314eff8b85444680', 'uploader_id': 'itsmadeon', - 'uploader_url': 'http://www.youtube.com/user/itsmadeon' + 'uploader_url': 'https://www.youtube.com/user/itsmadeon', + 'channel_url': 'https://www.youtube.com/channel/UCqMDNf3Pn5L7pcNkuSEeO3w', + 'thumbnail': r're:https?://.*\.(jpg|webp)', } }, { # First capture is of dead video, second is the oldest from CDX response. @@ -355,10 +447,13 @@ class YoutubeWebArchiveIE(InfoExtractor): 'title': 'Fake Teen Doctor Strikes AGAIN! - Weekly Weird News', 'upload_date': '20160218', 'channel_id': 'UCdIaNUarhzLSXGoItz7BHVA', - 'duration': 1236, + 'duration': 1235, 'description': 'md5:21032bae736421e89c2edf36d1936947', 'uploader_id': 'MachinimaETC', - 'uploader_url': 'http://www.youtube.com/user/MachinimaETC' + 'uploader_url': 'https://www.youtube.com/user/MachinimaETC', + 'channel_url': 'https://www.youtube.com/channel/UCdIaNUarhzLSXGoItz7BHVA', + 'thumbnail': r're:https?://.*\.(jpg|webp)', + 'uploader': 'ETC News', } }, { # First capture of dead video, capture date in link links to dead capture. @@ -369,10 +464,13 @@ class YoutubeWebArchiveIE(InfoExtractor): 'title': 'WTF: Video Games Still Launch BROKEN?! - T.U.G.S.', 'upload_date': '20160219', 'channel_id': 'UCdIaNUarhzLSXGoItz7BHVA', - 'duration': 798, + 'duration': 797, 'description': 'md5:a1dbf12d9a3bd7cb4c5e33b27d77ffe7', 'uploader_id': 'MachinimaETC', - 'uploader_url': 'http://www.youtube.com/user/MachinimaETC' + 'uploader_url': 'https://www.youtube.com/user/MachinimaETC', + 'channel_url': 'https://www.youtube.com/channel/UCdIaNUarhzLSXGoItz7BHVA', + 'thumbnail': r're:https?://.*\.(jpg|webp)', + 'uploader': 'ETC News', }, 'expected_warnings': [ r'unable to download capture webpage \(it may not be archived\)' @@ -392,12 +490,11 @@ class YoutubeWebArchiveIE(InfoExtractor): 'title': 'It\'s Bootleg AirPods Time.', 'upload_date': '20211021', 'channel_id': 'UC7Jwj9fkrf1adN4fMmTkpug', - 'channel_url': 'http://www.youtube.com/channel/UC7Jwj9fkrf1adN4fMmTkpug', + 'channel_url': 'https://www.youtube.com/channel/UC7Jwj9fkrf1adN4fMmTkpug', 'duration': 810, 'description': 'md5:7b567f898d8237b256f36c1a07d6d7bc', + 'thumbnail': r're:https?://.*\.(jpg|webp)', 'uploader': 'DankPods', - 'uploader_id': 'UC7Jwj9fkrf1adN4fMmTkpug', - 'uploader_url': 'http://www.youtube.com/channel/UC7Jwj9fkrf1adN4fMmTkpug' } }, { # player response contains '};' See: https://github.com/ytdl-org/youtube-dl/issues/27093 @@ -408,12 +505,135 @@ class YoutubeWebArchiveIE(InfoExtractor): 'title': 'bitch lasagna', 'upload_date': '20181005', 'channel_id': 'UC-lHJZR3Gqxm24_Vd_AJ5Yw', - 'channel_url': 'http://www.youtube.com/channel/UC-lHJZR3Gqxm24_Vd_AJ5Yw', + 'channel_url': 'https://www.youtube.com/channel/UC-lHJZR3Gqxm24_Vd_AJ5Yw', 'duration': 135, 'description': 'md5:2dbe4051feeff2dab5f41f82bb6d11d0', 'uploader': 'PewDiePie', 'uploader_id': 'PewDiePie', - 'uploader_url': 'http://www.youtube.com/user/PewDiePie' + 'uploader_url': 'https://www.youtube.com/user/PewDiePie', + 'thumbnail': r're:https?://.*\.(jpg|webp)', + } + }, { + # ~June 2010 Capture. swfconfig + 'url': 'https://web.archive.org/web/0/https://www.youtube.com/watch?v=8XeW5ilk-9Y', + 'info_dict': { + 'id': '8XeW5ilk-9Y', + 'ext': 'flv', + 'title': 'Story of Stuff, The Critique Part 4 of 4', + 'duration': 541, + 'description': 'md5:28157da06f2c5e94c97f7f3072509972', + 'uploader': 'HowTheWorldWorks', + 'uploader_id': 'HowTheWorldWorks', + 'thumbnail': r're:https?://.*\.(jpg|webp)', + 'uploader_url': 'https://www.youtube.com/user/HowTheWorldWorks', + 'upload_date': '20090520', + } + }, { + # Jan 2011: watch-video-date/eow-date surrounded by whitespace + 'url': 'https://web.archive.org/web/20110126141719/http://www.youtube.com/watch?v=Q_yjX80U7Yc', + 'info_dict': { + 'id': 'Q_yjX80U7Yc', + 'ext': 'flv', + 'title': 'Spray Paint Art by Clay Butler: Purple Fantasy Forest', + 'uploader_id': 'claybutlermusic', + 'description': 'md5:4595264559e3d0a0ceb3f011f6334543', + 'upload_date': '20090803', + 'uploader': 'claybutlermusic', + 'thumbnail': r're:https?://.*\.(jpg|webp)', + 'duration': 132, + 'uploader_url': 'https://www.youtube.com/user/claybutlermusic', + } + }, { + # ~May 2009 swfArgs. ytcfg is spread out over various vars + 'url': 'https://web.archive.org/web/0/https://www.youtube.com/watch?v=c5uJgG05xUY', + 'info_dict': { + 'id': 'c5uJgG05xUY', + 'ext': 'webm', + 'title': 'Story of Stuff, The Critique Part 1 of 4', + 'uploader_id': 'HowTheWorldWorks', + 'uploader': 'HowTheWorldWorks', + 'uploader_url': 'https://www.youtube.com/user/HowTheWorldWorks', + 'upload_date': '20090513', + 'description': 'md5:4ca77d79538064e41e4cc464e93f44f0', + 'thumbnail': r're:https?://.*\.(jpg|webp)', + 'duration': 754, + } + }, { + # ~June 2012. Upload date is in another lang so cannot extract. + 'url': 'https://web.archive.org/web/20120607174520/http://www.youtube.com/watch?v=xWTLLl-dQaA', + 'info_dict': { + 'id': 'xWTLLl-dQaA', + 'ext': 'mp4', + 'title': 'Black Nerd eHarmony Video Bio Parody (SPOOF)', + 'uploader_url': 'https://www.youtube.com/user/BlackNerdComedy', + 'description': 'md5:e25f0133aaf9e6793fb81c18021d193e', + 'uploader_id': 'BlackNerdComedy', + 'uploader': 'BlackNerdComedy', + 'duration': 182, + 'thumbnail': r're:https?://.*\.(jpg|webp)', + } + }, { + # ~July 2013 + 'url': 'https://web.archive.org/web/*/https://www.youtube.com/watch?v=9eO1aasHyTM', + 'info_dict': { + 'id': '9eO1aasHyTM', + 'ext': 'mp4', + 'title': 'Polar-oid', + 'description': 'Cameras and bears are dangerous!', + 'uploader_url': 'https://www.youtube.com/user/punkybird', + 'uploader_id': 'punkybird', + 'duration': 202, + 'channel_id': 'UC62R2cBezNBOqxSerfb1nMQ', + 'channel_url': 'https://www.youtube.com/channel/UC62R2cBezNBOqxSerfb1nMQ', + 'upload_date': '20060428', + 'uploader': 'punkybird', + } + }, { + # April 2020: Player response in player config + 'url': 'https://web.archive.org/web/20200416034815/https://www.youtube.com/watch?v=Cf7vS8jc7dY&gl=US&hl=en', + 'info_dict': { + 'id': 'Cf7vS8jc7dY', + 'ext': 'mp4', + 'title': 'A Dramatic Pool Story (by Jamie Spicer-Lewis) - Game Grumps Animated', + 'duration': 64, + 'upload_date': '20200408', + 'uploader_id': 'GameGrumps', + 'uploader': 'GameGrumps', + 'channel_url': 'https://www.youtube.com/channel/UC9CuvdOVfMPvKCiwdGKL3cQ', + 'channel_id': 'UC9CuvdOVfMPvKCiwdGKL3cQ', + 'thumbnail': r're:https?://.*\.(jpg|webp)', + 'description': 'md5:c625bb3c02c4f5fb4205971e468fa341', + 'uploader_url': 'https://www.youtube.com/user/GameGrumps', + } + }, { + # watch7-user-header with yt-user-info + 'url': 'ytarchive:kbh4T_b4Ixw:20160307085057', + 'info_dict': { + 'id': 'kbh4T_b4Ixw', + 'ext': 'mp4', + 'title': 'Shovel Knight OST - Strike the Earth! Plains of Passage 16 bit SNES style remake / remix', + 'channel_url': 'https://www.youtube.com/channel/UCnTaGvsHmMy792DWeT6HbGA', + 'uploader': 'Nelward music', + 'duration': 213, + 'description': 'md5:804b4a9ce37b050a5fefdbb23aeba54d', + 'thumbnail': r're:https?://.*\.(jpg|webp)', + 'upload_date': '20150503', + 'channel_id': 'UCnTaGvsHmMy792DWeT6HbGA', + } + }, { + # April 2012 + 'url': 'https://web.archive.org/web/0/https://www.youtube.com/watch?v=SOm7mPoPskU', + 'info_dict': { + 'id': 'SOm7mPoPskU', + 'ext': 'mp4', + 'title': 'Boyfriend - Justin Bieber Parody', + 'uploader_url': 'https://www.youtube.com/user/thecomputernerd01', + 'uploader': 'thecomputernerd01', + 'thumbnail': r're:https?://.*\.(jpg|webp)', + 'description': 'md5:dd7fa635519c2a5b4d566beaecad7491', + 'duration': 200, + 'upload_date': '20120407', + 'uploader_id': 'thecomputernerd01', } }, { 'url': 'https://web.archive.org/web/http://www.youtube.com/watch?v=kH-G_aIBlFw', @@ -445,9 +665,11 @@ class YoutubeWebArchiveIE(InfoExtractor): 'only_matching': True }, ] - _YT_INITIAL_DATA_RE = r'(?:(?:(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;)|%s)' % YoutubeBaseInfoExtractor._YT_INITIAL_DATA_RE - _YT_INITIAL_PLAYER_RESPONSE_RE = r'(?:(?:(?:window\s*\[\s*["\']ytInitialPlayerResponse["\']\s*\]|ytInitialPlayerResponse)\s*=[(\s]*({.+?})[)\s]*;)|%s)' % YoutubeBaseInfoExtractor._YT_INITIAL_PLAYER_RESPONSE_RE - _YT_INITIAL_BOUNDARY_RE = r'(?:(?:var\s+meta|</script|\n)|%s)' % YoutubeBaseInfoExtractor._YT_INITIAL_BOUNDARY_RE + _YT_INITIAL_DATA_RE = YoutubeBaseInfoExtractor._YT_INITIAL_DATA_RE + _YT_INITIAL_PLAYER_RESPONSE_RE = fr'''(?x: + (?:window\s*\[\s*["\']ytInitialPlayerResponse["\']\s*\]|ytInitialPlayerResponse)\s*=[(\s]*| + {YoutubeBaseInfoExtractor._YT_INITIAL_PLAYER_RESPONSE_RE} + )''' _YT_DEFAULT_THUMB_SERVERS = ['i.ytimg.com'] # thumbnails most likely archived on these servers _YT_ALL_THUMB_SERVERS = orderedSet( @@ -477,11 +699,6 @@ class YoutubeWebArchiveIE(InfoExtractor): elif not isinstance(res, list) or len(res) != 0: self.report_warning('Error while parsing CDX API response' + bug_reports_message()) - def _extract_yt_initial_variable(self, webpage, regex, video_id, name): - return self._parse_json(self._search_regex( - (r'%s\s*%s' % (regex, self._YT_INITIAL_BOUNDARY_RE), - regex), webpage, name, default='{}'), video_id, fatal=False) - def _extract_webpage_title(self, webpage): page_title = self._html_extract_title(webpage, default='') # YouTube video pages appear to always have either 'YouTube -' as prefix or '- YouTube' as suffix. @@ -491,10 +708,32 @@ class YoutubeWebArchiveIE(InfoExtractor): def _extract_metadata(self, video_id, webpage): search_meta = ((lambda x: self._html_search_meta(x, webpage, default=None)) if webpage else (lambda x: None)) - player_response = self._extract_yt_initial_variable( - webpage, self._YT_INITIAL_PLAYER_RESPONSE_RE, video_id, 'initial player response') or {} - initial_data = self._extract_yt_initial_variable( - webpage, self._YT_INITIAL_DATA_RE, video_id, 'initial player response') or {} + player_response = self._search_json( + self._YT_INITIAL_PLAYER_RESPONSE_RE, webpage, 'initial player response', + video_id, default={}) + initial_data = self._search_json( + self._YT_INITIAL_DATA_RE, webpage, 'initial data', video_id, default={}) + + ytcfg = {} + for j in re.findall(r'yt\.setConfig\(\s*(?P<json>{\s*(?s:.+?)\s*})\s*\);', webpage): # ~June 2010 + ytcfg.update(self._parse_json(j, video_id, fatal=False, ignore_extra=True, transform_source=js_to_json, errnote='') or {}) + + # XXX: this also may contain a 'ptchn' key + player_config = ( + self._search_json( + r'(?:yt\.playerConfig|ytplayer\.config|swfConfig)\s*=', + webpage, 'player config', video_id, default=None) + or ytcfg.get('PLAYER_CONFIG') or {}) + + # XXX: this may also contain a 'creator' key. + swf_args = self._search_json(r'swfArgs\s*=', webpage, 'swf config', video_id, default={}) + if swf_args and not traverse_obj(player_config, ('args',)): + player_config['args'] = swf_args + + if not player_response: + # April 2020 + player_response = self._parse_json( + traverse_obj(player_config, ('args', 'player_response')) or '{}', video_id, fatal=False) initial_data_video = traverse_obj( initial_data, ('contents', 'twoColumnWatchNextResults', 'results', 'results', 'contents', ..., 'videoPrimaryInfoRenderer'), @@ -510,21 +749,64 @@ class YoutubeWebArchiveIE(InfoExtractor): video_details.get('title') or YoutubeBaseInfoExtractor._get_text(microformats, 'title') or YoutubeBaseInfoExtractor._get_text(initial_data_video, 'title') + or traverse_obj(player_config, ('args', 'title')) or self._extract_webpage_title(webpage) or search_meta(['og:title', 'twitter:title', 'title'])) + def id_from_url(url, type_): + return self._search_regex( + rf'(?:{type_})/([^/#&?]+)', url or '', f'{type_} id', default=None) + + # XXX: would the get_elements_by_... functions be better suited here? + _CHANNEL_URL_HREF_RE = r'href="[^"]*(?P<url>https?://www\.youtube\.com/(?:user|channel)/[^"]+)"' + uploader_or_channel_url = self._search_regex( + [fr'<(?:link\s*itemprop=\"url\"|a\s*id=\"watch-username\").*?\b{_CHANNEL_URL_HREF_RE}>', # @fd05024 + fr'<div\s*id=\"(?:watch-channel-stats|watch-headline-user-info)\"[^>]*>\s*<a[^>]*\b{_CHANNEL_URL_HREF_RE}'], # ~ May 2009, ~June 2012 + webpage, 'uploader or channel url', default=None) + + owner_profile_url = url_or_none(microformats.get('ownerProfileUrl')) # @a6211d2 + + # Uploader refers to the /user/ id ONLY + uploader_id = ( + id_from_url(owner_profile_url, 'user') + or id_from_url(uploader_or_channel_url, 'user') + or ytcfg.get('VIDEO_USERNAME')) + uploader_url = f'https://www.youtube.com/user/{uploader_id}' if uploader_id else None + + # XXX: do we want to differentiate uploader and channel? + uploader = ( + self._search_regex( + [r'<a\s*id="watch-username"[^>]*>\s*<strong>([^<]+)</strong>', # June 2010 + r'var\s*watchUsername\s*=\s*\'(.+?)\';', # ~May 2009 + r'<div\s*\bid=\"watch-channel-stats"[^>]*>\s*<a[^>]*>\s*(.+?)\s*</a', # ~May 2009 + r'<a\s*id="watch-userbanner"[^>]*title="\s*(.+?)\s*"'], # ~June 2012 + webpage, 'uploader', default=None) + or self._html_search_regex( + [r'(?s)<div\s*class="yt-user-info".*?<a[^>]*[^>]*>\s*(.*?)\s*</a', # March 2016 + r'(?s)<a[^>]*yt-user-name[^>]*>\s*(.*?)\s*</a'], # july 2013 + get_element_by_id('watch7-user-header', webpage), 'uploader', default=None) + or self._html_search_regex( + r'<button\s*href="/user/[^>]*>\s*<span[^>]*>\s*(.+?)\s*<', # April 2012 + get_element_by_id('watch-headline-user-info', webpage), 'uploader', default=None) + or traverse_obj(player_config, ('args', 'creator')) + or video_details.get('author')) + channel_id = str_or_none( video_details.get('channelId') or microformats.get('externalChannelId') or search_meta('channelId') or self._search_regex( r'data-channel-external-id=(["\'])(?P<id>(?:(?!\1).)+)\1', # @b45a9e6 - webpage, 'channel id', default=None, group='id')) - channel_url = f'http://www.youtube.com/channel/{channel_id}' if channel_id else None + webpage, 'channel id', default=None, group='id') + or id_from_url(owner_profile_url, 'channel') + or id_from_url(uploader_or_channel_url, 'channel') + or traverse_obj(player_config, ('args', 'ucid'))) + channel_url = f'https://www.youtube.com/channel/{channel_id}' if channel_id else None duration = int_or_none( video_details.get('lengthSeconds') or microformats.get('lengthSeconds') + or traverse_obj(player_config, ('args', ('length_seconds', 'l')), get_all=False) or parse_duration(search_meta('duration'))) description = ( video_details.get('shortDescription') @@ -532,26 +814,13 @@ class YoutubeWebArchiveIE(InfoExtractor): or clean_html(get_element_by_id('eow-description', webpage)) # @9e6dd23 or search_meta(['description', 'og:description', 'twitter:description'])) - uploader = video_details.get('author') - - # Uploader ID and URL - uploader_mobj = re.search( - r'<link itemprop="url" href="(?P<uploader_url>https?://www\.youtube\.com/(?:user|channel)/(?P<uploader_id>[^"]+))">', # @fd05024 - webpage) - if uploader_mobj is not None: - uploader_id, uploader_url = uploader_mobj.group('uploader_id'), uploader_mobj.group('uploader_url') - else: - # @a6211d2 - uploader_url = url_or_none(microformats.get('ownerProfileUrl')) - uploader_id = self._search_regex( - r'(?:user|channel)/([^/]+)', uploader_url or '', 'uploader id', default=None) - upload_date = unified_strdate( dict_get(microformats, ('uploadDate', 'publishDate')) or search_meta(['uploadDate', 'datePublished']) or self._search_regex( - [r'(?s)id="eow-date.*?>(.*?)</span>', - r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'], # @7998520 + [r'(?s)id="eow-date.*?>\s*(.*?)\s*</span>', + r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']', # @7998520 + r'class\s*=\s*"(?:watch-video-date|watch-video-added post-date)"[^>]*>\s*([^<]+?)\s*<'], # ~June 2010, ~Jan 2009 (respectively) webpage, 'upload date', default=None)) return { @@ -597,7 +866,7 @@ class YoutubeWebArchiveIE(InfoExtractor): response = self._call_cdx_api( video_id, f'https://www.youtube.com/watch?v={video_id}', filters=['mimetype:text/html'], collapse=['timestamp:6', 'digest'], query={'matchType': 'prefix'}) or [] - all_captures = sorted([int_or_none(r['timestamp']) for r in response if int_or_none(r['timestamp']) is not None]) + all_captures = sorted(int_or_none(r['timestamp']) for r in response if int_or_none(r['timestamp']) is not None) # Prefer the new polymer UI captures as we support extracting more metadata from them # WBM captures seem to all switch to this layout ~July 2020 @@ -620,18 +889,22 @@ class YoutubeWebArchiveIE(InfoExtractor): url_date = url_date or url_date_2 urlh = None - try: - urlh = self._request_webpage( - HEADRequest('https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/%s' % video_id), - video_id, note='Fetching archived video file url', expected_status=True) - except ExtractorError as e: - # HTTP Error 404 is expected if the video is not saved. - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404: - self.raise_no_formats( - 'The requested video is not archived, indexed, or there is an issue with web.archive.org', - expected=True) - else: - raise + retry_manager = self.RetryManager(fatal=False) + for retry in retry_manager: + try: + urlh = self._request_webpage( + HEADRequest('https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/%s' % video_id), + video_id, note='Fetching archived video file url', expected_status=True) + except ExtractorError as e: + # HTTP Error 404 is expected if the video is not saved. + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404: + self.raise_no_formats( + 'The requested video is not archived, indexed, or there is an issue with web.archive.org (try again later)', expected=True) + else: + retry.error = e + + if retry_manager.error: + self.raise_no_formats(retry_manager.error, expected=True, video_id=video_id) capture_dates = self._get_capture_dates(video_id, int_or_none(url_date)) self.write_debug('Captures to try: ' + join_nonempty(*capture_dates, delim=', ')) diff --git a/hypervideo_dl/extractor/arcpublishing.py b/hypervideo_dl/extractor/arcpublishing.py index 8880e5c..febd3d2 100644 --- a/hypervideo_dl/extractor/arcpublishing.py +++ b/hypervideo_dl/extractor/arcpublishing.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -73,8 +70,8 @@ class ArcPublishingIE(InfoExtractor): ], 'video-api-cdn.%s.arcpublishing.com/api'), ] - @staticmethod - def _extract_urls(webpage): + @classmethod + def _extract_embed_urls(cls, url, webpage): entries = [] # https://arcpublishing.atlassian.net/wiki/spaces/POWA/overview for powa_el in re.findall(r'(<div[^>]+class="[^"]*\bpowa\b[^"]*"[^>]+data-uuid="%s"[^>]*>)' % ArcPublishingIE._UUID_REGEX, webpage): @@ -147,7 +144,6 @@ class ArcPublishingIE(InfoExtractor): 'url': s_url, 'quality': -10, }) - self._sort_formats(formats) subtitles = {} for subtitle in (try_get(video, lambda x: x['subtitles']['urls'], list) or []): diff --git a/hypervideo_dl/extractor/ard.py b/hypervideo_dl/extractor/ard.py index 7ea339b..0a8a874 100644 --- a/hypervideo_dl/extractor/ard.py +++ b/hypervideo_dl/extractor/ard.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json import re @@ -43,8 +40,6 @@ class ARDMediathekBaseIE(InfoExtractor): 'This video is not available due to geoblocking', countries=self._GEO_COUNTRIES, metadata_available=True) - self._sort_formats(formats) - subtitles = {} subtitle_url = media_info.get('_subtitleUrl') if subtitle_url: @@ -265,7 +260,6 @@ class ARDMediathekIE(ARDMediathekBaseIE): 'format_id': fid, 'url': furl, }) - self._sort_formats(formats) info = { 'formats': formats, } @@ -374,7 +368,6 @@ class ARDIE(InfoExtractor): continue f['url'] = format_url formats.append(f) - self._sort_formats(formats) _SUB_FORMATS = ( ('./dataTimedText', 'ttml'), diff --git a/hypervideo_dl/extractor/arkena.py b/hypervideo_dl/extractor/arkena.py index 4f4f457..de36ec8 100644 --- a/hypervideo_dl/extractor/arkena.py +++ b/hypervideo_dl/extractor/arkena.py @@ -1,8 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - from .common import InfoExtractor from ..utils import ( ExtractorError, @@ -22,6 +17,8 @@ class ArkenaIE(InfoExtractor): play\.arkena\.com/(?:config|embed)/avp/v\d/player/media/(?P<id>[^/]+)/[^/]+/(?P<account_id>\d+) ) ''' + # See https://support.arkena.com/display/PLAY/Ways+to+embed+your+video + _EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//play\.arkena\.com/embed/avp/.+?)\1'] _TESTS = [{ 'url': 'https://video.qbrick.com/play2/embed/player?accountId=1034090&mediaId=d8ab4607-00090107-aab86310', 'md5': '97f117754e5f3c020f5f26da4a44ebaf', @@ -53,15 +50,6 @@ class ArkenaIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_url(webpage): - # See https://support.arkena.com/display/PLAY/Ways+to+embed+your+video - mobj = re.search( - r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//play\.arkena\.com/embed/avp/.+?)\1', - webpage) - if mobj: - return mobj.group('url') - def _real_extract(self, url): mobj = self._match_valid_url(url) video_id = mobj.group('id') @@ -148,7 +136,6 @@ class ArkenaIE(InfoExtractor): elif mime_type == 'application/vnd.ms-sstr+xml': formats.extend(self._extract_ism_formats( href, video_id, ism_id='mss', fatal=False)) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/arnes.py b/hypervideo_dl/extractor/arnes.py index 050c252..a493714 100644 --- a/hypervideo_dl/extractor/arnes.py +++ b/hypervideo_dl/extractor/arnes.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import ( compat_parse_qs, @@ -76,7 +73,6 @@ class ArnesIE(InfoExtractor): 'width': int_or_none(media.get('width')), 'height': int_or_none(media.get('height')), }) - self._sort_formats(formats) channel = video.get('channel') or {} channel_id = channel.get('url') @@ -93,7 +89,7 @@ class ArnesIE(InfoExtractor): 'timestamp': parse_iso8601(video.get('creationTime')), 'channel': channel.get('name'), 'channel_id': channel_id, - 'channel_url': format_field(channel_id, template=f'{self._BASE_URL}/?channel=%s'), + 'channel_url': format_field(channel_id, None, f'{self._BASE_URL}/?channel=%s'), 'duration': float_or_none(video.get('duration'), 1000), 'view_count': int_or_none(video.get('views')), 'tags': video.get('hashtags'), diff --git a/hypervideo_dl/extractor/arte.py b/hypervideo_dl/extractor/arte.py index c2f2c1b..54e4d2d 100644 --- a/hypervideo_dl/extractor/arte.py +++ b/hypervideo_dl/extractor/arte.py @@ -1,193 +1,216 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor -from ..compat import ( - compat_str, -) from ..utils import ( ExtractorError, + GeoRestrictedError, int_or_none, + parse_iso8601, parse_qs, - qualities, strip_or_none, - try_get, - unified_strdate, + traverse_obj, url_or_none, ) class ArteTVBaseIE(InfoExtractor): _ARTE_LANGUAGES = 'fr|de|en|es|it|pl' - _API_BASE = 'https://api.arte.tv/api/player/v1' + _API_BASE = 'https://api.arte.tv/api/player/v2' class ArteTVIE(ArteTVBaseIE): _VALID_URL = r'''(?x) - https?:// + (?:https?:// (?: (?:www\.)?arte\.tv/(?P<lang>%(langs)s)/videos| api\.arte\.tv/api/player/v\d+/config/(?P<lang_2>%(langs)s) ) - /(?P<id>\d{6}-\d{3}-[AF]) + |arte://program) + /(?P<id>\d{6}-\d{3}-[AF]|LIVE) ''' % {'langs': ArteTVBaseIE._ARTE_LANGUAGES} _TESTS = [{ 'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/', + 'only_matching': True, + }, { + 'url': 'https://www.arte.tv/pl/videos/100103-000-A/usa-dyskryminacja-na-porodowce/', 'info_dict': { - 'id': '088501-000-A', + 'id': '100103-000-A', + 'title': 'USA: Dyskryminacja na porodówce', + 'description': 'md5:242017b7cce59ffae340a54baefcafb1', + 'alt_title': 'ARTE Reportage', + 'upload_date': '20201103', + 'duration': 554, + 'thumbnail': r're:https://api-cdn\.arte\.tv/.+940x530', + 'timestamp': 1604417980, 'ext': 'mp4', - 'title': 'Mexico: Stealing Petrol to Survive', - 'upload_date': '20190628', }, + 'params': {'skip_download': 'm3u8'} }, { - 'url': 'https://www.arte.tv/pl/videos/100103-000-A/usa-dyskryminacja-na-porodowce/', - 'only_matching': True, + 'note': 'No alt_title', + 'url': 'https://www.arte.tv/fr/videos/110371-000-A/la-chaleur-supplice-des-arbres-de-rue/', + 'info_dict': { + 'id': '110371-000-A', + 'ext': 'mp4', + 'upload_date': '20220718', + 'duration': 154, + 'timestamp': 1658162460, + 'description': 'md5:5890f36fe7dccfadb8b7c0891de54786', + 'title': 'La chaleur, supplice des arbres de rue', + 'thumbnail': 'https://api-cdn.arte.tv/img/v2/image/CPE2sQDtD8GLQgt8DuYHLf/940x530', + }, + 'params': {'skip_download': 'm3u8'} }, { 'url': 'https://api.arte.tv/api/player/v2/config/de/100605-013-A', 'only_matching': True, + }, { + 'url': 'https://api.arte.tv/api/player/v2/config/de/LIVE', + 'only_matching': True, }] + _GEO_BYPASS = True + + _LANG_MAP = { # ISO639 -> French abbreviations + 'fr': 'F', + 'de': 'A', + 'en': 'E[ANG]', + 'es': 'E[ESP]', + 'it': 'E[ITA]', + 'pl': 'E[POL]', + # XXX: probably means mixed; <https://www.arte.tv/en/videos/107710-029-A/dispatches-from-ukraine-local-journalists-report/> + # uses this code for audio that happens to be in Ukrainian, but the manifest uses the ISO code 'mul' (mixed) + 'mul': 'EU', + } + + _VERSION_CODE_RE = re.compile(r'''(?x) + V + (?P<original_voice>O?) + (?P<vlang>[FA]|E\[[A-Z]+\]|EU)? + (?P<audio_desc>AUD|) + (?: + (?P<has_sub>-ST) + (?P<sdh_sub>M?) + (?P<sub_lang>[FA]|E\[[A-Z]+\]|EU) + )? + ''') + + # all obtained by exhaustive testing + _COUNTRIES_MAP = { + 'DE_FR': ( + 'BL', 'DE', 'FR', 'GF', 'GP', 'MF', 'MQ', 'NC', + 'PF', 'PM', 'RE', 'WF', 'YT', + ), + # with both of the below 'BE' sometimes works, sometimes doesn't + 'EUR_DE_FR': ( + 'AT', 'BL', 'CH', 'DE', 'FR', 'GF', 'GP', 'LI', + 'MC', 'MF', 'MQ', 'NC', 'PF', 'PM', 'RE', 'WF', + 'YT', + ), + 'SAT': ( + 'AD', 'AT', 'AX', 'BG', 'BL', 'CH', 'CY', 'CZ', + 'DE', 'DK', 'EE', 'ES', 'FI', 'FR', 'GB', 'GF', + 'GR', 'HR', 'HU', 'IE', 'IS', 'IT', 'KN', 'LI', + 'LT', 'LU', 'LV', 'MC', 'MF', 'MQ', 'MT', 'NC', + 'NL', 'NO', 'PF', 'PL', 'PM', 'PT', 'RE', 'RO', + 'SE', 'SI', 'SK', 'SM', 'VA', 'WF', 'YT', + ), + } + def _real_extract(self, url): mobj = self._match_valid_url(url) video_id = mobj.group('id') lang = mobj.group('lang') or mobj.group('lang_2') - - info = self._download_json( - '%s/config/%s/%s' % (self._API_BASE, lang, video_id), video_id) - player_info = info['videoJsonPlayer'] - - vsr = try_get(player_info, lambda x: x['VSR'], dict) - if not vsr: - error = None - if try_get(player_info, lambda x: x['custom_msg']['type']) == 'error': - error = try_get( - player_info, lambda x: x['custom_msg']['msg'], compat_str) - if not error: - error = 'Video %s is not available' % player_info.get('VID') or video_id - raise ExtractorError(error, expected=True) - - upload_date_str = player_info.get('shootingDate') - if not upload_date_str: - upload_date_str = (player_info.get('VRA') or player_info.get('VDA') or '').split(' ')[0] - - title = (player_info.get('VTI') or player_info['VID']).strip() - subtitle = player_info.get('VSU', '').strip() - if subtitle: - title += ' - %s' % subtitle - - qfunc = qualities(['MQ', 'HQ', 'EQ', 'SQ']) - - LANGS = { - 'fr': 'F', - 'de': 'A', - 'en': 'E[ANG]', - 'es': 'E[ESP]', - 'it': 'E[ITA]', - 'pl': 'E[POL]', - } - - langcode = LANGS.get(lang, lang) - - formats = [] - for format_id, format_dict in vsr.items(): - f = dict(format_dict) - format_url = url_or_none(f.get('url')) - streamer = f.get('streamer') - if not format_url and not streamer: - continue - versionCode = f.get('versionCode') - l = re.escape(langcode) - - # Language preference from most to least priority - # Reference: section 6.8 of - # https://www.arte.tv/sites/en/corporate/files/complete-technical-guidelines-arte-geie-v1-07-1.pdf - PREFERENCES = ( - # original version in requested language, without subtitles - r'VO{0}$'.format(l), - # original version in requested language, with partial subtitles in requested language - r'VO{0}-ST{0}$'.format(l), - # original version in requested language, with subtitles for the deaf and hard-of-hearing in requested language - r'VO{0}-STM{0}$'.format(l), - # non-original (dubbed) version in requested language, without subtitles - r'V{0}$'.format(l), - # non-original (dubbed) version in requested language, with subtitles partial subtitles in requested language - r'V{0}-ST{0}$'.format(l), - # non-original (dubbed) version in requested language, with subtitles for the deaf and hard-of-hearing in requested language - r'V{0}-STM{0}$'.format(l), - # original version in requested language, with partial subtitles in different language - r'VO{0}-ST(?!{0}).+?$'.format(l), - # original version in requested language, with subtitles for the deaf and hard-of-hearing in different language - r'VO{0}-STM(?!{0}).+?$'.format(l), - # original version in different language, with partial subtitles in requested language - r'VO(?:(?!{0}).+?)?-ST{0}$'.format(l), - # original version in different language, with subtitles for the deaf and hard-of-hearing in requested language - r'VO(?:(?!{0}).+?)?-STM{0}$'.format(l), - # original version in different language, without subtitles - r'VO(?:(?!{0}))?$'.format(l), - # original version in different language, with partial subtitles in different language - r'VO(?:(?!{0}).+?)?-ST(?!{0}).+?$'.format(l), - # original version in different language, with subtitles for the deaf and hard-of-hearing in different language - r'VO(?:(?!{0}).+?)?-STM(?!{0}).+?$'.format(l), - ) - - for pref, p in enumerate(PREFERENCES): - if re.match(p, versionCode): - lang_pref = len(PREFERENCES) - pref - break - else: - lang_pref = -1 - format_note = '%s, %s' % (f.get('versionCode'), f.get('versionLibelle')) - - media_type = f.get('mediaType') - if media_type == 'hls': - m3u8_formats = self._extract_m3u8_formats( - format_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id=format_id, fatal=False) - for m3u8_format in m3u8_formats: - m3u8_format.update({ + langauge_code = self._LANG_MAP.get(lang) + + config = self._download_json(f'{self._API_BASE}/config/{lang}/{video_id}', video_id) + + geoblocking = traverse_obj(config, ('data', 'attributes', 'restriction', 'geoblocking')) or {} + if geoblocking.get('restrictedArea'): + raise GeoRestrictedError(f'Video restricted to {geoblocking["code"]!r}', + countries=self._COUNTRIES_MAP.get(geoblocking['code'], ('DE', 'FR'))) + + if not traverse_obj(config, ('data', 'attributes', 'rights')): + # Eg: https://www.arte.tv/de/videos/097407-215-A/28-minuten + # Eg: https://www.arte.tv/es/videos/104351-002-A/serviteur-du-peuple-1-23 + raise ExtractorError( + 'Video is not available in this language edition of Arte or broadcast rights expired', expected=True) + + formats, subtitles = [], {} + secondary_formats = [] + for stream in config['data']['attributes']['streams']: + # official player contains code like `e.get("versions")[0].eStat.ml5` + stream_version = stream['versions'][0] + stream_version_code = stream_version['eStat']['ml5'] + + lang_pref = -1 + m = self._VERSION_CODE_RE.match(stream_version_code) + if m: + lang_pref = int(''.join('01'[x] for x in ( + m.group('vlang') == langauge_code, # we prefer voice in the requested language + not m.group('audio_desc'), # and not the audio description version + bool(m.group('original_voice')), # but if voice is not in the requested language, at least choose the original voice + m.group('sub_lang') == langauge_code, # if subtitles are present, we prefer them in the requested language + not m.group('has_sub'), # but we prefer no subtitles otherwise + not m.group('sdh_sub'), # and we prefer not the hard-of-hearing subtitles if there are subtitles + ))) + + short_label = traverse_obj(stream_version, 'shortLabel', expected_type=str, default='?') + if stream['protocol'].startswith('HLS'): + fmts, subs = self._extract_m3u8_formats_and_subtitles( + stream['url'], video_id=video_id, ext='mp4', m3u8_id=stream_version_code, fatal=False) + for fmt in fmts: + fmt.update({ + 'format_note': f'{stream_version.get("label", "unknown")} [{short_label}]', 'language_preference': lang_pref, - 'format_note': format_note, }) - formats.extend(m3u8_formats) - continue + if any(map(short_label.startswith, ('cc', 'OGsub'))): + secondary_formats.extend(fmts) + else: + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + + elif stream['protocol'] in ('HTTPS', 'RTMP'): + formats.append({ + 'format_id': f'{stream["protocol"]}-{stream_version_code}', + 'url': stream['url'], + 'format_note': f'{stream_version.get("label", "unknown")} [{short_label}]', + 'language_preference': lang_pref, + # 'ext': 'mp4', # XXX: may or may not be necessary, at least for HTTPS + }) - format = { - 'format_id': format_id, - 'language_preference': lang_pref, - 'format_note': format_note, - 'width': int_or_none(f.get('width')), - 'height': int_or_none(f.get('height')), - 'tbr': int_or_none(f.get('bitrate')), - 'quality': qfunc(f.get('quality')), - } - - if media_type == 'rtmp': - format['url'] = f['streamer'] - format['play_path'] = 'mp4:' + f['url'] - format['ext'] = 'flv' else: - format['url'] = f['url'] + self.report_warning(f'Skipping stream with unknown protocol {stream["protocol"]}') - formats.append(format) + # TODO: chapters from stream['segments']? + # The JS also looks for chapters in config['data']['attributes']['chapters'], + # but I am yet to find a video having those - # For this extractor, quality only represents the relative quality - # with respect to other formats with the same resolution - self._sort_formats(formats, ('res', 'quality')) + formats.extend(secondary_formats) + self._remove_duplicate_formats(formats) + + metadata = config['data']['attributes']['metadata'] return { - 'id': player_info.get('VID') or video_id, - 'title': title, - 'description': player_info.get('VDE') or player_info.get('V7T'), - 'upload_date': unified_strdate(upload_date_str), - 'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'), + 'id': metadata['providerId'], + 'webpage_url': traverse_obj(metadata, ('link', 'url')), + 'title': traverse_obj(metadata, 'subtitle', 'title'), + 'alt_title': metadata.get('subtitle') and metadata.get('title'), + 'description': metadata.get('description'), + 'duration': traverse_obj(metadata, ('duration', 'seconds')), + 'language': metadata.get('language'), + 'timestamp': traverse_obj(config, ('data', 'attributes', 'rights', 'begin'), expected_type=parse_iso8601), + 'is_live': config['data']['attributes'].get('live', False), 'formats': formats, + 'subtitles': subtitles, + 'thumbnails': [ + {'url': image['url'], 'id': image.get('caption')} + for image in metadata.get('images') or [] if url_or_none(image.get('url')) + ], } class ArteTVEmbedIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+' + _EMBED_REGEX = [r'<(?:iframe|script)[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+?)\1'] _TESTS = [{ 'url': 'https://www.arte.tv/player/v5/index.php?json_url=https%3A%2F%2Fapi.arte.tv%2Fapi%2Fplayer%2Fv2%2Fconfig%2Fde%2F100605-013-A&lang=de&autoplay=true&mute=0100605-013-A', 'info_dict': { @@ -197,17 +220,12 @@ class ArteTVEmbedIE(InfoExtractor): 'description': 'md5:be40b667f45189632b78c1425c7c2ce1', 'upload_date': '20201116', }, + 'skip': 'No video available' }, { 'url': 'https://www.arte.tv/player/v3/index.php?json_url=https://api.arte.tv/api/player/v2/config/de/100605-013-A', 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): - return [url for _, url in re.findall( - r'<(?:iframe|script)[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+?)\1', - webpage)] - def _real_extract(self, url): qs = parse_qs(url) json_url = qs['json_url'][0] @@ -220,44 +238,36 @@ class ArteTVPlaylistIE(ArteTVBaseIE): _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>RC-\d{6})' % ArteTVBaseIE._ARTE_LANGUAGES _TESTS = [{ 'url': 'https://www.arte.tv/en/videos/RC-016954/earn-a-living/', - 'info_dict': { - 'id': 'RC-016954', - 'title': 'Earn a Living', - 'description': 'md5:d322c55011514b3a7241f7fb80d494c2', - }, - 'playlist_mincount': 6, + 'only_matching': True, }, { 'url': 'https://www.arte.tv/pl/videos/RC-014123/arte-reportage/', - 'only_matching': True, + 'playlist_mincount': 100, + 'info_dict': { + 'description': 'md5:84e7bf1feda248bc325ebfac818c476e', + 'id': 'RC-014123', + 'title': 'ARTE Reportage - najlepsze reportaże', + }, }] def _real_extract(self, url): - lang, playlist_id = self._match_valid_url(url).groups() - collection = self._download_json( - '%s/collectionData/%s/%s?source=videos' - % (self._API_BASE, lang, playlist_id), playlist_id) - entries = [] - for video in collection['videos']: - if not isinstance(video, dict): - continue - video_url = url_or_none(video.get('url')) or url_or_none(video.get('jsonUrl')) - if not video_url: - continue - video_id = video.get('programId') - entries.append({ - '_type': 'url_transparent', - 'url': video_url, - 'id': video_id, - 'title': video.get('title'), - 'alt_title': video.get('subtitle'), - 'thumbnail': url_or_none(try_get(video, lambda x: x['mainImage']['url'], compat_str)), - 'duration': int_or_none(video.get('durationSeconds')), - 'view_count': int_or_none(video.get('views')), - 'ie_key': ArteTVIE.ie_key(), - }) - title = collection.get('title') - description = collection.get('shortDescription') or collection.get('teaserText') - return self.playlist_result(entries, playlist_id, title, description) + lang, playlist_id = self._match_valid_url(url).group('lang', 'id') + playlist = self._download_json( + f'{self._API_BASE}/playlist/{lang}/{playlist_id}', playlist_id)['data']['attributes'] + + entries = [{ + '_type': 'url_transparent', + 'url': video['config']['url'], + 'ie_key': ArteTVIE.ie_key(), + 'id': video.get('providerId'), + 'title': video.get('title'), + 'alt_title': video.get('subtitle'), + 'thumbnail': url_or_none(traverse_obj(video, ('mainImage', 'url'))), + 'duration': int_or_none(traverse_obj(video, ('duration', 'seconds'))), + } for video in traverse_obj(playlist, ('items', lambda _, v: v['config']['url']))] + + return self.playlist_result(entries, playlist_id, + traverse_obj(playlist, ('metadata', 'title')), + traverse_obj(playlist, ('metadata', 'description'))) class ArteTVCategoryIE(ArteTVBaseIE): @@ -270,14 +280,13 @@ class ArteTVCategoryIE(ArteTVBaseIE): 'description': 'Investigative documentary series, geopolitical analysis, and international commentary', }, 'playlist_mincount': 13, - }, - ] + }] @classmethod def suitable(cls, url): return ( not any(ie.suitable(url) for ie in (ArteTVIE, ArteTVPlaylistIE, )) - and super(ArteTVCategoryIE, cls).suitable(url)) + and super().suitable(url)) def _real_extract(self, url): lang, playlist_id = self._match_valid_url(url).groups() @@ -293,9 +302,7 @@ class ArteTVCategoryIE(ArteTVBaseIE): if any(ie.suitable(video) for ie in (ArteTVIE, ArteTVPlaylistIE, )): items.append(video) - title = (self._og_search_title(webpage, default=None) - or self._html_search_regex(r'<title\b[^>]*>([^<]+)', default=None)) - title = strip_or_none(title.rsplit('|', 1)[0]) or self._generic_title(url) + title = strip_or_none(self._generic_title('', webpage, default='').rsplit('|', 1)[0]) or None return self.playlist_from_matches(items, playlist_id=playlist_id, playlist_title=title, description=self._og_search_description(webpage, default=None)) diff --git a/hypervideo_dl/extractor/asiancrush.py b/hypervideo_dl/extractor/asiancrush.py index 7f1940f..23f310e 100644 --- a/hypervideo_dl/extractor/asiancrush.py +++ b/hypervideo_dl/extractor/asiancrush.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import functools import re diff --git a/hypervideo_dl/extractor/atresplayer.py b/hypervideo_dl/extractor/atresplayer.py index 465af4e..a20e7f9 100644 --- a/hypervideo_dl/extractor/atresplayer.py +++ b/hypervideo_dl/extractor/atresplayer.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor from ..compat import compat_HTTPError from ..utils import ( @@ -88,7 +84,6 @@ class AtresPlayerIE(InfoExtractor): elif src_type == 'application/dash+xml': formats, subtitles = self._extract_mpd_formats( src, video_id, mpd_id='dash', fatal=False) - self._sort_formats(formats) heartbeat = episode.get('heartbeat') or {} omniture = episode.get('omniture') or {} diff --git a/hypervideo_dl/extractor/atscaleconf.py b/hypervideo_dl/extractor/atscaleconf.py new file mode 100644 index 0000000..3f7b1e9 --- /dev/null +++ b/hypervideo_dl/extractor/atscaleconf.py @@ -0,0 +1,34 @@ +import re + +from .common import InfoExtractor + + +class AtScaleConfEventIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?atscaleconference\.com/events/(?P[^/&$?]+)' + + _TESTS = [{ + 'url': 'https://atscaleconference.com/events/data-scale-spring-2022/', + 'playlist_mincount': 13, + 'info_dict': { + 'id': 'data-scale-spring-2022', + 'title': 'Data @Scale Spring 2022', + 'description': 'md5:7d7ca1c42ac9c6d8a785092a1aea4b55' + }, + }, { + 'url': 'https://atscaleconference.com/events/video-scale-2021/', + 'playlist_mincount': 14, + 'info_dict': { + 'id': 'video-scale-2021', + 'title': 'Video @Scale 2021', + 'description': 'md5:7d7ca1c42ac9c6d8a785092a1aea4b55' + }, + }] + + def _real_extract(self, url): + id = self._match_id(url) + webpage = self._download_webpage(url, id) + + return self.playlist_from_matches( + re.findall(r'data-url\s*=\s*"(https?://(?:www\.)?atscaleconference\.com/videos/[^"]+)"', webpage), + ie='Generic', playlist_id=id, + title=self._og_search_title(webpage), description=self._og_search_description(webpage)) diff --git a/hypervideo_dl/extractor/atttechchannel.py b/hypervideo_dl/extractor/atttechchannel.py index 8f93fb3..6ff4ec0 100644 --- a/hypervideo_dl/extractor/atttechchannel.py +++ b/hypervideo_dl/extractor/atttechchannel.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import unified_strdate diff --git a/hypervideo_dl/extractor/atvat.py b/hypervideo_dl/extractor/atvat.py index 481a097..d6ed9e4 100644 --- a/hypervideo_dl/extractor/atvat.py +++ b/hypervideo_dl/extractor/atvat.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import datetime from .common import InfoExtractor @@ -52,7 +49,6 @@ class ATVAtIE(InfoExtractor): 'url': source_url, 'format_id': protocol, }) - self._sort_formats(formats) return { 'id': clip_id, diff --git a/hypervideo_dl/extractor/audimedia.py b/hypervideo_dl/extractor/audimedia.py index 6bd48ef..35114e5 100644 --- a/hypervideo_dl/extractor/audimedia.py +++ b/hypervideo_dl/extractor/audimedia.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( int_or_none, @@ -79,7 +76,6 @@ class AudiMediaIE(InfoExtractor): 'format_id': 'http-%s' % bitrate, }) formats.append(f) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/audioboom.py b/hypervideo_dl/extractor/audioboom.py index c51837b..a23fcd2 100644 --- a/hypervideo_dl/extractor/audioboom.py +++ b/hypervideo_dl/extractor/audioboom.py @@ -1,27 +1,33 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor -from ..utils import ( - clean_html, - float_or_none, -) +from ..utils import clean_html, float_or_none, traverse_obj, unescapeHTML class AudioBoomIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?audioboom\.com/(?:boos|posts)/(?P[0-9]+)' _TESTS = [{ 'url': 'https://audioboom.com/posts/7398103-asim-chaudhry', - 'md5': '7b00192e593ff227e6a315486979a42d', + 'md5': '4d68be11c9f9daf3dab0778ad1e010c3', 'info_dict': { 'id': '7398103', 'ext': 'mp3', 'title': 'Asim Chaudhry', - 'description': 'md5:2f3fef17dacc2595b5362e1d7d3602fc', + 'description': 'md5:0ed714ae0e81e5d9119cac2f618ad679', 'duration': 4000.99, 'uploader': 'Sue Perkins: An hour or so with...', 'uploader_url': r're:https?://(?:www\.)?audioboom\.com/channel/perkins', } + }, { # Direct mp3-file link + 'url': 'https://audioboom.com/posts/8128496.mp3', + 'md5': 'e329edf304d450def95c7f86a9165ee1', + 'info_dict': { + 'id': '8128496', + 'ext': 'mp3', + 'title': 'TCRNo8 / DAILY 03 - In Control', + 'description': 'md5:44665f142db74858dfa21c5b34787948', + 'duration': 1689.7, + 'uploader': 'Lost Dot Podcast: The Trans Pyrenees and Transcontinental Race', + 'uploader_url': r're:https?://(?:www\.)?audioboom\.com/channels/5003904', + } }, { 'url': 'https://audioboom.com/posts/4279833-3-09-2016-czaban-hour-3?t=0', 'only_matching': True, @@ -29,45 +35,23 @@ class AudioBoomIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) + webpage = self._download_webpage(f'https://audioboom.com/posts/{video_id}', video_id) - webpage = self._download_webpage(url, video_id) - - clip = None - - clip_store = self._parse_json( - self._html_search_regex( - r'data-new-clip-store=(["\'])(?P{.+?})\1', - webpage, 'clip store', default='{}', group='json'), - video_id, fatal=False) - if clip_store: - clips = clip_store.get('clips') - if clips and isinstance(clips, list) and isinstance(clips[0], dict): - clip = clips[0] - - def from_clip(field): - if clip: - return clip.get(field) - - audio_url = from_clip('clipURLPriorToLoading') or self._og_search_property( - 'audio', webpage, 'audio url') - title = from_clip('title') or self._html_search_meta( - ['og:title', 'og:audio:title', 'audio_title'], webpage) - description = from_clip('description') or clean_html(from_clip('formattedDescription')) or self._og_search_description(webpage) - - duration = float_or_none(from_clip('duration') or self._html_search_meta( - 'weibo:audio:duration', webpage)) - - uploader = from_clip('author') or self._html_search_meta( - ['og:audio:artist', 'twitter:audio:artist_name', 'audio_artist'], webpage, 'uploader') - uploader_url = from_clip('author_url') or self._html_search_meta( - 'audioboo:channel', webpage, 'uploader url') + clip_store = self._search_json( + r'data-react-class="V5DetailPagePlayer"\s*data-react-props=["\']', + webpage, 'clip store', video_id, fatal=False, transform_source=unescapeHTML) + clip = traverse_obj(clip_store, ('clips', 0), expected_type=dict) or {} return { 'id': video_id, - 'url': audio_url, - 'title': title, - 'description': description, - 'duration': duration, - 'uploader': uploader, - 'uploader_url': uploader_url, + 'url': clip.get('clipURLPriorToLoading') or self._og_search_property('audio', webpage, 'audio url'), + 'title': clip.get('title') or self._html_search_meta(['og:title', 'og:audio:title', 'audio_title'], webpage), + 'description': (clip.get('description') or clean_html(clip.get('formattedDescription')) + or self._og_search_description(webpage)), + 'duration': float_or_none(clip.get('duration') or self._html_search_meta('weibo:audio:duration', webpage)), + 'uploader': clip.get('author') or self._html_search_meta( + ['og:audio:artist', 'twitter:audio:artist_name', 'audio_artist'], webpage, 'uploader'), + 'uploader_url': clip.get('author_url') or self._html_search_regex( + r'
\s*\d+)' + + _TESTS = [{ + 'url': 'http://nokiatune.audiodraft.com/entry/5874', + 'info_dict': { + 'id': '9485', + 'ext': 'mp3', + 'title': 'Hula Hula Calls', + 'uploader': 'unclemaki', + 'uploader_id': '13512', + 'average_rating': 5, + 'like_count': int, + }, + }, { + 'url': 'http://vikinggrace.audiodraft.com/entry/501', + 'info_dict': { + 'id': '22241', + 'ext': 'mp3', + 'title': 'MVG Happy', + 'uploader': 'frog', + 'uploader_id': '19142', + 'average_rating': 5, + 'like_count': int, + }, + }, { + 'url': 'http://timferriss.audiodraft.com/entry/765', + 'info_dict': { + 'id': '19710', + 'ext': 'mp3', + 'title': 'ferris03', + 'uploader': 'malex', + 'uploader_id': '17335', + 'average_rating': 5, + 'like_count': int, + }, + }] + + def _real_extract(self, url): + id = self._match_id(url) + webpage = self._download_webpage(url, id) + player_entry_id = self._search_regex(r'playAudio\(\'(player_entry_\d+)\'\);', webpage, id, 'play entry id') + return self._audiodraft_extract_from_id(player_entry_id) + + +class AudiodraftGenericIE(AudiodraftBaseIE): + IE_NAME = 'Audiodraft:generic' + _VALID_URL = r'https?://www\.audiodraft\.com/contests/[^/#]+#entries&eid=(?P\d+)' + + _TESTS = [{ + 'url': 'https://www.audiodraft.com/contests/570-Score-A-Video-Surprise-Us#entries&eid=30138', + 'info_dict': { + 'id': '30138', + 'ext': 'mp3', + 'title': 'DROP in sound_V2', + 'uploader': 'TiagoSilva', + 'uploader_id': '19452', + 'average_rating': 4, + 'like_count': int, + }, + }] + + def _real_extract(self, url): + id = self._match_id(url) + return self._audiodraft_extract_from_id(f'player_entry_{id}') diff --git a/hypervideo_dl/extractor/audiomack.py b/hypervideo_dl/extractor/audiomack.py index 19775cf..5c4160f 100644 --- a/hypervideo_dl/extractor/audiomack.py +++ b/hypervideo_dl/extractor/audiomack.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import itertools import time diff --git a/hypervideo_dl/extractor/audius.py b/hypervideo_dl/extractor/audius.py index fa64995..6448b44 100644 --- a/hypervideo_dl/extractor/audius.py +++ b/hypervideo_dl/extractor/audius.py @@ -1,11 +1,8 @@ -# coding: utf-8 -from __future__ import unicode_literals - import random from .common import InfoExtractor -from ..utils import ExtractorError, try_get, compat_str, str_or_none -from ..compat import compat_urllib_parse_unquote +from ..compat import compat_str, compat_urllib_parse_unquote +from ..utils import ExtractorError, str_or_none, try_get class AudiusBaseIE(InfoExtractor): @@ -171,7 +168,7 @@ class AudiusIE(AudiusBaseIE): } -class AudiusTrackIE(AudiusIE): +class AudiusTrackIE(AudiusIE): # XXX: Do not subclass from concrete IE _VALID_URL = r'''(?x)(?:audius:)(?:https?://(?:www\.)?.+/v1/tracks/)?(?P\w+)''' IE_NAME = 'audius:track' IE_DESC = 'Audius track ID or API link. Prepend with "audius:"' @@ -246,7 +243,7 @@ class AudiusPlaylistIE(AudiusBaseIE): playlist_data.get('description')) -class AudiusProfileIE(AudiusPlaylistIE): +class AudiusProfileIE(AudiusPlaylistIE): # XXX: Do not subclass from concrete IE IE_NAME = 'audius:artist' IE_DESC = 'Audius.co profile/artist pages' _VALID_URL = r'https?://(?:www)?audius\.co/(?P[^\/]+)/?(?:[?#]|$)' diff --git a/hypervideo_dl/extractor/awaan.py b/hypervideo_dl/extractor/awaan.py index f5e559c..6fc938d 100644 --- a/hypervideo_dl/extractor/awaan.py +++ b/hypervideo_dl/extractor/awaan.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import base64 from .common import InfoExtractor @@ -44,7 +41,7 @@ class AWAANBaseIE(InfoExtractor): 'id': video_id, 'title': title, 'description': video_data.get('description_en') or video_data.get('description_ar'), - 'thumbnail': format_field(img, template='http://admin.mangomolo.com/analytics/%s'), + 'thumbnail': format_field(img, None, 'http://admin.mangomolo.com/analytics/%s'), 'duration': int_or_none(video_data.get('duration')), 'timestamp': parse_iso8601(video_data.get('create_time'), ' '), 'is_live': is_live, diff --git a/hypervideo_dl/extractor/aws.py b/hypervideo_dl/extractor/aws.py index dccfeaf..eb831a1 100644 --- a/hypervideo_dl/extractor/aws.py +++ b/hypervideo_dl/extractor/aws.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import datetime import hashlib import hmac @@ -9,7 +6,7 @@ from .common import InfoExtractor from ..compat import compat_urllib_parse_urlencode -class AWSIE(InfoExtractor): +class AWSIE(InfoExtractor): # XXX: Conventionally, base classes should end with BaseIE/InfoExtractor _AWS_ALGORITHM = 'AWS4-HMAC-SHA256' _AWS_REGION = 'us-east-1' diff --git a/hypervideo_dl/extractor/azmedien.py b/hypervideo_dl/extractor/azmedien.py index 0168340..d1686ee 100644 --- a/hypervideo_dl/extractor/azmedien.py +++ b/hypervideo_dl/extractor/azmedien.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/baidu.py b/hypervideo_dl/extractor/baidu.py index 364fd94..8786d67 100644 --- a/hypervideo_dl/extractor/baidu.py +++ b/hypervideo_dl/extractor/baidu.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor from ..utils import unescapeHTML diff --git a/hypervideo_dl/extractor/banbye.py b/hypervideo_dl/extractor/banbye.py index 3d4d36e..c873425 100644 --- a/hypervideo_dl/extractor/banbye.py +++ b/hypervideo_dl/extractor/banbye.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import math from .common import InfoExtractor @@ -83,8 +80,6 @@ class BanByeIE(BanByeBaseIE): 'url': f'{self._CDN_BASE}/video/{video_id}/{quality}.mp4', } for quality in data['quality']] - self._sort_formats(formats) - return { 'id': video_id, 'title': data.get('title'), diff --git a/hypervideo_dl/extractor/bandaichannel.py b/hypervideo_dl/extractor/bandaichannel.py index f1bcdef..d7fcf44 100644 --- a/hypervideo_dl/extractor/bandaichannel.py +++ b/hypervideo_dl/extractor/bandaichannel.py @@ -1,11 +1,8 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .brightcove import BrightcoveNewIE +from .brightcove import BrightcoveNewBaseIE from ..utils import extract_attributes -class BandaiChannelIE(BrightcoveNewIE): +class BandaiChannelIE(BrightcoveNewBaseIE): IE_NAME = 'bandaichannel' _VALID_URL = r'https?://(?:www\.)?b-ch\.com/titles/(?P\d+/\d+)' _TESTS = [{ diff --git a/hypervideo_dl/extractor/bandcamp.py b/hypervideo_dl/extractor/bandcamp.py index 745055e..de81e0d 100644 --- a/hypervideo_dl/extractor/bandcamp.py +++ b/hypervideo_dl/extractor/bandcamp.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import random import re import time @@ -8,23 +5,24 @@ import time from .common import InfoExtractor from ..compat import compat_str from ..utils import ( + KNOWN_EXTENSIONS, ExtractorError, float_or_none, int_or_none, - KNOWN_EXTENSIONS, parse_filesize, str_or_none, try_get, - update_url_query, unified_strdate, unified_timestamp, + update_url_query, url_or_none, urljoin, ) class BandcampIE(InfoExtractor): - _VALID_URL = r'https?://[^/]+\.bandcamp\.com/track/(?P[^/?#&]+)' + _VALID_URL = r'https?://(?P[^/]+)\.bandcamp\.com/track/(?P[^/?#&]+)' + _EMBED_REGEX = [r']*?content="(?P.*?bandcamp\.com.*?)"'] _TESTS = [{ 'url': 'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song', 'md5': 'c557841d5e50261777a6585648adf439', @@ -87,7 +85,7 @@ class BandcampIE(InfoExtractor): attr + ' data', group=2), video_id, fatal=fatal) def _real_extract(self, url): - title = self._match_id(url) + title, uploader = self._match_valid_url(url).group('id', 'uploader') webpage = self._download_webpage(url, title) tralbum = self._extract_data_attr(webpage, title) thumbnail = self._og_search_thumbnail(webpage) @@ -186,8 +184,6 @@ class BandcampIE(InfoExtractor): 'acodec': format_id.split('-')[0], }) - self._sort_formats(formats) - title = '%s - %s' % (artist, track) if artist else track if not duration: @@ -199,6 +195,8 @@ class BandcampIE(InfoExtractor): 'title': title, 'thumbnail': thumbnail, 'uploader': artist, + 'uploader_id': uploader, + 'uploader_url': f'https://{uploader}.bandcamp.com', 'timestamp': timestamp, 'release_timestamp': unified_timestamp(tralbum.get('album_release_date')), 'duration': duration, @@ -211,7 +209,7 @@ class BandcampIE(InfoExtractor): } -class BandcampAlbumIE(BandcampIE): +class BandcampAlbumIE(BandcampIE): # XXX: Do not subclass from concrete IE IE_NAME = 'Bandcamp:album' _VALID_URL = r'https?://(?:(?P[^.]+)\.)?bandcamp\.com/album/(?P[^/?#&]+)' @@ -314,7 +312,7 @@ class BandcampAlbumIE(BandcampIE): } -class BandcampWeeklyIE(BandcampIE): +class BandcampWeeklyIE(BandcampIE): # XXX: Do not subclass from concrete IE IE_NAME = 'Bandcamp:weekly' _VALID_URL = r'https?://(?:www\.)?bandcamp\.com/?\?(?:.*?&)?show=(?P\d+)' _TESTS = [{ @@ -363,7 +361,6 @@ class BandcampWeeklyIE(BandcampIE): 'ext': ext, 'vcodec': 'none', }) - self._sort_formats(formats) title = show.get('audio_title') or 'Bandcamp Weekly' subtitle = show.get('subtitle') @@ -439,7 +436,7 @@ class BandcampUserIE(InfoExtractor): uploader = self._match_id(url) webpage = self._download_webpage(url, uploader) - discography_data = (re.findall(r'
  • ]+>\s*]+>\s*]+trackTitle["\'][^"\']+["\']([^"\']+)', webpage)) return self.playlist_from_matches( diff --git a/hypervideo_dl/extractor/bannedvideo.py b/hypervideo_dl/extractor/bannedvideo.py index 3db1151..51e7220 100644 --- a/hypervideo_dl/extractor/bannedvideo.py +++ b/hypervideo_dl/extractor/bannedvideo.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import json from .common import InfoExtractor @@ -137,7 +135,6 @@ query GetCommentReplies($id: String!) { formats.extend(self._extract_m3u8_formats( video_info.get('streamUrl'), video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', live=True)) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/bbc.py b/hypervideo_dl/extractor/bbc.py index 29ad7de..9d28e70 100644 --- a/hypervideo_dl/extractor/bbc.py +++ b/hypervideo_dl/extractor/bbc.py @@ -1,19 +1,12 @@ -# coding: utf-8 -from __future__ import unicode_literals - import functools import itertools import json import re +import urllib.error +import xml.etree.ElementTree from .common import InfoExtractor -from ..compat import ( - compat_etree_Element, - compat_HTTPError, - compat_str, - compat_urllib_error, - compat_urlparse, -) +from ..compat import compat_HTTPError, compat_str, compat_urlparse from ..utils import ( ExtractorError, OnDemandPagedList, @@ -53,6 +46,7 @@ class BBCCoUkIE(InfoExtractor): ) (?P%s)(?!/(?:episodes|broadcasts|clips)) ''' % _ID_REGEX + _EMBED_REGEX = [r'setPlaylist\("(?Phttps?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)'] _LOGIN_URL = 'https://account.bbc.com/signin' _NETRC_MACHINE = 'bbc' @@ -318,7 +312,7 @@ class BBCCoUkIE(InfoExtractor): continue captions = self._download_xml( cc_url, programme_id, 'Downloading captions', fatal=False) - if not isinstance(captions, compat_etree_Element): + if not isinstance(captions, xml.etree.ElementTree.Element): continue subtitles['en'] = [ { @@ -394,7 +388,7 @@ class BBCCoUkIE(InfoExtractor): href, programme_id, ext='mp4', entry_protocol='m3u8_native', m3u8_id=format_id, fatal=False) except ExtractorError as e: - if not (isinstance(e.exc_info[1], compat_urllib_error.HTTPError) + if not (isinstance(e.exc_info[1], urllib.error.HTTPError) and e.exc_info[1].code in (403, 404)): raise fmts = [] @@ -581,8 +575,6 @@ class BBCCoUkIE(InfoExtractor): else: programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id) - self._sort_formats(formats) - return { 'id': programme_id, 'title': title, @@ -594,10 +586,15 @@ class BBCCoUkIE(InfoExtractor): } -class BBCIE(BBCCoUkIE): +class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE IE_NAME = 'bbc' IE_DESC = 'BBC' - _VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P[^/#?]+)' + _VALID_URL = r'''(?x) + https?://(?:www\.)?(?: + bbc\.(?:com|co\.uk)| + bbcnewsd73hkzno2ini43t4gblxvycyac5aw4gnv7t2rccijh7745uqd\.onion| + bbcweb3hytmzhn5d532owbu6oqadra5z3ar726vq5kgwwn6aucdccrad\.onion + )/(?:[^/]+/)+(?P[^/#?]+)''' _MEDIA_SETS = [ 'pc', @@ -847,6 +844,12 @@ class BBCIE(BBCCoUkIE): 'upload_date': '20190604', 'categories': ['Psychology'], }, + }, { # onion routes + 'url': 'https://www.bbcnewsd73hkzno2ini43t4gblxvycyac5aw4gnv7t2rccijh7745uqd.onion/news/av/world-europe-63208576', + 'only_matching': True, + }, { + 'url': 'https://www.bbcweb3hytmzhn5d532owbu6oqadra5z3ar726vq5kgwwn6aucdccrad.onion/sport/av/football/63195681', + 'only_matching': True, }] @classmethod @@ -885,7 +888,6 @@ class BBCIE(BBCCoUkIE): def _extract_from_playlist_sxml(self, url, playlist_id, timestamp): programme_id, title, description, duration, formats, subtitles = \ self._process_legacy_playlist_url(url, playlist_id) - self._sort_formats(formats) return { 'id': programme_id, 'title': title, @@ -904,12 +906,8 @@ class BBCIE(BBCCoUkIE): json_ld_info = self._search_json_ld(webpage, playlist_id, default={}) timestamp = json_ld_info.get('timestamp') - playlist_title = json_ld_info.get('title') - if not playlist_title: - playlist_title = (self._og_search_title(webpage, default=None) - or self._html_extract_title(webpage, 'playlist title', default=None)) - if playlist_title: - playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip() + playlist_title = json_ld_info.get('title') or re.sub( + r'(.+)\s*-\s*BBC.*?$', r'\1', self._generic_title('', webpage, default='')).strip() or None playlist_description = json_ld_info.get( 'description') or self._og_search_description(webpage, default=None) @@ -953,7 +951,6 @@ class BBCIE(BBCCoUkIE): duration = int_or_none(items[0].get('duration')) programme_id = items[0].get('vpid') formats, subtitles = self._download_media_selector(programme_id) - self._sort_formats(formats) entries.append({ 'id': programme_id, 'title': title, @@ -990,7 +987,6 @@ class BBCIE(BBCCoUkIE): continue raise if entry: - self._sort_formats(entry['formats']) entries.append(entry) if entries: @@ -1014,7 +1010,6 @@ class BBCIE(BBCCoUkIE): if programme_id: formats, subtitles = self._download_media_selector(programme_id) - self._sort_formats(formats) # digitalData may be missing (e.g. http://www.bbc.com/autos/story/20130513-hyundais-rock-star) digital_data = self._parse_json( self._search_regex( @@ -1046,7 +1041,6 @@ class BBCIE(BBCCoUkIE): if version_id: title = smp_data['title'] formats, subtitles = self._download_media_selector(version_id) - self._sort_formats(formats) image_url = smp_data.get('holdingImageURL') display_date = init_data.get('displayDate') topic_title = init_data.get('topicTitle') @@ -1088,7 +1082,6 @@ class BBCIE(BBCCoUkIE): continue title = lead_media.get('title') or self._og_search_title(webpage) formats, subtitles = self._download_media_selector(programme_id) - self._sort_formats(formats) description = lead_media.get('summary') uploader = lead_media.get('masterBrand') uploader_id = lead_media.get('mid') @@ -1117,7 +1110,6 @@ class BBCIE(BBCCoUkIE): if current_programme and programme_id and current_programme.get('type') == 'playable_item': title = current_programme.get('titles', {}).get('tertiary') or playlist_title formats, subtitles = self._download_media_selector(programme_id) - self._sort_formats(formats) synopses = current_programme.get('synopses') or {} network = current_programme.get('network') or {} duration = int_or_none( @@ -1150,7 +1142,6 @@ class BBCIE(BBCCoUkIE): clip_title = clip.get('title') if clip_vpid and clip_title: formats, subtitles = self._download_media_selector(clip_vpid) - self._sort_formats(formats) return { 'id': clip_vpid, 'title': clip_title, @@ -1172,7 +1163,6 @@ class BBCIE(BBCCoUkIE): if not programme_id: continue formats, subtitles = self._download_media_selector(programme_id) - self._sort_formats(formats) entries.append({ 'id': programme_id, 'title': playlist_title, @@ -1204,7 +1194,6 @@ class BBCIE(BBCCoUkIE): if not (item_id and item_title): continue formats, subtitles = self._download_media_selector(item_id) - self._sort_formats(formats) item_desc = None blocks = try_get(media, lambda x: x['summary']['blocks'], list) if blocks: @@ -1238,7 +1227,7 @@ class BBCIE(BBCCoUkIE): (lambda x: x['data']['blocks'], lambda x: x['data']['content']['model']['blocks'],), list) or []): - if block.get('type') != 'media': + if block.get('type') not in ['media', 'video']: continue parse_media(block.get('model')) return self.playlist_result( @@ -1305,7 +1294,6 @@ class BBCIE(BBCCoUkIE): formats, subtitles = self._extract_from_media_meta(media_meta, playlist_id) if not formats and not self.get_param('ignore_no_formats'): continue - self._sort_formats(formats) video_id = media_meta.get('externalId') if not video_id: diff --git a/hypervideo_dl/extractor/beatport.py b/hypervideo_dl/extractor/beatport.py index e1cf8b4..0aecbd0 100644 --- a/hypervideo_dl/extractor/beatport.py +++ b/hypervideo_dl/extractor/beatport.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -77,7 +74,6 @@ class BeatportIE(InfoExtractor): fmt['abr'] = 96 fmt['asr'] = 44100 formats.append(fmt) - self._sort_formats(formats) images = [] for name, info in track['images'].items(): diff --git a/hypervideo_dl/extractor/beeg.py b/hypervideo_dl/extractor/beeg.py index 717fff3..52ee68e 100644 --- a/hypervideo_dl/extractor/beeg.py +++ b/hypervideo_dl/extractor/beeg.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( @@ -78,8 +76,6 @@ class BeegIE(InfoExtractor): f['height'] = height formats.extend(current_formats) - self._sort_formats(formats) - return { 'id': video_id, 'display_id': first_fact.get('id'), diff --git a/hypervideo_dl/extractor/behindkink.py b/hypervideo_dl/extractor/behindkink.py index 2c97f98..ca44981 100644 --- a/hypervideo_dl/extractor/behindkink.py +++ b/hypervideo_dl/extractor/behindkink.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor from ..utils import url_basename diff --git a/hypervideo_dl/extractor/bellmedia.py b/hypervideo_dl/extractor/bellmedia.py index 904c17e..5ae4b91 100644 --- a/hypervideo_dl/extractor/bellmedia.py +++ b/hypervideo_dl/extractor/bellmedia.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor @@ -28,7 +24,7 @@ class BellMediaIE(InfoExtractor): )/.*?(?:\b(?:vid(?:eoid)?|clipId)=|-vid|~|%7E|/(?:episode)?)(?P[0-9]{6,})''' _TESTS = [{ 'url': 'https://www.bnnbloomberg.ca/video/david-cockfield-s-top-picks~1403070', - 'md5': '36d3ef559cfe8af8efe15922cd3ce950', + 'md5': '3e5b8e38370741d5089da79161646635', 'info_dict': { 'id': '1403070', 'ext': 'flv', @@ -36,6 +32,14 @@ class BellMediaIE(InfoExtractor): 'description': 'md5:810f7f8c6a83ad5b48677c3f8e5bb2c3', 'upload_date': '20180525', 'timestamp': 1527288600, + 'season_id': 73997, + 'season': '2018', + 'thumbnail': 'http://images2.9c9media.com/image_asset/2018_5_25_baf30cbd-b28d-4a18-9903-4bb8713b00f5_PNG_956x536.jpg', + 'tags': [], + 'categories': ['ETFs'], + 'season_number': 8, + 'duration': 272.038, + 'series': 'Market Call Tonight', }, }, { 'url': 'http://www.thecomedynetwork.ca/video/player?vid=923582', diff --git a/hypervideo_dl/extractor/berufetv.py b/hypervideo_dl/extractor/berufetv.py new file mode 100644 index 0000000..8160cbd --- /dev/null +++ b/hypervideo_dl/extractor/berufetv.py @@ -0,0 +1,70 @@ +from .common import InfoExtractor +from ..utils import float_or_none, mimetype2ext, traverse_obj + + +class BerufeTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?web\.arbeitsagentur\.de/berufetv/[^?#]+/film;filmId=(?P[\w-]+)' + _TESTS = [{ + 'url': 'https://web.arbeitsagentur.de/berufetv/studienberufe/wirtschaftswissenschaften/wirtschaftswissenschaften-volkswirtschaft/film;filmId=DvKC3DUpMKvUZ_6fEnfg3u', + 'md5': '041b6432ec8e6838f84a5c30f31cc795', + 'info_dict': { + 'id': 'DvKC3DUpMKvUZ_6fEnfg3u', + 'ext': 'mp4', + 'title': 'Volkswirtschaftslehre', + 'description': 'md5:6bd87d0c63163480a6489a37526ee1c1', + 'categories': ['Studien­beruf'], + 'tags': ['Studienfilm'], + 'duration': 602.440, + 'thumbnail': r're:^https://asset-out-cdn\.video-cdn\.net/private/videos/DvKC3DUpMKvUZ_6fEnfg3u/thumbnails/793063\?quality=thumbnail&__token__=[^\s]+$', + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + movie_metadata = self._download_json( + 'https://rest.arbeitsagentur.de/infosysbub/berufetv/pc/v1/film-metadata', + video_id, 'Downloading JSON metadata', + headers={'X-API-Key': '79089773-4892-4386-86e6-e8503669f426'}, fatal=False) + + meta = traverse_obj( + movie_metadata, ('metadaten', lambda _, i: video_id == i['miId']), + get_all=False, default={}) + + video = self._download_json( + f'https://d.video-cdn.net/play/player/8YRzUk6pTzmBdrsLe9Y88W/video/{video_id}', + video_id, 'Downloading video JSON') + + formats, subtitles = [], {} + for key, source in video['videoSources']['html'].items(): + if key == 'auto': + fmts, subs = self._extract_m3u8_formats_and_subtitles(source[0]['source'], video_id) + formats += fmts + subtitles = subs + else: + formats.append({ + 'url': source[0]['source'], + 'ext': mimetype2ext(source[0]['mimeType']), + 'format_id': key, + }) + + for track in video.get('videoTracks') or []: + if track.get('type') != 'SUBTITLES': + continue + subtitles.setdefault(track['language'], []).append({ + 'url': track['source'], + 'name': track.get('label'), + 'ext': 'vtt' + }) + + return { + 'id': video_id, + 'title': meta.get('titel') or traverse_obj(video, ('videoMetaData', 'title')), + 'description': meta.get('beschreibung'), + 'thumbnail': meta.get('thumbnail') or f'https://asset-out-cdn.video-cdn.net/private/videos/{video_id}/thumbnails/active', + 'duration': float_or_none(video.get('duration'), scale=1000), + 'categories': [meta['kategorie']] if meta.get('kategorie') else None, + 'tags': meta.get('themengebiete'), + 'subtitles': subtitles, + 'formats': formats, + } diff --git a/hypervideo_dl/extractor/bet.py b/hypervideo_dl/extractor/bet.py index 2c71442..6b867d1 100644 --- a/hypervideo_dl/extractor/bet.py +++ b/hypervideo_dl/extractor/bet.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .mtv import MTVServicesInfoExtractor from ..utils import unified_strdate diff --git a/hypervideo_dl/extractor/bfi.py b/hypervideo_dl/extractor/bfi.py index 60c8944..76f0516 100644 --- a/hypervideo_dl/extractor/bfi.py +++ b/hypervideo_dl/extractor/bfi.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/bfmtv.py b/hypervideo_dl/extractor/bfmtv.py index 501f69d..d86d283 100644 --- a/hypervideo_dl/extractor/bfmtv.py +++ b/hypervideo_dl/extractor/bfmtv.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -45,7 +42,7 @@ class BFMTVIE(BFMTVBaseIE): return self._brightcove_url_result(video_block['videoid'], video_block) -class BFMTVLiveIE(BFMTVIE): +class BFMTVLiveIE(BFMTVIE): # XXX: Do not subclass from concrete IE IE_NAME = 'bfmtv:live' _VALID_URL = BFMTVBaseIE._VALID_URL_BASE + '(?P(?:[^/]+/)?en-direct)' _TESTS = [{ diff --git a/hypervideo_dl/extractor/bibeltv.py b/hypervideo_dl/extractor/bibeltv.py index 56c2bfe..fd20aad 100644 --- a/hypervideo_dl/extractor/bibeltv.py +++ b/hypervideo_dl/extractor/bibeltv.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/bigflix.py b/hypervideo_dl/extractor/bigflix.py index 28e3e59..02d1ba0 100644 --- a/hypervideo_dl/extractor/bigflix.py +++ b/hypervideo_dl/extractor/bigflix.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -66,8 +63,6 @@ class BigflixIE(InfoExtractor): 'url': decode_url(file_url), }) - self._sort_formats(formats) - description = self._html_search_meta('description', webpage) return { diff --git a/hypervideo_dl/extractor/bigo.py b/hypervideo_dl/extractor/bigo.py index ddf76ac..1cb6e58 100644 --- a/hypervideo_dl/extractor/bigo.py +++ b/hypervideo_dl/extractor/bigo.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ExtractorError, urlencode_postdata @@ -31,7 +28,7 @@ class BigoIE(InfoExtractor): user_id = self._match_id(url) info_raw = self._download_json( - 'https://bigo.tv/studio/getInternalStudioInfo', + 'https://ta.bigo.tv/official_website/studio/getInternalStudioInfo', user_id, data=urlencode_postdata({'siteId': user_id})) if not isinstance(info_raw, dict): @@ -44,14 +41,14 @@ class BigoIE(InfoExtractor): if not info.get('alive'): raise ExtractorError('This user is offline.', expected=True) + formats, subs = self._extract_m3u8_formats_and_subtitles( + info.get('hls_src'), user_id, 'mp4', 'm3u8') + return { 'id': info.get('roomId') or user_id, 'title': info.get('roomTopic') or info.get('nick_name') or user_id, - 'formats': [{ - 'url': info.get('hls_src'), - 'ext': 'mp4', - 'protocol': 'm3u8', - }], + 'formats': formats, + 'subtitles': subs, 'thumbnail': info.get('snapshot'), 'uploader': info.get('nick_name'), 'uploader_id': user_id, diff --git a/hypervideo_dl/extractor/bild.py b/hypervideo_dl/extractor/bild.py index b8dfbd4..f3dea33 100644 --- a/hypervideo_dl/extractor/bild.py +++ b/hypervideo_dl/extractor/bild.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( int_or_none, diff --git a/hypervideo_dl/extractor/bilibili.py b/hypervideo_dl/extractor/bilibili.py index 909f7f8..bc04241 100644 --- a/hypervideo_dl/extractor/bilibili.py +++ b/hypervideo_dl/extractor/bilibili.py @@ -1,509 +1,561 @@ -# coding: utf-8 - import base64 -import hashlib -import itertools import functools -import re +import itertools import math +import urllib.error +import urllib.parse from .common import InfoExtractor, SearchInfoExtractor -from ..compat import ( - compat_parse_qs, - compat_urlparse, - compat_urllib_parse_urlparse -) from ..utils import ( ExtractorError, + GeoRestrictedError, + InAdvancePagedList, + OnDemandPagedList, filter_dict, - int_or_none, float_or_none, + format_field, + int_or_none, + make_archive_id, mimetype2ext, - parse_iso8601, - traverse_obj, parse_count, - smuggle_url, + parse_qs, + qualities, srt_subtitles_timecode, str_or_none, - strip_jsonp, - unified_timestamp, - unsmuggle_url, - urlencode_postdata, + traverse_obj, url_or_none, - OnDemandPagedList + urlencode_postdata, ) -class BiliBiliIE(InfoExtractor): - _VALID_URL = r'''(?x) - https?:// - (?:(?:www|bangumi)\.)? - bilibili\.(?:tv|com)/ - (?: - (?: - video/[aA][vV]| - anime/(?P\d+)/play\# - )(?P\d+)| - (s/)?video/[bB][vV](?P[^/?#&]+) - ) - (?:/?\?p=(?P\d+))? - ''' +class BilibiliBaseIE(InfoExtractor): + def extract_formats(self, play_info): + format_names = { + r['quality']: traverse_obj(r, 'new_description', 'display_desc') + for r in traverse_obj(play_info, ('support_formats', lambda _, v: v['quality'])) + } + + audios = traverse_obj(play_info, ('dash', 'audio', ...)) + flac_audio = traverse_obj(play_info, ('dash', 'flac', 'audio')) + if flac_audio: + audios.append(flac_audio) + formats = [{ + 'url': traverse_obj(audio, 'baseUrl', 'base_url', 'url'), + 'ext': mimetype2ext(traverse_obj(audio, 'mimeType', 'mime_type')), + 'acodec': audio.get('codecs'), + 'vcodec': 'none', + 'tbr': float_or_none(audio.get('bandwidth'), scale=1000), + 'filesize': int_or_none(audio.get('size')) + } for audio in audios] + + formats.extend({ + 'url': traverse_obj(video, 'baseUrl', 'base_url', 'url'), + 'ext': mimetype2ext(traverse_obj(video, 'mimeType', 'mime_type')), + 'fps': float_or_none(traverse_obj(video, 'frameRate', 'frame_rate')), + 'width': int_or_none(video.get('width')), + 'height': int_or_none(video.get('height')), + 'vcodec': video.get('codecs'), + 'acodec': 'none' if audios else None, + 'tbr': float_or_none(video.get('bandwidth'), scale=1000), + 'filesize': int_or_none(video.get('size')), + 'quality': int_or_none(video.get('id')), + 'format': format_names.get(video.get('id')), + } for video in traverse_obj(play_info, ('dash', 'video', ...))) + + missing_formats = format_names.keys() - set(traverse_obj(formats, (..., 'quality'))) + if missing_formats: + self.to_screen(f'Format(s) {", ".join(format_names[i] for i in missing_formats)} are missing; ' + f'you have to login or become premium member to download them. {self._login_hint()}') + + return formats + + def json2srt(self, json_data): + srt_data = '' + for idx, line in enumerate(json_data.get('body') or []): + srt_data += (f'{idx + 1}\n' + f'{srt_subtitles_timecode(line["from"])} --> {srt_subtitles_timecode(line["to"])}\n' + f'{line["content"]}\n\n') + return srt_data + + def _get_subtitles(self, video_id, initial_state, cid): + subtitles = { + 'danmaku': [{ + 'ext': 'xml', + 'url': f'https://comment.bilibili.com/{cid}.xml', + }] + } + + for s in traverse_obj(initial_state, ('videoData', 'subtitle', 'list')) or []: + subtitles.setdefault(s['lan'], []).append({ + 'ext': 'srt', + 'data': self.json2srt(self._download_json(s['subtitle_url'], video_id)) + }) + return subtitles + + def _get_chapters(self, aid, cid): + chapters = aid and cid and self._download_json( + 'https://api.bilibili.com/x/player/v2', aid, query={'aid': aid, 'cid': cid}, + note='Extracting chapters', fatal=False) + return traverse_obj(chapters, ('data', 'view_points', ..., { + 'title': 'content', + 'start_time': 'from', + 'end_time': 'to', + })) or None + + def _get_comments(self, aid): + for idx in itertools.count(1): + replies = traverse_obj( + self._download_json( + f'https://api.bilibili.com/x/v2/reply?pn={idx}&oid={aid}&type=1&jsonp=jsonp&sort=2&_=1567227301685', + aid, note=f'Extracting comments from page {idx}', fatal=False), + ('data', 'replies')) + if not replies: + return + for children in map(self._get_all_children, replies): + yield from children + + def _get_all_children(self, reply): + yield { + 'author': traverse_obj(reply, ('member', 'uname')), + 'author_id': traverse_obj(reply, ('member', 'mid')), + 'id': reply.get('rpid'), + 'text': traverse_obj(reply, ('content', 'message')), + 'timestamp': reply.get('ctime'), + 'parent': reply.get('parent') or 'root', + } + for children in map(self._get_all_children, traverse_obj(reply, ('replies', ...))): + yield from children + + +class BiliBiliIE(BilibiliBaseIE): + _VALID_URL = r'https?://www\.bilibili\.com/video/[aAbB][vV](?P[^/?#&]+)' _TESTS = [{ + 'url': 'https://www.bilibili.com/video/BV13x41117TL', + 'info_dict': { + 'id': 'BV13x41117TL', + 'title': '阿滴英文|英文歌分享#6 "Closer', + 'ext': 'mp4', + 'description': '滴妹今天唱Closer給你聽! 有史以来,被推最多次也是最久的歌曲,其实歌词跟我原本想像差蛮多的,不过还是好听! 微博@阿滴英文', + 'uploader_id': '65880958', + 'uploader': '阿滴英文', + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', + 'duration': 554.117, + 'tags': list, + 'comment_count': int, + 'upload_date': '20170301', + 'timestamp': 1488353834, + 'like_count': int, + 'view_count': int, + }, + }, { + # old av URL version 'url': 'http://www.bilibili.com/video/av1074402/', - 'md5': '5f7d29e1a2872f3df0cf76b1f87d3788', 'info_dict': { - 'id': '1074402_part1', + 'thumbnail': r're:^https?://.*\.(jpg|jpeg)$', 'ext': 'mp4', - 'title': '【金坷垃】金泡沫', - 'uploader_id': '156160', 'uploader': '菊子桑', + 'uploader_id': '156160', + 'id': 'BV11x411K7CN', + 'title': '【金坷垃】金泡沫', + 'duration': 308.36, 'upload_date': '20140420', + 'timestamp': 1397983878, 'description': 'md5:ce18c2a2d2193f0df2917d270f2e5923', - 'timestamp': 1398012678, + 'like_count': int, + 'comment_count': int, + 'view_count': int, + 'tags': list, }, + 'params': {'skip_download': True}, }, { - # Tested in BiliBiliBangumiIE - 'url': 'http://bangumi.bilibili.com/anime/1869/play#40062', - 'only_matching': True, + 'note': 'Anthology', + 'url': 'https://www.bilibili.com/video/BV1bK411W797', + 'info_dict': { + 'id': 'BV1bK411W797', + 'title': '物语中的人物是如何吐槽自己的OP的' + }, + 'playlist_count': 18, + 'playlist': [{ + 'info_dict': { + 'id': 'BV1bK411W797_p1', + 'ext': 'mp4', + 'title': '物语中的人物是如何吐槽自己的OP的 p01 Staple Stable/战场原+羽川', + 'tags': 'count:11', + 'timestamp': 1589601697, + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', + 'uploader': '打牌还是打桩', + 'uploader_id': '150259984', + 'like_count': int, + 'comment_count': int, + 'upload_date': '20200516', + 'view_count': int, + 'description': 'md5:e3c401cf7bc363118d1783dd74068a68', + 'duration': 90.314, + } + }] }, { - # bilibili.tv - 'url': 'http://www.bilibili.tv/video/av1074402/', - 'only_matching': True, + 'note': 'Specific page of Anthology', + 'url': 'https://www.bilibili.com/video/BV1bK411W797?p=1', + 'info_dict': { + 'id': 'BV1bK411W797_p1', + 'ext': 'mp4', + 'title': '物语中的人物是如何吐槽自己的OP的 p01 Staple Stable/战场原+羽川', + 'tags': 'count:11', + 'timestamp': 1589601697, + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', + 'uploader': '打牌还是打桩', + 'uploader_id': '150259984', + 'like_count': int, + 'comment_count': int, + 'upload_date': '20200516', + 'view_count': int, + 'description': 'md5:e3c401cf7bc363118d1783dd74068a68', + 'duration': 90.314, + } }, { - 'url': 'http://bangumi.bilibili.com/anime/5802/play#100643', - 'md5': '3f721ad1e75030cc06faf73587cfec57', + 'note': 'video has subtitles', + 'url': 'https://www.bilibili.com/video/BV12N4y1M7rh', 'info_dict': { - 'id': '100643_part1', + 'id': 'BV12N4y1M7rh', 'ext': 'mp4', - 'title': 'CHAOS;CHILD', - 'description': '如果你是神明,并且能够让妄想成为现实。那你会进行怎么样的妄想?是淫靡的世界?独裁社会?毁灭性的制裁?还是……2015年,涩谷。从6年前发生的大灾害“涩谷地震”之后复兴了的这个街区里新设立的私立高中...', + 'title': 'md5:96e8bb42c2b432c0d4ce3434a61479c1', + 'tags': list, + 'description': 'md5:afde2b7ba9025c01d9e3dde10de221e4', + 'duration': 313.557, + 'upload_date': '20220709', + 'uploader': '小夫Tech', + 'timestamp': 1657347907, + 'uploader_id': '1326814124', + 'comment_count': int, + 'view_count': int, + 'like_count': int, + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', + 'subtitles': 'count:2' }, - 'skip': 'Geo-restricted to China', + 'params': {'listsubtitles': True}, }, { - 'url': 'http://www.bilibili.com/video/av8903802/', + 'url': 'https://www.bilibili.com/video/av8903802/', 'info_dict': { - 'id': '8903802_part1', + 'id': 'BV13x41117TL', 'ext': 'mp4', 'title': '阿滴英文|英文歌分享#6 "Closer', 'upload_date': '20170301', - 'description': '滴妹今天唱Closer給你聽! 有史以来,被推最多次也是最久的歌曲,其实歌词跟我原本想像差蛮多的,不过还是好听! 微博@阿滴英文', - 'timestamp': 1488382634, + 'description': 'md5:3b1b9e25b78da4ef87e9b548b88ee76a', + 'timestamp': 1488353834, 'uploader_id': '65880958', 'uploader': '阿滴英文', + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', + 'duration': 554.117, + 'tags': list, + 'comment_count': int, + 'view_count': int, + 'like_count': int, }, 'params': { 'skip_download': True, }, }, { - # new BV video id format - 'url': 'https://www.bilibili.com/video/BV1JE411F741', - 'only_matching': True, - }, { - # Anthology - 'url': 'https://www.bilibili.com/video/BV1bK411W797', + 'note': 'video has chapter', + 'url': 'https://www.bilibili.com/video/BV1vL411G7N7/', 'info_dict': { - 'id': 'BV1bK411W797', - 'title': '物语中的人物是如何吐槽自己的OP的' + 'id': 'BV1vL411G7N7', + 'ext': 'mp4', + 'title': '如何为你的B站视频添加进度条分段', + 'timestamp': 1634554558, + 'upload_date': '20211018', + 'description': 'md5:a9a3d6702b3a94518d419b2e9c320a6d', + 'tags': list, + 'uploader': '爱喝咖啡的当麻', + 'duration': 669.482, + 'uploader_id': '1680903', + 'chapters': 'count:6', + 'comment_count': int, + 'view_count': int, + 'like_count': int, + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', }, - 'playlist_count': 17, + 'params': {'skip_download': True}, }] - _APP_KEY = 'iVGUTjsxvpLeuDCf' - _BILIBILI_KEY = 'aHRmhWMLkdeMuILqORnYZocwMBpMEOdt' + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + initial_state = self._search_json(r'window\.__INITIAL_STATE__\s*=', webpage, 'initial state', video_id) + play_info = self._search_json(r'window\.__playinfo__\s*=', webpage, 'play info', video_id)['data'] - def _report_error(self, result): - if 'message' in result: - raise ExtractorError('%s said: %s' % (self.IE_NAME, result['message']), expected=True) - elif 'code' in result: - raise ExtractorError('%s returns error %d' % (self.IE_NAME, result['code']), expected=True) - else: - raise ExtractorError('Can\'t extract Bangumi episode ID') + video_data = initial_state['videoData'] + video_id, title = video_data['bvid'], video_data.get('title') - def _real_extract(self, url): - url, smuggled_data = unsmuggle_url(url, {}) + # Bilibili anthologies are similar to playlists but all videos share the same video ID as the anthology itself. + page_list_json = traverse_obj( + self._download_json( + 'https://api.bilibili.com/x/player/pagelist', video_id, + fatal=False, query={'bvid': video_id, 'jsonp': 'jsonp'}, + note='Extracting videos in anthology'), + 'data', expected_type=list) or [] + is_anthology = len(page_list_json) > 1 + + part_id = int_or_none(parse_qs(url).get('p', [None])[-1]) + if is_anthology and not part_id and self._yes_playlist(video_id, video_id): + return self.playlist_from_matches( + page_list_json, video_id, title, ie=BiliBiliIE, + getter=lambda entry: f'https://www.bilibili.com/video/{video_id}?p={entry["page"]}') - mobj = self._match_valid_url(url) - video_id = mobj.group('id_bv') or mobj.group('id') + if is_anthology: + title += f' p{part_id:02d} {traverse_obj(page_list_json, ((part_id or 1) - 1, "part")) or ""}' - av_id, bv_id = self._get_video_id_set(video_id, mobj.group('id_bv') is not None) - video_id = av_id + aid = video_data.get('aid') + old_video_id = format_field(aid, None, f'%s_part{part_id or 1}') - info = {} - anime_id = mobj.group('anime_id') - page_id = mobj.group('page') - webpage = self._download_webpage(url, video_id) + cid = traverse_obj(video_data, ('pages', part_id - 1, 'cid')) if part_id else video_data.get('cid') - # Bilibili anthologies are similar to playlists but all videos share the same video ID as the anthology itself. - # If the video has no page argument, check to see if it's an anthology - if page_id is None: - if not self.get_param('noplaylist'): - r = self._extract_anthology_entries(bv_id, video_id, webpage) - if r is not None: - self.to_screen('Downloading anthology %s - add --no-playlist to just download video' % video_id) - return r - else: - self.to_screen('Downloading just video %s because of --no-playlist' % video_id) - - if 'anime/' not in url: - cid = self._search_regex( - r'\bcid(?:["\']:|=)(\d+),["\']page(?:["\']:|=)' + str(page_id), webpage, 'cid', - default=None - ) or self._search_regex( - r'\bcid(?:["\']:|=)(\d+)', webpage, 'cid', - default=None - ) or compat_parse_qs(self._search_regex( - [r'EmbedPlayer\([^)]+,\s*"([^"]+)"\)', - r'EmbedPlayer\([^)]+,\s*\\"([^"]+)\\"\)', - r']+src="https://secure\.bilibili\.com/secure,([^"]+)"'], - webpage, 'player parameters'))['cid'][0] - else: - if 'no_bangumi_tip' not in smuggled_data: - self.to_screen('Downloading episode %s. To download all videos in anime %s, re-run hypervideo with %s' % ( - video_id, anime_id, compat_urlparse.urljoin(url, '//bangumi.bilibili.com/anime/%s' % anime_id))) - headers = { - 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', - 'Referer': url - } - headers.update(self.geo_verification_headers()) - - js = self._download_json( - 'http://bangumi.bilibili.com/web_api/get_source', video_id, - data=urlencode_postdata({'episode_id': video_id}), - headers=headers) - if 'result' not in js: - self._report_error(js) - cid = js['result']['cid'] - - headers = { - 'Accept': 'application/json', - 'Referer': url + return { + 'id': f'{video_id}{format_field(part_id, None, "_p%d")}', + 'formats': self.extract_formats(play_info), + '_old_archive_ids': [make_archive_id(self, old_video_id)] if old_video_id else None, + 'title': title, + 'description': traverse_obj(initial_state, ('videoData', 'desc')), + 'view_count': traverse_obj(initial_state, ('videoData', 'stat', 'view')), + 'uploader': traverse_obj(initial_state, ('upData', 'name')), + 'uploader_id': traverse_obj(initial_state, ('upData', 'mid')), + 'like_count': traverse_obj(initial_state, ('videoData', 'stat', 'like')), + 'comment_count': traverse_obj(initial_state, ('videoData', 'stat', 'reply')), + 'tags': traverse_obj(initial_state, ('tags', ..., 'tag_name')), + 'thumbnail': traverse_obj(initial_state, ('videoData', 'pic')), + 'timestamp': traverse_obj(initial_state, ('videoData', 'pubdate')), + 'duration': float_or_none(play_info.get('timelength'), scale=1000), + 'chapters': self._get_chapters(aid, cid), + 'subtitles': self.extract_subtitles(video_id, initial_state, cid), + '__post_extractor': self.extract_comments(aid), + 'http_headers': {'Referer': url}, } - headers.update(self.geo_verification_headers()) - video_info = self._parse_json( - self._search_regex(r'window.__playinfo__\s*=\s*({.+?})', webpage, 'video info', default=None) or '{}', - video_id, fatal=False) - video_info = video_info.get('data') or {} - durl = traverse_obj(video_info, ('dash', 'video')) - audios = traverse_obj(video_info, ('dash', 'audio')) or [] - entries = [] +class BiliBiliBangumiIE(BilibiliBaseIE): + _VALID_URL = r'(?x)https?://www\.bilibili\.com/bangumi/play/(?P(?:ss|ep)\d+)' - RENDITIONS = ('qn=80&quality=80&type=', 'quality=2&type=mp4') - for num, rendition in enumerate(RENDITIONS, start=1): - payload = 'appkey=%s&cid=%s&otype=json&%s' % (self._APP_KEY, cid, rendition) - sign = hashlib.md5((payload + self._BILIBILI_KEY).encode('utf-8')).hexdigest() - if not video_info: - video_info = self._download_json( - 'http://interface.bilibili.com/v2/playurl?%s&sign=%s' % (payload, sign), - video_id, note='Downloading video info page', - headers=headers, fatal=num == len(RENDITIONS)) - if not video_info: - continue - - if not durl and 'durl' not in video_info: - if num < len(RENDITIONS): - continue - self._report_error(video_info) - - formats = [] - for idx, durl in enumerate(durl or video_info['durl']): - formats.append({ - 'url': durl.get('baseUrl') or durl.get('base_url') or durl.get('url'), - 'ext': mimetype2ext(durl.get('mimeType') or durl.get('mime_type')), - 'fps': int_or_none(durl.get('frameRate') or durl.get('frame_rate')), - 'width': int_or_none(durl.get('width')), - 'height': int_or_none(durl.get('height')), - 'vcodec': durl.get('codecs'), - 'acodec': 'none' if audios else None, - 'tbr': float_or_none(durl.get('bandwidth'), scale=1000), - 'filesize': int_or_none(durl.get('size')), - }) - for backup_url in traverse_obj(durl, 'backup_url', expected_type=list) or []: - formats.append({ - 'url': backup_url, - 'quality': -2 if 'hd.mp4' in backup_url else -3, - }) - - for audio in audios: - formats.append({ - 'url': audio.get('baseUrl') or audio.get('base_url') or audio.get('url'), - 'ext': mimetype2ext(audio.get('mimeType') or audio.get('mime_type')), - 'fps': int_or_none(audio.get('frameRate') or audio.get('frame_rate')), - 'width': int_or_none(audio.get('width')), - 'height': int_or_none(audio.get('height')), - 'acodec': audio.get('codecs'), - 'vcodec': 'none', - 'tbr': float_or_none(audio.get('bandwidth'), scale=1000), - 'filesize': int_or_none(audio.get('size')) - }) - for backup_url in traverse_obj(audio, 'backup_url', expected_type=list) or []: - formats.append({ - 'url': backup_url, - # backup URLs have lower priorities - 'quality': -3, - }) - - info.update({ - 'id': video_id, - 'duration': float_or_none(durl.get('length'), 1000), - 'formats': formats, - 'http_headers': { - 'Referer': url, - }, - }) - break - - self._sort_formats(formats) - - title = self._html_search_regex(( - r']+title=(["\'])(?P[^"\']+)', - r'(?s)]*>(?P.+?)', - self._meta_regex('title') - ), webpage, 'title', group='content', fatal=False) - - # Get part title for anthologies - if page_id is not None: - # TODO: The json is already downloaded by _extract_anthology_entries. Don't redownload for each video. - part_info = traverse_obj(self._download_json( - f'https://api.bilibili.com/x/player/pagelist?bvid={bv_id}&jsonp=jsonp', - video_id, note='Extracting videos in anthology'), 'data', expected_type=list) - title = title if len(part_info) == 1 else traverse_obj(part_info, (int(page_id) - 1, 'part')) or title - - description = self._html_search_meta('description', webpage) - timestamp = unified_timestamp(self._html_search_regex( - r']+datetime="([^"]+)"', webpage, 'upload time', - default=None) or self._html_search_meta( - 'uploadDate', webpage, 'timestamp', default=None)) - thumbnail = self._html_search_meta(['og:image', 'thumbnailUrl'], webpage) - - # TODO 'view_count' requires deobfuscating Javascript - info.update({ - 'id': f'{video_id}_part{page_id or 1}', - 'cid': cid, - 'title': title, - 'description': description, - 'timestamp': timestamp, - 'thumbnail': thumbnail, - 'duration': float_or_none(video_info.get('timelength'), scale=1000), - }) - - uploader_mobj = re.search( - r']+href="(?:https?:)?//space\.bilibili\.com/(?P\d+)"[^>]*>\s*(?P[^<]+?)\s*<', - webpage) - if uploader_mobj: - info.update({ - 'uploader': uploader_mobj.group('name').strip(), - 'uploader_id': uploader_mobj.group('id'), - }) + _TESTS = [{ + 'url': 'https://www.bilibili.com/bangumi/play/ss897', + 'info_dict': { + 'id': 'ss897', + 'ext': 'mp4', + 'series': '神的记事本', + 'season': '神的记事本', + 'season_id': 897, + 'season_number': 1, + 'episode': '你与旅行包', + 'episode_number': 2, + 'title': '神的记事本:第2话 你与旅行包', + 'duration': 1428.487, + 'timestamp': 1310809380, + 'upload_date': '20110716', + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', + }, + }, { + 'url': 'https://www.bilibili.com/bangumi/play/ep508406', + 'only_matching': True, + }] - if not info.get('uploader'): - info['uploader'] = self._html_search_meta( - 'author', webpage, 'uploader', default=None) + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) - top_level_info = { - 'tags': traverse_obj(self._download_json( - f'https://api.bilibili.com/x/tag/archive/tags?aid={video_id}', - video_id, fatal=False, note='Downloading tags'), ('data', ..., 'tag_name')), - } + if '您所在的地区无法观看本片' in webpage: + raise GeoRestrictedError('This video is restricted') + elif ('开通大会员观看' in webpage and '__playinfo__' not in webpage + or '正在观看预览,大会员免费看全片' in webpage): + self.raise_login_required('This video is for premium members only') - info['subtitles'] = { - 'danmaku': [{ - 'ext': 'xml', - 'url': f'https://comment.bilibili.com/{cid}.xml', - }] - } + play_info = self._search_json(r'window\.__playinfo__\s*=', webpage, 'play info', video_id)['data'] + formats = self.extract_formats(play_info) + if (not formats and '成为大会员抢先看' in webpage + and play_info.get('durl') and not play_info.get('dash')): + self.raise_login_required('This video is for premium members only') - r''' - # Requires https://github.com/m13253/danmaku2ass which is licenced under GPL3 - # See https://github.com/animelover1984/youtube-dl + initial_state = self._search_json(r'window\.__INITIAL_STATE__\s*=', webpage, 'initial state', video_id) - raw_danmaku = self._download_webpage( - f'https://comment.bilibili.com/{cid}.xml', video_id, fatal=False, note='Downloading danmaku comments') - danmaku = NiconicoIE.CreateDanmaku(raw_danmaku, commentType='Bilibili', x=1024, y=576) - entries[0]['subtitles'] = { - 'danmaku': [{ - 'ext': 'ass', - 'data': danmaku - }] + season_id = traverse_obj(initial_state, ('mediaInfo', 'season_id')) + season_number = season_id and next(( + idx + 1 for idx, e in enumerate( + traverse_obj(initial_state, ('mediaInfo', 'seasons', ...))) + if e.get('season_id') == season_id + ), None) + + return { + 'id': video_id, + 'formats': formats, + 'title': traverse_obj(initial_state, 'h1Title'), + 'episode': traverse_obj(initial_state, ('epInfo', 'long_title')), + 'episode_number': int_or_none(traverse_obj(initial_state, ('epInfo', 'title'))), + 'series': traverse_obj(initial_state, ('mediaInfo', 'series')), + 'season': traverse_obj(initial_state, ('mediaInfo', 'season_title')), + 'season_id': season_id, + 'season_number': season_number, + 'thumbnail': traverse_obj(initial_state, ('epInfo', 'cover')), + 'timestamp': traverse_obj(initial_state, ('epInfo', 'pub_time')), + 'duration': float_or_none(play_info.get('timelength'), scale=1000), + 'subtitles': self.extract_subtitles( + video_id, initial_state, traverse_obj(initial_state, ('epInfo', 'cid'))), + '__post_extractor': self.extract_comments(traverse_obj(initial_state, ('epInfo', 'aid'))), + 'http_headers': {'Referer': url, **self.geo_verification_headers()}, } - ''' - top_level_info['__post_extractor'] = self.extract_comments(video_id) - for entry in entries: - entry.update(info) +class BiliBiliBangumiMediaIE(InfoExtractor): + _VALID_URL = r'https?://www\.bilibili\.com/bangumi/media/md(?P\d+)' + _TESTS = [{ + 'url': 'https://www.bilibili.com/bangumi/media/md24097891', + 'info_dict': { + 'id': '24097891', + }, + 'playlist_mincount': 25, + }] - if len(entries) == 1: - entries[0].update(top_level_info) - return entries[0] + def _real_extract(self, url): + media_id = self._match_id(url) + webpage = self._download_webpage(url, media_id) - for idx, entry in enumerate(entries): - entry['id'] = '%s_part%d' % (video_id, (idx + 1)) + initial_state = self._search_json(r'window\.__INITIAL_STATE__\s*=', webpage, 'initial_state', media_id) + episode_list = self._download_json( + 'https://api.bilibili.com/pgc/web/season/section', media_id, + query={'season_id': initial_state['mediaInfo']['season_id']}, + note='Downloading season info')['result']['main_section']['episodes'] - return { - 'id': str(video_id), - 'bv_id': bv_id, - 'title': title, - 'description': description, - **info, **top_level_info - } + return self.playlist_result(( + self.url_result(entry['share_url'], BiliBiliBangumiIE, entry['aid']) + for entry in episode_list), media_id) - def _extract_anthology_entries(self, bv_id, video_id, webpage): - title = self._html_search_regex( - (r']+\btitle=(["\'])(?P(?:(?!\1).)+)\1', - r'(?s)<h1[^>]*>(?P<title>.+?)</h1>', - r'<title>(?P<title>.+?)'), webpage, 'title', - group='title') - json_data = self._download_json( - f'https://api.bilibili.com/x/player/pagelist?bvid={bv_id}&jsonp=jsonp', - video_id, note='Extracting videos in anthology') - - if json_data['data']: - return self.playlist_from_matches( - json_data['data'], bv_id, title, ie=BiliBiliIE.ie_key(), - getter=lambda entry: 'https://www.bilibili.com/video/%s?p=%d' % (bv_id, entry['page'])) - - def _get_video_id_set(self, id, is_bv): - query = {'bvid': id} if is_bv else {'aid': id} - response = self._download_json( - "http://api.bilibili.cn/x/web-interface/view", - id, query=query, - note='Grabbing original ID via API') - - if response['code'] == -400: - raise ExtractorError('Video ID does not exist', expected=True, video_id=id) - elif response['code'] != 0: - raise ExtractorError(f'Unknown error occurred during API check (code {response["code"]})', - expected=True, video_id=id) - return response['data']['aid'], response['data']['bvid'] - - def _get_comments(self, video_id, commentPageNumber=0): - for idx in itertools.count(1): - replies = traverse_obj( - self._download_json( - f'https://api.bilibili.com/x/v2/reply?pn={idx}&oid={video_id}&type=1&jsonp=jsonp&sort=2&_=1567227301685', - video_id, note=f'Extracting comments from page {idx}', fatal=False), - ('data', 'replies')) - if not replies: - return - for children in map(self._get_all_children, replies): - yield from children - def _get_all_children(self, reply): - yield { - 'author': traverse_obj(reply, ('member', 'uname')), - 'author_id': traverse_obj(reply, ('member', 'mid')), - 'id': reply.get('rpid'), - 'text': traverse_obj(reply, ('content', 'message')), - 'timestamp': reply.get('ctime'), - 'parent': reply.get('parent') or 'root', - } - for children in map(self._get_all_children, reply.get('replies') or []): - yield from children +class BilibiliSpaceBaseIE(InfoExtractor): + def _extract_playlist(self, fetch_page, get_metadata, get_entries): + first_page = fetch_page(0) + metadata = get_metadata(first_page) + paged_list = InAdvancePagedList( + lambda idx: get_entries(fetch_page(idx) if idx else first_page), + metadata['page_count'], metadata['page_size']) -class BiliBiliBangumiIE(InfoExtractor): - _VALID_URL = r'https?://bangumi\.bilibili\.com/anime/(?P\d+)' + return metadata, paged_list - IE_NAME = 'bangumi.bilibili.com' - IE_DESC = 'BiliBili番剧' +class BilibiliSpaceVideoIE(BilibiliSpaceBaseIE): + _VALID_URL = r'https?://space\.bilibili\.com/(?P\d+)(?P', + ] + if any(re.search(marker, html) for marker in AGE_LIMIT_MARKERS): + return 18 return 0 def _media_rating_search(self, html): @@ -1401,27 +1435,25 @@ class InfoExtractor(object): return self._html_search_meta('twitter:player', html, 'twitter card player') - def _search_json_ld(self, html, video_id, expected_type=None, **kwargs): - json_ld_list = list(re.finditer(JSON_LD_RE, html)) - default = kwargs.get('default', NO_DEFAULT) - # JSON-LD may be malformed and thus `fatal` should be respected. - # At the same time `default` may be passed that assumes `fatal=False` - # for _search_regex. Let's simulate the same behavior here as well. - fatal = kwargs.get('fatal', True) if default is NO_DEFAULT else False - json_ld = [] - for mobj in json_ld_list: - json_ld_item = self._parse_json( - mobj.group('json_ld'), video_id, fatal=fatal) - if not json_ld_item: - continue - if isinstance(json_ld_item, dict): - json_ld.append(json_ld_item) - elif isinstance(json_ld_item, (list, tuple)): - json_ld.extend(json_ld_item) - if json_ld: - json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type) - if json_ld: - return json_ld + def _yield_json_ld(self, html, video_id, *, fatal=True, default=NO_DEFAULT): + """Yield all json ld objects in the html""" + if default is not NO_DEFAULT: + fatal = False + for mobj in re.finditer(JSON_LD_RE, html): + json_ld_item = self._parse_json(mobj.group('json_ld'), video_id, fatal=fatal) + for json_ld in variadic(json_ld_item): + if isinstance(json_ld, dict): + yield json_ld + + def _search_json_ld(self, html, video_id, expected_type=None, *, fatal=True, default=NO_DEFAULT): + """Search for a video in any json ld in the html""" + if default is not NO_DEFAULT: + fatal = False + info = self._json_ld( + list(self._yield_json_ld(html, video_id, fatal=fatal, default=default)), + video_id, fatal=fatal, expected_type=expected_type) + if info: + return info if default is not NO_DEFAULT: return default elif fatal: @@ -1431,15 +1463,11 @@ class InfoExtractor(object): return {} def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None): - if isinstance(json_ld, compat_str): + if isinstance(json_ld, str): json_ld = self._parse_json(json_ld, video_id, fatal=fatal) if not json_ld: return {} info = {} - if not isinstance(json_ld, (list, tuple, dict)): - return info - if isinstance(json_ld, dict): - json_ld = [json_ld] INTERACTION_TYPE_MAP = { 'CommentAction': 'comment', @@ -1452,6 +1480,10 @@ class InfoExtractor(object): 'ViewAction': 'view', } + def is_type(e, *expected_types): + type = variadic(traverse_obj(e, '@type')) + return any(x in type for x in expected_types) + def extract_interaction_type(e): interaction_type = e.get('interactionType') if isinstance(interaction_type, dict): @@ -1465,9 +1497,7 @@ class InfoExtractor(object): if not isinstance(interaction_statistic, list): return for is_e in interaction_statistic: - if not isinstance(is_e, dict): - continue - if is_e.get('@type') != 'InteractionCounter': + if not is_type(is_e, 'InteractionCounter'): continue interaction_type = extract_interaction_type(is_e) if not interaction_type: @@ -1504,44 +1534,53 @@ class InfoExtractor(object): info['chapters'] = chapters def extract_video_object(e): - assert e['@type'] == 'VideoObject' author = e.get('author') info.update({ 'url': url_or_none(e.get('contentUrl')), + 'ext': mimetype2ext(e.get('encodingFormat')), 'title': unescapeHTML(e.get('name')), 'description': unescapeHTML(e.get('description')), - 'thumbnails': [{'url': url_or_none(url)} - for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))], + 'thumbnails': [{'url': unescapeHTML(url)} + for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL')) + if url_or_none(url)], 'duration': parse_duration(e.get('duration')), 'timestamp': unified_timestamp(e.get('uploadDate')), # author can be an instance of 'Organization' or 'Person' types. # both types can have 'name' property(inherited from 'Thing' type). [1] # however some websites are using 'Text' type instead. # 1. https://schema.org/VideoObject - 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None, - 'filesize': float_or_none(e.get('contentSize')), + 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, str) else None, + 'artist': traverse_obj(e, ('byArtist', 'name'), expected_type=str), + 'filesize': int_or_none(float_or_none(e.get('contentSize'))), 'tbr': int_or_none(e.get('bitrate')), 'width': int_or_none(e.get('width')), 'height': int_or_none(e.get('height')), 'view_count': int_or_none(e.get('interactionCount')), + 'tags': try_call(lambda: e.get('keywords').split(',')), }) + if is_type(e, 'AudioObject'): + info.update({ + 'vcodec': 'none', + 'abr': int_or_none(e.get('bitrate')), + }) extract_interaction_statistic(e) extract_chapter_information(e) def traverse_json_ld(json_ld, at_top_level=True): - for e in json_ld: + for e in variadic(json_ld): + if not isinstance(e, dict): + continue if at_top_level and '@context' not in e: continue if at_top_level and set(e.keys()) == {'@context', '@graph'}: - traverse_json_ld(variadic(e['@graph'], allowed_types=(dict,)), at_top_level=False) - break - item_type = e.get('@type') - if expected_type is not None and expected_type != item_type: + traverse_json_ld(e['@graph'], at_top_level=False) + continue + if expected_type is not None and not is_type(e, expected_type): continue rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none) if rating is not None: info['average_rating'] = rating - if item_type in ('TVEpisode', 'Episode'): + if is_type(e, 'TVEpisode', 'Episode'): episode_name = unescapeHTML(e.get('name')) info.update({ 'episode': episode_name, @@ -1551,44 +1590,46 @@ class InfoExtractor(object): if not info.get('title') and episode_name: info['title'] = episode_name part_of_season = e.get('partOfSeason') - if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'): + if is_type(part_of_season, 'TVSeason', 'Season', 'CreativeWorkSeason'): info.update({ 'season': unescapeHTML(part_of_season.get('name')), 'season_number': int_or_none(part_of_season.get('seasonNumber')), }) part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries') - if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'): + if is_type(part_of_series, 'TVSeries', 'Series', 'CreativeWorkSeries'): info['series'] = unescapeHTML(part_of_series.get('name')) - elif item_type == 'Movie': + elif is_type(e, 'Movie'): info.update({ 'title': unescapeHTML(e.get('name')), 'description': unescapeHTML(e.get('description')), 'duration': parse_duration(e.get('duration')), 'timestamp': unified_timestamp(e.get('dateCreated')), }) - elif item_type in ('Article', 'NewsArticle'): + elif is_type(e, 'Article', 'NewsArticle'): info.update({ 'timestamp': parse_iso8601(e.get('datePublished')), 'title': unescapeHTML(e.get('headline')), 'description': unescapeHTML(e.get('articleBody') or e.get('description')), }) - if traverse_obj(e, ('video', 0, '@type')) == 'VideoObject': + if is_type(traverse_obj(e, ('video', 0)), 'VideoObject'): extract_video_object(e['video'][0]) - elif item_type == 'VideoObject': + elif is_type(traverse_obj(e, ('subjectOf', 0)), 'VideoObject'): + extract_video_object(e['subjectOf'][0]) + elif is_type(e, 'VideoObject', 'AudioObject'): extract_video_object(e) if expected_type is None: continue else: break video = e.get('video') - if isinstance(video, dict) and video.get('@type') == 'VideoObject': + if is_type(video, 'VideoObject'): extract_video_object(video) if expected_type is None: continue else: break - traverse_json_ld(json_ld) + traverse_json_ld(json_ld) return filter_dict(info) def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw): @@ -1598,15 +1639,16 @@ class InfoExtractor(object): webpage, 'next.js data', fatal=fatal, **kw), video_id, transform_source=transform_source, fatal=fatal) - def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__'): - ''' Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function. ''' - # not all website do this, but it can be changed - # https://stackoverflow.com/questions/67463109/how-to-change-or-hide-nuxt-and-nuxt-keyword-in-page-source + def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)): + """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function""" rectx = re.escape(context_name) + FUNCTION_RE = r'\(function\((?P.*?)\){return\s+(?P{.*?})\s*;?\s*}\((?P.*?)\)' js, arg_keys, arg_vals = self._search_regex( - (r'' % rectx, - r'%s\(.*?\(function\((?P.*?)\)\{return\s(?P\{.*?\})\}\((?P.*?)\)' % rectx), - webpage, context_name, group=['js', 'arg_keys', 'arg_vals']) + (rf'', rf'{rectx}\(.*?{FUNCTION_RE}'), + webpage, context_name, group=('js', 'arg_keys', 'arg_vals'), + default=NO_DEFAULT if fatal else (None, None, None)) + if js is None: + return {} args = dict(zip(arg_keys.split(','), arg_vals.split(','))) @@ -1614,7 +1656,8 @@ class InfoExtractor(object): if val in ('undefined', 'void 0'): args[key] = 'null' - return self._parse_json(js_to_json(js, args), video_id)['data'][0] + ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal) + return traverse_obj(ret, traverse) or {} @staticmethod def _hidden_inputs(html): @@ -1638,296 +1681,27 @@ class InfoExtractor(object): html, '%s form' % form_id, group='form') return self._hidden_inputs(form) - class FormatSort: - regex = r' *((?P\+)?(?P[a-zA-Z0-9_]+)((?P[~:])(?P.*?))?)? *$' - - default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality', - 'res', 'fps', 'hdr:12', 'codec:vp9.2', 'size', 'br', 'asr', - 'proto', 'ext', 'hasaud', 'source', 'id') # These must not be aliases - ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr', - 'height', 'width', 'proto', 'vext', 'abr', 'aext', - 'fps', 'fs_approx', 'source', 'id') - - settings = { - 'vcodec': {'type': 'ordered', 'regex': True, - 'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']}, - 'acodec': {'type': 'ordered', 'regex': True, - 'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']}, - 'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range', - 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]}, - 'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol', - 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']}, - 'vext': {'type': 'ordered', 'field': 'video_ext', - 'order': ('mp4', 'webm', 'flv', '', 'none'), - 'order_free': ('webm', 'mp4', 'flv', '', 'none')}, - 'aext': {'type': 'ordered', 'field': 'audio_ext', - 'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'), - 'order_free': ('opus', 'ogg', 'webm', 'm4a', 'mp3', 'aac', '', 'none')}, - 'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000}, - 'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple', - 'field': ('vcodec', 'acodec'), - 'function': lambda it: int(any(v != 'none' for v in it))}, - 'ie_pref': {'priority': True, 'type': 'extractor'}, - 'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)}, - 'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)}, - 'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1}, - 'quality': {'convert': 'float', 'default': -1}, - 'filesize': {'convert': 'bytes'}, - 'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'}, - 'id': {'convert': 'string', 'field': 'format_id'}, - 'height': {'convert': 'float_none'}, - 'width': {'convert': 'float_none'}, - 'fps': {'convert': 'float_none'}, - 'tbr': {'convert': 'float_none'}, - 'vbr': {'convert': 'float_none'}, - 'abr': {'convert': 'float_none'}, - 'asr': {'convert': 'float_none'}, - 'source': {'convert': 'float', 'field': 'source_preference', 'default': -1}, - - 'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')}, - 'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True}, - 'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')}, - 'ext': {'type': 'combined', 'field': ('vext', 'aext')}, - 'res': {'type': 'multiple', 'field': ('height', 'width'), - 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))}, - - # For compatibility with youtube-dl - 'format_id': {'type': 'alias', 'field': 'id'}, - 'preference': {'type': 'alias', 'field': 'ie_pref'}, - 'language_preference': {'type': 'alias', 'field': 'lang'}, - 'source_preference': {'type': 'alias', 'field': 'source'}, - 'protocol': {'type': 'alias', 'field': 'proto'}, - 'filesize_approx': {'type': 'alias', 'field': 'fs_approx'}, - - # Deprecated - 'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True}, - 'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True}, - 'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True}, - 'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True}, - 'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True}, - 'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True}, - 'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True}, - 'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True}, - 'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True}, - 'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True}, - 'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True}, - 'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True}, - 'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True}, - 'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True}, - 'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True}, - 'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True}, - 'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True}, - 'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True}, - 'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True}, - 'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True}, - } + @classproperty(cache=True) + def FormatSort(cls): + class FormatSort(FormatSorter): + def __init__(ie, *args, **kwargs): + super().__init__(ie._downloader, *args, **kwargs) - def __init__(self, ie, field_preference): - self._order = [] - self.ydl = ie._downloader - self.evaluate_params(self.ydl.params, field_preference) - if ie.get_param('verbose'): - self.print_verbose_info(self.ydl.write_debug) - - def _get_field_setting(self, field, key): - if field not in self.settings: - if key in ('forced', 'priority'): - return False - self.ydl.deprecation_warning( - f'Using arbitrary fields ({field}) for format sorting is deprecated ' - 'and may be removed in a future version') - self.settings[field] = {} - propObj = self.settings[field] - if key not in propObj: - type = propObj.get('type') - if key == 'field': - default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field - elif key == 'convert': - default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore' - else: - default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None) - propObj[key] = default - return propObj[key] - - def _resolve_field_value(self, field, value, convertNone=False): - if value is None: - if not convertNone: - return None - else: - value = value.lower() - conversion = self._get_field_setting(field, 'convert') - if conversion == 'ignore': - return None - if conversion == 'string': - return value - elif conversion == 'float_none': - return float_or_none(value) - elif conversion == 'bytes': - return FileDownloader.parse_bytes(value) - elif conversion == 'order': - order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order') - use_regex = self._get_field_setting(field, 'regex') - list_length = len(order_list) - empty_pos = order_list.index('') if '' in order_list else list_length + 1 - if use_regex and value is not None: - for i, regex in enumerate(order_list): - if regex and re.match(regex, value): - return list_length - i - return list_length - empty_pos # not in list - else: # not regex or value = None - return list_length - (order_list.index(value) if value in order_list else empty_pos) - else: - if value.isnumeric(): - return float(value) - else: - self.settings[field]['convert'] = 'string' - return value - - def evaluate_params(self, params, sort_extractor): - self._use_free_order = params.get('prefer_free_formats', False) - self._sort_user = params.get('format_sort', []) - self._sort_extractor = sort_extractor - - def add_item(field, reverse, closest, limit_text): - field = field.lower() - if field in self._order: - return - self._order.append(field) - limit = self._resolve_field_value(field, limit_text) - data = { - 'reverse': reverse, - 'closest': False if limit is None else closest, - 'limit_text': limit_text, - 'limit': limit} - if field in self.settings: - self.settings[field].update(data) - else: - self.settings[field] = data - - sort_list = ( - tuple(field for field in self.default if self._get_field_setting(field, 'forced')) - + (tuple() if params.get('format_sort_force', False) - else tuple(field for field in self.default if self._get_field_setting(field, 'priority'))) - + tuple(self._sort_user) + tuple(sort_extractor) + self.default) - - for item in sort_list: - match = re.match(self.regex, item) - if match is None: - raise ExtractorError('Invalid format sort string "%s" given by extractor' % item) - field = match.group('field') - if field is None: - continue - if self._get_field_setting(field, 'type') == 'alias': - alias, field = field, self._get_field_setting(field, 'field') - if self._get_field_setting(alias, 'deprecated'): - self.ydl.deprecation_warning( - f'Format sorting alias {alias} is deprecated ' - f'and may be removed in a future version. Please use {field} instead') - reverse = match.group('reverse') is not None - closest = match.group('separator') == '~' - limit_text = match.group('limit') - - has_limit = limit_text is not None - has_multiple_fields = self._get_field_setting(field, 'type') == 'combined' - has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit') - - fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,) - limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple() - limit_count = len(limits) - for (i, f) in enumerate(fields): - add_item(f, reverse, closest, - limits[i] if i < limit_count - else limits[0] if has_limit and not has_multiple_limits - else None) - - def print_verbose_info(self, write_debug): - if self._sort_user: - write_debug('Sort order given by user: %s' % ', '.join(self._sort_user)) - if self._sort_extractor: - write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor)) - write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % ( - '+' if self._get_field_setting(field, 'reverse') else '', field, - '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':', - self._get_field_setting(field, 'limit_text'), - self._get_field_setting(field, 'limit')) - if self._get_field_setting(field, 'limit_text') is not None else '') - for field in self._order if self._get_field_setting(field, 'visible')])) - - def _calculate_field_preference_from_value(self, format, field, type, value): - reverse = self._get_field_setting(field, 'reverse') - closest = self._get_field_setting(field, 'closest') - limit = self._get_field_setting(field, 'limit') - - if type == 'extractor': - maximum = self._get_field_setting(field, 'max') - if value is None or (maximum is not None and value >= maximum): - value = -1 - elif type == 'boolean': - in_list = self._get_field_setting(field, 'in_list') - not_in_list = self._get_field_setting(field, 'not_in_list') - value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1 - elif type == 'ordered': - value = self._resolve_field_value(field, value, True) - - # try to convert to number - val_num = float_or_none(value, default=self._get_field_setting(field, 'default')) - is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None - if is_num: - value = val_num - - return ((-10, 0) if value is None - else (1, value, 0) if not is_num # if a field has mixed strings and numbers, strings are sorted higher - else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest - else (0, value, 0) if not reverse and (limit is None or value <= limit) - else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit - else (-1, value, 0)) - - def _calculate_field_preference(self, format, field): - type = self._get_field_setting(field, 'type') # extractor, boolean, ordered, field, multiple - get_value = lambda f: format.get(self._get_field_setting(f, 'field')) - if type == 'multiple': - type = 'field' # Only 'field' is allowed in multiple for now - actual_fields = self._get_field_setting(field, 'field') - - value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields) - else: - value = get_value(field) - return self._calculate_field_preference_from_value(format, field, type, value) - - def calculate_preference(self, format): - # Determine missing protocol - if not format.get('protocol'): - format['protocol'] = determine_protocol(format) - - # Determine missing ext - if not format.get('ext') and 'url' in format: - format['ext'] = determine_ext(format['url']) - if format.get('vcodec') == 'none': - format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none' - format['video_ext'] = 'none' - else: - format['video_ext'] = format['ext'] - format['audio_ext'] = 'none' - # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'): # Not supported? - # format['preference'] = -1000 - - # Determine missing bitrates - if format.get('tbr') is None: - if format.get('vbr') is not None and format.get('abr') is not None: - format['tbr'] = format.get('vbr', 0) + format.get('abr', 0) - else: - if format.get('vcodec') != 'none' and format.get('vbr') is None: - format['vbr'] = format.get('tbr') - format.get('abr', 0) - if format.get('acodec') != 'none' and format.get('abr') is None: - format['abr'] = format.get('tbr') - format.get('vbr', 0) - - return tuple(self._calculate_field_preference(format, field) for field in self._order) + deprecation_warning( + 'hypervideo_dl.InfoExtractor.FormatSort is deprecated and may be removed in the future. ' + 'Use hypervideo_dl.utils.FormatSorter instead') + return FormatSort def _sort_formats(self, formats, field_preference=[]): - if not formats: + if not field_preference: + self._downloader.deprecation_warning( + 'hypervideo_dl.InfoExtractor._sort_formats is deprecated and is no longer required') return - format_sort = self.FormatSort(self, field_preference) - formats.sort(key=lambda f: format_sort.calculate_preference(f)) + self._downloader.deprecation_warning( + 'hypervideo_dl.InfoExtractor._sort_formats is deprecated and no longer works as expected. ' + 'Return _format_sort_fields in the info_dict instead') + if formats: + formats[0]['__sort_fields'] = field_preference def _check_formats(self, formats, video_id): if formats: @@ -1969,14 +1743,9 @@ class InfoExtractor(object): else 'https:') def _proto_relative_url(self, url, scheme=None): - if url is None: - return url - if url.startswith('//'): - if scheme is None: - scheme = self.http_scheme() - return scheme + url - else: - return url + scheme = scheme or self.http_scheme() + assert scheme.endswith(':') + return sanitize_url(url, scheme=scheme[:-1]) def _sleep(self, timeout, video_id, msg_template=None): if msg_template is None: @@ -1988,17 +1757,19 @@ class InfoExtractor(object): def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None, transform_source=lambda s: fix_xml_ampersands(s).strip(), fatal=True, m3u8_id=None, data=None, headers={}, query={}): - manifest = self._download_xml( + res = self._download_xml_handle( manifest_url, video_id, 'Downloading f4m manifest', 'Unable to download f4m manifest', # Some manifests may be malformed, e.g. prosiebensat1 generated manifests # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244) transform_source=transform_source, fatal=fatal, data=data, headers=headers, query=query) - - if manifest is False: + if res is False: return [] + manifest, urlh = res + manifest_url = urlh.geturl() + return self._parse_f4m_formats( manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id, transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id) @@ -2006,7 +1777,7 @@ class InfoExtractor(object): def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None, transform_source=lambda s: fix_xml_ampersands(s).strip(), fatal=True, m3u8_id=None): - if not isinstance(manifest, compat_etree_Element) and not fatal: + if not isinstance(manifest, xml.etree.ElementTree.Element) and not fatal: return [] # currently hypervideo cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy @@ -2166,7 +1937,7 @@ class InfoExtractor(object): ]), m3u8_doc) def format_url(url): - return url if re.match(r'^https?://', url) else compat_urlparse.urljoin(m3u8_url, url) + return url if re.match(r'^https?://', url) else urllib.parse.urljoin(m3u8_url, url) if self.get_param('hls_split_discontinuity', False): def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None): @@ -2342,7 +2113,7 @@ class InfoExtractor(object): audio_group_id = last_stream_inf.get('AUDIO') # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which # references a rendition group MUST have a CODECS attribute. - # However, this is not always respected, for example, [2] + # However, this is not always respected. E.g. [2] # contains EXT-X-STREAM-INF tag which references AUDIO # rendition group but does not have CODECS and despite # referencing an audio group it represents a complete @@ -2406,12 +2177,14 @@ class InfoExtractor(object): return '/'.join(out) def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None): - smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source) - - if smil is False: + res = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source) + if res is False: assert not fatal return [], {} + smil, urlh = res + smil_url = urlh.geturl() + namespace = self._parse_smil_namespace(smil) fmts = self._parse_smil_formats( @@ -2428,13 +2201,17 @@ class InfoExtractor(object): return fmts def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None): - smil = self._download_smil(smil_url, video_id, fatal=fatal) - if smil is False: + res = self._download_smil(smil_url, video_id, fatal=fatal) + if res is False: return {} + + smil, urlh = res + smil_url = urlh.geturl() + return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params) def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None): - return self._download_xml( + return self._download_xml_handle( smil_url, video_id, 'Downloading SMIL file', 'Unable to download SMIL file', fatal=fatal, transform_source=transform_source) @@ -2533,7 +2310,7 @@ class InfoExtractor(object): }) continue - src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src) + src_url = src if src.startswith('http') else urllib.parse.urljoin(base, src) src_url = src_url.strip() if proto == 'm3u8' or src_ext == 'm3u8': @@ -2556,7 +2333,7 @@ class InfoExtractor(object): 'plugin': 'flowplayer-3.2.0.1', } f4m_url += '&' if '?' in f4m_url else '?' - f4m_url += compat_urllib_parse_urlencode(f4m_params) + f4m_url += urllib.parse.urlencode(f4m_params) formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False)) elif src_ext == 'mpd': formats.extend(self._extract_mpd_formats( @@ -2613,11 +2390,15 @@ class InfoExtractor(object): return subtitles def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True): - xspf = self._download_xml( + res = self._download_xml_handle( xspf_url, playlist_id, 'Downloading xpsf playlist', 'Unable to download xspf manifest', fatal=fatal) - if xspf is False: + if res is False: return [] + + xspf, urlh = res + xspf_url = urlh.geturl() + return self._parse_xspf( xspf, playlist_id, xspf_url=xspf_url, xspf_base_url=base_url(xspf_url)) @@ -2651,7 +2432,6 @@ class InfoExtractor(object): 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))), 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))), }) - self._sort_formats(formats) entries.append({ 'id': playlist_id, @@ -2682,7 +2462,10 @@ class InfoExtractor(object): mpd_doc, urlh = res if mpd_doc is None: return [], {} - mpd_base_url = base_url(urlh.geturl()) + + # We could have been redirected to a new url when we retrieved our mpd file. + mpd_url = urlh.geturl() + mpd_base_url = base_url(mpd_url) return self._parse_mpd_formats_and_subtitles( mpd_doc, mpd_id, mpd_base_url, mpd_url) @@ -2790,15 +2573,20 @@ class InfoExtractor(object): mime_type = representation_attrib['mimeType'] content_type = representation_attrib.get('contentType', mime_type.split('/')[0]) - codecs = parse_codecs(representation_attrib.get('codecs', '')) + codec_str = representation_attrib.get('codecs', '') + # Some kind of binary subtitle found in some youtube livestreams + if mime_type == 'application/x-rawcc': + codecs = {'scodec': codec_str} + else: + codecs = parse_codecs(codec_str) if content_type not in ('video', 'audio', 'text'): if mime_type == 'image/jpeg': content_type = mime_type - elif codecs['vcodec'] != 'none': + elif codecs.get('vcodec', 'none') != 'none': content_type = 'video' - elif codecs['acodec'] != 'none': + elif codecs.get('acodec', 'none') != 'none': content_type = 'audio' - elif codecs.get('tcodec', 'none') != 'none': + elif codecs.get('scodec', 'none') != 'none': content_type = 'text' elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'): content_type = 'text' @@ -2809,12 +2597,12 @@ class InfoExtractor(object): base_url = '' for element in (representation, adaptation_set, period, mpd_doc): base_url_e = element.find(_add_ns('BaseURL')) - if base_url_e is not None: + if try_call(lambda: base_url_e.text) is not None: base_url = base_url_e.text + base_url if re.match(r'^https?://', base_url): break if mpd_base_url and base_url.startswith('/'): - base_url = compat_urlparse.urljoin(mpd_base_url, base_url) + base_url = urllib.parse.urljoin(mpd_base_url, base_url) elif mpd_base_url and not re.match(r'^https?://', base_url): if not mpd_base_url.endswith('/'): mpd_base_url += '/' @@ -2869,6 +2657,8 @@ class InfoExtractor(object): def prepare_template(template_name, identifiers): tmpl = representation_ms_info[template_name] + if representation_id is not None: + tmpl = tmpl.replace('$RepresentationID$', representation_id) # First of, % characters outside $...$ templates # must be escaped by doubling for proper processing # by % operator string formatting used further (see @@ -2883,8 +2673,6 @@ class InfoExtractor(object): t += c # Next, $...$ templates are translated to their # %(...) counterparts to be used with % operator - if representation_id is not None: - t = t.replace('$RepresentationID$', representation_id) t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t) t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t) t.replace('$$', '$') @@ -2960,8 +2748,8 @@ class InfoExtractor(object): segment_number += 1 segment_time += segment_d elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info: - # No media template - # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI + # No media template, + # e.g. https://www.youtube.com/watch?v=iXZV5uAYMJI # or any YouTube dashsegments video fragments = [] segment_index = 0 @@ -2978,7 +2766,7 @@ class InfoExtractor(object): representation_ms_info['fragments'] = fragments elif 'segment_urls' in representation_ms_info: # Segment URLs with no SegmentTimeline - # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091 + # E.g. https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091 # https://github.com/ytdl-org/youtube-dl/pull/14844 fragments = [] segment_duration = float_or_none( @@ -3070,9 +2858,10 @@ class InfoExtractor(object): stream_name = stream.get('Name') stream_language = stream.get('Language', 'und') for track in stream.findall('QualityLevel'): - fourcc = track.get('FourCC') or ('AACL' if track.get('AudioTag') == '255' else None) + KNOWN_TAGS = {'255': 'AACL', '65534': 'EC-3'} + fourcc = track.get('FourCC') or KNOWN_TAGS.get(track.get('AudioTag')) # TODO: add support for WVC1 and WMAP - if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML'): + if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML', 'EC-3'): self.report_warning('%s is not a supported codec' % fourcc) continue tbr = int(track.attrib['Bitrate']) // 1000 @@ -3084,7 +2873,7 @@ class InfoExtractor(object): sampling_rate = int_or_none(track.get('SamplingRate')) track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern) - track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern) + track_url_pattern = urllib.parse.urljoin(ism_url, track_url_pattern) fragments = [] fragment_ctx = { @@ -3103,7 +2892,7 @@ class InfoExtractor(object): fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat for _ in range(fragment_repeat): fragments.append({ - 'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern), + 'url': re.sub(r'{start[ _]time}', str(fragment_ctx['time']), track_url_pattern), 'duration': fragment_ctx['duration'] / stream_timescale, }) fragment_ctx['time'] += fragment_ctx['duration'] @@ -3171,7 +2960,8 @@ class InfoExtractor(object): return f return {} - def _media_formats(src, cur_media_type, type_info={}): + def _media_formats(src, cur_media_type, type_info=None): + type_info = type_info or {} full_url = absolute_url(src) ext = type_info.get('ext') or determine_ext(full_url) if ext == 'm3u8': @@ -3189,12 +2979,13 @@ class InfoExtractor(object): formats = [{ 'url': full_url, 'vcodec': 'none' if cur_media_type == 'audio' else None, + 'ext': ext, }] return is_plain_url, formats entries = [] # amp-video and amp-audio are very similar to their HTML5 counterparts - # so we wll include them right here (see + # so we will include them right here (see # https://www.ampproject.org/docs/reference/components/amp-video) # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/ _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)' @@ -3204,8 +2995,8 @@ class InfoExtractor(object): media_tags.extend(re.findall( # We only allow video|audio followed by a whitespace or '>'. # Allowing more characters may end up in significant slow down (see - # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL: - # http://www.porntrex.com/maps/videositemap.xml). + # https://github.com/ytdl-org/youtube-dl/issues/11979, + # e.g. http://www.porntrex.com/maps/videositemap.xml). r'(?s)(<(?P%s)(?:\s+[^>]*)?>)(.*?)' % _MEDIA_TAG_NAME_RE, webpage)) for media_tag, _, media_type, media_content in media_tags: media_info = { @@ -3213,9 +3004,10 @@ class InfoExtractor(object): 'subtitles': {}, } media_attributes = extract_attributes(media_tag) - src = strip_or_none(media_attributes.get('src')) + src = strip_or_none(dict_get(media_attributes, ('src', 'data-video-src', 'data-src', 'data-source'))) if src: - _, formats = _media_formats(src, media_type) + f = parse_content_type(media_attributes.get('type')) + _, formats = _media_formats(src, media_type, f) media_info['formats'].extend(formats) media_info['thumbnail'] = absolute_url(media_attributes.get('poster')) if media_content: @@ -3223,7 +3015,7 @@ class InfoExtractor(object): s_attr = extract_attributes(source_tag) # data-video-src and data-src are non standard but seen # several times in the wild - src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src'))) + src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src', 'data-source'))) if not src: continue f = parse_content_type(s_attr.get('type')) @@ -3332,7 +3124,7 @@ class InfoExtractor(object): http_f = f.copy() del http_f['manifest_url'] http_url = re.sub( - REPL_REGEX, protocol + r'://%s/\g<1>%s\3' % (http_host, qualities[i]), f['url']) + REPL_REGEX, protocol + fr'://{http_host}/\g<1>{qualities[i]}\3', f['url']) http_f.update({ 'format_id': http_f['format_id'].replace('hls-', protocol + '-'), 'url': http_url, @@ -3344,7 +3136,7 @@ class InfoExtractor(object): return formats, subtitles def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]): - query = compat_urlparse.urlparse(url).query + query = urllib.parse.urlparse(url).query url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url) mobj = re.search( r'(?:(?:http|rtmp|rtsp)(?Ps)?:)?(?P//[^?]+)', url) @@ -3353,7 +3145,7 @@ class InfoExtractor(object): formats = [] def manifest_url(manifest): - m_url = '%s/%s' % (http_base_url, manifest) + m_url = f'{http_base_url}/{manifest}' if query: m_url += '?%s' % query return m_url @@ -3390,7 +3182,7 @@ class InfoExtractor(object): for protocol in ('rtmp', 'rtsp'): if protocol not in skip_protocols: formats.append({ - 'url': '%s:%s' % (protocol, url_base), + 'url': f'{protocol}:{url_base}', 'format_id': protocol, 'protocol': protocol, }) @@ -3450,7 +3242,7 @@ class InfoExtractor(object): if not isinstance(track, dict): continue track_kind = track.get('kind') - if not track_kind or not isinstance(track_kind, compat_str): + if not track_kind or not isinstance(track_kind, str): continue if track_kind.lower() not in ('captions', 'subtitles'): continue @@ -3477,7 +3269,6 @@ class InfoExtractor(object): 'url': formats[0]['url'], }) else: - self._sort_formats(formats) entry['formats'] = formats entries.append(entry) if len(entries) == 1: @@ -3523,13 +3314,14 @@ class InfoExtractor(object): # Often no height is provided but there is a label in # format like "1080p", "720p SD", or 1080. height = int_or_none(self._search_regex( - r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''), + r'^(\d{3,4})[pP]?(?:\b|$)', str(source.get('label') or ''), 'height', default=None)) a_format = { 'url': source_url, 'width': int_or_none(source.get('width')), 'height': height, - 'tbr': int_or_none(source.get('bitrate')), + 'tbr': int_or_none(source.get('bitrate'), scale=1000), + 'filesize': int_or_none(source.get('filesize')), 'ext': ext, } if source_url.startswith('rtmp'): @@ -3556,7 +3348,7 @@ class InfoExtractor(object): def _int(self, v, name, fatal=False, **kwargs): res = int_or_none(v, **kwargs) if res is None: - msg = 'Failed to extract %s: Could not parse value %r' % (name, v) + msg = f'Failed to extract {name}: Could not parse value {v!r}' if fatal: raise ExtractorError(msg) else: @@ -3566,7 +3358,7 @@ class InfoExtractor(object): def _float(self, v, name, fatal=False, **kwargs): res = float_or_none(v, **kwargs) if res is None: - msg = 'Failed to extract %s: Could not parse value %r' % (name, v) + msg = f'Failed to extract {name}: Could not parse value {v!r}' if fatal: raise ExtractorError(msg) else: @@ -3575,17 +3367,15 @@ class InfoExtractor(object): def _set_cookie(self, domain, name, value, expire_time=None, port=None, path='/', secure=False, discard=False, rest={}, **kwargs): - cookie = compat_cookiejar_Cookie( + cookie = http.cookiejar.Cookie( 0, name, value, port, port is not None, domain, True, domain.startswith('.'), path, True, secure, expire_time, discard, None, None, rest) - self._downloader.cookiejar.set_cookie(cookie) + self.cookiejar.set_cookie(cookie) def _get_cookies(self, url): - """ Return a compat_cookies_SimpleCookie with the cookies for the url """ - req = sanitized_Request(url) - self._downloader.cookiejar.add_cookie_header(req) - return compat_cookies_SimpleCookie(req.get_header('Cookie')) + """ Return a http.cookies.SimpleCookie with the cookies for the url """ + return LenientSimpleCookie(self._downloader._calc_cookies(url)) def _apply_first_set_cookie_header(self, url_handle, cookie): """ @@ -3604,9 +3394,7 @@ class InfoExtractor(object): for header, cookies in url_handle.headers.items(): if header.lower() != 'set-cookie': continue - if sys.version_info[0] >= 3: - cookies = cookies.encode('iso-8859-1') - cookies = cookies.decode('utf-8') + cookies = cookies.encode('iso-8859-1').decode('utf-8') cookie_value = re.search( r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies) if cookie_value: @@ -3614,34 +3402,82 @@ class InfoExtractor(object): self._set_cookie(domain, cookie, value) break - def get_testcases(self, include_onlymatching=False): - t = getattr(self, '_TEST', None) + @classmethod + def get_testcases(cls, include_onlymatching=False): + # Do not look in super classes + t = vars(cls).get('_TEST') if t: - assert not hasattr(self, '_TESTS'), \ - '%s has _TEST and _TESTS' % type(self).__name__ + assert not hasattr(cls, '_TESTS'), f'{cls.ie_key()}IE has _TEST and _TESTS' tests = [t] else: - tests = getattr(self, '_TESTS', []) + tests = vars(cls).get('_TESTS', []) for t in tests: if not include_onlymatching and t.get('only_matching', False): continue - t['name'] = type(self).__name__[:-len('IE')] + t['name'] = cls.ie_key() yield t - def is_suitable(self, age_limit): - """ Test whether the extractor is generally suitable for the given - age limit (i.e. pornographic sites are not, all others usually are) """ - - any_restricted = False - for tc in self.get_testcases(include_onlymatching=False): - if tc.get('playlist', []): - tc = tc['playlist'][0] - is_restricted = age_restricted( - tc.get('info_dict', {}).get('age_limit'), age_limit) - if not is_restricted: - return True - any_restricted = any_restricted or is_restricted - return not any_restricted + @classmethod + def get_webpage_testcases(cls): + tests = vars(cls).get('_WEBPAGE_TESTS', []) + for t in tests: + t['name'] = cls.ie_key() + return tests + + @classproperty(cache=True) + def age_limit(cls): + """Get age limit from the testcases""" + return max(traverse_obj( + (*cls.get_testcases(include_onlymatching=False), *cls.get_webpage_testcases()), + (..., (('playlist', 0), None), 'info_dict', 'age_limit')) or [0]) + + @classproperty(cache=True) + def _RETURN_TYPE(cls): + """What the extractor returns: "video", "playlist", "any", or None (Unknown)""" + tests = tuple(cls.get_testcases(include_onlymatching=False)) + if not tests: + return None + elif not any(k.startswith('playlist') for test in tests for k in test): + return 'video' + elif all(any(k.startswith('playlist') for k in test) for test in tests): + return 'playlist' + return 'any' + + @classmethod + def is_single_video(cls, url): + """Returns whether the URL is of a single video, None if unknown""" + assert cls.suitable(url), 'The URL must be suitable for the extractor' + return {'video': True, 'playlist': False}.get(cls._RETURN_TYPE) + + @classmethod + def is_suitable(cls, age_limit): + """Test whether the extractor is generally suitable for the given age limit""" + return not age_restricted(cls.age_limit, age_limit) + + @classmethod + def description(cls, *, markdown=True, search_examples=None): + """Description of the extractor""" + desc = '' + if cls._NETRC_MACHINE: + if markdown: + desc += f' [{cls._NETRC_MACHINE}]' + else: + desc += f' [{cls._NETRC_MACHINE}]' + if cls.IE_DESC is False: + desc += ' [HIDDEN]' + elif cls.IE_DESC: + desc += f' {cls.IE_DESC}' + if cls.SEARCH_KEY: + desc += f'; "{cls.SEARCH_KEY}:" prefix' + if search_examples: + _COUNTS = ('', '5', '10', 'all') + desc += f' (e.g. "{cls.SEARCH_KEY}{random.choice(_COUNTS)}:{random.choice(search_examples)}")' + if not cls.working(): + desc += ' (**Currently broken**)' if markdown else ' (Currently broken)' + + # Escape emojis. Ref: https://github.com/github/markup/issues/1153 + name = (' - **%s**' % re.sub(r':(\w+:)', ':\u200B\\g<1>', cls.IE_NAME)) if markdown else cls.IE_NAME + return f'{name}:{desc}' if desc else name def extract_subtitles(self, *args, **kwargs): if (self.get_param('writesubtitles', False) @@ -3652,6 +3488,9 @@ class InfoExtractor(object): def _get_subtitles(self, *args, **kwargs): raise NotImplementedError('This method must be implemented by subclasses') + class CommentsDisabled(Exception): + """Raise in _get_comments if comments are disabled for the video""" + def extract_comments(self, *args, **kwargs): if not self.get_param('getcomments'): return None @@ -3667,6 +3506,8 @@ class InfoExtractor(object): interrupted = False except KeyboardInterrupt: self.to_screen('Interrupted by user') + except self.CommentsDisabled: + return {'comments': None, 'comment_count': None} except Exception as e: if self.get_param('ignoreerrors') is not True: raise @@ -3686,7 +3527,7 @@ class InfoExtractor(object): def _merge_subtitle_items(subtitle_list1, subtitle_list2): """ Merge subtitle items for one language. Items with duplicated URLs/data will be dropped. """ - list1_data = set((item.get('url'), item.get('data')) for item in subtitle_list1) + list1_data = {(item.get('url'), item.get('data')) for item in subtitle_list1} ret = list(subtitle_list1) ret.extend(item for item in subtitle_list2 if (item.get('url'), item.get('data')) not in list1_data) return ret @@ -3710,11 +3551,15 @@ class InfoExtractor(object): def _get_automatic_captions(self, *args, **kwargs): raise NotImplementedError('This method must be implemented by subclasses') + @functools.cached_property + def _cookies_passed(self): + """Whether cookies have been passed to YoutubeDL""" + return self.get_param('cookiefile') is not None or self.get_param('cookiesfrombrowser') is not None + def mark_watched(self, *args, **kwargs): if not self.get_param('mark_watched', False): return - if (self.supports_login() and self._get_login_info()[0] is not None - or self.get_param('cookiefile') or self.get_param('cookiesfrombrowser')): + if self.supports_login() and self._get_login_info()[0] is not None or self._cookies_passed: self._mark_watched(*args, **kwargs) def _mark_watched(self, *args, **kwargs): @@ -3727,11 +3572,15 @@ class InfoExtractor(object): headers['Ytdl-request-proxy'] = geo_verification_proxy return headers - def _generic_id(self, url): - return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0]) + @staticmethod + def _generic_id(url): + return urllib.parse.unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0]) - def _generic_title(self, url): - return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]) + def _generic_title(self, url='', webpage='', *, default=None): + return (self._og_search_title(webpage, default=None) + or self._html_extract_title(webpage, default=None) + or urllib.parse.unquote(os.path.splitext(url_basename(url))[0]) + or default) @staticmethod def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None): @@ -3754,8 +3603,8 @@ class InfoExtractor(object): @param default The default value to return when the key is not present (default: []) @param casesense When false, the values are converted to lower case ''' - val = traverse_obj( - self._downloader.params, ('extractor_args', (ie_key or self.ie_key()).lower(), key)) + ie_key = ie_key if isinstance(ie_key, str) else (ie_key or self).ie_key() + val = traverse_obj(self._downloader.params, ('extractor_args', ie_key.lower(), key)) if val is None: return [] if default is NO_DEFAULT else default return list(val) if casesense else [x.lower() for x in val] @@ -3776,6 +3625,72 @@ class InfoExtractor(object): self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}') return True + def _error_or_warning(self, err, _count=None, _retries=0, *, fatal=True): + RetryManager.report_retry( + err, _count or int(fatal), _retries, + info=self.to_screen, warn=self.report_warning, error=None if fatal else self.report_warning, + sleep_func=self.get_param('retry_sleep_functions', {}).get('extractor')) + + def RetryManager(self, **kwargs): + return RetryManager(self.get_param('extractor_retries', 3), self._error_or_warning, **kwargs) + + def _extract_generic_embeds(self, url, *args, info_dict={}, note='Extracting generic embeds', **kwargs): + display_id = traverse_obj(info_dict, 'display_id', 'id') + self.to_screen(f'{format_field(display_id, None, "%s: ")}{note}') + return self._downloader.get_info_extractor('Generic')._extract_embeds( + smuggle_url(url, {'block_ies': [self.ie_key()]}), *args, **kwargs) + + @classmethod + def extract_from_webpage(cls, ydl, url, webpage): + ie = (cls if isinstance(cls._extract_from_webpage, types.MethodType) + else ydl.get_info_extractor(cls.ie_key())) + for info in ie._extract_from_webpage(url, webpage) or []: + # url = None since we do not want to set (webpage/original)_url + ydl.add_default_extra_info(info, ie, None) + yield info + + @classmethod + def _extract_from_webpage(cls, url, webpage): + for embed_url in orderedSet( + cls._extract_embed_urls(url, webpage) or [], lazy=True): + yield cls.url_result(embed_url, None if cls._VALID_URL is False else cls) + + @classmethod + def _extract_embed_urls(cls, url, webpage): + """@returns all the embed urls on the webpage""" + if '_EMBED_URL_RE' not in cls.__dict__: + assert isinstance(cls._EMBED_REGEX, (list, tuple)) + for idx, regex in enumerate(cls._EMBED_REGEX): + assert regex.count('(?P') == 1, \ + f'{cls.__name__}._EMBED_REGEX[{idx}] must have exactly 1 url group\n\t{regex}' + cls._EMBED_URL_RE = tuple(map(re.compile, cls._EMBED_REGEX)) + + for regex in cls._EMBED_URL_RE: + for mobj in regex.finditer(webpage): + embed_url = urllib.parse.urljoin(url, unescapeHTML(mobj.group('url'))) + if cls._VALID_URL is False or cls.suitable(embed_url): + yield embed_url + + class StopExtraction(Exception): + pass + + @classmethod + def _extract_url(cls, webpage): # TODO: Remove + """Only for compatibility with some older extractors""" + return next(iter(cls._extract_embed_urls(None, webpage) or []), None) + + @classmethod + def __init_subclass__(cls, *, plugin_name=None, **kwargs): + if plugin_name: + mro = inspect.getmro(cls) + super_class = cls.__wrapped__ = mro[mro.index(cls) + 1] + cls.IE_NAME, cls.ie_key = f'{super_class.IE_NAME}+{plugin_name}', super_class.ie_key + while getattr(super_class, '__wrapped__', None): + super_class = super_class.__wrapped__ + setattr(sys.modules[super_class.__module__], super_class.__name__, cls) + + return super().__init_subclass__(**kwargs) + class SearchInfoExtractor(InfoExtractor): """ @@ -3785,9 +3700,10 @@ class SearchInfoExtractor(InfoExtractor): """ _MAX_RESULTS = float('inf') + _RETURN_TYPE = 'playlist' - @classmethod - def _make_valid_url(cls): + @classproperty + def _VALID_URL(cls): return r'%s(?P|[1-9][0-9]*|all):(?P[\s\S]+)' % cls._SEARCH_KEY def _real_extract(self, query): @@ -3799,7 +3715,7 @@ class SearchInfoExtractor(InfoExtractor): else: n = int(prefix) if n <= 0: - raise ExtractorError('invalid download number %s for query "%s"' % (n, query)) + raise ExtractorError(f'invalid download number {n} for query "{query}"') elif n > self._MAX_RESULTS: self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n)) n = self._MAX_RESULTS @@ -3816,6 +3732,15 @@ class SearchInfoExtractor(InfoExtractor): """Returns an iterator of search results""" raise NotImplementedError('This method must be implemented by subclasses') - @property - def SEARCH_KEY(self): - return self._SEARCH_KEY + @classproperty + def SEARCH_KEY(cls): + return cls._SEARCH_KEY + + +class UnsupportedURLIE(InfoExtractor): + _VALID_URL = '.*' + _ENABLED = False + IE_DESC = False + + def _real_extract(self, url): + raise UnsupportedError(url) diff --git a/hypervideo_dl/extractor/commonmistakes.py b/hypervideo_dl/extractor/commonmistakes.py index eb76fe5..a4a38cf 100644 --- a/hypervideo_dl/extractor/commonmistakes.py +++ b/hypervideo_dl/extractor/commonmistakes.py @@ -1,16 +1,10 @@ -from __future__ import unicode_literals - -import sys - from .common import InfoExtractor from ..utils import ExtractorError class CommonMistakesIE(InfoExtractor): IE_DESC = False # Do not list - _VALID_URL = r'''(?x) - (?:url|URL)$ - ''' + _VALID_URL = r'(?:url|URL|hypervideo)$' _TESTS = [{ 'url': 'url', @@ -35,9 +29,7 @@ class UnicodeBOMIE(InfoExtractor): IE_DESC = False _VALID_URL = r'(?P\ufeff)(?P.*)$' - # Disable test for python 3.2 since BOM is broken in re in this version - # (see https://github.com/ytdl-org/youtube-dl/issues/9751) - _TESTS = [] if (3, 0) < sys.version_info <= (3, 3) else [{ + _TESTS = [{ 'url': '\ufeffhttp://www.youtube.com/watch?v=BaW_jenozKc', 'only_matching': True, }] diff --git a/hypervideo_dl/extractor/commonprotocols.py b/hypervideo_dl/extractor/commonprotocols.py index 3708c6a..2f93e8e 100644 --- a/hypervideo_dl/extractor/commonprotocols.py +++ b/hypervideo_dl/extractor/commonprotocols.py @@ -1,10 +1,6 @@ -from __future__ import unicode_literals - +import urllib.parse from .common import InfoExtractor -from ..compat import ( - compat_urlparse, -) class RtmpIE(InfoExtractor): @@ -28,7 +24,7 @@ class RtmpIE(InfoExtractor): 'formats': [{ 'url': url, 'ext': 'flv', - 'format_id': compat_urlparse.urlparse(url).scheme, + 'format_id': urllib.parse.urlparse(url).scheme, }], } diff --git a/hypervideo_dl/extractor/condenast.py b/hypervideo_dl/extractor/condenast.py index 54e7af8..3170c29 100644 --- a/hypervideo_dl/extractor/condenast.py +++ b/hypervideo_dl/extractor/condenast.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -61,7 +58,10 @@ class CondeNastIE(InfoExtractor): )''' % '|'.join(_SITES.keys()) IE_DESC = 'Condé Nast media group: %s' % ', '.join(sorted(_SITES.values())) - EMBED_URL = r'(?:https?:)?//player(?:-backend)?\.(?:%s)\.com/(?:embed(?:js)?|(?:script|inline)/video)/.+?' % '|'.join(_SITES.keys()) + _EMBED_REGEX = [r'''(?x) + <(?:iframe|script)[^>]+?src=(["\'])(?P + (?:https?:)?//player(?:-backend)?\.(?:%s)\.com/(?:embed(?:js)?|(?:script|inline)/video)/.+? + )\1''' % '|'.join(_SITES.keys())] _TESTS = [{ 'url': 'http://video.wired.com/watch/3d-printed-speakers-lit-with-led', @@ -197,7 +197,6 @@ class CondeNastIE(InfoExtractor): 'ext': ext, 'quality': 1 if quality == 'high' else 0, }) - self._sort_formats(formats) subtitles = {} for t, caption in video_info.get('captions', {}).items(): diff --git a/hypervideo_dl/extractor/contv.py b/hypervideo_dl/extractor/contv.py index 84b462d..d69e816 100644 --- a/hypervideo_dl/extractor/contv.py +++ b/hypervideo_dl/extractor/contv.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( float_or_none, @@ -72,8 +69,6 @@ class CONtvIE(InfoExtractor): 'url': media_mp4_url, }) - self._sort_formats(formats) - subtitles = {} captions = m_details.get('captions') or {} for caption_url in captions.values(): diff --git a/hypervideo_dl/extractor/corus.py b/hypervideo_dl/extractor/corus.py index 1194613..c03d653 100644 --- a/hypervideo_dl/extractor/corus.py +++ b/hypervideo_dl/extractor/corus.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .theplatform import ThePlatformFeedIE from ..utils import ( dict_get, @@ -11,7 +7,7 @@ from ..utils import ( ) -class CorusIE(ThePlatformFeedIE): +class CorusIE(ThePlatformFeedIE): # XXX: Do not subclass from concrete IE _VALID_URL = r'''(?x) https?:// (?:www\.)? @@ -130,7 +126,6 @@ class CorusIE(ThePlatformFeedIE): smil, smil_url, video_id, namespace)) if not formats and video.get('drm'): self.report_drm(video_id) - self._sort_formats(formats) subtitles = {} for track in video.get('tracks', []): diff --git a/hypervideo_dl/extractor/coub.py b/hypervideo_dl/extractor/coub.py index e90aa19..9bab698 100644 --- a/hypervideo_dl/extractor/coub.py +++ b/hypervideo_dl/extractor/coub.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( ExtractorError, @@ -107,8 +104,6 @@ class CoubIE(InfoExtractor): 'source_preference': preference_key(MOBILE), }) - self._sort_formats(formats) - thumbnail = coub.get('picture') duration = float_or_none(coub.get('duration')) timestamp = parse_iso8601(coub.get('published_at') or coub.get('created_at')) diff --git a/hypervideo_dl/extractor/cozytv.py b/hypervideo_dl/extractor/cozytv.py index d49f1ca..5ef5afc 100644 --- a/hypervideo_dl/extractor/cozytv.py +++ b/hypervideo_dl/extractor/cozytv.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import unified_strdate diff --git a/hypervideo_dl/extractor/cpac.py b/hypervideo_dl/extractor/cpac.py index 2274115..0f23f2b 100644 --- a/hypervideo_dl/extractor/cpac.py +++ b/hypervideo_dl/extractor/cpac.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import compat_str from ..utils import ( @@ -12,13 +9,6 @@ from ..utils import ( urljoin, ) -# compat_range -try: - if callable(xrange): - range = xrange -except (NameError, TypeError): - pass - class CPACIE(InfoExtractor): IE_NAME = 'cpac' @@ -64,8 +54,6 @@ class CPACIE(InfoExtractor): else: fmt['language_preference'] = -10 - self._sort_formats(formats) - category = str_or_none(content['details']['category_%s_t' % (url_lang, )]) def is_live(v_type): diff --git a/hypervideo_dl/extractor/cracked.py b/hypervideo_dl/extractor/cracked.py index f77a68e..c6aabcc 100644 --- a/hypervideo_dl/extractor/cracked.py +++ b/hypervideo_dl/extractor/cracked.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/crackle.py b/hypervideo_dl/extractor/crackle.py index db4962c..4610015 100644 --- a/hypervideo_dl/extractor/crackle.py +++ b/hypervideo_dl/extractor/crackle.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals, division - import hashlib import hmac import re @@ -180,7 +177,6 @@ class CrackleIE(InfoExtractor): }) if not formats and has_drm: self.report_drm(video_id) - self._sort_formats(formats) description = media.get('Description') duration = int_or_none(media.get( diff --git a/hypervideo_dl/extractor/craftsy.py b/hypervideo_dl/extractor/craftsy.py index ed2f442..307bfb9 100644 --- a/hypervideo_dl/extractor/craftsy.py +++ b/hypervideo_dl/extractor/craftsy.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .brightcove import BrightcoveNewIE from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/crooksandliars.py b/hypervideo_dl/extractor/crooksandliars.py index 7fb782d..4de7e3d 100644 --- a/hypervideo_dl/extractor/crooksandliars.py +++ b/hypervideo_dl/extractor/crooksandliars.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( int_or_none, @@ -9,6 +7,8 @@ from ..utils import ( class CrooksAndLiarsIE(InfoExtractor): _VALID_URL = r'https?://embed\.crooksandliars\.com/(?:embed|v)/(?P[A-Za-z0-9]+)' + _EMBED_REGEX = [r'<(?:iframe[^>]+src|param[^>]+value)=(["\'])(?P(?:https?:)?//embed\.crooksandliars\.com/(?:embed|v)/.+?)\1'] + _TESTS = [{ 'url': 'https://embed.crooksandliars.com/embed/8RUoRhRi', 'info_dict': { @@ -45,7 +45,6 @@ class CrooksAndLiarsIE(InfoExtractor): 'format_id': item['type'], 'quality': quality(item['type']), } for item in manifest['flavors'] if item['mime'].startswith('video/')] - self._sort_formats(formats) return { 'url': url, diff --git a/hypervideo_dl/extractor/crowdbunker.py b/hypervideo_dl/extractor/crowdbunker.py index 72906af..d83c015 100644 --- a/hypervideo_dl/extractor/crowdbunker.py +++ b/hypervideo_dl/extractor/crowdbunker.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import itertools from .common import InfoExtractor @@ -63,7 +60,6 @@ class CrowdBunkerIE(InfoExtractor): 'width': int_or_none(image.get('width')), } for image in video_json.get('thumbnails') or [] if image.get('url')] - self._sort_formats(formats) return { 'id': id, 'title': video_json.get('title'), diff --git a/hypervideo_dl/extractor/crunchyroll.py b/hypervideo_dl/extractor/crunchyroll.py index 7edb645..d226050 100644 --- a/hypervideo_dl/extractor/crunchyroll.py +++ b/hypervideo_dl/extractor/crunchyroll.py @@ -1,44 +1,16 @@ -# coding: utf-8 -from __future__ import unicode_literals - import base64 -import re -import json -import zlib +import urllib.parse -from hashlib import sha1 -from math import pow, sqrt, floor from .common import InfoExtractor -from .vrv import VRVBaseIE -from ..compat import ( - compat_b64decode, - compat_etree_Element, - compat_etree_fromstring, - compat_str, - compat_urllib_parse_urlencode, - compat_urllib_request, - compat_urlparse, -) from ..utils import ( ExtractorError, - bytes_to_intlist, - extract_attributes, float_or_none, format_field, - intlist_to_bytes, - int_or_none, join_nonempty, - lowercase_escape, - merge_dicts, + parse_iso8601, qualities, - remove_end, - sanitized_Request, traverse_obj, try_get, - xpath_text, -) -from ..aes import ( - aes_cbc_decrypt, ) @@ -46,16 +18,7 @@ class CrunchyrollBaseIE(InfoExtractor): _LOGIN_URL = 'https://www.crunchyroll.com/welcome/login' _API_BASE = 'https://api.crunchyroll.com' _NETRC_MACHINE = 'crunchyroll' - - def _call_rpc_api(self, method, video_id, note=None, data=None): - data = data or {} - data['req'] = 'RpcApi' + method - data = compat_urllib_parse_urlencode(data).encode('utf-8') - return self._download_xml( - 'https://www.crunchyroll.com/xml/', - video_id, note, fatal=False, data=data, headers={ - 'Content-Type': 'application/x-www-form-urlencoded', - }) + params = None def _perform_login(self, username, password): if self._get_cookies(self._LOGIN_URL).get('etp_rt'): @@ -76,7 +39,7 @@ class CrunchyrollBaseIE(InfoExtractor): login_response = self._download_json( f'{self._API_BASE}/login.1.json', None, 'Logging in', - data=compat_urllib_parse_urlencode({ + data=urllib.parse.urlencode({ 'account': username, 'password': password, 'session_id': session_id @@ -86,800 +49,173 @@ class CrunchyrollBaseIE(InfoExtractor): if not self._get_cookies(self._LOGIN_URL).get('etp_rt'): raise ExtractorError('Login succeeded but did not set etp_rt cookie') - # Beta-specific, but needed for redirects - def _get_beta_embedded_json(self, webpage, display_id): + def _get_embedded_json(self, webpage, display_id): initial_state = self._parse_json(self._search_regex( r'__INITIAL_STATE__\s*=\s*({.+?})\s*;', webpage, 'initial state'), display_id) app_config = self._parse_json(self._search_regex( r'__APP_CONFIG__\s*=\s*({.+?})\s*;', webpage, 'app config'), display_id) return initial_state, app_config - def _redirect_to_beta(self, webpage, iekey, video_id): - if not self._get_cookies(self._LOGIN_URL).get('etp_rt'): - raise ExtractorError('Received a beta page from non-beta url when not logged in.') - initial_state, app_config = self._get_beta_embedded_json(webpage, video_id) - url = app_config['baseSiteUrl'] + initial_state['router']['locations']['current']['pathname'] - self.to_screen(f'{video_id}: Redirected to beta site - {url}') - return self.url_result(f'{url}', iekey, video_id) - - @staticmethod - def _add_skip_wall(url): - parsed_url = compat_urlparse.urlparse(url) - qs = compat_urlparse.parse_qs(parsed_url.query) - # Always force skip_wall to bypass maturity wall, namely 18+ confirmation message: - # > This content may be inappropriate for some people. - # > Are you sure you want to continue? - # since it's not disabled by default in crunchyroll account's settings. - # See https://github.com/ytdl-org/youtube-dl/issues/7202. - qs['skip_wall'] = ['1'] - return compat_urlparse.urlunparse( - parsed_url._replace(query=compat_urllib_parse_urlencode(qs, True))) - - -class CrunchyrollIE(CrunchyrollBaseIE, VRVBaseIE): - IE_NAME = 'crunchyroll' - _VALID_URL = r'https?://(?:(?Pwww|m)\.)?(?Pcrunchyroll\.(?:com|fr)/(?:media(?:-|/\?id=)|(?:[^/]*/){1,2}[^/?&]*?)(?P[0-9]+))(?:[/?&]|$)' - _TESTS = [{ - 'url': 'http://www.crunchyroll.com/wanna-be-the-strongest-in-the-world/episode-1-an-idol-wrestler-is-born-645513', - 'info_dict': { - 'id': '645513', - 'ext': 'mp4', - 'title': 'Wanna be the Strongest in the World Episode 1 – An Idol-Wrestler is Born!', - 'description': 'md5:2d17137920c64f2f49981a7797d275ef', - 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'Yomiuri Telecasting Corporation (YTV)', - 'upload_date': '20131013', - 'url': 're:(?!.*&)', - }, - 'params': { - # rtmp - 'skip_download': True, - }, - 'skip': 'Video gone', - }, { - 'url': 'http://www.crunchyroll.com/media-589804/culture-japan-1', - 'info_dict': { - 'id': '589804', - 'ext': 'flv', - 'title': 'Culture Japan Episode 1 – Rebuilding Japan after the 3.11', - 'description': 'md5:2fbc01f90b87e8e9137296f37b461c12', - 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'Danny Choo Network', - 'upload_date': '20120213', - }, - 'params': { - # rtmp - 'skip_download': True, - }, - 'skip': 'Video gone', - }, { - 'url': 'http://www.crunchyroll.com/rezero-starting-life-in-another-world-/episode-5-the-morning-of-our-promise-is-still-distant-702409', - 'info_dict': { - 'id': '702409', - 'ext': 'mp4', - 'title': compat_str, - 'description': compat_str, - 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'Re:Zero Partners', - 'timestamp': 1462098900, - 'upload_date': '20160501', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { - 'url': 'http://www.crunchyroll.com/konosuba-gods-blessing-on-this-wonderful-world/episode-1-give-me-deliverance-from-this-judicial-injustice-727589', - 'info_dict': { - 'id': '727589', - 'ext': 'mp4', - 'title': compat_str, - 'description': compat_str, - 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'Kadokawa Pictures Inc.', - 'timestamp': 1484130900, - 'upload_date': '20170111', - 'series': compat_str, - 'season': "KONOSUBA -God's blessing on this wonderful world! 2", - 'season_number': 2, - 'episode': 'Give Me Deliverance From This Judicial Injustice!', - 'episode_number': 1, - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { - 'url': 'http://www.crunchyroll.fr/girl-friend-beta/episode-11-goodbye-la-mode-661697', - 'only_matching': True, - }, { - # geo-restricted (US), 18+ maturity wall, non-premium available - 'url': 'http://www.crunchyroll.com/cosplay-complex-ova/episode-1-the-birth-of-the-cosplay-club-565617', - 'only_matching': True, - }, { - # A description with double quotes - 'url': 'http://www.crunchyroll.com/11eyes/episode-1-piros-jszaka-red-night-535080', - 'info_dict': { - 'id': '535080', - 'ext': 'mp4', - 'title': compat_str, - 'description': compat_str, - 'uploader': 'Marvelous AQL Inc.', - 'timestamp': 1255512600, - 'upload_date': '20091014', - }, - 'params': { - # Just test metadata extraction - 'skip_download': True, - }, - }, { - # make sure we can extract an uploader name that's not a link - 'url': 'http://www.crunchyroll.com/hakuoki-reimeiroku/episode-1-dawn-of-the-divine-warriors-606899', - 'info_dict': { - 'id': '606899', - 'ext': 'mp4', - 'title': 'Hakuoki Reimeiroku Episode 1 – Dawn of the Divine Warriors', - 'description': 'Ryunosuke was left to die, but Serizawa-san asked him a simple question "Do you want to live?"', - 'uploader': 'Geneon Entertainment', - 'upload_date': '20120717', - }, - 'params': { - # just test metadata extraction - 'skip_download': True, - }, - 'skip': 'Video gone', - }, { - # A video with a vastly different season name compared to the series name - 'url': 'http://www.crunchyroll.com/nyarko-san-another-crawling-chaos/episode-1-test-590532', - 'info_dict': { - 'id': '590532', - 'ext': 'mp4', - 'title': compat_str, - 'description': compat_str, - 'uploader': 'TV TOKYO', - 'timestamp': 1330956000, - 'upload_date': '20120305', - 'series': 'Nyarko-san: Another Crawling Chaos', - 'season': 'Haiyoru! Nyaruani (ONA)', - }, - 'params': { - # Just test metadata extraction - 'skip_download': True, - }, - }, { - 'url': 'http://www.crunchyroll.com/media-723735', - 'only_matching': True, - }, { - 'url': 'https://www.crunchyroll.com/en-gb/mob-psycho-100/episode-2-urban-legends-encountering-rumors-780921', - 'only_matching': True, - }] - - _FORMAT_IDS = { - '360': ('60', '106'), - '480': ('61', '106'), - '720': ('62', '106'), - '1080': ('80', '108'), - } - - def _download_webpage(self, url_or_request, *args, **kwargs): - request = (url_or_request if isinstance(url_or_request, compat_urllib_request.Request) - else sanitized_Request(url_or_request)) - # Accept-Language must be set explicitly to accept any language to avoid issues - # similar to https://github.com/ytdl-org/youtube-dl/issues/6797. - # Along with IP address Crunchyroll uses Accept-Language to guess whether georestriction - # should be imposed or not (from what I can see it just takes the first language - # ignoring the priority and requires it to correspond the IP). By the way this causes - # Crunchyroll to not work in georestriction cases in some browsers that don't place - # the locale lang first in header. However allowing any language seems to workaround the issue. - request.add_header('Accept-Language', '*') - return super(CrunchyrollBaseIE, self)._download_webpage(request, *args, **kwargs) - - def _decrypt_subtitles(self, data, iv, id): - data = bytes_to_intlist(compat_b64decode(data)) - iv = bytes_to_intlist(compat_b64decode(iv)) - id = int(id) - - def obfuscate_key_aux(count, modulo, start): - output = list(start) - for _ in range(count): - output.append(output[-1] + output[-2]) - # cut off start values - output = output[2:] - output = list(map(lambda x: x % modulo + 33, output)) - return output - - def obfuscate_key(key): - num1 = int(floor(pow(2, 25) * sqrt(6.9))) - num2 = (num1 ^ key) << 5 - num3 = key ^ num1 - num4 = num3 ^ (num3 >> 3) ^ num2 - prefix = intlist_to_bytes(obfuscate_key_aux(20, 97, (1, 2))) - shaHash = bytes_to_intlist(sha1(prefix + str(num4).encode('ascii')).digest()) - # Extend 160 Bit hash to 256 Bit - return shaHash + [0] * 12 - - key = obfuscate_key(id) - - decrypted_data = intlist_to_bytes(aes_cbc_decrypt(data, key, iv)) - return zlib.decompress(decrypted_data) - - def _convert_subtitles_to_srt(self, sub_root): - output = '' - - for i, event in enumerate(sub_root.findall('./events/event'), 1): - start = event.attrib['start'].replace('.', ',') - end = event.attrib['end'].replace('.', ',') - text = event.attrib['text'].replace('\\N', '\n') - output += '%d\n%s --> %s\n%s\n\n' % (i, start, end, text) - return output - - def _convert_subtitles_to_ass(self, sub_root): - output = '' - - def ass_bool(strvalue): - assvalue = '0' - if strvalue == '1': - assvalue = '-1' - return assvalue - - output = '[Script Info]\n' - output += 'Title: %s\n' % sub_root.attrib['title'] - output += 'ScriptType: v4.00+\n' - output += 'WrapStyle: %s\n' % sub_root.attrib['wrap_style'] - output += 'PlayResX: %s\n' % sub_root.attrib['play_res_x'] - output += 'PlayResY: %s\n' % sub_root.attrib['play_res_y'] - output += """ -[V4+ Styles] -Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding -""" - for style in sub_root.findall('./styles/style'): - output += 'Style: ' + style.attrib['name'] - output += ',' + style.attrib['font_name'] - output += ',' + style.attrib['font_size'] - output += ',' + style.attrib['primary_colour'] - output += ',' + style.attrib['secondary_colour'] - output += ',' + style.attrib['outline_colour'] - output += ',' + style.attrib['back_colour'] - output += ',' + ass_bool(style.attrib['bold']) - output += ',' + ass_bool(style.attrib['italic']) - output += ',' + ass_bool(style.attrib['underline']) - output += ',' + ass_bool(style.attrib['strikeout']) - output += ',' + style.attrib['scale_x'] - output += ',' + style.attrib['scale_y'] - output += ',' + style.attrib['spacing'] - output += ',' + style.attrib['angle'] - output += ',' + style.attrib['border_style'] - output += ',' + style.attrib['outline'] - output += ',' + style.attrib['shadow'] - output += ',' + style.attrib['alignment'] - output += ',' + style.attrib['margin_l'] - output += ',' + style.attrib['margin_r'] - output += ',' + style.attrib['margin_v'] - output += ',' + style.attrib['encoding'] - output += '\n' - - output += """ -[Events] -Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text -""" - for event in sub_root.findall('./events/event'): - output += 'Dialogue: 0' - output += ',' + event.attrib['start'] - output += ',' + event.attrib['end'] - output += ',' + event.attrib['style'] - output += ',' + event.attrib['name'] - output += ',' + event.attrib['margin_l'] - output += ',' + event.attrib['margin_r'] - output += ',' + event.attrib['margin_v'] - output += ',' + event.attrib['effect'] - output += ',' + event.attrib['text'] - output += '\n' - - return output - - def _extract_subtitles(self, subtitle): - sub_root = compat_etree_fromstring(subtitle) - return [{ - 'ext': 'srt', - 'data': self._convert_subtitles_to_srt(sub_root), - }, { - 'ext': 'ass', - 'data': self._convert_subtitles_to_ass(sub_root), - }] - - def _get_subtitles(self, video_id, webpage): - subtitles = {} - for sub_id, sub_name in re.findall(r'\bssid=([0-9]+)"[^>]+?\btitle="([^"]+)', webpage): - sub_doc = self._call_rpc_api( - 'Subtitle_GetXml', video_id, - 'Downloading subtitles for ' + sub_name, data={ - 'subtitle_script_id': sub_id, - }) - if not isinstance(sub_doc, compat_etree_Element): - continue - sid = sub_doc.get('id') - iv = xpath_text(sub_doc, 'iv', 'subtitle iv') - data = xpath_text(sub_doc, 'data', 'subtitle data') - if not sid or not iv or not data: - continue - subtitle = self._decrypt_subtitles(data, iv, sid).decode('utf-8') - lang_code = self._search_regex(r'lang_code=["\']([^"\']+)', subtitle, 'subtitle_lang_code', fatal=False) - if not lang_code: - continue - subtitles[lang_code] = self._extract_subtitles(subtitle) - return subtitles - - def _real_extract(self, url): - mobj = self._match_valid_url(url) - video_id = mobj.group('id') - - if mobj.group('prefix') == 'm': - mobile_webpage = self._download_webpage(url, video_id, 'Downloading mobile webpage') - webpage_url = self._search_regex(r'', mobile_webpage, 'webpage_url') - else: - webpage_url = 'http://www.' + mobj.group('url') - - webpage = self._download_webpage( - self._add_skip_wall(webpage_url), video_id, - headers=self.geo_verification_headers()) - if re.search(r'
    ', webpage): - return self._redirect_to_beta(webpage, CrunchyrollBetaIE.ie_key(), video_id) - note_m = self._html_search_regex( - r'
    (.+?)
    ', - webpage, 'trailer-notice', default='') - if note_m: - raise ExtractorError(note_m, expected=True) - - mobj = re.search(r'Page\.messaging_box_controller\.addItems\(\[(?P{.+?})\]\)', webpage) - if mobj: - msg = json.loads(mobj.group('msg')) - if msg.get('type') == 'error': - raise ExtractorError('crunchyroll returned error: %s' % msg['message_body'], expected=True) - - if 'To view this, please log in to verify you are 18 or older.' in webpage: - self.raise_login_required() - - media = self._parse_json(self._search_regex( - r'vilos\.config\.media\s*=\s*({.+?});', - webpage, 'vilos media', default='{}'), video_id) - media_metadata = media.get('metadata') or {} - - language = self._search_regex( - r'(?:vilos\.config\.player\.language|LOCALE)\s*=\s*(["\'])(?P(?:(?!\1).)+)\1', - webpage, 'language', default=None, group='lang') - - video_title = self._html_search_regex( - (r'(?s)]*>((?:(?!]+itemprop=["\']title["\']|meta[^>]+itemprop=["\']position["\'])[^>]*>(?:(?!', - r'(.+?),\s+-\s+.+? Crunchyroll'), - webpage, 'video_title', default=None) - if not video_title: - video_title = re.sub(r'^Watch\s+', '', self._og_search_description(webpage)) - video_title = re.sub(r' {2,}', ' ', video_title) - video_description = (self._parse_json(self._html_search_regex( - r'<script[^>]*>\s*.+?\[media_id=%s\].+?({.+?"description"\s*:.+?})\);' % video_id, - webpage, 'description', default='{}'), video_id) or media_metadata).get('description') - - thumbnails = [] - thumbnail_url = (self._parse_json(self._html_search_regex( - r'<script type="application\/ld\+json">\n\s*(.+?)<\/script>', - webpage, 'thumbnail_url', default='{}'), video_id)).get('image') - if thumbnail_url: - thumbnails.append({ - 'url': thumbnail_url, - 'width': 1920, - 'height': 1080 - }) - - if video_description: - video_description = lowercase_escape(video_description.replace(r'\r\n', '\n')) - video_uploader = self._html_search_regex( - # try looking for both an uploader that's a link and one that's not - [r'<a[^>]+href="/publisher/[^"]+"[^>]*>([^<]+)</a>', r'<div>\s*Publisher:\s*<span>\s*(.+?)\s*</span>\s*</div>'], - webpage, 'video_uploader', default=False) - - requested_languages = self._configuration_arg('language') - requested_hardsubs = [('' if val == 'none' else val) for val in self._configuration_arg('hardsub')] - language_preference = qualities((requested_languages or [language or ''])[::-1]) - hardsub_preference = qualities((requested_hardsubs or ['', language or ''])[::-1]) - - formats = [] - for stream in media.get('streams', []): - audio_lang = stream.get('audio_lang') or '' - hardsub_lang = stream.get('hardsub_lang') or '' - if (requested_languages and audio_lang.lower() not in requested_languages - or requested_hardsubs and hardsub_lang.lower() not in requested_hardsubs): - continue - vrv_formats = self._extract_vrv_formats( - stream.get('url'), video_id, stream.get('format'), - audio_lang, hardsub_lang) - for f in vrv_formats: - f['language_preference'] = language_preference(audio_lang) - f['quality'] = hardsub_preference(hardsub_lang) - formats.extend(vrv_formats) - if not formats: - available_fmts = [] - for a, fmt in re.findall(r'(<a[^>]+token=["\']showmedia\.([0-9]{3,4})p["\'][^>]+>)', webpage): - attrs = extract_attributes(a) - href = attrs.get('href') - if href and '/freetrial' in href: - continue - available_fmts.append(fmt) - if not available_fmts: - for p in (r'token=["\']showmedia\.([0-9]{3,4})p"', r'showmedia\.([0-9]{3,4})p'): - available_fmts = re.findall(p, webpage) - if available_fmts: - break - if not available_fmts: - available_fmts = self._FORMAT_IDS.keys() - video_encode_ids = [] - - for fmt in available_fmts: - stream_quality, stream_format = self._FORMAT_IDS[fmt] - video_format = fmt + 'p' - stream_infos = [] - streamdata = self._call_rpc_api( - 'VideoPlayer_GetStandardConfig', video_id, - 'Downloading media info for %s' % video_format, data={ - 'media_id': video_id, - 'video_format': stream_format, - 'video_quality': stream_quality, - 'current_page': url, - }) - if isinstance(streamdata, compat_etree_Element): - stream_info = streamdata.find('./{default}preload/stream_info') - if stream_info is not None: - stream_infos.append(stream_info) - stream_info = self._call_rpc_api( - 'VideoEncode_GetStreamInfo', video_id, - 'Downloading stream info for %s' % video_format, data={ - 'media_id': video_id, - 'video_format': stream_format, - 'video_encode_quality': stream_quality, - }) - if isinstance(stream_info, compat_etree_Element): - stream_infos.append(stream_info) - for stream_info in stream_infos: - video_encode_id = xpath_text(stream_info, './video_encode_id') - if video_encode_id in video_encode_ids: - continue - video_encode_ids.append(video_encode_id) - - video_file = xpath_text(stream_info, './file') - if not video_file: - continue - if video_file.startswith('http'): - formats.extend(self._extract_m3u8_formats( - video_file, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - continue - - video_url = xpath_text(stream_info, './host') - if not video_url: - continue - metadata = stream_info.find('./metadata') - format_info = { - 'format': video_format, - 'height': int_or_none(xpath_text(metadata, './height')), - 'width': int_or_none(xpath_text(metadata, './width')), - } - - if '.fplive.net/' in video_url: - video_url = re.sub(r'^rtmpe?://', 'http://', video_url.strip()) - parsed_video_url = compat_urlparse.urlparse(video_url) - direct_video_url = compat_urlparse.urlunparse(parsed_video_url._replace( - netloc='v.lvlt.crcdn.net', - path='%s/%s' % (remove_end(parsed_video_url.path, '/'), video_file.split(':')[-1]))) - if self._is_valid_url(direct_video_url, video_id, video_format): - format_info.update({ - 'format_id': 'http-' + video_format, - 'url': direct_video_url, - }) - formats.append(format_info) - continue - - format_info.update({ - 'format_id': 'rtmp-' + video_format, - 'url': video_url, - 'play_path': video_file, - 'ext': 'flv', - }) - formats.append(format_info) - self._sort_formats(formats) - - metadata = self._call_rpc_api( - 'VideoPlayer_GetMediaMetadata', video_id, - note='Downloading media info', data={ - 'media_id': video_id, - }) - - subtitles = {} - for subtitle in media.get('subtitles', []): - subtitle_url = subtitle.get('url') - if not subtitle_url: - continue - subtitles.setdefault(subtitle.get('language', 'enUS'), []).append({ - 'url': subtitle_url, - 'ext': subtitle.get('format', 'ass'), - }) - if not subtitles: - subtitles = self.extract_subtitles(video_id, webpage) - - # webpage provide more accurate data than series_title from XML - series = self._html_search_regex( - r'(?s)<h\d[^>]+\bid=["\']showmedia_about_episode_num[^>]+>(.+?)</h\d', - webpage, 'series', fatal=False) - - season = episode = episode_number = duration = None - - if isinstance(metadata, compat_etree_Element): - season = xpath_text(metadata, 'series_title') - episode = xpath_text(metadata, 'episode_title') - episode_number = int_or_none(xpath_text(metadata, 'episode_number')) - duration = float_or_none(media_metadata.get('duration'), 1000) - - if not episode: - episode = media_metadata.get('title') - if not episode_number: - episode_number = int_or_none(media_metadata.get('episode_number')) - thumbnail_url = try_get(media, lambda x: x['thumbnail']['url']) - if thumbnail_url: - thumbnails.append({ - 'url': thumbnail_url, - 'width': 640, - 'height': 360 - }) - - season_number = int_or_none(self._search_regex( - r'(?s)<h\d[^>]+id=["\']showmedia_about_episode_num[^>]+>.+?</h\d>\s*<h4>\s*Season (\d+)', - webpage, 'season number', default=None)) - - info = self._search_json_ld(webpage, video_id, default={}) - - return merge_dicts({ - 'id': video_id, - 'title': video_title, - 'description': video_description, - 'duration': duration, - 'thumbnails': thumbnails, - 'uploader': video_uploader, - 'series': series, - 'season': season, - 'season_number': season_number, - 'episode': episode, - 'episode_number': episode_number, - 'subtitles': subtitles, - 'formats': formats, - }, info) - - -class CrunchyrollShowPlaylistIE(CrunchyrollBaseIE): - IE_NAME = 'crunchyroll:playlist' - _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.com/(?:\w{1,2}/)?(?!(?:news|anime-news|library|forum|launchcalendar|lineup|store|comics|freetrial|login|media-\d+))(?P<id>[\w\-]+))/?(?:\?|$)' - - _TESTS = [{ - 'url': 'https://www.crunchyroll.com/a-bridge-to-the-starry-skies-hoshizora-e-kakaru-hashi', - 'info_dict': { - 'id': 'a-bridge-to-the-starry-skies-hoshizora-e-kakaru-hashi', - 'title': 'A Bridge to the Starry Skies - Hoshizora e Kakaru Hashi' - }, - 'playlist_count': 13, - }, { - # geo-restricted (US), 18+ maturity wall, non-premium available - 'url': 'http://www.crunchyroll.com/cosplay-complex-ova', - 'info_dict': { - 'id': 'cosplay-complex-ova', - 'title': 'Cosplay Complex OVA' - }, - 'playlist_count': 3, - 'skip': 'Georestricted', - }, { - # geo-restricted (US), 18+ maturity wall, non-premium will be available since 2015.11.14 - 'url': 'http://www.crunchyroll.com/ladies-versus-butlers?skip_wall=1', - 'only_matching': True, - }, { - 'url': 'http://www.crunchyroll.com/fr/ladies-versus-butlers', - 'only_matching': True, - }] - - def _real_extract(self, url): - show_id = self._match_id(url) - - webpage = self._download_webpage( - # https:// gives a 403, but http:// does not - self._add_skip_wall(url).replace('https://', 'http://'), show_id, - headers=self.geo_verification_headers()) - if re.search(r'<div id="preload-data">', webpage): - return self._redirect_to_beta(webpage, CrunchyrollBetaShowIE.ie_key(), show_id) - title = self._html_search_meta('name', webpage, default=None) - - episode_re = r'<li id="showview_videos_media_(\d+)"[^>]+>.*?<a href="([^"]+)"' - season_re = r'<a [^>]+season-dropdown[^>]+>([^<]+)' - paths = re.findall(f'(?s){episode_re}|{season_re}', webpage) - - entries, current_season = [], None - for ep_id, ep, season in paths: - if season: - current_season = season - continue - entries.append(self.url_result( - f'http://www.crunchyroll.com{ep}', CrunchyrollIE.ie_key(), ep_id, season=current_season)) - - return { - '_type': 'playlist', - 'id': show_id, - 'title': title, - 'entries': reversed(entries), - } + def _get_params(self, lang): + if not CrunchyrollBaseIE.params: + if self._get_cookies(f'https://www.crunchyroll.com/{lang}').get('etp_rt'): + grant_type, key = 'etp_rt_cookie', 'accountAuthClientId' + else: + grant_type, key = 'client_id', 'anonClientId' + initial_state, app_config = self._get_embedded_json(self._download_webpage( + f'https://www.crunchyroll.com/{lang}', None, note='Retrieving main page'), None) + api_domain = app_config['cxApiParams']['apiDomain'].replace('beta.crunchyroll.com', 'www.crunchyroll.com') -class CrunchyrollBetaBaseIE(CrunchyrollBaseIE): - params = None - - def _get_params(self, lang): - if not CrunchyrollBetaBaseIE.params: - initial_state, app_config = self._get_beta_embedded_json(self._download_webpage( - f'https://beta.crunchyroll.com/{lang}', None, note='Retrieving main page'), None) - api_domain = app_config['cxApiParams']['apiDomain'] - basic_token = str(base64.b64encode(('%s:' % app_config['cxApiParams']['accountAuthClientId']).encode('ascii')), 'ascii') auth_response = self._download_json( - f'{api_domain}/auth/v1/token', None, note='Authenticating with cookie', + f'{api_domain}/auth/v1/token', None, note=f'Authenticating with grant_type={grant_type}', headers={ - 'Authorization': 'Basic ' + basic_token - }, data='grant_type=etp_rt_cookie'.encode('ascii')) + 'Authorization': 'Basic ' + str(base64.b64encode(('%s:' % app_config['cxApiParams'][key]).encode('ascii')), 'ascii') + }, data=f'grant_type={grant_type}'.encode('ascii')) policy_response = self._download_json( f'{api_domain}/index/v2', None, note='Retrieving signed policy', headers={ 'Authorization': auth_response['token_type'] + ' ' + auth_response['access_token'] }) - bucket = policy_response['cms']['bucket'] + cms = policy_response.get('cms_web') + bucket = cms['bucket'] params = { - 'Policy': policy_response['cms']['policy'], - 'Signature': policy_response['cms']['signature'], - 'Key-Pair-Id': policy_response['cms']['key_pair_id'] + 'Policy': cms['policy'], + 'Signature': cms['signature'], + 'Key-Pair-Id': cms['key_pair_id'] } locale = traverse_obj(initial_state, ('localization', 'locale')) if locale: params['locale'] = locale - CrunchyrollBetaBaseIE.params = (api_domain, bucket, params) - return CrunchyrollBetaBaseIE.params - - def _redirect_from_beta(self, url, lang, internal_id, display_id, is_episode, iekey): - initial_state, app_config = self._get_beta_embedded_json(self._download_webpage(url, display_id), display_id) - content_data = initial_state['content']['byId'][internal_id] - if is_episode: - video_id = content_data['external_id'].split('.')[1] - series_id = content_data['episode_metadata']['series_slug_title'] - else: - series_id = content_data['slug_title'] - series_id = re.sub(r'-{2,}', '-', series_id) - url = f'https://www.crunchyroll.com/{lang}{series_id}' - if is_episode: - url = url + f'/{display_id}-{video_id}' - self.to_screen(f'{display_id}: Not logged in. Redirecting to non-beta site - {url}') - return self.url_result(url, iekey, display_id) + CrunchyrollBaseIE.params = (api_domain, bucket, params) + return CrunchyrollBaseIE.params -class CrunchyrollBetaIE(CrunchyrollBetaBaseIE): - IE_NAME = 'crunchyroll:beta' - _VALID_URL = r'https?://beta\.crunchyroll\.com/(?P<lang>(?:\w{1,2}/)?)watch/(?P<id>\w+)/(?P<display_id>[\w\-]*)/?(?:\?|$)' +class CrunchyrollBetaIE(CrunchyrollBaseIE): + IE_NAME = 'crunchyroll' + _VALID_URL = r'''(?x) + https?://(?:beta|www)\.crunchyroll\.com/ + (?P<lang>(?:\w{2}(?:-\w{2})?/)?) + watch/(?P<id>\w+) + (?:/(?P<display_id>[\w-]+))?/?(?:[?#]|$)''' _TESTS = [{ - 'url': 'https://beta.crunchyroll.com/watch/GY2P1Q98Y/to-the-future', + 'url': 'https://www.crunchyroll.com/watch/GY2P1Q98Y/to-the-future', 'info_dict': { - 'id': '696363', + 'id': 'GY2P1Q98Y', 'ext': 'mp4', - 'timestamp': 1459610100, + 'duration': 1380.241, + 'timestamp': 1459632600, 'description': 'md5:a022fbec4fbb023d43631032c91ed64b', - 'uploader': 'Toei Animation', 'title': 'World Trigger Episode 73 – To the Future', 'upload_date': '20160402', - 'episode_number': 73, 'series': 'World Trigger', - 'average_rating': 4.9, - 'episode': 'To the Future', + 'series_id': 'GR757DMKY', 'season': 'World Trigger', - 'thumbnail': 'https://img1.ak.crunchyroll.com/i/spire3-tmb/c870dedca1a83137c2d3d144984155ed1459527119_main.jpg', + 'season_id': 'GR9P39NJ6', 'season_number': 1, + 'episode': 'To the Future', + 'episode_number': 73, + 'thumbnail': r're:^https://www.crunchyroll.com/imgsrv/.*\.jpeg$', }, - 'params': {'skip_download': 'm3u8'}, - 'expected_warnings': ['Unable to download XML'] + 'params': {'skip_download': 'm3u8', 'format': 'all[format_id~=hardsub]'}, }, { - 'url': 'https://beta.crunchyroll.com/watch/GYK53DMPR/wicked-lord-shingan-reborn', + 'url': 'https://www.crunchyroll.com/watch/GYE5WKQGR', 'info_dict': { - 'id': '648781', + 'id': 'GYE5WKQGR', 'ext': 'mp4', - 'episode_number': 1, - 'timestamp': 1389173400, - 'series': 'Love, Chunibyo & Other Delusions - Heart Throb -', - 'description': 'md5:5579d1a0355cc618558ba23d27067a62', - 'uploader': 'TBS', - 'episode': 'Wicked Lord Shingan... Reborn', - 'average_rating': 4.9, - 'season': 'Love, Chunibyo & Other Delusions - Heart Throb -', - 'thumbnail': 'https://img1.ak.crunchyroll.com/i/spire3-tmb/2ba0384e225a5370d5f0ee9496d91ea51389046521_main.jpg', - 'title': 'Love, Chunibyo & Other Delusions - Heart Throb - Episode 1 – Wicked Lord Shingan... Reborn', - 'season_number': 2, - 'upload_date': '20140108', + 'duration': 366.459, + 'timestamp': 1476788400, + 'description': 'md5:74b67283ffddd75f6e224ca7dc031e76', + 'title': 'SHELTER Episode – Porter Robinson presents Shelter the Animation', + 'upload_date': '20161018', + 'series': 'SHELTER', + 'series_id': 'GYGG09WWY', + 'season': 'SHELTER', + 'season_id': 'GR09MGK4R', + 'season_number': 1, + 'episode': 'Porter Robinson presents Shelter the Animation', + 'episode_number': 0, + 'thumbnail': r're:^https://www.crunchyroll.com/imgsrv/.*\.jpeg$', }, - 'params': {'skip_download': 'm3u8'}, - 'expected_warnings': ['Unable to download XML'] + 'params': {'skip_download': True}, + 'skip': 'Video is Premium only', + }, { + 'url': 'https://www.crunchyroll.com/watch/GY2P1Q98Y', + 'only_matching': True, }, { - 'url': 'https://beta.crunchyroll.com/watch/GY2P1Q98Y/', + 'url': 'https://beta.crunchyroll.com/pt-br/watch/G8WUN8VKP/the-ruler-of-conspiracy', 'only_matching': True, }] def _real_extract(self, url): lang, internal_id, display_id = self._match_valid_url(url).group('lang', 'id', 'display_id') - - if not self._get_cookies(url).get('etp_rt'): - return self._redirect_from_beta(url, lang, internal_id, display_id, True, CrunchyrollIE.ie_key()) - api_domain, bucket, params = self._get_params(lang) episode_response = self._download_json( f'{api_domain}/cms/v2{bucket}/episodes/{internal_id}', display_id, - note='Retrieving episode metadata', - query=params) + note='Retrieving episode metadata', query=params) if episode_response.get('is_premium_only') and not episode_response.get('playback'): raise ExtractorError('This video is for premium members only.', expected=True) - stream_response = self._download_json( - episode_response['playback'], display_id, - note='Retrieving stream info') - thumbnails = [] - for thumbnails_data in traverse_obj(episode_response, ('images', 'thumbnail')): - for thumbnail_data in thumbnails_data: - thumbnails.append({ - 'url': thumbnail_data.get('source'), - 'width': thumbnail_data.get('width'), - 'height': thumbnail_data.get('height'), - }) - subtitles = {} - for lang, subtitle_data in stream_response.get('subtitles').items(): - subtitles[lang] = [{ - 'url': subtitle_data.get('url'), - 'ext': subtitle_data.get('format') - }] + stream_response = self._download_json( + f'{api_domain}{episode_response["__links__"]["streams"]["href"]}', display_id, + note='Retrieving stream info', query=params) + get_streams = lambda name: (traverse_obj(stream_response, name) or {}).items() requested_hardsubs = [('' if val == 'none' else val) for val in (self._configuration_arg('hardsub') or ['none'])] hardsub_preference = qualities(requested_hardsubs[::-1]) requested_formats = self._configuration_arg('format') or ['adaptive_hls'] - formats = [] - for stream_type, streams in stream_response.get('streams', {}).items(): + available_formats = {} + for stream_type, streams in get_streams('streams'): if stream_type not in requested_formats: continue for stream in streams.values(): - hardsub_lang = stream.get('hardsub_locale') or '' - if hardsub_lang.lower() not in requested_hardsubs: - continue - format_id = join_nonempty( - stream_type, - format_field(stream, 'hardsub_locale', 'hardsub-%s')) if not stream.get('url'): continue - if stream_type.split('_')[-1] == 'hls': + hardsub_lang = stream.get('hardsub_locale') or '' + format_id = join_nonempty(stream_type, format_field(stream, 'hardsub_locale', 'hardsub-%s')) + available_formats[hardsub_lang] = (stream_type, format_id, hardsub_lang, stream['url']) + + if '' in available_formats and 'all' not in requested_hardsubs: + full_format_langs = set(requested_hardsubs) + self.to_screen( + 'To get all formats of a hardsub language, use ' + '"--extractor-args crunchyrollbeta:hardsub=<language_code or all>". ' + 'See https://github.com/hypervideo/hypervideo#crunchyrollbeta for more info', + only_once=True) + else: + full_format_langs = set(map(str.lower, available_formats)) + + formats = [] + for stream_type, format_id, hardsub_lang, stream_url in available_formats.values(): + if stream_type.endswith('hls'): + if hardsub_lang.lower() in full_format_langs: adaptive_formats = self._extract_m3u8_formats( - stream['url'], display_id, 'mp4', m3u8_id=format_id, - note='Downloading %s information' % format_id, - fatal=False) - elif stream_type.split('_')[-1] == 'dash': - adaptive_formats = self._extract_mpd_formats( - stream['url'], display_id, mpd_id=format_id, - note='Downloading %s information' % format_id, - fatal=False) - for f in adaptive_formats: - if f.get('acodec') != 'none': - f['language'] = stream_response.get('audio_locale') - f['quality'] = hardsub_preference(hardsub_lang.lower()) - formats.extend(adaptive_formats) - self._sort_formats(formats) + stream_url, display_id, 'mp4', m3u8_id=format_id, + fatal=False, note=f'Downloading {format_id} HLS manifest') + else: + adaptive_formats = (self._m3u8_meta_format(stream_url, ext='mp4', m3u8_id=format_id),) + elif stream_type.endswith('dash'): + adaptive_formats = self._extract_mpd_formats( + stream_url, display_id, mpd_id=format_id, + fatal=False, note=f'Downloading {format_id} MPD manifest') + else: + self.report_warning(f'Encountered unknown stream_type: {stream_type!r}', display_id, only_once=True) + continue + for f in adaptive_formats: + if f.get('acodec') != 'none': + f['language'] = stream_response.get('audio_locale') + f['quality'] = hardsub_preference(hardsub_lang.lower()) + formats.extend(adaptive_formats) return { 'id': internal_id, - 'title': '%s Episode %s – %s' % (episode_response.get('season_title'), episode_response.get('episode'), episode_response.get('title')), - 'description': episode_response.get('description').replace(r'\r\n', '\n'), + 'title': '%s Episode %s – %s' % ( + episode_response.get('season_title'), episode_response.get('episode'), episode_response.get('title')), + 'description': try_get(episode_response, lambda x: x['description'].replace(r'\r\n', '\n')), 'duration': float_or_none(episode_response.get('duration_ms'), 1000), - 'thumbnails': thumbnails, + 'timestamp': parse_iso8601(episode_response.get('upload_date')), 'series': episode_response.get('series_title'), 'series_id': episode_response.get('series_id'), 'season': episode_response.get('season_title'), @@ -887,39 +223,42 @@ class CrunchyrollBetaIE(CrunchyrollBetaBaseIE): 'season_number': episode_response.get('season_number'), 'episode': episode_response.get('title'), 'episode_number': episode_response.get('sequence_number'), - 'subtitles': subtitles, - 'formats': formats + 'formats': formats, + 'thumbnails': [{ + 'url': thumb.get('source'), + 'width': thumb.get('width'), + 'height': thumb.get('height'), + } for thumb in traverse_obj(episode_response, ('images', 'thumbnail', ..., ...)) or []], + 'subtitles': { + lang: [{ + 'url': subtitle_data.get('url'), + 'ext': subtitle_data.get('format') + }] for lang, subtitle_data in get_streams('subtitles') + }, } -class CrunchyrollBetaShowIE(CrunchyrollBetaBaseIE): - IE_NAME = 'crunchyroll:playlist:beta' - _VALID_URL = r'https?://beta\.crunchyroll\.com/(?P<lang>(?:\w{1,2}/)?)series/(?P<id>\w+)/(?P<display_id>[\w\-]*)/?(?:\?|$)' +class CrunchyrollBetaShowIE(CrunchyrollBaseIE): + IE_NAME = 'crunchyroll:playlist' + _VALID_URL = r'''(?x) + https?://(?:beta|www)\.crunchyroll\.com/ + (?P<lang>(?:\w{2}(?:-\w{2})?/)?) + series/(?P<id>\w+) + (?:/(?P<display_id>[\w-]+))?/?(?:[?#]|$)''' _TESTS = [{ - 'url': 'https://beta.crunchyroll.com/series/GY19NQ2QR/Girl-Friend-BETA', + 'url': 'https://www.crunchyroll.com/series/GY19NQ2QR/Girl-Friend-BETA', 'info_dict': { - 'id': 'girl-friend-beta', + 'id': 'GY19NQ2QR', 'title': 'Girl Friend BETA', }, 'playlist_mincount': 10, }, { - 'url': 'https://beta.crunchyroll.com/series/GYJQV73V6/love-chunibyo--other-delusions---heart-throb--', - 'info_dict': { - 'id': 'love-chunibyo-other-delusions-heart-throb-', - 'title': 'Love, Chunibyo & Other Delusions - Heart Throb -', - }, - 'playlist_mincount': 10, - }, { - 'url': 'https://beta.crunchyroll.com/it/series/GY19NQ2QR/Girl-Friend-BETA', + 'url': 'https://beta.crunchyroll.com/it/series/GY19NQ2QR', 'only_matching': True, }] def _real_extract(self, url): lang, internal_id, display_id = self._match_valid_url(url).group('lang', 'id', 'display_id') - - if not self._get_cookies(url).get('etp_rt'): - return self._redirect_from_beta(url, lang, internal_id, display_id, False, CrunchyrollShowPlaylistIE.ie_key()) - api_domain, bucket, params = self._get_params(lang) series_response = self._download_json( @@ -940,7 +279,7 @@ class CrunchyrollBetaShowIE(CrunchyrollBetaBaseIE): episode_display_id = episode['slug_title'] yield { '_type': 'url', - 'url': f'https://beta.crunchyroll.com/{lang}watch/{episode_id}/{episode_display_id}', + 'url': f'https://www.crunchyroll.com/{lang}watch/{episode_id}/{episode_display_id}', 'ie_key': CrunchyrollBetaIE.ie_key(), 'id': episode_id, 'title': '%s Episode %s – %s' % (episode.get('season_title'), episode.get('episode'), episode.get('title')), diff --git a/hypervideo_dl/extractor/cspan.py b/hypervideo_dl/extractor/cspan.py index f51159b..0075680 100644 --- a/hypervideo_dl/extractor/cspan.py +++ b/hypervideo_dl/extractor/cspan.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -165,7 +163,7 @@ class CSpanIE(InfoExtractor): video_id = m.group('id') video_type = 'program' if m.group('type') == 'prog' else 'clip' else: - senate_isvp_url = SenateISVPIE._search_iframe_url(webpage) + senate_isvp_url = SenateISVPIE._extract_url(webpage) if senate_isvp_url: title = self._og_search_title(webpage) surl = smuggle_url(senate_isvp_url, {'force_title': title}) @@ -220,7 +218,6 @@ class CSpanIE(InfoExtractor): path, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') if determine_ext(path) == 'm3u8' else [{'url': path, }] add_referer(formats) - self._sort_formats(formats) entries.append({ 'id': '%s_%d' % (video_id, partnum + 1), 'title': ( @@ -277,8 +274,7 @@ class CSpanCongressIE(InfoExtractor): self._search_regex(r'jwsetup\s*=\s*({(?:.|\n)[^;]+});', webpage, 'player config'), video_id, transform_source=js_to_json) - title = (self._og_search_title(webpage, default=None) - or self._html_extract_title(webpage, 'video title')) + title = self._generic_title('', webpage) description = (self._og_search_description(webpage, default=None) or self._html_search_meta('description', webpage, 'description', default=None)) diff --git a/hypervideo_dl/extractor/ctsnews.py b/hypervideo_dl/extractor/ctsnews.py index 679f1d9..cec178f 100644 --- a/hypervideo_dl/extractor/ctsnews.py +++ b/hypervideo_dl/extractor/ctsnews.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import unified_timestamp from .youtube import YoutubeIE diff --git a/hypervideo_dl/extractor/ctv.py b/hypervideo_dl/extractor/ctv.py index 756bcc2..f125c1c 100644 --- a/hypervideo_dl/extractor/ctv.py +++ b/hypervideo_dl/extractor/ctv.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/ctvnews.py b/hypervideo_dl/extractor/ctvnews.py index 952f4c7..ad3f0d8 100644 --- a/hypervideo_dl/extractor/ctvnews.py +++ b/hypervideo_dl/extractor/ctvnews.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/cultureunplugged.py b/hypervideo_dl/extractor/cultureunplugged.py index 9002e4c..2fb2280 100644 --- a/hypervideo_dl/extractor/cultureunplugged.py +++ b/hypervideo_dl/extractor/cultureunplugged.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import time from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/curiositystream.py b/hypervideo_dl/extractor/curiositystream.py index b8abcf7..26cf24f 100644 --- a/hypervideo_dl/extractor/curiositystream.py +++ b/hypervideo_dl/extractor/curiositystream.py @@ -1,15 +1,8 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor -from ..utils import ( - int_or_none, - urlencode_postdata, - compat_str, - ExtractorError, -) +from ..compat import compat_str +from ..utils import ExtractorError, int_or_none, urlencode_postdata class CuriosityStreamBaseIE(InfoExtractor): @@ -26,6 +19,11 @@ class CuriosityStreamBaseIE(InfoExtractor): def _call_api(self, path, video_id, query=None): headers = {} + if not self._auth_token: + auth_cookie = self._get_cookies('https://curiositystream.com').get('auth_token') + if auth_cookie: + self.write_debug('Obtained auth_token cookie') + self._auth_token = auth_cookie.value if self._auth_token: headers['X-Auth-Token'] = self._auth_token result = self._download_json( @@ -48,7 +46,7 @@ class CuriosityStreamIE(CuriosityStreamBaseIE): IE_NAME = 'curiositystream' _VALID_URL = r'https?://(?:app\.)?curiositystream\.com/video/(?P<id>\d+)' _TESTS = [{ - 'url': 'https://app.curiositystream.com/video/2', + 'url': 'http://app.curiositystream.com/video/2', 'info_dict': { 'id': '2', 'ext': 'mp4', @@ -119,7 +117,6 @@ class CuriosityStreamIE(CuriosityStreamBaseIE): 'format_id': 'http', }) formats.append(fmt) - self._sort_formats(formats) title = media['title'] diff --git a/hypervideo_dl/extractor/cwtv.py b/hypervideo_dl/extractor/cwtv.py index 7338243..9b83264 100644 --- a/hypervideo_dl/extractor/cwtv.py +++ b/hypervideo_dl/extractor/cwtv.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( ExtractorError, @@ -94,4 +91,5 @@ class CWTVIE(InfoExtractor): 'timestamp': parse_iso8601(video_data.get('start_time')), 'age_limit': parse_age_limit(video_data.get('rating')), 'ie_key': 'ThePlatform', + 'thumbnail': video_data.get('large_thumbnail') } diff --git a/hypervideo_dl/extractor/cybrary.py b/hypervideo_dl/extractor/cybrary.py index c278f0f..73f2439 100644 --- a/hypervideo_dl/extractor/cybrary.py +++ b/hypervideo_dl/extractor/cybrary.py @@ -1,12 +1,10 @@ -# coding: utf-8 from .common import InfoExtractor - from ..utils import ( ExtractorError, smuggle_url, str_or_none, traverse_obj, - urlencode_postdata + urlencode_postdata, ) diff --git a/hypervideo_dl/extractor/daftsex.py b/hypervideo_dl/extractor/daftsex.py index 6037fd9..551d5e3 100644 --- a/hypervideo_dl/extractor/daftsex.py +++ b/hypervideo_dl/extractor/daftsex.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import compat_b64decode from ..utils import ( @@ -84,7 +81,6 @@ class DaftsexIE(InfoExtractor): 'height': int_or_none(height), 'ext': ext, }) - self._sort_formats(formats) return { 'id': video_id, @@ -120,7 +116,6 @@ class DaftsexIE(InfoExtractor): 'height': int_or_none(height), 'ext': ext, }) - self._sort_formats(formats) thumbnails = [] for k, v in item.items(): diff --git a/hypervideo_dl/extractor/dailymail.py b/hypervideo_dl/extractor/dailymail.py index 67b88fd..43401e1 100644 --- a/hypervideo_dl/extractor/dailymail.py +++ b/hypervideo_dl/extractor/dailymail.py @@ -1,8 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - from .common import InfoExtractor from ..compat import compat_str from ..utils import ( @@ -15,6 +10,7 @@ from ..utils import ( class DailyMailIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?dailymail\.co\.uk/(?:video/[^/]+/video-|embed/video/)(?P<id>[0-9]+)' + _EMBED_REGEX = [r'<iframe\b[^>]+\bsrc=["\'](?P<url>(?:https?:)?//(?:www\.)?dailymail\.co\.uk/embed/video/\d+\.html)'] _TESTS = [{ 'url': 'http://www.dailymail.co.uk/video/tvshowbiz/video-1295863/The-Mountain-appears-sparkling-water-ad-Heavy-Bubbles.html', 'md5': 'f6129624562251f628296c3a9ffde124', @@ -29,12 +25,6 @@ class DailyMailIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): - return re.findall( - r'<iframe\b[^>]+\bsrc=["\'](?P<url>(?:https?:)?//(?:www\.)?dailymail\.co\.uk/embed/video/\d+\.html)', - webpage) - def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) @@ -73,7 +63,6 @@ class DailyMailIE(InfoExtractor): 'protocol': protocol, 'ext': 'mp4' if is_hls else None, }) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/dailymotion.py b/hypervideo_dl/extractor/dailymotion.py index 9cb5618..2a44718 100644 --- a/hypervideo_dl/extractor/dailymotion.py +++ b/hypervideo_dl/extractor/dailymotion.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import functools import json import re @@ -8,13 +5,15 @@ import re from .common import InfoExtractor from ..compat import compat_HTTPError from ..utils import ( + ExtractorError, + OnDemandPagedList, age_restricted, clean_html, - ExtractorError, int_or_none, - OnDemandPagedList, + traverse_obj, try_get, unescapeHTML, + unsmuggle_url, urlencode_postdata, ) @@ -100,6 +99,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor): [/=](?P<id>[^/?_&]+)(?:.+?\bplaylist=(?P<playlist_id>x[0-9a-z]+))? ''' IE_NAME = 'dailymotion' + _EMBED_REGEX = [r'<(?:(?:embed|iframe)[^>]+?src=|input[^>]+id=[\'"]dmcloudUrlEmissionSelect[\'"][^>]+value=)(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/(?:embed|swf)/video/.+?)\1'] _TESTS = [{ 'url': 'http://www.dailymotion.com/video/x5kesuj_office-christmas-party-review-jason-bateman-olivia-munn-t-j-miller_news', 'md5': '074b95bdee76b9e3654137aee9c79dfe', @@ -209,20 +209,16 @@ class DailymotionIE(DailymotionBaseInfoExtractor): } xid''' - @staticmethod - def _extract_urls(webpage): - urls = [] - # Look for embedded Dailymotion player + @classmethod + def _extract_embed_urls(cls, url, webpage): # https://developer.dailymotion.com/player#player-parameters - for mobj in re.finditer( - r'<(?:(?:embed|iframe)[^>]+?src=|input[^>]+id=[\'"]dmcloudUrlEmissionSelect[\'"][^>]+value=)(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/(?:embed|swf)/video/.+?)\1', webpage): - urls.append(unescapeHTML(mobj.group('url'))) + yield from super()._extract_embed_urls(url, webpage) for mobj in re.finditer( r'(?s)DM\.player\([^,]+,\s*{.*?video[\'"]?\s*:\s*["\']?(?P<id>[0-9a-zA-Z]+).+?}\s*\);', webpage): - urls.append('https://www.dailymotion.com/embed/video/' + mobj.group('id')) - return urls + yield from 'https://www.dailymotion.com/embed/video/' + mobj.group('id') def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url) video_id, playlist_id = self._match_valid_url(url).groups() if playlist_id: @@ -255,7 +251,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor): metadata = self._download_json( 'https://www.dailymotion.com/player/metadata/video/' + xid, xid, 'Downloading metadata JSON', - query={'app': 'com.dailymotion.neon'}) + query=traverse_obj(smuggled_data, 'query') or {'app': 'com.dailymotion.neon'}) error = metadata.get('error') if error: @@ -297,7 +293,6 @@ class DailymotionIE(DailymotionBaseInfoExtractor): f['url'] = f['url'].split('#')[0] if not f.get('fps') and f['format_id'].endswith('@60'): f['fps'] = 60 - self._sort_formats(formats) subtitles = {} subtitles_data = try_get(metadata, lambda x: x['subtitles']['data'], dict) or {} @@ -378,6 +373,15 @@ class DailymotionPlaylistIE(DailymotionPlaylistBaseIE): }] _OBJECT_TYPE = 'collection' + @classmethod + def _extract_embed_urls(cls, url, webpage): + # Look for embedded Dailymotion playlist player (#3822) + for mobj in re.finditer( + r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.[a-z]{2,3}/widget/jukebox\?.+?)\1', + webpage): + for p in re.findall(r'list\[\]=/playlist/([^/]+)/', unescapeHTML(mobj.group('url'))): + yield '//dailymotion.com/playlist/%s' % p + class DailymotionUserIE(DailymotionPlaylistBaseIE): IE_NAME = 'dailymotion:user' diff --git a/hypervideo_dl/extractor/dailywire.py b/hypervideo_dl/extractor/dailywire.py new file mode 100644 index 0000000..f177c9d --- /dev/null +++ b/hypervideo_dl/extractor/dailywire.py @@ -0,0 +1,113 @@ +from .common import InfoExtractor +from ..utils import ( + determine_ext, + float_or_none, + join_nonempty, + traverse_obj, + url_or_none, +) + + +class DailyWireBaseIE(InfoExtractor): + _JSON_PATH = { + 'episode': ('props', 'pageProps', 'episodeData', 'episode'), + 'videos': ('props', 'pageProps', 'videoData', 'video'), + 'podcasts': ('props', 'pageProps', 'episode'), + } + + def _get_json(self, url): + sites_type, slug = self._match_valid_url(url).group('sites_type', 'id') + json_data = self._search_nextjs_data(self._download_webpage(url, slug), slug) + return slug, traverse_obj(json_data, self._JSON_PATH[sites_type]) + + +class DailyWireIE(DailyWireBaseIE): + _VALID_URL = r'https?://(?:www\.)dailywire(?:\.com)/(?P<sites_type>episode|videos)/(?P<id>[\w-]+)' + _TESTS = [{ + 'url': 'https://www.dailywire.com/episode/1-fauci', + 'info_dict': { + 'id': 'ckzsl50xnqpy30850in3v4bu7', + 'ext': 'mp4', + 'display_id': '1-fauci', + 'title': '1. Fauci', + 'description': 'md5:9df630347ef85081b7e97dd30bc22853', + 'thumbnail': 'https://daily-wire-production.imgix.net/episodes/ckzsl50xnqpy30850in3v4bu7/ckzsl50xnqpy30850in3v4bu7-1648237399554.jpg', + 'creator': 'Caroline Roberts', + 'series_id': 'ckzplm0a097fn0826r2vc3j7h', + 'series': 'China: The Enemy Within', + } + }, { + 'url': 'https://www.dailywire.com/episode/ep-124-bill-maher', + 'info_dict': { + 'id': 'cl0ngbaalplc80894sfdo9edf', + 'ext': 'mp3', + 'display_id': 'ep-124-bill-maher', + 'title': 'Ep. 124 - Bill Maher', + 'thumbnail': 'https://daily-wire-production.imgix.net/episodes/cl0ngbaalplc80894sfdo9edf/cl0ngbaalplc80894sfdo9edf-1647065568518.jpg', + 'creator': 'Caroline Roberts', + 'description': 'md5:adb0de584bcfa9c41374999d9e324e98', + 'series_id': 'cjzvep7270hp00786l9hwccob', + 'series': 'The Sunday Special', + } + }, { + 'url': 'https://www.dailywire.com/videos/the-hyperions', + 'only_matching': True, + }] + + def _real_extract(self, url): + slug, episode_info = self._get_json(url) + urls = traverse_obj( + episode_info, (('segments', 'videoUrl'), ..., ('video', 'audio')), expected_type=url_or_none) + + formats, subtitles = [], {} + for url in urls: + if determine_ext(url) != 'm3u8': + formats.append({'url': url}) + continue + format_, subs_ = self._extract_m3u8_formats_and_subtitles(url, slug) + formats.extend(format_) + self._merge_subtitles(subs_, target=subtitles) + return { + 'id': episode_info['id'], + 'display_id': slug, + 'title': traverse_obj(episode_info, 'title', 'name'), + 'description': episode_info.get('description'), + 'creator': join_nonempty(('createdBy', 'firstName'), ('createdBy', 'lastName'), from_dict=episode_info, delim=' '), + 'duration': float_or_none(episode_info.get('duration')), + 'is_live': episode_info.get('isLive'), + 'thumbnail': traverse_obj(episode_info, 'thumbnail', 'image', expected_type=url_or_none), + 'formats': formats, + 'subtitles': subtitles, + 'series_id': traverse_obj(episode_info, ('show', 'id')), + 'series': traverse_obj(episode_info, ('show', 'name')), + } + + +class DailyWirePodcastIE(DailyWireBaseIE): + _VALID_URL = r'https?://(?:www\.)dailywire(?:\.com)/(?P<sites_type>podcasts)/(?P<podcaster>[\w-]+/(?P<id>[\w-]+))' + _TESTS = [{ + 'url': 'https://www.dailywire.com/podcasts/morning-wire/get-ready-for-recession-6-15-22', + 'info_dict': { + 'id': 'cl4f01d0w8pbe0a98ydd0cfn1', + 'ext': 'm4a', + 'display_id': 'get-ready-for-recession-6-15-22', + 'title': 'Get Ready for Recession | 6.15.22', + 'description': 'md5:c4afbadda4e1c38a4496f6d62be55634', + 'thumbnail': 'https://daily-wire-production.imgix.net/podcasts/ckx4otgd71jm508699tzb6hf4-1639506575562.jpg', + 'duration': 900.117667, + } + }] + + def _real_extract(self, url): + slug, episode_info = self._get_json(url) + audio_id = traverse_obj(episode_info, 'audioMuxPlaybackId', 'VUsAipTrBVSgzw73SpC2DAJD401TYYwEp') + + return { + 'id': episode_info['id'], + 'url': f'https://stream.media.dailywire.com/{audio_id}/audio.m4a', + 'display_id': slug, + 'title': episode_info.get('title'), + 'duration': float_or_none(episode_info.get('duration')), + 'thumbnail': episode_info.get('thumbnail'), + 'description': episode_info.get('description'), + } diff --git a/hypervideo_dl/extractor/damtomo.py b/hypervideo_dl/extractor/damtomo.py index 456cd35..0e08e4f 100644 --- a/hypervideo_dl/extractor/damtomo.py +++ b/hypervideo_dl/extractor/damtomo.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -39,7 +36,6 @@ class DamtomoBaseIE(InfoExtractor): if not m3u8_url: raise ExtractorError('Failed to obtain m3u8 URL') formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4') - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/daum.py b/hypervideo_dl/extractor/daum.py index 4362e92..3ef5140 100644 --- a/hypervideo_dl/extractor/daum.py +++ b/hypervideo_dl/extractor/daum.py @@ -1,7 +1,3 @@ -# coding: utf-8 - -from __future__ import unicode_literals - import itertools from .common import InfoExtractor @@ -129,7 +125,7 @@ class DaumClipIE(DaumBaseIE): self._KAKAO_EMBED_BASE + video_id, 'Kakao', video_id) -class DaumListIE(InfoExtractor): +class DaumListIE(InfoExtractor): # XXX: Conventionally, base classes should end with BaseIE/InfoExtractor def _get_entries(self, list_id, list_id_type): name = None entries = [] diff --git a/hypervideo_dl/extractor/daystar.py b/hypervideo_dl/extractor/daystar.py index 4f59d90..ef3520a 100644 --- a/hypervideo_dl/extractor/daystar.py +++ b/hypervideo_dl/extractor/daystar.py @@ -36,7 +36,6 @@ class DaystarClipIE(InfoExtractor): video_id, 'mp4', fatal=False, headers={'Referer': src_iframe}) formats.extend(fmts) subtitles = self._merge_subtitles(subtitles, subs) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/dbtv.py b/hypervideo_dl/extractor/dbtv.py index 8e73176..18be46f 100644 --- a/hypervideo_dl/extractor/dbtv.py +++ b/hypervideo_dl/extractor/dbtv.py @@ -1,13 +1,9 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - from .common import InfoExtractor class DBTVIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?dagbladet\.no/video/(?:(?:embed|(?P<display_id>[^/]+))/)?(?P<id>[0-9A-Za-z_-]{11}|[a-zA-Z0-9]{8})' + _EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dagbladet\.no/video/embed/(?:[0-9A-Za-z_-]{11}|[a-zA-Z0-9]{8}).*?)\1'] _TESTS = [{ 'url': 'https://www.dagbladet.no/video/PynxJnNWChE/', 'md5': 'b8f850ba1860adbda668d367f9b77699', @@ -31,12 +27,6 @@ class DBTVIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): - return [url for _, url in re.findall( - r'<iframe[^>]+src=(["\'])((?:https?:)?//(?:www\.)?dagbladet\.no/video/embed/(?:[0-9A-Za-z_-]{11}|[a-zA-Z0-9]{8}).*?)\1', - webpage)] - def _real_extract(self, url): display_id, video_id = self._match_valid_url(url).groups() info = { diff --git a/hypervideo_dl/extractor/dctp.py b/hypervideo_dl/extractor/dctp.py index e700f8d..24bb6ac 100644 --- a/hypervideo_dl/extractor/dctp.py +++ b/hypervideo_dl/extractor/dctp.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import compat_str from ..utils import ( diff --git a/hypervideo_dl/extractor/deezer.py b/hypervideo_dl/extractor/deezer.py index 7ba02e5..f61f12a 100644 --- a/hypervideo_dl/extractor/deezer.py +++ b/hypervideo_dl/extractor/deezer.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import json from .common import InfoExtractor @@ -64,7 +62,6 @@ class DeezerPlaylistIE(DeezerBaseInfoExtractor): 'preference': -100, # Only the first 30 seconds 'ext': 'mp3', }] - self._sort_formats(formats) artists = ', '.join( orderedSet(a.get('ART_NAME') for a in s.get('ARTISTS'))) entries.append({ @@ -117,7 +114,6 @@ class DeezerAlbumIE(DeezerBaseInfoExtractor): 'preference': -100, # Only the first 30 seconds 'ext': 'mp3', }] - self._sort_formats(formats) artists = ', '.join( orderedSet(a.get('ART_NAME') for a in s.get('ARTISTS'))) entries.append({ diff --git a/hypervideo_dl/extractor/defense.py b/hypervideo_dl/extractor/defense.py index 9fe144e..7d73ea8 100644 --- a/hypervideo_dl/extractor/defense.py +++ b/hypervideo_dl/extractor/defense.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/democracynow.py b/hypervideo_dl/extractor/democracynow.py index 5c9c0ec..1624d08 100644 --- a/hypervideo_dl/extractor/democracynow.py +++ b/hypervideo_dl/extractor/democracynow.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re import os.path @@ -62,8 +59,6 @@ class DemocracynowIE(InfoExtractor): 'vcodec': 'none' if key == 'audio' else None, }) - self._sort_formats(formats) - default_lang = 'en' subtitles = {} diff --git a/hypervideo_dl/extractor/detik.py b/hypervideo_dl/extractor/detik.py new file mode 100644 index 0000000..f148054 --- /dev/null +++ b/hypervideo_dl/extractor/detik.py @@ -0,0 +1,159 @@ +from .common import InfoExtractor +from ..utils import int_or_none, merge_dicts, try_call, url_basename + + +class DetikEmbedIE(InfoExtractor): + _VALID_URL = False + _WEBPAGE_TESTS = [{ + # cnn embed + 'url': 'https://www.cnnindonesia.com/embed/video/846189', + 'info_dict': { + 'id': '846189', + 'ext': 'mp4', + 'description': 'md5:ece7b003b3ee7d81c6a5cfede7d5397d', + 'thumbnail': r're:https?://akcdn\.detik\.net\.id/visual/2022/09/11/thumbnail-video-1_169.jpeg', + 'title': 'Video CNN Indonesia - VIDEO: Momen Charles Disambut Meriah usai Dilantik jadi Raja Inggris', + 'age_limit': 0, + 'tags': ['raja charles', ' raja charles iii', ' ratu elizabeth', ' ratu elizabeth meninggal dunia', ' raja inggris', ' inggris'], + 'release_timestamp': 1662869995, + 'release_date': '20220911', + 'uploader': 'REUTERS' + } + }, { + # 20.detik + 'url': 'https://20.detik.com/otobuzz/20220704-220704093/mulai-rp-10-jutaan-ini-skema-kredit-mitsubishi-pajero-sport', + 'info_dict': { + 'display_id': 'mulai-rp-10-jutaan-ini-skema-kredit-mitsubishi-pajero-sport', + 'id': '220704093', + 'ext': 'mp4', + 'description': 'md5:9b2257341b6f375cdcf90106146d5ffb', + 'thumbnail': r're:https?://cdnv\.detik\.com/videoservice/AdminTV/2022/07/04/5d6187e402ec4a91877755a5886ff5b6-20220704161859-0s.jpg', + 'title': 'Mulai Rp 10 Jutaan! Ini Skema Kredit Mitsubishi Pajero Sport', + 'timestamp': 1656951521, + 'upload_date': '20220704', + 'duration': 83.0, + 'tags': ['cicilan mobil', 'mitsubishi pajero sport', 'mitsubishi', 'pajero sport'], + 'release_timestamp': 1656926321, + 'release_date': '20220704', + 'age_limit': 0, + 'uploader': 'Ridwan Arifin ' # TODO: strip trailling whitespace at uploader + } + }, { + # pasangmata.detik + 'url': 'https://pasangmata.detik.com/contribution/366649', + 'info_dict': { + 'id': '366649', + 'ext': 'mp4', + 'title': 'Saling Dorong Aparat dan Pendemo di Aksi Tolak Kenaikan BBM', + 'description': 'md5:7a6580876c8381c454679e028620bea7', + 'age_limit': 0, + 'tags': 'count:17', + 'thumbnail': 'https://akcdn.detik.net.id/community/data/media/thumbs-pasangmata/2022/09/08/366649-16626229351533009620.mp4-03.jpg', + } + }, { + # insertlive embed + 'url': 'https://www.insertlive.com/embed/video/290482', + 'info_dict': { + 'id': '290482', + 'ext': 'mp4', + 'release_timestamp': 1663063704, + 'thumbnail': 'https://akcdn.detik.net.id/visual/2022/09/13/leonardo-dicaprio_169.png?w=600&q=90', + 'age_limit': 0, + 'description': 'Aktor Leonardo DiCaprio memang baru saja putus dari kekasihnya yang bernama Camilla Morrone.', + 'release_date': '20220913', + 'title': 'Diincar Leonardo DiCaprio, Gigi Hadid Ngaku Tertarik Tapi Belum Cinta', + 'tags': ['leonardo dicaprio', ' gigi hadid', ' hollywood'], + 'uploader': '!nsertlive', + } + }, { + # beautynesia embed + 'url': 'https://www.beautynesia.id/embed/video/261636', + 'info_dict': { + 'id': '261636', + 'ext': 'mp4', + 'age_limit': 0, + 'release_timestamp': 1662375600, + 'description': 'Menurut ramalan astrologi, tiga zodiak ini bakal hoki sepanjang September 2022.', + 'title': '3 Zodiak Paling Beruntung Selama September 2022', + 'release_date': '20220905', + 'tags': ['zodiac update', ' zodiak', ' ramalan bintang', ' zodiak beruntung 2022', ' zodiak hoki september 2022', ' zodiak beruntung september 2022'], + 'thumbnail': 'https://akcdn.detik.net.id/visual/2022/09/05/3-zodiak-paling-beruntung-selama-september-2022_169.jpeg?w=600&q=90', + 'uploader': 'amh', + } + }, { + # cnbcindonesia embed + 'url': 'https://www.cnbcindonesia.com/embed/video/371839', + 'info_dict': { + 'id': '371839', + 'ext': 'mp4', + 'title': 'Puluhan Pejabat Rusia Tuntut Putin Mundur', + 'tags': ['putin'], + 'age_limit': 0, + 'thumbnail': 'https://awsimages.detik.net.id/visual/2022/09/13/cnbc-indonesia-tv-3_169.png?w=600&q=80', + 'description': 'md5:8b9111e37555fcd95fe549a9b4ae6fdc', + } + }, { + # detik shortlink (we can get it from https://dtk.id/?<url>) + 'url': 'https://dtk.id/NkISKr', + 'info_dict': { + 'id': '220914049', + 'ext': 'mp4', + 'release_timestamp': 1663114488, + 'uploader': 'Tim 20Detik', + 'title': 'Pakar Bicara soal Tim Khusus Jokowi dan Mereka yang Pro ke Bjorka', + 'age_limit': 0, + 'thumbnail': 'https://cdnv.detik.com/videoservice/AdminTV/2022/09/14/f15cae71d7b640c58e75b254ecbb1ce1-20220914071613-0s.jpg?w=400&q=80', + 'display_id': 'pakar-bicara-soal-tim-khusus-jokowi-dan-mereka-yang-pro-ke-bjorka', + 'upload_date': '20220914', + 'release_date': '20220914', + 'description': 'md5:5eb03225f7ee40207dd3a1e18a73f1ff', + 'timestamp': 1663139688, + 'duration': 213.0, + 'tags': ['hacker bjorka', 'bjorka', 'hacker bjorka bocorkan data rahasia presiden jokowi', 'jokowi'], + } + }] + + def _extract_from_webpage(self, url, webpage): + player_type, video_data = self._search_regex( + r'<script\s*[^>]+src="https?://(aws)?cdn\.detik\.net\.id/(?P<type>flowplayer|detikVideo)[^>]+>\s*(?P<video_data>{[^}]+})', + webpage, 'playerjs', group=('type', 'video_data'), default=(None, '')) + if not player_type: + return + + display_id, extra_info_dict = url_basename(url), {} + + if player_type == 'flowplayer': + video_json_data = self._parse_json(video_data.replace('\'', '"'), display_id) + video_url = video_json_data['videoUrl'] + + extra_info_dict = { + 'id': self._search_regex(r'identifier\s*:\s*\'([^\']+)', webpage, 'identifier'), + 'thumbnail': video_json_data.get('imageUrl'), + } + + elif player_type == 'detikVideo': + video_url = self._search_regex( + r'videoUrl\s*:\s*[\'"]?([^"\']+)', video_data, 'videoUrl') + extra_info_dict = { + 'id': self._html_search_meta(['video_id', 'dtk:video_id'], webpage), + 'thumbnail': self._search_regex(r'imageUrl\s*:\s*[\'"]?([^"\']+)', video_data, 'videoUrl'), + 'duration': int_or_none(self._html_search_meta('duration', webpage, fatal=False, default=None)), + 'release_timestamp': int_or_none(self._html_search_meta('dtk:publishdateunix', webpage, fatal=False, default=None), 1000), + 'timestamp': int_or_none(self._html_search_meta('dtk:createdateunix', webpage, fatal=False, default=None), 1000), + 'uploader': self._search_regex( + r'([^-]+)', self._html_search_meta('dtk:author', webpage, default='').strip(), 'uploader', + default=None) + } + + formats, subtitles = self._extract_m3u8_formats_and_subtitles(video_url, display_id) + + json_ld_data = self._search_json_ld(webpage, display_id, default={}) + yield merge_dicts(json_ld_data, extra_info_dict, { + 'display_id': display_id, + 'title': self._html_search_meta(['og:title', 'originalTitle'], webpage) or self._html_extract_title(webpage), + 'description': self._html_search_meta(['og:description', 'twitter:description', 'description'], webpage), + 'formats': formats, + 'subtitles': subtitles, + 'tags': try_call(lambda: self._html_search_meta( + ['keywords', 'keyword', 'dtk:keywords'], webpage).split(',')), + }) diff --git a/hypervideo_dl/extractor/deuxm.py b/hypervideo_dl/extractor/deuxm.py new file mode 100644 index 0000000..74a6da6 --- /dev/null +++ b/hypervideo_dl/extractor/deuxm.py @@ -0,0 +1,76 @@ +from .common import InfoExtractor +from ..utils import url_or_none + + +class DeuxMIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?2m\.ma/[^/]+/replay/single/(?P<id>([\w.]{1,24})+)' + + _TESTS = [{ + 'url': 'https://2m.ma/fr/replay/single/6351d439b15e1a613b3debe8', + 'md5': '5f761f04c9d686e553b685134dca5d32', + 'info_dict': { + 'id': '6351d439b15e1a613b3debe8', + 'ext': 'mp4', + 'title': 'Grand Angle : Jeudi 20 Octobre 2022', + 'thumbnail': r're:^https?://2msoread-ww.amagi.tv/mediasfiles/videos/images/.*\.png$' + } + }, { + 'url': 'https://2m.ma/fr/replay/single/635c0aeab4eec832622356da', + 'md5': 'ad6af2f5e4d5b2ad2194a84b6e890b4c', + 'info_dict': { + 'id': '635c0aeab4eec832622356da', + 'ext': 'mp4', + 'title': 'Journal Amazigh : Vendredi 28 Octobre 2022', + 'thumbnail': r're:^https?://2msoread-ww.amagi.tv/mediasfiles/videos/images/.*\.png$' + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + video = self._download_json( + f'https://2m.ma/api/watchDetail/{video_id}', video_id)['response']['News'] + return { + 'id': video_id, + 'title': video.get('titre'), + 'url': video['url'], + 'description': video.get('description'), + 'thumbnail': url_or_none(video.get('image')), + } + + +class DeuxMNewsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?2m\.ma/(?P<lang>\w+)/news/(?P<id>[^/#?]+)' + + _TESTS = [{ + 'url': 'https://2m.ma/fr/news/Kan-Ya-Mkan-d%C3%A9poussi%C3%A8re-l-histoire-du-phare-du-Cap-Beddouza-20221028', + 'md5': '43d5e693a53fa0b71e8a5204c7d4542a', + 'info_dict': { + 'id': '635c5d1233b83834e35b282e', + 'ext': 'mp4', + 'title': 'Kan Ya Mkan d\u00e9poussi\u00e8re l\u2019histoire du phare du Cap Beddouza', + 'description': 'md5:99dcf29b82f1d7f2a4acafed1d487527', + 'thumbnail': r're:^https?://2msoread-ww.amagi.tv/mediasfiles/videos/images/.*\.png$' + } + }, { + 'url': 'https://2m.ma/fr/news/Interview-Casablanca-hors-des-sentiers-battus-avec-Abderrahim-KASSOU-Replay--20221017', + 'md5': '7aca29f02230945ef635eb8290283c0c', + 'info_dict': { + 'id': '634d9e108b70d40bc51a844b', + 'ext': 'mp4', + 'title': 'Interview: Casablanca hors des sentiers battus avec Abderrahim KASSOU (Replay) ', + 'description': 'md5:3b8e78111de9fcc6ef7f7dd6cff2430c', + 'thumbnail': r're:^https?://2msoread-ww.amagi.tv/mediasfiles/videos/images/.*\.png$' + } + }] + + def _real_extract(self, url): + article_name, lang = self._match_valid_url(url).group('id', 'lang') + video = self._download_json( + f'https://2m.ma/api/articlesByUrl?lang={lang}&url=/news/{article_name}', article_name)['response']['article'][0] + return { + 'id': video['id'], + 'title': video.get('title'), + 'url': video['image'][0], + 'description': video.get('content'), + 'thumbnail': url_or_none(video.get('cover')), + } diff --git a/hypervideo_dl/extractor/dfb.py b/hypervideo_dl/extractor/dfb.py index 97f70fc..c4fb5c2 100644 --- a/hypervideo_dl/extractor/dfb.py +++ b/hypervideo_dl/extractor/dfb.py @@ -1,6 +1,3 @@ -from __future__ import unicode_literals - - from .common import InfoExtractor from ..utils import unified_strdate @@ -44,7 +41,6 @@ class DFBIE(InfoExtractor): formats.extend(self._extract_m3u8_formats( manifest_url, display_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/dhm.py b/hypervideo_dl/extractor/dhm.py index aee72a6..3d42fc2 100644 --- a/hypervideo_dl/extractor/dhm.py +++ b/hypervideo_dl/extractor/dhm.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import parse_duration diff --git a/hypervideo_dl/extractor/digg.py b/hypervideo_dl/extractor/digg.py index 913c175..86e8a6f 100644 --- a/hypervideo_dl/extractor/digg.py +++ b/hypervideo_dl/extractor/digg.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import js_to_json diff --git a/hypervideo_dl/extractor/digitalconcerthall.py b/hypervideo_dl/extractor/digitalconcerthall.py index 8398ae3..3461e36 100644 --- a/hypervideo_dl/extractor/digitalconcerthall.py +++ b/hypervideo_dl/extractor/digitalconcerthall.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( @@ -89,9 +86,8 @@ class DigitalConcertHallIE(InfoExtractor): }) m3u8_url = traverse_obj( - stream_info, ('channel', lambda x: x.startswith('vod_mixed'), 'stream', 0, 'url'), get_all=False) + stream_info, ('channel', lambda k, _: k.startswith('vod_mixed'), 'stream', 0, 'url'), get_all=False) formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', 'm3u8_native', fatal=False) - self._sort_formats(formats) yield { 'id': video_id, diff --git a/hypervideo_dl/extractor/digiteka.py b/hypervideo_dl/extractor/digiteka.py index d632047..912e33b 100644 --- a/hypervideo_dl/extractor/digiteka.py +++ b/hypervideo_dl/extractor/digiteka.py @@ -1,8 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - from .common import InfoExtractor from ..utils import int_or_none @@ -28,6 +23,7 @@ class DigitekaIE(InfoExtractor): ) /id )/(?P<id>[\d+a-z]+)''' + _EMBED_REGEX = [r'<(?:iframe|script)[^>]+src=["\'](?P<url>(?:https?:)?//(?:www\.)?ultimedia\.com/deliver/(?:generic|musique)(?:/[^/]+)*/(?:src|article)/[\d+a-z]+)'] _TESTS = [{ # news 'url': 'https://www.ultimedia.com/default/index/videogeneric/id/s8uk0r', @@ -61,14 +57,6 @@ class DigitekaIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_url(webpage): - mobj = re.search( - r'<(?:iframe|script)[^>]+src=["\'](?P<url>(?:https?:)?//(?:www\.)?ultimedia\.com/deliver/(?:generic|musique)(?:/[^/]+)*/(?:src|article)/[\d+a-z]+)', - webpage) - if mobj: - return mobj.group('url') - def _real_extract(self, url): mobj = self._match_valid_url(url) video_id = mobj.group('id') @@ -93,8 +81,6 @@ class DigitekaIE(InfoExtractor): 'format_id': source.get('label'), }) - self._sort_formats(formats) - title = deliver_info['title'] thumbnail = jwconf.get('image') duration = int_or_none(deliver_info.get('duration')) diff --git a/hypervideo_dl/extractor/discovery.py b/hypervideo_dl/extractor/discovery.py index fd3ad75..fd3fc8f 100644 --- a/hypervideo_dl/extractor/discovery.py +++ b/hypervideo_dl/extractor/discovery.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import random import string diff --git a/hypervideo_dl/extractor/discoverygo.py b/hypervideo_dl/extractor/discoverygo.py index 9e7b14a..1f3d8e3 100644 --- a/hypervideo_dl/extractor/discoverygo.py +++ b/hypervideo_dl/extractor/discoverygo.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -52,7 +50,6 @@ class DiscoveryGoBaseIE(InfoExtractor): elif stream_kind == 'hds': formats.extend(self._extract_f4m_formats( stream_url, display_id, f4m_id=stream_kind, fatal=False)) - self._sort_formats(formats) video_id = video.get('id') or display_id description = video.get('description', {}).get('detailed') diff --git a/hypervideo_dl/extractor/discoverynetworks.py b/hypervideo_dl/extractor/discoverynetworks.py deleted file mode 100644 index f43c871..0000000 --- a/hypervideo_dl/extractor/discoverynetworks.py +++ /dev/null @@ -1,42 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - - -from .dplay import DPlayIE - - -class DiscoveryNetworksDeIE(DPlayIE): - _VALID_URL = r'https?://(?:www\.)?(?P<domain>(?:tlc|dmax)\.de|dplay\.co\.uk)/(?:programme|show|sendungen)/(?P<programme>[^/]+)/(?:video/)?(?P<alternate_id>[^/]+)' - - _TESTS = [{ - 'url': 'https://www.tlc.de/programme/breaking-amish/video/die-welt-da-drauen/DCB331270001100', - 'info_dict': { - 'id': '78867', - 'ext': 'mp4', - 'title': 'Die Welt da draußen', - 'description': 'md5:61033c12b73286e409d99a41742ef608', - 'timestamp': 1554069600, - 'upload_date': '20190331', - }, - 'params': { - 'format': 'bestvideo', - 'skip_download': True, - }, - }, { - 'url': 'https://www.dmax.de/programme/dmax-highlights/video/tuning-star-sidney-hoffmann-exklusiv-bei-dmax/191023082312316', - 'only_matching': True, - }, { - 'url': 'https://www.dplay.co.uk/show/ghost-adventures/video/hotel-leger-103620/EHD_280313B', - 'only_matching': True, - }, { - 'url': 'https://tlc.de/sendungen/breaking-amish/die-welt-da-drauen/', - 'only_matching': True, - }] - - def _real_extract(self, url): - domain, programme, alternate_id = self._match_valid_url(url).groups() - country = 'GB' if domain == 'dplay.co.uk' else 'DE' - realm = 'questuk' if country == 'GB' else domain.replace('.', '') - return self._get_disco_api_info( - url, '%s/%s' % (programme, alternate_id), - 'sonic-eu1-prod.disco-api.com', realm, country) diff --git a/hypervideo_dl/extractor/discoveryplusindia.py b/hypervideo_dl/extractor/discoveryplusindia.py deleted file mode 100644 index 5180140..0000000 --- a/hypervideo_dl/extractor/discoveryplusindia.py +++ /dev/null @@ -1,98 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import json - -from ..compat import compat_str -from ..utils import try_get -from .common import InfoExtractor -from .dplay import DPlayIE - - -class DiscoveryPlusIndiaIE(DPlayIE): - _VALID_URL = r'https?://(?:www\.)?discoveryplus\.in/videos?' + DPlayIE._PATH_REGEX - _TESTS = [{ - 'url': 'https://www.discoveryplus.in/videos/how-do-they-do-it/fugu-and-more?seasonId=8&type=EPISODE', - 'info_dict': { - 'id': '27104', - 'ext': 'mp4', - 'display_id': 'how-do-they-do-it/fugu-and-more', - 'title': 'Fugu and More', - 'description': 'The Japanese catch, prepare and eat the deadliest fish on the planet.', - 'duration': 1319, - 'timestamp': 1582309800, - 'upload_date': '20200221', - 'series': 'How Do They Do It?', - 'season_number': 8, - 'episode_number': 2, - 'creator': 'Discovery Channel', - }, - 'params': { - 'format': 'bestvideo', - 'skip_download': True, - }, - 'skip': 'Cookies (not necessarily logged in) are needed' - }] - - def _update_disco_api_headers(self, headers, disco_base, display_id, realm): - headers['x-disco-params'] = 'realm=%s' % realm - headers['x-disco-client'] = 'WEB:UNKNOWN:dplus-india:17.0.0' - - def _download_video_playback_info(self, disco_base, video_id, headers): - return self._download_json( - disco_base + 'playback/v3/videoPlaybackInfo', - video_id, headers=headers, data=json.dumps({ - 'deviceInfo': { - 'adBlocker': False, - }, - 'videoId': video_id, - }).encode('utf-8'))['data']['attributes']['streaming'] - - def _real_extract(self, url): - display_id = self._match_id(url) - return self._get_disco_api_info( - url, display_id, 'ap2-prod-direct.discoveryplus.in', 'dplusindia', 'in') - - -class DiscoveryPlusIndiaShowIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?discoveryplus\.in/show/(?P<show_name>[^/]+)/?(?:[?#]|$)' - _TESTS = [{ - 'url': 'https://www.discoveryplus.in/show/how-do-they-do-it', - 'playlist_mincount': 140, - 'info_dict': { - 'id': 'how-do-they-do-it', - }, - }] - - def _entries(self, show_name): - headers = { - 'x-disco-client': 'WEB:UNKNOWN:dplus-india:prod', - 'x-disco-params': 'realm=dplusindia', - 'referer': 'https://www.discoveryplus.in/', - } - show_url = 'https://ap2-prod-direct.discoveryplus.in/cms/routes/show/{}?include=default'.format(show_name) - show_json = self._download_json(show_url, - video_id=show_name, - headers=headers)['included'][4]['attributes']['component'] - show_id = show_json['mandatoryParams'].split('=')[-1] - season_url = 'https://ap2-prod-direct.discoveryplus.in/content/videos?sort=episodeNumber&filter[seasonNumber]={}&filter[show.id]={}&page[size]=100&page[number]={}' - for season in show_json['filters'][0]['options']: - season_id = season['id'] - total_pages, page_num = 1, 0 - while page_num < total_pages: - season_json = self._download_json(season_url.format(season_id, show_id, compat_str(page_num + 1)), - video_id=show_id, headers=headers, - note='Downloading JSON metadata%s' % (' page %d' % page_num if page_num else '')) - if page_num == 0: - total_pages = try_get(season_json, lambda x: x['meta']['totalPages'], int) or 1 - episodes_json = season_json['data'] - for episode in episodes_json: - video_id = episode['attributes']['path'] - yield self.url_result( - 'https://discoveryplus.in/videos/%s' % video_id, - ie=DiscoveryPlusIndiaIE.ie_key(), video_id=video_id) - page_num += 1 - - def _real_extract(self, url): - show_name = self._match_valid_url(url).group('show_name') - return self.playlist_result(self._entries(show_name), playlist_id=show_name) diff --git a/hypervideo_dl/extractor/discoveryvr.py b/hypervideo_dl/extractor/discoveryvr.py deleted file mode 100644 index cb63c26..0000000 --- a/hypervideo_dl/extractor/discoveryvr.py +++ /dev/null @@ -1,59 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import parse_duration - - -class DiscoveryVRIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?discoveryvr\.com/watch/(?P<id>[^/?#]+)' - _TEST = { - 'url': 'http://www.discoveryvr.com/watch/discovery-vr-an-introduction', - 'md5': '32b1929798c464a54356378b7912eca4', - 'info_dict': { - 'id': 'discovery-vr-an-introduction', - 'ext': 'mp4', - 'title': 'Discovery VR - An Introduction', - 'description': 'md5:80d418a10efb8899d9403e61d8790f06', - } - } - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - - bootstrap_data = self._search_regex( - r'root\.DVR\.bootstrapData\s+=\s+"({.+?})";', - webpage, 'bootstrap data') - bootstrap_data = self._parse_json( - bootstrap_data.encode('utf-8').decode('unicode_escape'), - display_id) - videos = self._parse_json(bootstrap_data['videos'], display_id)['allVideos'] - video_data = next(video for video in videos if video.get('slug') == display_id) - - series = video_data.get('showTitle') - title = episode = video_data.get('title') or series - if series and series != title: - title = '%s - %s' % (series, title) - - formats = [] - for f, format_id in (('cdnUriM3U8', 'mobi'), ('webVideoUrlSd', 'sd'), ('webVideoUrlHd', 'hd')): - f_url = video_data.get(f) - if not f_url: - continue - formats.append({ - 'format_id': format_id, - 'url': f_url, - }) - - return { - 'id': display_id, - 'display_id': display_id, - 'title': title, - 'description': video_data.get('description'), - 'thumbnail': video_data.get('thumbnail'), - 'duration': parse_duration(video_data.get('runTime')), - 'formats': formats, - 'episode': episode, - 'series': series, - } diff --git a/hypervideo_dl/extractor/disney.py b/hypervideo_dl/extractor/disney.py index 0ad7b1f..430de32 100644 --- a/hypervideo_dl/extractor/disney.py +++ b/hypervideo_dl/extractor/disney.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -137,7 +134,6 @@ class DisneyIE(InfoExtractor): self.raise_no_formats( '%s said: %s' % (self.IE_NAME, page_data['translations']['video_expired']), expected=True) - self._sort_formats(formats) subtitles = {} for caption in video_data.get('captions', []): diff --git a/hypervideo_dl/extractor/dispeak.py b/hypervideo_dl/extractor/dispeak.py index 3d651f3..37f89b9 100644 --- a/hypervideo_dl/extractor/dispeak.py +++ b/hypervideo_dl/extractor/dispeak.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -119,7 +117,6 @@ class DigitallySpeakingIE(InfoExtractor): video_formats = self._parse_mp4(metadata) if video_formats is None: video_formats = self._parse_flv(metadata) - self._sort_formats(video_formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/dlive.py b/hypervideo_dl/extractor/dlive.py index 7410eb6..30fcf9f 100644 --- a/hypervideo_dl/extractor/dlive.py +++ b/hypervideo_dl/extractor/dlive.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import json from .common import InfoExtractor @@ -42,7 +40,6 @@ class DLiveVODIE(InfoExtractor): title = broadcast['title'] formats = self._extract_m3u8_formats( broadcast['playbackUrl'], vod_id, 'mp4', 'm3u8_native') - self._sort_formats(formats) return { 'id': vod_id, 'title': title, @@ -81,7 +78,6 @@ class DLiveStreamIE(InfoExtractor): formats = self._extract_m3u8_formats( 'https://live.prd.dlive.tv/hls/live/%s.m3u8' % username, display_name, 'mp4') - self._sort_formats(formats) return { 'id': display_name, 'title': title, diff --git a/hypervideo_dl/extractor/doodstream.py b/hypervideo_dl/extractor/doodstream.py deleted file mode 100644 index f692127..0000000 --- a/hypervideo_dl/extractor/doodstream.py +++ /dev/null @@ -1,76 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import string -import random -import time - -from .common import InfoExtractor - - -class DoodStreamIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?dood\.(?:to|watch)/[ed]/(?P<id>[a-z0-9]+)' - _TESTS = [{ - 'url': 'http://dood.to/e/5s1wmbdacezb', - 'md5': '4568b83b31e13242b3f1ff96c55f0595', - 'info_dict': { - 'id': '5s1wmbdacezb', - 'ext': 'mp4', - 'title': 'Kat Wonders - Monthly May 2020', - 'description': 'Kat Wonders - Monthly May 2020 | DoodStream.com', - 'thumbnail': 'https://img.doodcdn.com/snaps/flyus84qgl2fsk4g.jpg', - } - }, { - 'url': 'http://dood.watch/d/5s1wmbdacezb', - 'md5': '4568b83b31e13242b3f1ff96c55f0595', - 'info_dict': { - 'id': '5s1wmbdacezb', - 'ext': 'mp4', - 'title': 'Kat Wonders - Monthly May 2020', - 'description': 'Kat Wonders - Monthly May 2020 | DoodStream.com', - 'thumbnail': 'https://img.doodcdn.com/snaps/flyus84qgl2fsk4g.jpg', - } - }, { - 'url': 'https://dood.to/d/jzrxn12t2s7n', - 'md5': '3207e199426eca7c2aa23c2872e6728a', - 'info_dict': { - 'id': 'jzrxn12t2s7n', - 'ext': 'mp4', - 'title': 'Stacy Cruz Cute ALLWAYSWELL', - 'description': 'Stacy Cruz Cute ALLWAYSWELL | DoodStream.com', - 'thumbnail': 'https://img.doodcdn.com/snaps/8edqd5nppkac3x8u.jpg', - } - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - url = f'https://dood.to/e/{video_id}' - webpage = self._download_webpage(url, video_id) - - title = self._html_search_meta(['og:title', 'twitter:title'], webpage, default=None) - thumb = self._html_search_meta(['og:image', 'twitter:image'], webpage, default=None) - token = self._html_search_regex(r'[?&]token=([a-z0-9]+)[&\']', webpage, 'token') - description = self._html_search_meta( - ['og:description', 'description', 'twitter:description'], webpage, default=None) - - headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:53.0) Gecko/20100101 Firefox/66.0', - 'referer': url - } - - pass_md5 = self._html_search_regex(r'(/pass_md5.*?)\'', webpage, 'pass_md5') - final_url = ''.join(( - self._download_webpage(f'https://dood.to{pass_md5}', video_id, headers=headers), - *(random.choice(string.ascii_letters + string.digits) for _ in range(10)), - f'?token={token}&expiry={int(time.time() * 1000)}', - )) - - return { - 'id': video_id, - 'title': title, - 'url': final_url, - 'http_headers': headers, - 'ext': 'mp4', - 'description': description, - 'thumbnail': thumb, - } diff --git a/hypervideo_dl/extractor/dotsub.py b/hypervideo_dl/extractor/dotsub.py index 148605c..079f837 100644 --- a/hypervideo_dl/extractor/dotsub.py +++ b/hypervideo_dl/extractor/dotsub.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( float_or_none, diff --git a/hypervideo_dl/extractor/douyutv.py b/hypervideo_dl/extractor/douyutv.py index 26a8d64..477f468 100644 --- a/hypervideo_dl/extractor/douyutv.py +++ b/hypervideo_dl/extractor/douyutv.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import time import hashlib import re diff --git a/hypervideo_dl/extractor/dplay.py b/hypervideo_dl/extractor/dplay.py index a25f27c..8eb4d8f 100644 --- a/hypervideo_dl/extractor/dplay.py +++ b/hypervideo_dl/extractor/dplay.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json import uuid @@ -11,6 +8,7 @@ from ..utils import ( ExtractorError, float_or_none, int_or_none, + remove_start, strip_or_none, try_get, unified_timestamp, @@ -128,7 +126,6 @@ class DPlayBaseIE(InfoExtractor): 'url': format_url, 'format_id': format_id, }) - self._sort_formats(formats) creator = series = None tags = [] @@ -314,7 +311,7 @@ class DPlayIE(DPlayBaseIE): def _real_extract(self, url): mobj = self._match_valid_url(url) display_id = mobj.group('id') - domain = mobj.group('domain').lstrip('www.') + domain = remove_start(mobj.group('domain'), 'www.') country = mobj.group('country') or mobj.group('subdomain_country') or mobj.group('plus_country') host = 'disco-api.' + domain if domain[0] == 'd' else 'eu2-prod.disco-api.com' return self._get_disco_api_info( @@ -720,6 +717,72 @@ class TLCIE(DiscoveryPlusBaseIE): } +class MotorTrendIE(DiscoveryPlusBaseIE): + _VALID_URL = r'https?://(?:watch\.)?motortrend\.com/video' + DPlayBaseIE._PATH_REGEX + _TESTS = [{ + 'url': 'https://watch.motortrend.com/video/car-issues-motortrend-atve-us/double-dakotas', + 'info_dict': { + 'id': '"4859182"', + 'display_id': 'double-dakotas', + 'ext': 'mp4', + 'title': 'Double Dakotas', + 'description': 'Tylers buy-one-get-one Dakota deal has the Wizard pulling double duty.', + 'season_number': 2, + 'episode_number': 3, + }, + 'skip': 'Available for Premium users', + }, { + 'url': 'https://watch.motortrend.com/video/car-issues-motortrend-atve-us/double-dakotas', + 'only_matching': True, + }] + + _PRODUCT = 'vel' + _DISCO_API_PARAMS = { + 'disco_host': 'us1-prod-direct.watch.motortrend.com', + 'realm': 'go', + 'country': 'us', + } + + +class MotorTrendOnDemandIE(DiscoveryPlusBaseIE): + _VALID_URL = r'https?://(?:www\.)?motortrendondemand\.com/detail' + DPlayBaseIE._PATH_REGEX + _TESTS = [{ + 'url': 'https://www.motortrendondemand.com/detail/wheelstanding-dump-truck-stubby-bobs-comeback/37699/784', + 'info_dict': { + 'id': '37699', + 'display_id': 'wheelstanding-dump-truck-stubby-bobs-comeback/37699', + 'ext': 'mp4', + 'title': 'Wheelstanding Dump Truck! Stubby Bob’s Comeback', + 'description': 'md5:996915abe52a1c3dfc83aecea3cce8e7', + 'season_number': 5, + 'episode_number': 52, + 'episode': 'Episode 52', + 'season': 'Season 5', + 'thumbnail': r're:^https?://.+\.jpe?g$', + 'timestamp': 1388534401, + 'duration': 1887.345, + 'creator': 'Originals', + 'series': 'Roadkill', + 'upload_date': '20140101', + 'tags': [], + }, + }] + + _PRODUCT = 'MTOD' + _DISCO_API_PARAMS = { + 'disco_host': 'us1-prod-direct.motortrendondemand.com', + 'realm': 'motortrend', + 'country': 'us', + } + + def _update_disco_api_headers(self, headers, disco_base, display_id, realm): + headers.update({ + 'x-disco-params': f'realm={realm}', + 'x-disco-client': f'WEB:UNKNOWN:{self._PRODUCT}:4.39.1-gi1', + 'Authorization': self._get_auth(disco_base, display_id, realm), + }) + + class DiscoveryPlusIE(DiscoveryPlusBaseIE): _VALID_URL = r'https?://(?:www\.)?discoveryplus\.com/(?!it/)(?:\w{2}/)?video' + DPlayBaseIE._PATH_REGEX _TESTS = [{ @@ -882,6 +945,9 @@ class DiscoveryPlusItalyIE(DiscoveryPlusBaseIE): _TESTS = [{ 'url': 'https://www.discoveryplus.com/it/video/i-signori-della-neve/stagione-2-episodio-1-i-preparativi', 'only_matching': True, + }, { + 'url': 'https://www.discoveryplus.com/it/video/super-benny/trailer', + 'only_matching': True, }] _PRODUCT = 'dplus_us' @@ -891,6 +957,13 @@ class DiscoveryPlusItalyIE(DiscoveryPlusBaseIE): 'country': 'it', } + def _update_disco_api_headers(self, headers, disco_base, display_id, realm): + headers.update({ + 'x-disco-params': 'realm=%s' % realm, + 'x-disco-client': f'WEB:UNKNOWN:{self._PRODUCT}:25.2.6', + 'Authorization': self._get_auth(disco_base, display_id, realm), + }) + class DiscoveryPlusItalyShowIE(DiscoveryPlusShowBaseIE): _VALID_URL = r'https?://(?:www\.)?discoveryplus\.it/programmi/(?P<show_name>[^/]+)/?(?:[?#]|$)' diff --git a/hypervideo_dl/extractor/drbonanza.py b/hypervideo_dl/extractor/drbonanza.py index ea0f06d..824d70d 100644 --- a/hypervideo_dl/extractor/drbonanza.py +++ b/hypervideo_dl/extractor/drbonanza.py @@ -1,6 +1,3 @@ -from __future__ import unicode_literals - - from .common import InfoExtractor from ..utils import ( js_to_json, @@ -33,7 +30,6 @@ class DRBonanzaIE(InfoExtractor): info = self._parse_html5_media_entries( url, webpage, display_id, m3u8_id='hls', m3u8_entry_protocol='m3u8_native')[0] - self._sort_formats(info['formats']) asset = self._parse_json( self._search_regex( diff --git a/hypervideo_dl/extractor/dreisat.py b/hypervideo_dl/extractor/dreisat.py index 5a07c18..8a59c23 100644 --- a/hypervideo_dl/extractor/dreisat.py +++ b/hypervideo_dl/extractor/dreisat.py @@ -1,9 +1,7 @@ -from __future__ import unicode_literals - from .zdf import ZDFIE -class DreiSatIE(ZDFIE): +class DreiSatIE(ZDFIE): # XXX: Do not subclass from concrete IE IE_NAME = '3sat' _VALID_URL = r'https?://(?:www\.)?3sat\.de/(?:[^/]+/)*(?P<id>[^/?#&]+)\.html' _TESTS = [{ diff --git a/hypervideo_dl/extractor/drooble.py b/hypervideo_dl/extractor/drooble.py index 0584250..106e5c4 100644 --- a/hypervideo_dl/extractor/drooble.py +++ b/hypervideo_dl/extractor/drooble.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/dropbox.py b/hypervideo_dl/extractor/dropbox.py index 2559657..214b309 100644 --- a/hypervideo_dl/extractor/dropbox.py +++ b/hypervideo_dl/extractor/dropbox.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import os.path import re @@ -56,8 +53,8 @@ class DropboxIE(InfoExtractor): else: raise ExtractorError('Password protected video, use --video-password <password>', expected=True) - json_string = self._html_search_regex(r'InitReact\.mountComponent\(.*?,\s*(\{.+\})\s*?\)', webpage, 'Info JSON') - info_json = self._parse_json(json_string, video_id).get('props') + info_json = self._search_json(r'InitReact\.mountComponent\(.*?,', webpage, 'mountComponent', video_id, + contains_pattern=r'{.+?"preview".+?}', end_pattern=r'\)')['props'] transcode_url = traverse_obj(info_json, ((None, 'preview'), 'file', 'preview', 'content', 'transcode_url'), get_all=False) formats, subtitles = self._extract_m3u8_formats_and_subtitles(transcode_url, video_id) @@ -66,7 +63,6 @@ class DropboxIE(InfoExtractor): video_url = re.sub(r'[?&]dl=0', '', url) video_url += ('?' if '?' not in video_url else '&') + 'dl=1' formats.append({'url': video_url, 'format_id': 'original', 'format_note': 'Original', 'quality': 1}) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/dropout.py b/hypervideo_dl/extractor/dropout.py index 2fa6195..e280b1c 100644 --- a/hypervideo_dl/extractor/dropout.py +++ b/hypervideo_dl/extractor/dropout.py @@ -1,9 +1,8 @@ -# coding: utf-8 from .common import InfoExtractor from .vimeo import VHXEmbedIE from ..utils import ( - clean_html, ExtractorError, + clean_html, get_element_by_class, get_element_by_id, get_elements_by_class, @@ -97,11 +96,12 @@ class DropoutIE(InfoExtractor): def _login(self, display_id): username, password = self._get_login_info() - if not (username and password): - self.raise_login_required(method='password') + if not username: + return True response = self._download_webpage( - self._LOGIN_URL, display_id, note='Logging in', data=urlencode_postdata({ + self._LOGIN_URL, display_id, note='Logging in', fatal=False, + data=urlencode_postdata({ 'email': username, 'password': password, 'authenticity_token': self._get_authenticity_token(display_id), @@ -111,19 +111,25 @@ class DropoutIE(InfoExtractor): user_has_subscription = self._search_regex( r'user_has_subscription:\s*["\'](.+?)["\']', response, 'subscription status', default='none') if user_has_subscription.lower() == 'true': - return response + return elif user_has_subscription.lower() == 'false': - raise ExtractorError('Account is not subscribed') + return 'Account is not subscribed' else: - raise ExtractorError('Incorrect username/password') + return 'Incorrect username/password' def _real_extract(self, url): display_id = self._match_id(url) - try: - self._login(display_id) - webpage = self._download_webpage(url, display_id, note='Downloading video webpage') - finally: - self._download_webpage('https://www.dropout.tv/logout', display_id, note='Logging out', fatal=False) + + webpage = None + if self._get_cookies('https://www.dropout.tv').get('_session'): + webpage = self._download_webpage(url, display_id) + if not webpage or '<div id="watch-unauthorized"' in webpage: + login_err = self._login(display_id) + webpage = self._download_webpage(url, display_id) + if login_err and '<div id="watch-unauthorized"' in webpage: + if login_err is True: + self.raise_login_required(method='any') + raise ExtractorError(login_err, expected=True) embed_url = self._search_regex(r'embed_url:\s*["\'](.+?)["\']', webpage, 'embed url') thumbnail = self._og_search_thumbnail(webpage) @@ -138,7 +144,7 @@ class DropoutIE(InfoExtractor): return { '_type': 'url_transparent', 'ie_key': VHXEmbedIE.ie_key(), - 'url': embed_url, + 'url': VHXEmbedIE._smuggle_referrer(embed_url, 'https://www.dropout.tv'), 'id': self._search_regex(r'embed\.vhx\.tv/videos/(.+?)\?', embed_url, 'id'), 'display_id': display_id, 'title': title, diff --git a/hypervideo_dl/extractor/drtuber.py b/hypervideo_dl/extractor/drtuber.py index 540b86a..e5dab6a 100644 --- a/hypervideo_dl/extractor/drtuber.py +++ b/hypervideo_dl/extractor/drtuber.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -13,6 +11,7 @@ from ..utils import ( class DrTuberIE(InfoExtractor): _VALID_URL = r'https?://(?:(?:www|m)\.)?drtuber\.com/(?:video|embed)/(?P<id>\d+)(?:/(?P<display_id>[\w-]+))?' + _EMBED_REGEX = [r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?drtuber\.com/embed/\d+)'] _TESTS = [{ 'url': 'http://www.drtuber.com/video/1740434/hot-perky-blonde-naked-golf', 'md5': '93e680cf2536ad0dfb7e74d94a89facd', @@ -35,12 +34,6 @@ class DrTuberIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): - return re.findall( - r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?drtuber\.com/embed/\d+)', - webpage) - def _real_extract(self, url): mobj = self._match_valid_url(url) video_id = mobj.group('id') @@ -65,7 +58,6 @@ class DrTuberIE(InfoExtractor): 'quality': 2 if format_id == 'hq' else 1, 'url': video_url }) - self._sort_formats(formats) duration = int_or_none(video_data.get('duration')) or parse_duration( video_data.get('duration_format')) diff --git a/hypervideo_dl/extractor/drtv.py b/hypervideo_dl/extractor/drtv.py index 37e4d5b..128f439 100644 --- a/hypervideo_dl/extractor/drtv.py +++ b/hypervideo_dl/extractor/drtv.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import binascii import hashlib import re @@ -26,7 +23,7 @@ class DRTVIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// (?: - (?:www\.)?dr\.dk/(?:tv/se|nyheder|radio(?:/ondemand)?)/(?:[^/]+/)*| + (?:www\.)?dr\.dk/(?:tv/se|nyheder|(?:radio|lyd)(?:/ondemand)?)/(?:[^/]+/)*| (?:www\.)?(?:dr\.dk|dr-massive\.com)/drtv/(?:se|episode|program)/ ) (?P<id>[\da-z_-]+) @@ -54,6 +51,7 @@ class DRTVIE(InfoExtractor): 'release_year': 2016, }, 'expected_warnings': ['Unable to download f4m manifest'], + 'skip': 'this video has been removed', }, { # embed 'url': 'https://www.dr.dk/nyheder/indland/live-christianias-rydning-af-pusher-street-er-i-gang', @@ -74,31 +72,41 @@ class DRTVIE(InfoExtractor): # with SignLanguage formats 'url': 'https://www.dr.dk/tv/se/historien-om-danmark/-/historien-om-danmark-stenalder', 'info_dict': { - 'id': 'historien-om-danmark-stenalder', + 'id': '00831690010', 'ext': 'mp4', 'title': 'Historien om Danmark: Stenalder', 'description': 'md5:8c66dcbc1669bbc6f873879880f37f2a', 'timestamp': 1546628400, 'upload_date': '20190104', - 'duration': 3502.56, + 'duration': 3504.618, 'formats': 'mincount:20', + 'release_year': 2017, + 'season_id': 'urn:dr:mu:bundle:5afc03ad6187a4065ca5fd35', + 'season_number': 1, + 'season': 'Historien om Danmark', + 'series': 'Historien om Danmark', }, 'params': { 'skip_download': True, }, }, { - 'url': 'https://www.dr.dk/radio/p4kbh/regionale-nyheder-kh4/p4-nyheder-2019-06-26-17-30-9', + 'url': 'https://www.dr.dk/lyd/p4kbh/regionale-nyheder-kh4/p4-nyheder-2019-06-26-17-30-9', 'only_matching': True, }, { 'url': 'https://www.dr.dk/drtv/se/bonderoeven_71769', 'info_dict': { 'id': '00951930010', 'ext': 'mp4', - 'title': 'Bonderøven (1:8)', - 'description': 'md5:3cf18fc0d3b205745d4505f896af8121', - 'timestamp': 1546542000, - 'upload_date': '20190103', + 'title': 'Bonderøven 2019 (1:8)', + 'description': 'md5:b6dcfe9b6f0bea6703e9a0092739a5bd', + 'timestamp': 1603188600, + 'upload_date': '20201020', 'duration': 2576.6, + 'season': 'Bonderøven 2019', + 'season_id': 'urn:dr:mu:bundle:5c201667a11fa01ca4528ce5', + 'release_year': 2019, + 'season_number': 2019, + 'series': 'Frank & Kastaniegaarden' }, 'params': { 'skip_download': True, @@ -112,6 +120,24 @@ class DRTVIE(InfoExtractor): }, { 'url': 'https://www.dr.dk/drtv/program/jagten_220924', 'only_matching': True, + }, { + 'url': 'https://www.dr.dk/lyd/p4aarhus/regionale-nyheder-ar4/regionale-nyheder-2022-05-05-12-30-3', + 'info_dict': { + 'id': 'urn:dr:mu:programcard:6265cb2571401424d0360113', + 'title': "Regionale nyheder", + 'ext': 'mp4', + 'duration': 120.043, + 'series': 'P4 Østjylland regionale nyheder', + 'timestamp': 1651746600, + 'season': 'Regionale nyheder', + 'release_year': 0, + 'season_id': 'urn:dr:mu:bundle:61c26889539f0201586b73c5', + 'description': '', + 'upload_date': '20220505', + }, + 'params': { + 'skip_download': True, + }, }] def _real_extract(self, url): @@ -274,8 +300,6 @@ class DRTVIE(InfoExtractor): 'Unfortunately, DR is not allowed to show this program outside Denmark.', countries=self._GEO_COUNTRIES) - self._sort_formats(formats) - return { 'id': video_id, 'title': title, @@ -340,7 +364,6 @@ class DRTVLiveIE(InfoExtractor): formats.extend(self._extract_f4m_formats(update_url_query( '%s/%s' % (server, stream_path), {'hdcore': '3.7.0'}), channel_id, f4m_id=link_type, fatal=False)) - self._sort_formats(formats) return { 'id': channel_id, diff --git a/hypervideo_dl/extractor/dtube.py b/hypervideo_dl/extractor/dtube.py index ad247b7..25a98f6 100644 --- a/hypervideo_dl/extractor/dtube.py +++ b/hypervideo_dl/extractor/dtube.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json from socket import timeout diff --git a/hypervideo_dl/extractor/duboku.py b/hypervideo_dl/extractor/duboku.py index a875978..fb0546c 100644 --- a/hypervideo_dl/extractor/duboku.py +++ b/hypervideo_dl/extractor/duboku.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -54,31 +51,39 @@ def _get_element_by_tag_and_attrib(html, tag=None, attribute=None, value=None, e class DubokuIE(InfoExtractor): IE_NAME = 'duboku' - IE_DESC = 'www.duboku.co' + IE_DESC = 'www.duboku.io' - _VALID_URL = r'(?:https?://[^/]+\.duboku\.co/vodplay/)(?P<id>[0-9]+-[0-9-]+)\.html.*' + _VALID_URL = r'(?:https?://[^/]+\.duboku\.io/vodplay/)(?P<id>[0-9]+-[0-9-]+)\.html.*' _TESTS = [{ - 'url': 'https://www.duboku.co/vodplay/1575-1-1.html', + 'url': 'https://w.duboku.io/vodplay/1575-1-1.html', 'info_dict': { 'id': '1575-1-1', - 'ext': 'ts', + 'ext': 'mp4', 'series': '白色月光', 'title': 'contains:白色月光', 'season_number': 1, 'episode_number': 1, + 'season': 'Season 1', + 'episode_id': '1', + 'season_id': '1', + 'episode': 'Episode 1', }, 'params': { 'skip_download': 'm3u8 download', }, }, { - 'url': 'https://www.duboku.co/vodplay/1588-1-1.html', + 'url': 'https://w.duboku.io/vodplay/1588-1-1.html', 'info_dict': { 'id': '1588-1-1', - 'ext': 'ts', + 'ext': 'mp4', 'series': '亲爱的自己', - 'title': 'contains:预告片', + 'title': 'contains:第1集', 'season_number': 1, 'episode_number': 1, + 'episode': 'Episode 1', + 'season': 'Season 1', + 'episode_id': '1', + 'season_id': '1', }, 'params': { 'skip_download': 'm3u8 download', @@ -94,7 +99,7 @@ class DubokuIE(InfoExtractor): season_id = temp[1] episode_id = temp[2] - webpage_url = 'https://www.duboku.co/vodplay/%s.html' % video_id + webpage_url = 'https://w.duboku.io/vodplay/%s.html' % video_id webpage_html = self._download_webpage(webpage_url, video_id) # extract video url @@ -127,12 +132,13 @@ class DubokuIE(InfoExtractor): data_from = player_data.get('from') # if it is an embedded iframe, maybe it's an external source + headers = {'Referer': webpage_url} if data_from == 'iframe': # use _type url_transparent to retain the meaningful details # of the video. return { '_type': 'url_transparent', - 'url': smuggle_url(data_url, {'http_headers': {'Referer': webpage_url}}), + 'url': smuggle_url(data_url, {'http_headers': headers}), 'id': video_id, 'title': title, 'series': series_title, @@ -142,7 +148,7 @@ class DubokuIE(InfoExtractor): 'episode_id': episode_id, } - formats = self._extract_m3u8_formats(data_url, video_id, 'mp4') + formats = self._extract_m3u8_formats(data_url, video_id, 'mp4', headers=headers) return { 'id': video_id, @@ -153,36 +159,29 @@ class DubokuIE(InfoExtractor): 'episode_number': int_or_none(episode_id), 'episode_id': episode_id, 'formats': formats, - 'http_headers': {'Referer': 'https://www.duboku.co/static/player/videojs.html'} + 'http_headers': headers } class DubokuPlaylistIE(InfoExtractor): IE_NAME = 'duboku:list' - IE_DESC = 'www.duboku.co entire series' + IE_DESC = 'www.duboku.io entire series' - _VALID_URL = r'(?:https?://[^/]+\.duboku\.co/voddetail/)(?P<id>[0-9]+)\.html.*' + _VALID_URL = r'(?:https?://[^/]+\.duboku\.io/voddetail/)(?P<id>[0-9]+)\.html.*' _TESTS = [{ - 'url': 'https://www.duboku.co/voddetail/1575.html', + 'url': 'https://w.duboku.io/voddetail/1575.html', 'info_dict': { 'id': 'startswith:1575', 'title': '白色月光', }, 'playlist_count': 12, }, { - 'url': 'https://www.duboku.co/voddetail/1554.html', + 'url': 'https://w.duboku.io/voddetail/1554.html', 'info_dict': { 'id': 'startswith:1554', 'title': '以家人之名', }, 'playlist_mincount': 30, - }, { - 'url': 'https://www.duboku.co/voddetail/1554.html#playlist2', - 'info_dict': { - 'id': '1554#playlist2', - 'title': '以家人之名', - }, - 'playlist_mincount': 27, }] def _real_extract(self, url): @@ -192,7 +191,7 @@ class DubokuPlaylistIE(InfoExtractor): series_id = mobj.group('id') fragment = compat_urlparse.urlparse(url).fragment - webpage_url = 'https://www.duboku.co/voddetail/%s.html' % series_id + webpage_url = 'https://w.duboku.io/voddetail/%s.html' % series_id webpage_html = self._download_webpage(webpage_url, series_id) # extract title @@ -237,6 +236,6 @@ class DubokuPlaylistIE(InfoExtractor): # return url results return self.playlist_result([ self.url_result( - compat_urlparse.urljoin('https://www.duboku.co', x['href']), + compat_urlparse.urljoin('https://w.duboku.io', x['href']), ie=DubokuIE.ie_key(), video_title=x.get('title')) for x in playlist], series_id + '#' + playlist_id, title) diff --git a/hypervideo_dl/extractor/dumpert.py b/hypervideo_dl/extractor/dumpert.py index d9d9afd..010c2d0 100644 --- a/hypervideo_dl/extractor/dumpert.py +++ b/hypervideo_dl/extractor/dumpert.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( int_or_none, @@ -51,7 +48,6 @@ class DumpertIE(InfoExtractor): 'format_id': version, 'quality': quality(version), }) - self._sort_formats(formats) thumbnails = [] stills = item.get('stills') or {} diff --git a/hypervideo_dl/extractor/dvtv.py b/hypervideo_dl/extractor/dvtv.py index 08663cf..e671433 100644 --- a/hypervideo_dl/extractor/dvtv.py +++ b/hypervideo_dl/extractor/dvtv.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -145,7 +142,6 @@ class DVTVIE(InfoExtractor): 'format_id': join_nonempty('http', ext, label), 'height': int_or_none(height), }) - self._sort_formats(formats) return { 'id': data.get('mediaid') or video_id, diff --git a/hypervideo_dl/extractor/dw.py b/hypervideo_dl/extractor/dw.py index 6eaee07..9c4a08e 100644 --- a/hypervideo_dl/extractor/dw.py +++ b/hypervideo_dl/extractor/dw.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( int_or_none, @@ -65,7 +62,6 @@ class DWIE(InfoExtractor): transform_source=lambda s: s.replace( 'rtmp://tv-od.dw.de/flash/', 'http://tv-download.dw.de/dwtv_video/flv/')) - self._sort_formats(formats) upload_date = hidden_inputs.get('display_date') if not upload_date: diff --git a/hypervideo_dl/extractor/eagleplatform.py b/hypervideo_dl/extractor/eagleplatform.py index f86731a..9ebd24d 100644 --- a/hypervideo_dl/extractor/eagleplatform.py +++ b/hypervideo_dl/extractor/eagleplatform.py @@ -1,6 +1,4 @@ -# coding: utf-8 -from __future__ import unicode_literals - +import functools import re from .common import InfoExtractor @@ -8,6 +6,7 @@ from ..compat import compat_HTTPError from ..utils import ( ExtractorError, int_or_none, + smuggle_url, unsmuggle_url, url_or_none, ) @@ -21,6 +20,7 @@ class EaglePlatformIE(InfoExtractor): ) (?P<id>\d+) ''' + _EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//.+?\.media\.eagleplatform\.com/index/player\?.+?)\1'] _TESTS = [{ # http://lenta.ru/news/2015/03/06/navalny/ 'url': 'http://lentaru.media.eagleplatform.com/index/player?player=new&record_id=227304&player_template_id=5201', @@ -55,14 +55,14 @@ class EaglePlatformIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_url(webpage): - # Regular iframe embedding - mobj = re.search( - r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//.+?\.media\.eagleplatform\.com/index/player\?.+?)\1', - webpage) - if mobj is not None: - return mobj.group('url') + @classmethod + def _extract_embed_urls(cls, url, webpage): + add_referer = functools.partial(smuggle_url, data={'referrer': url}) + + res = tuple(super()._extract_embed_urls(url, webpage)) + if res: + return map(add_referer, res) + PLAYER_JS_RE = r''' <script[^>]+ src=(?P<qjs>["\'])(?:https?:)?//(?P<host>(?:(?!(?P=qjs)).)+\.media\.eagleplatform\.com)/player/player\.js(?P=qjs) @@ -77,7 +77,7 @@ class EaglePlatformIE(InfoExtractor): data-id=["\'](?P<id>\d+) ''' % PLAYER_JS_RE, webpage) if mobj is not None: - return 'eagleplatform:%(host)s:%(id)s' % mobj.groupdict() + return [add_referer('eagleplatform:%(host)s:%(id)s' % mobj.groupdict())] # Generalization of "Javascript code usage", "Combined usage" and # "Usage without attaching to DOM" embeddings (see # http://dultonmedia.github.io/eplayer/) @@ -98,7 +98,7 @@ class EaglePlatformIE(InfoExtractor): </script> ''' % PLAYER_JS_RE, webpage) if mobj is not None: - return 'eagleplatform:%(host)s:%(id)s' % mobj.groupdict() + return [add_referer('eagleplatform:%(host)s:%(id)s' % mobj.groupdict())] @staticmethod def _handle_error(response): @@ -192,8 +192,6 @@ class EaglePlatformIE(InfoExtractor): f['url'] = format_url formats.append(f) - self._sort_formats(formats) - return { 'id': video_id, 'title': title, @@ -204,3 +202,14 @@ class EaglePlatformIE(InfoExtractor): 'age_limit': age_limit, 'formats': formats, } + + +class ClipYouEmbedIE(InfoExtractor): + _VALID_URL = False + + @classmethod + def _extract_embed_urls(cls, url, webpage): + mobj = re.search( + r'<iframe[^>]+src="https?://(?P<host>media\.clipyou\.ru)/index/player\?.*\brecord_id=(?P<id>\d+).*"', webpage) + if mobj is not None: + yield smuggle_url('eagleplatform:%(host)s:%(id)s' % mobj.groupdict(), {'referrer': url}) diff --git a/hypervideo_dl/extractor/ebaumsworld.py b/hypervideo_dl/extractor/ebaumsworld.py index c97682c..0854d03 100644 --- a/hypervideo_dl/extractor/ebaumsworld.py +++ b/hypervideo_dl/extractor/ebaumsworld.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/echomsk.py b/hypervideo_dl/extractor/echomsk.py index 6b7cc65..850eabb 100644 --- a/hypervideo_dl/extractor/echomsk.py +++ b/hypervideo_dl/extractor/echomsk.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/egghead.py b/hypervideo_dl/extractor/egghead.py index b6b8676..a4b2a12 100644 --- a/hypervideo_dl/extractor/egghead.py +++ b/hypervideo_dl/extractor/egghead.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import compat_str from ..utils import ( @@ -120,7 +117,6 @@ class EggheadLessonIE(EggheadBaseIE): formats.append({ 'url': format_url, }) - self._sort_formats(formats) return { 'id': lesson_id, diff --git a/hypervideo_dl/extractor/ehow.py b/hypervideo_dl/extractor/ehow.py index b1cd4f5..74469ce 100644 --- a/hypervideo_dl/extractor/ehow.py +++ b/hypervideo_dl/extractor/ehow.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import compat_urllib_parse_unquote diff --git a/hypervideo_dl/extractor/eighttracks.py b/hypervideo_dl/extractor/eighttracks.py index 9a44f89..3dd9ab1 100644 --- a/hypervideo_dl/extractor/eighttracks.py +++ b/hypervideo_dl/extractor/eighttracks.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json import random diff --git a/hypervideo_dl/extractor/einthusan.py b/hypervideo_dl/extractor/einthusan.py index 7af279a..53bc253 100644 --- a/hypervideo_dl/extractor/einthusan.py +++ b/hypervideo_dl/extractor/einthusan.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json from .common import InfoExtractor @@ -92,8 +89,6 @@ class EinthusanIE(InfoExtractor): 'url': mp4_url, }) - self._sort_formats(formats) - description = get_elements_by_class('synopsis', webpage)[0] thumbnail = self._html_search_regex( r'''<img[^>]+src=(["'])(?P<url>(?!\1).+?/moviecovers/(?!\1).+?)\1''', diff --git a/hypervideo_dl/extractor/eitb.py b/hypervideo_dl/extractor/eitb.py index ee5ead1..bd027da 100644 --- a/hypervideo_dl/extractor/eitb.py +++ b/hypervideo_dl/extractor/eitb.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( float_or_none, @@ -74,8 +71,6 @@ class EitbIE(InfoExtractor): '%s?hdcore=3.7.0' % hds_url.replace('euskalsvod', 'euskalvod'), video_id, f4m_id='hds', fatal=False)) - self._sort_formats(formats) - return { 'id': video_id, 'title': media.get('NAME_ES') or media.get('name') or media['NAME_EU'], diff --git a/hypervideo_dl/extractor/ellentube.py b/hypervideo_dl/extractor/ellentube.py index d451bc0..6eb00f9 100644 --- a/hypervideo_dl/extractor/ellentube.py +++ b/hypervideo_dl/extractor/ellentube.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( clean_html, @@ -31,7 +28,6 @@ class EllenTubeBaseIE(InfoExtractor): entry_protocol='m3u8_native', m3u8_id='hls') duration = int_or_none(entry.get('duration')) break - self._sort_formats(formats) def get_insight(kind): return int_or_none(try_get( diff --git a/hypervideo_dl/extractor/elonet.py b/hypervideo_dl/extractor/elonet.py index 9c6aea2..c5558ff 100644 --- a/hypervideo_dl/extractor/elonet.py +++ b/hypervideo_dl/extractor/elonet.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import determine_ext @@ -56,7 +53,6 @@ class ElonetIE(InfoExtractor): else: formats, subtitles = [], {} self.raise_no_formats(f'Unknown streaming format {ext}') - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/elpais.py b/hypervideo_dl/extractor/elpais.py index b89f6db..7c6c880 100644 --- a/hypervideo_dl/extractor/elpais.py +++ b/hypervideo_dl/extractor/elpais.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import strip_jsonp, unified_strdate diff --git a/hypervideo_dl/extractor/embedly.py b/hypervideo_dl/extractor/embedly.py index a5820b2..483d018 100644 --- a/hypervideo_dl/extractor/embedly.py +++ b/hypervideo_dl/extractor/embedly.py @@ -1,6 +1,5 @@ -# coding: utf-8 -from __future__ import unicode_literals - +import re +import urllib.parse from .common import InfoExtractor from ..compat import compat_urllib_parse_unquote @@ -12,5 +11,14 @@ class EmbedlyIE(InfoExtractor): 'only_matching': True, }] + @classmethod + def _extract_embed_urls(cls, url, webpage): + # Bypass suitable check + for mobj in re.finditer(r'class=["\']embedly-card["\'][^>]href=["\'](?P<url>[^"\']+)', webpage): + yield mobj.group('url') + + for mobj in re.finditer(r'class=["\']embedly-embed["\'][^>]src=["\'][^"\']*url=(?P<url>[^&]+)', webpage): + yield urllib.parse.unquote(mobj.group('url')) + def _real_extract(self, url): return self.url_result(compat_urllib_parse_unquote(self._match_id(url))) diff --git a/hypervideo_dl/extractor/engadget.py b/hypervideo_dl/extractor/engadget.py index 733bf32..e7c5d7b 100644 --- a/hypervideo_dl/extractor/engadget.py +++ b/hypervideo_dl/extractor/engadget.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/epicon.py b/hypervideo_dl/extractor/epicon.py index cd19325..3bfcc54 100644 --- a/hypervideo_dl/extractor/epicon.py +++ b/hypervideo_dl/extractor/epicon.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -62,7 +59,6 @@ class EpiconIE(InfoExtractor): description = self._og_search_description(webpage) or None thumbnail = self._og_search_thumbnail(webpage) or None formats = self._extract_m3u8_formats(data_json['url']['video_url'], id) - self._sort_formats(formats) subtitles = {} for subtitle in data_json.get('subtitles', []): diff --git a/hypervideo_dl/extractor/epoch.py b/hypervideo_dl/extractor/epoch.py new file mode 100644 index 0000000..110e78c --- /dev/null +++ b/hypervideo_dl/extractor/epoch.py @@ -0,0 +1,55 @@ +from .common import InfoExtractor +from ..utils import extract_attributes, get_element_html_by_id + + +class EpochIE(InfoExtractor): + _VALID_URL = r'https?://www.theepochtimes\.com/[\w-]+_(?P<id>\d+).html' + _TESTS = [ + { + 'url': 'https://www.theepochtimes.com/they-can-do-audio-video-physical-surveillance-on-you-24h-365d-a-year-rex-lee-on-intrusive-apps_4661688.html', + 'info_dict': { + 'id': 'a3dd732c-4750-4bc8-8156-69180668bda1', + 'ext': 'mp4', + 'title': '‘They Can Do Audio, Video, Physical Surveillance on You 24H/365D a Year’: Rex Lee on Intrusive Apps', + } + }, + { + 'url': 'https://www.theepochtimes.com/the-communist-partys-cyberattacks-on-america-explained-rex-lee-talks-tech-hybrid-warfare_4342413.html', + 'info_dict': { + 'id': '276c7f46-3bbf-475d-9934-b9bbe827cf0a', + 'ext': 'mp4', + 'title': 'The Communist Party’s Cyberattacks on America Explained; Rex Lee Talks Tech Hybrid Warfare', + } + }, + { + 'url': 'https://www.theepochtimes.com/kash-patel-a-6-year-saga-of-government-corruption-from-russiagate-to-mar-a-lago_4690250.html', + 'info_dict': { + 'id': 'aa9ceecd-a127-453d-a2de-7153d6fd69b6', + 'ext': 'mp4', + 'title': 'Kash Patel: A ‘6-Year-Saga’ of Government Corruption, From Russiagate to Mar-a-Lago', + } + }, + { + 'url': 'https://www.theepochtimes.com/dick-morris-discusses-his-book-the-return-trumps-big-2024-comeback_4819205.html', + 'info_dict': { + 'id': '9489f994-2a20-4812-b233-ac0e5c345632', + 'ext': 'mp4', + 'title': 'Dick Morris Discusses His Book ‘The Return: Trump’s Big 2024 Comeback’', + } + }, + ] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + youmaker_video_id = extract_attributes(get_element_html_by_id('videobox', webpage))['data-id'] + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + f'http://vs1.youmaker.com/assets/{youmaker_video_id}/playlist.m3u8', video_id, 'mp4', m3u8_id='hls') + + return { + 'id': youmaker_video_id, + 'formats': formats, + 'subtitles': subtitles, + 'title': self._html_extract_title(webpage) + } diff --git a/hypervideo_dl/extractor/eporner.py b/hypervideo_dl/extractor/eporner.py index 25a0d97..a233797 100644 --- a/hypervideo_dl/extractor/eporner.py +++ b/hypervideo_dl/extractor/eporner.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor from ..utils import ( encode_base_n, @@ -110,7 +106,6 @@ class EpornerIE(InfoExtractor): 'height': height, 'fps': fps, }) - self._sort_formats(formats) json_ld = self._search_json_ld(webpage, display_id, default={}) diff --git a/hypervideo_dl/extractor/eroprofile.py b/hypervideo_dl/extractor/eroprofile.py index 5d5e7f2..2b61f3b 100644 --- a/hypervideo_dl/extractor/eroprofile.py +++ b/hypervideo_dl/extractor/eroprofile.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/ertgr.py b/hypervideo_dl/extractor/ertgr.py index 19ce23f..9ecdf5d 100644 --- a/hypervideo_dl/extractor/ertgr.py +++ b/hypervideo_dl/extractor/ertgr.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json import re @@ -18,7 +15,6 @@ from ..utils import ( parse_iso8601, str_or_none, try_get, - unescapeHTML, url_or_none, variadic, ) @@ -77,7 +73,7 @@ class ERTFlixCodenameIE(ERTFlixBaseIE): }, ] - def _extract_formats_and_subs(self, video_id, allow_none=True): + def _extract_formats_and_subs(self, video_id): media_info = self._call_api(video_id, codename=video_id) formats, subs = [], {} for media_file in try_get(media_info, lambda x: x['MediaFiles'], list) or []: @@ -101,8 +97,6 @@ class ERTFlixCodenameIE(ERTFlixBaseIE): formats.extend(formats_) self._merge_subtitles(subs_, target=subs) - if formats or not allow_none: - self._sort_formats(formats) return formats, subs def _real_extract(self, url): @@ -122,7 +116,7 @@ class ERTFlixCodenameIE(ERTFlixBaseIE): class ERTFlixIE(ERTFlixBaseIE): IE_NAME = 'ertflix' IE_DESC = 'ERTFLIX videos' - _VALID_URL = r'https?://www\.ertflix\.gr/(?:series|vod)/(?P<id>[a-z]{3}\.\d+)' + _VALID_URL = r'https?://www\.ertflix\.gr/(?:[^/]+/)?(?:series|vod)/(?P<id>[a-z]{3}\.\d+)' _TESTS = [{ 'url': 'https://www.ertflix.gr/vod/vod.173258-aoratoi-ergates', 'md5': '6479d5e60fd7e520b07ba5411dcdd6e7', @@ -174,6 +168,9 @@ class ERTFlixIE(ERTFlixBaseIE): 'title': 'Το δίκτυο', }, 'playlist_mincount': 9, + }, { + 'url': 'https://www.ertflix.gr/en/vod/vod.127652-ta-kalytera-mas-chronia-ep1-mia-volta-sto-feggari', + 'only_matching': True, }] def _extract_episode(self, episode): @@ -275,6 +272,7 @@ class ERTWebtvEmbedIE(InfoExtractor): IE_DESC = 'ert.gr webtv embedded videos' _BASE_PLAYER_URL_RE = re.escape('//www.ert.gr/webtv/live-uni/vod/dt-uni-vod.php') _VALID_URL = rf'https?:{_BASE_PLAYER_URL_RE}\?([^#]+&)?f=(?P<id>[^#&]+)' + _EMBED_REGEX = [rf'<iframe[^>]+?src=(?P<_q1>["\'])(?P<url>(?:https?:)?{_BASE_PLAYER_URL_RE}\?(?:(?!(?P=_q1)).)+)(?P=_q1)'] _TESTS = [{ 'url': 'https://www.ert.gr/webtv/live-uni/vod/dt-uni-vod.php?f=trailers/E2251_TO_DIKTYO_E09_16-01_1900.mp4&bgimg=/photos/2022/1/to_diktio_ep09_i_istoria_tou_diadiktiou_stin_Ellada_1021x576.jpg', @@ -287,23 +285,11 @@ class ERTWebtvEmbedIE(InfoExtractor): }, }] - @classmethod - def _extract_urls(cls, webpage): - EMBED_URL_RE = rf'(?:https?:)?{cls._BASE_PLAYER_URL_RE}\?(?:(?!(?P=_q1)).)+' - EMBED_RE = rf'<iframe[^>]+?src=(?P<_q1>["\'])(?P<url>{EMBED_URL_RE})(?P=_q1)' - - for mobj in re.finditer(EMBED_RE, webpage): - url = unescapeHTML(mobj.group('url')) - if not cls.suitable(url): - continue - yield url - def _real_extract(self, url): video_id = self._match_id(url) formats, subs = self._extract_m3u8_formats_and_subtitles( f'https://mediastream.ert.gr/vodedge/_definst_/mp4:dvrorigin/{video_id}/playlist.m3u8', video_id, 'mp4') - self._sort_formats(formats) thumbnail_id = parse_qs(url).get('bgimg', [None])[0] if thumbnail_id and not thumbnail_id.startswith('http'): thumbnail_id = f'https://program.ert.gr{thumbnail_id}' diff --git a/hypervideo_dl/extractor/escapist.py b/hypervideo_dl/extractor/escapist.py index 4cd815e..85a1cbf 100644 --- a/hypervideo_dl/extractor/escapist.py +++ b/hypervideo_dl/extractor/escapist.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( determine_ext, @@ -97,7 +95,6 @@ class EscapistIE(InfoExtractor): 'format_id': '%s-%sp' % (determine_ext(video['src']), video['res']), 'height': int_or_none(video.get('res')), } for video in data['files']['videos']] - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/espn.py b/hypervideo_dl/extractor/espn.py index dc50f3b..f4b0134 100644 --- a/hypervideo_dl/extractor/espn.py +++ b/hypervideo_dl/extractor/espn.py @@ -1,14 +1,16 @@ -from __future__ import unicode_literals - +import base64 +import json import re +import urllib.parse +from .adobepass import AdobePassIE from .common import InfoExtractor from .once import OnceIE -from ..compat import compat_str from ..utils import ( determine_ext, dict_get, int_or_none, + traverse_obj, unified_strdate, unified_timestamp, ) @@ -26,7 +28,6 @@ class ESPNIE(OnceIE): (?: (?: video/(?:clip|iframe/twitter)| - watch/player ) (?: .*?\?.*?\bid=| @@ -49,6 +50,8 @@ class ESPNIE(OnceIE): 'description': 'md5:39370c2e016cb4ecf498ffe75bef7f0f', 'timestamp': 1390936111, 'upload_date': '20140128', + 'duration': 1302, + 'thumbnail': r're:https://.+\.jpg', }, 'params': { 'skip_download': True, @@ -73,15 +76,6 @@ class ESPNIE(OnceIE): }, { 'url': 'https://cdn.espn.go.com/video/clip/_/id/19771774', 'only_matching': True, - }, { - 'url': 'http://www.espn.com/watch/player?id=19141491', - 'only_matching': True, - }, { - 'url': 'http://www.espn.com/watch/player?bucketId=257&id=19505875', - 'only_matching': True, - }, { - 'url': 'http://www.espn.com/watch/player/_/id/19141491', - 'only_matching': True, }, { 'url': 'http://www.espn.com/video/clip?id=10365079', 'only_matching': True, @@ -100,7 +94,13 @@ class ESPNIE(OnceIE): }, { 'url': 'http://www.espn.com/espnw/video/26066627/arkansas-gibson-completes-hr-cycle-four-innings', 'only_matching': True, - }] + }, { + 'url': 'http://www.espn.com/watch/player?id=19141491', + 'only_matching': True, + }, { + 'url': 'http://www.espn.com/watch/player?bucketId=257&id=19505875', + 'only_matching': True, + }, ] def _real_extract(self, url): video_id = self._match_id(url) @@ -118,7 +118,7 @@ class ESPNIE(OnceIE): for source_id, source in source.items(): if source_id == 'alert': continue - elif isinstance(source, compat_str): + elif isinstance(source, str): extract_source(source, base_source_id) elif isinstance(source, dict): traverse_source( @@ -162,7 +162,6 @@ class ESPNIE(OnceIE): links = clip.get('links', {}) traverse_source(links.get('source', {})) traverse_source(links.get('mobile', {})) - self._sort_formats(formats) description = clip.get('caption') or clip.get('description') thumbnail = clip.get('thumbnail') @@ -198,7 +197,7 @@ class ESPNArticleIE(InfoExtractor): @classmethod def suitable(cls, url): - return False if ESPNIE.suitable(url) else super(ESPNArticleIE, cls).suitable(url) + return False if (ESPNIE.suitable(url) or WatchESPNIE.suitable(url)) else super().suitable(url) def _real_extract(self, url): video_id = self._match_id(url) @@ -269,7 +268,6 @@ class ESPNCricInfoIE(InfoExtractor): 'url': item['url'], 'vcodec': 'none', }) - self._sort_formats(formats) return { 'id': id, 'title': data_json.get('title'), @@ -279,3 +277,134 @@ class ESPNCricInfoIE(InfoExtractor): 'formats': formats, 'subtitles': subtitles, } + + +class WatchESPNIE(AdobePassIE): + _VALID_URL = r'https?://(?:www\.)?espn\.com/(?:watch|espnplus)/player/_/id/(?P<id>[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})' + _TESTS = [{ + 'url': 'https://www.espn.com/watch/player/_/id/dbbc6b1d-c084-4b47-9878-5f13c56ce309', + 'info_dict': { + 'id': 'dbbc6b1d-c084-4b47-9878-5f13c56ce309', + 'ext': 'mp4', + 'title': 'Huddersfield vs. Burnley', + 'duration': 7500, + 'thumbnail': 'https://artwork.api.espn.com/artwork/collections/media/dbbc6b1d-c084-4b47-9878-5f13c56ce309/default?width=640&apikey=1ngjw23osgcis1i1vbj96lmfqs', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.espn.com/watch/player/_/id/a049a56e-a7ce-477e-aef3-c7e48ef8221c', + 'info_dict': { + 'id': 'a049a56e-a7ce-477e-aef3-c7e48ef8221c', + 'ext': 'mp4', + 'title': 'Dynamo Dresden vs. VfB Stuttgart (Round #1) (German Cup)', + 'duration': 8335, + 'thumbnail': 'https://s.secure.espncdn.com/stitcher/artwork/collections/media/bd1f3d12-0654-47d9-852e-71b85ea695c7/16x9.jpg?timestamp=202201112217&showBadge=true&cb=12&package=ESPN_PLUS', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.espn.com/espnplus/player/_/id/317f5fd1-c78a-4ebe-824a-129e0d348421', + 'info_dict': { + 'id': '317f5fd1-c78a-4ebe-824a-129e0d348421', + 'ext': 'mp4', + 'title': 'The Wheel - Episode 10', + 'duration': 3352, + 'thumbnail': 'https://s.secure.espncdn.com/stitcher/artwork/collections/media/317f5fd1-c78a-4ebe-824a-129e0d348421/16x9.jpg?timestamp=202205031523&showBadge=true&cb=12&package=ESPN_PLUS', + }, + 'params': { + 'skip_download': True, + }, + }] + + _API_KEY = 'ZXNwbiZicm93c2VyJjEuMC4w.ptUt7QxsteaRruuPmGZFaJByOoqKvDP2a5YkInHrc7c' + + def _call_bamgrid_api(self, path, video_id, payload=None, headers={}): + if 'Authorization' not in headers: + headers['Authorization'] = f'Bearer {self._API_KEY}' + parse = urllib.parse.urlencode if path == 'token' else json.dumps + return self._download_json( + f'https://espn.api.edge.bamgrid.com/{path}', video_id, headers=headers, data=parse(payload).encode()) + + def _real_extract(self, url): + video_id = self._match_id(url) + cdn_data = self._download_json( + f'https://watch-cdn.product.api.espn.com/api/product/v3/watchespn/web/playback/event?id={video_id}', + video_id) + video_data = cdn_data['playbackState'] + + # ESPN+ subscription required, through cookies + if 'DTC' in video_data.get('sourceId'): + cookie = self._get_cookies(url).get('ESPN-ONESITE.WEB-PROD.token') + if not cookie: + self.raise_login_required(method='cookies') + + assertion = self._call_bamgrid_api( + 'devices', video_id, + headers={'Content-Type': 'application/json; charset=UTF-8'}, + payload={ + 'deviceFamily': 'android', + 'applicationRuntime': 'android', + 'deviceProfile': 'tv', + 'attributes': {}, + })['assertion'] + token = self._call_bamgrid_api( + 'token', video_id, payload={ + 'subject_token': assertion, + 'subject_token_type': 'urn:bamtech:params:oauth:token-type:device', + 'platform': 'android', + 'grant_type': 'urn:ietf:params:oauth:grant-type:token-exchange' + })['access_token'] + + assertion = self._call_bamgrid_api( + 'accounts/grant', video_id, payload={'id_token': cookie.value.split('|')[1]}, + headers={ + 'Authorization': token, + 'Content-Type': 'application/json; charset=UTF-8' + })['assertion'] + token = self._call_bamgrid_api( + 'token', video_id, payload={ + 'subject_token': assertion, + 'subject_token_type': 'urn:bamtech:params:oauth:token-type:account', + 'platform': 'android', + 'grant_type': 'urn:ietf:params:oauth:grant-type:token-exchange' + })['access_token'] + + playback = self._download_json( + video_data['videoHref'].format(scenario='browser~ssai'), video_id, + headers={ + 'Accept': 'application/vnd.media-service+json; version=5', + 'Authorization': token + }) + m3u8_url, headers = playback['stream']['complete'][0]['url'], {'authorization': token} + + # No login required + elif video_data.get('sourceId') == 'ESPN_FREE': + asset = self._download_json( + f'https://watch.auth.api.espn.com/video/auth/media/{video_id}/asset?apikey=uiqlbgzdwuru14v627vdusswb', + video_id) + m3u8_url, headers = asset['stream'], {} + + # TV Provider required + else: + resource = self._get_mvpd_resource('ESPN', video_data['name'], video_id, None) + auth = self._extract_mvpd_auth(url, video_id, 'ESPN', resource).encode() + + asset = self._download_json( + f'https://watch.auth.api.espn.com/video/auth/media/{video_id}/asset?apikey=uiqlbgzdwuru14v627vdusswb', + video_id, data=f'adobeToken={urllib.parse.quote_plus(base64.b64encode(auth))}&drmSupport=HLS'.encode()) + m3u8_url, headers = asset['stream'], {} + + formats, subtitles = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, 'mp4', m3u8_id='hls') + + return { + 'id': video_id, + 'duration': traverse_obj(cdn_data, ('tracking', 'duration')), + 'title': video_data.get('name'), + 'formats': formats, + 'subtitles': subtitles, + 'thumbnail': video_data.get('posterHref'), + 'http_headers': headers, + } diff --git a/hypervideo_dl/extractor/esri.py b/hypervideo_dl/extractor/esri.py index e9dcaeb..02e7efa 100644 --- a/hypervideo_dl/extractor/esri.py +++ b/hypervideo_dl/extractor/esri.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -46,7 +43,6 @@ class EsriVideoIE(InfoExtractor): 'height': int(height), 'filesize_approx': parse_filesize(filesize), }) - self._sort_formats(formats) title = self._html_search_meta('title', webpage, 'title') description = self._html_search_meta( diff --git a/hypervideo_dl/extractor/europa.py b/hypervideo_dl/extractor/europa.py index 60ab2ce..c2b4937 100644 --- a/hypervideo_dl/extractor/europa.py +++ b/hypervideo_dl/extractor/europa.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( int_or_none, @@ -79,7 +76,6 @@ class EuropaIE(InfoExtractor): 'format_note': xpath_text(file_, './lglabel'), 'language_preference': language_preference(lang) }) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/europeantour.py b/hypervideo_dl/extractor/europeantour.py index e28f067..1995a74 100644 --- a/hypervideo_dl/extractor/europeantour.py +++ b/hypervideo_dl/extractor/europeantour.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/eurosport.py b/hypervideo_dl/extractor/eurosport.py new file mode 100644 index 0000000..654e112 --- /dev/null +++ b/hypervideo_dl/extractor/eurosport.py @@ -0,0 +1,97 @@ +from .common import InfoExtractor +from ..utils import traverse_obj + + +class EurosportIE(InfoExtractor): + _VALID_URL = r'https?://www\.eurosport\.com/\w+/[\w-]+/\d+/[\w-]+_(?P<id>vid\d+)' + _TESTS = [{ + 'url': 'https://www.eurosport.com/tennis/roland-garros/2022/highlights-rafael-nadal-brushes-aside-caper-ruud-to-win-record-extending-14th-french-open-title_vid1694147/video.shtml', + 'info_dict': { + 'id': '2480939', + 'ext': 'mp4', + 'title': 'Highlights: Rafael Nadal brushes aside Caper Ruud to win record-extending 14th French Open title', + 'description': 'md5:b564db73ecfe4b14ebbd8e62a3692c76', + 'thumbnail': 'https://imgresizer.eurosport.com/unsafe/1280x960/smart/filters:format(jpeg)/origin-imgresizer.eurosport.com/2022/06/05/3388285-69245968-2560-1440.png', + 'duration': 195.0, + 'display_id': 'vid1694147', + 'timestamp': 1654446698, + 'upload_date': '20220605', + } + }, { + 'url': 'https://www.eurosport.com/tennis/roland-garros/2022/watch-the-top-five-shots-from-men-s-final-as-rafael-nadal-beats-casper-ruud-to-seal-14th-french-open_vid1694283/video.shtml', + 'info_dict': { + 'id': '2481254', + 'ext': 'mp4', + 'title': 'md5:149dcc5dfb38ab7352acc008cc9fb071', + 'duration': 130.0, + 'thumbnail': 'https://imgresizer.eurosport.com/unsafe/1280x960/smart/filters:format(jpeg)/origin-imgresizer.eurosport.com/2022/06/05/3388422-69248708-2560-1440.png', + 'description': 'md5:a0c8a7f6b285e48ae8ddbe7aa85cfee6', + 'display_id': 'vid1694283', + 'timestamp': 1654456090, + 'upload_date': '20220605', + } + }, { + # geo-fence but can bypassed by xff + 'url': 'https://www.eurosport.com/cycling/tour-de-france-femmes/2022/incredible-ride-marlen-reusser-storms-to-stage-4-win-at-tour-de-france-femmes_vid1722221/video.shtml', + 'info_dict': { + 'id': '2582552', + 'ext': 'mp4', + 'title': '‘Incredible ride!’ - Marlen Reusser storms to Stage 4 win at Tour de France Femmes', + 'duration': 188.0, + 'display_id': 'vid1722221', + 'timestamp': 1658936167, + 'thumbnail': 'https://imgresizer.eurosport.com/unsafe/1280x960/smart/filters:format(jpeg)/origin-imgresizer.eurosport.com/2022/07/27/3423347-69852108-2560-1440.jpg', + 'description': 'md5:32bbe3a773ac132c57fb1e8cca4b7c71', + 'upload_date': '20220727', + } + }] + + _TOKEN = None + + # actually defined in https://netsport.eurosport.io/?variables={"databaseId":<databaseId>,"playoutType":"VDP"}&extensions={"persistedQuery":{"version":1 .. + # but this method require to get sha256 hash + _GEO_COUNTRIES = ['DE', 'NL', 'EU', 'IT', 'FR'] # Not complete list but it should work + + def _real_initialize(self): + if EurosportIE._TOKEN is None: + EurosportIE._TOKEN = self._download_json( + 'https://eu3-prod-direct.eurosport.com/token?realm=eurosport', None, + 'Trying to get token')['data']['attributes']['token'] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + json_data = self._download_json( + f'https://eu3-prod-direct.eurosport.com/playback/v2/videoPlaybackInfo/sourceSystemId/eurosport-{display_id}', + display_id, query={'usePreAuth': True}, headers={'Authorization': f'Bearer {EurosportIE._TOKEN}'})['data'] + + json_ld_data = self._search_json_ld(webpage, display_id) + + formats, subtitles = [], {} + for stream_type in json_data['attributes']['streaming']: + if stream_type == 'hls': + fmts, subs = self._extract_m3u8_formats_and_subtitles( + traverse_obj(json_data, ('attributes', 'streaming', stream_type, 'url')), display_id, ext='mp4') + elif stream_type == 'dash': + fmts, subs = self._extract_mpd_formats_and_subtitles( + traverse_obj(json_data, ('attributes', 'streaming', stream_type, 'url')), display_id) + elif stream_type == 'mss': + fmts, subs = self._extract_ism_formats_and_subtitles( + traverse_obj(json_data, ('attributes', 'streaming', stream_type, 'url')), display_id) + + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + + return { + 'id': json_data['id'], + 'title': json_ld_data.get('title') or self._og_search_title(webpage), + 'display_id': display_id, + 'formats': formats, + 'subtitles': subtitles, + 'thumbnails': json_ld_data.get('thumbnails'), + 'description': (json_ld_data.get('description') + or self._html_search_meta(['og:description', 'description'], webpage)), + 'duration': json_ld_data.get('duration'), + 'timestamp': json_ld_data.get('timestamp'), + } diff --git a/hypervideo_dl/extractor/euscreen.py b/hypervideo_dl/extractor/euscreen.py index 2759e74..65a1dc7 100644 --- a/hypervideo_dl/extractor/euscreen.py +++ b/hypervideo_dl/extractor/euscreen.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( @@ -48,7 +45,6 @@ class EUScreenIE(InfoExtractor): formats = [{ 'url': source['src'], } for source in video_json.get('sources', [])] - self._sort_formats(formats) return { 'id': id, diff --git a/hypervideo_dl/extractor/everyonesmixtape.py b/hypervideo_dl/extractor/everyonesmixtape.py deleted file mode 100644 index 80cb032..0000000 --- a/hypervideo_dl/extractor/everyonesmixtape.py +++ /dev/null @@ -1,76 +0,0 @@ -from __future__ import unicode_literals - - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - sanitized_Request, -) - - -class EveryonesMixtapeIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?everyonesmixtape\.com/#/mix/(?P<id>[0-9a-zA-Z]+)(?:/(?P<songnr>[0-9]))?$' - - _TESTS = [{ - 'url': 'http://everyonesmixtape.com/#/mix/m7m0jJAbMQi/5', - 'info_dict': { - 'id': '5bfseWNmlds', - 'ext': 'mp4', - 'title': "Passion Pit - \"Sleepyhead\" (Official Music Video)", - 'uploader': 'FKR.TV', - 'uploader_id': 'frenchkissrecords', - 'description': "Music video for \"Sleepyhead\" from Passion Pit's debut EP Chunk Of Change.\nBuy on iTunes: https://itunes.apple.com/us/album/chunk-of-change-ep/id300087641\n\nDirected by The Wilderness.\n\nhttp://www.passionpitmusic.com\nhttp://www.frenchkissrecords.com", - 'upload_date': '20081015' - }, - 'params': { - 'skip_download': True, # This is simply YouTube - } - }, { - 'url': 'http://everyonesmixtape.com/#/mix/m7m0jJAbMQi', - 'info_dict': { - 'id': 'm7m0jJAbMQi', - 'title': 'Driving', - }, - 'playlist_count': 24 - }] - - def _real_extract(self, url): - mobj = self._match_valid_url(url) - playlist_id = mobj.group('id') - - pllist_url = 'http://everyonesmixtape.com/mixtape.php?a=getMixes&u=-1&linked=%s&explore=' % playlist_id - pllist_req = sanitized_Request(pllist_url) - pllist_req.add_header('X-Requested-With', 'XMLHttpRequest') - - playlist_list = self._download_json( - pllist_req, playlist_id, note='Downloading playlist metadata') - try: - playlist_no = next(playlist['id'] - for playlist in playlist_list - if playlist['code'] == playlist_id) - except StopIteration: - raise ExtractorError('Playlist id not found') - - pl_url = 'http://everyonesmixtape.com/mixtape.php?a=getMix&id=%s&userId=null&code=' % playlist_no - pl_req = sanitized_Request(pl_url) - pl_req.add_header('X-Requested-With', 'XMLHttpRequest') - playlist = self._download_json( - pl_req, playlist_id, note='Downloading playlist info') - - entries = [{ - '_type': 'url', - 'url': t['url'], - 'title': t['title'], - } for t in playlist['tracks']] - - if mobj.group('songnr'): - songnr = int(mobj.group('songnr')) - 1 - return entries[songnr] - - playlist_title = playlist['mixData']['name'] - return { - '_type': 'playlist', - 'id': playlist_id, - 'title': playlist_title, - 'entries': entries, - } diff --git a/hypervideo_dl/extractor/expotv.py b/hypervideo_dl/extractor/expotv.py index 95a8977..bda6e3c 100644 --- a/hypervideo_dl/extractor/expotv.py +++ b/hypervideo_dl/extractor/expotv.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( int_or_none, @@ -51,7 +49,6 @@ class ExpoTVIE(InfoExtractor): r'filename=.*\.([a-z0-9_A-Z]+)&', media_url, 'file extension', default=None) or fcfg.get('type'), }) - self._sort_formats(formats) title = self._og_search_title(webpage) description = self._og_search_description(webpage) diff --git a/hypervideo_dl/extractor/expressen.py b/hypervideo_dl/extractor/expressen.py index dc8b855..86967b6 100644 --- a/hypervideo_dl/extractor/expressen.py +++ b/hypervideo_dl/extractor/expressen.py @@ -1,8 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - from .common import InfoExtractor from ..utils import ( determine_ext, @@ -20,11 +15,13 @@ class ExpressenIE(InfoExtractor): tv/(?:[^/]+/)* (?P<id>[^/?#&]+) ''' + _EMBED_REGEX = [r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:www\.)?(?:expressen|di)\.se/(?:tvspelare/video|videoplayer/embed)/tv/.+?)\1'] _TESTS = [{ 'url': 'https://www.expressen.se/tv/ledare/ledarsnack/ledarsnack-om-arbetslosheten-bland-kvinnor-i-speciellt-utsatta-omraden/', - 'md5': '2fbbe3ca14392a6b1b36941858d33a45', + 'md5': 'deb2ca62e7b1dcd19fa18ba37523f66e', 'info_dict': { - 'id': '8690962', + 'id': 'ba90f5a9-78d1-4511-aa02-c177b9c99136', + 'display_id': 'ledarsnack-om-arbetslosheten-bland-kvinnor-i-speciellt-utsatta-omraden', 'ext': 'mp4', 'title': 'Ledarsnack: Om arbetslösheten bland kvinnor i speciellt utsatta områden', 'description': 'md5:f38c81ff69f3de4d269bbda012fcbbba', @@ -47,13 +44,6 @@ class ExpressenIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): - return [ - mobj.group('url') for mobj in re.finditer( - r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:www\.)?(?:expressen|di)\.se/(?:tvspelare/video|videoplayer/embed)/tv/.+?)\1', - webpage)] - def _real_extract(self, url): display_id = self._match_id(url) @@ -67,7 +57,7 @@ class ExpressenIE(InfoExtractor): display_id, transform_source=unescapeHTML) info = extract_data('video-tracking-info') - video_id = info['videoId'] + video_id = info['contentId'] data = extract_data('article-data') stream = data['stream'] @@ -80,7 +70,6 @@ class ExpressenIE(InfoExtractor): formats = [{ 'url': stream, }] - self._sort_formats(formats) title = info.get('titleRaw') or data['title'] description = info.get('descriptionRaw') diff --git a/hypervideo_dl/extractor/extractors.py b/hypervideo_dl/extractor/extractors.py index 457f4c2..610e02f 100644 --- a/hypervideo_dl/extractor/extractors.py +++ b/hypervideo_dl/extractor/extractors.py @@ -1,2144 +1,26 @@ -# flake8: noqa -from __future__ import unicode_literals +import contextlib +import os -from .abc import ( - ABCIE, - ABCIViewIE, - ABCIViewShowSeriesIE, -) -from .abcnews import ( - AbcNewsIE, - AbcNewsVideoIE, -) -from .abcotvs import ( - ABCOTVSIE, - ABCOTVSClipsIE, -) -from .abematv import ( - AbemaTVIE, - AbemaTVTitleIE, -) -from .academicearth import AcademicEarthCourseIE -from .acast import ( - ACastIE, - ACastChannelIE, -) -from .adn import ADNIE -from .adobeconnect import AdobeConnectIE -from .adobetv import ( - AdobeTVEmbedIE, - AdobeTVIE, - AdobeTVShowIE, - AdobeTVChannelIE, - AdobeTVVideoIE, -) -from .adultswim import AdultSwimIE -from .aenetworks import ( - AENetworksIE, - AENetworksCollectionIE, - AENetworksShowIE, - HistoryTopicIE, - HistoryPlayerIE, - BiographyIE, -) -from .afreecatv import ( - AfreecaTVIE, - AfreecaTVLiveIE, -) -from .airmozilla import AirMozillaIE -from .aljazeera import AlJazeeraIE -from .alphaporno import AlphaPornoIE -from .amara import AmaraIE -from .alura import ( - AluraIE, - AluraCourseIE -) -from .amcnetworks import AMCNetworksIE -from .animelab import ( - AnimeLabIE, - AnimeLabShowsIE, -) -from .amazon import AmazonStoreIE -from .americastestkitchen import ( - AmericasTestKitchenIE, - AmericasTestKitchenSeasonIE, -) -from .animeondemand import AnimeOnDemandIE -from .anvato import AnvatoIE -from .aol import AolIE -from .allocine import AllocineIE -from .aliexpress import AliExpressLiveIE -from .alsace20tv import ( - Alsace20TVIE, - Alsace20TVEmbedIE, -) -from .apa import APAIE -from .aparat import AparatIE -from .appleconnect import AppleConnectIE -from .appletrailers import ( - AppleTrailersIE, - AppleTrailersSectionIE, -) -from .applepodcasts import ApplePodcastsIE -from .archiveorg import ( - ArchiveOrgIE, - YoutubeWebArchiveIE, -) -from .arcpublishing import ArcPublishingIE -from .arkena import ArkenaIE -from .ard import ( - ARDBetaMediathekIE, - ARDIE, - ARDMediathekIE, -) -from .arte import ( - ArteTVIE, - ArteTVEmbedIE, - ArteTVPlaylistIE, - ArteTVCategoryIE, -) -from .arnes import ArnesIE -from .asiancrush import ( - AsianCrushIE, - AsianCrushPlaylistIE, -) -from .atresplayer import AtresPlayerIE -from .atttechchannel import ATTTechChannelIE -from .atvat import ATVAtIE -from .audimedia import AudiMediaIE -from .audioboom import AudioBoomIE -from .audiomack import AudiomackIE, AudiomackAlbumIE -from .audius import ( - AudiusIE, - AudiusTrackIE, - AudiusPlaylistIE, - AudiusProfileIE, -) -from .awaan import ( - AWAANIE, - AWAANVideoIE, - AWAANLiveIE, - AWAANSeasonIE, -) -from .azmedien import AZMedienIE -from .baidu import BaiduVideoIE -from .banbye import ( - BanByeIE, - BanByeChannelIE, -) -from .bandaichannel import BandaiChannelIE -from .bandcamp import ( - BandcampIE, - BandcampAlbumIE, - BandcampWeeklyIE, - BandcampUserIE, -) -from .bannedvideo import BannedVideoIE -from .bbc import ( - BBCCoUkIE, - BBCCoUkArticleIE, - BBCCoUkIPlayerEpisodesIE, - BBCCoUkIPlayerGroupIE, - BBCCoUkPlaylistIE, - BBCIE, -) -from .beeg import BeegIE -from .behindkink import BehindKinkIE -from .bellmedia import BellMediaIE -from .beatport import BeatportIE -from .bet import BetIE -from .bfi import BFIPlayerIE -from .bfmtv import ( - BFMTVIE, - BFMTVLiveIE, - BFMTVArticleIE, -) -from .bibeltv import BibelTVIE -from .bigflix import BigflixIE -from .bigo import BigoIE -from .bild import BildIE -from .bilibili import ( - BiliBiliIE, - BiliBiliSearchIE, - BilibiliCategoryIE, - BiliBiliBangumiIE, - BilibiliAudioIE, - BilibiliAudioAlbumIE, - BiliBiliPlayerIE, - BilibiliChannelIE, - BiliIntlIE, - BiliIntlSeriesIE, -) -from .biobiochiletv import BioBioChileTVIE -from .bitchute import ( - BitChuteIE, - BitChuteChannelIE, -) -from .bitwave import ( - BitwaveReplayIE, - BitwaveStreamIE, -) -from .biqle import BIQLEIE -from .blackboardcollaborate import BlackboardCollaborateIE -from .bleacherreport import ( - BleacherReportIE, - BleacherReportCMSIE, -) -from .blogger import BloggerIE -from .bloomberg import BloombergIE -from .bokecc import BokeCCIE -from .bongacams import BongaCamsIE -from .bostonglobe import BostonGlobeIE -from .box import BoxIE -from .bpb import BpbIE -from .br import ( - BRIE, - BRMediathekIE, -) -from .bravotv import BravoTVIE -from .breakcom import BreakIE -from .breitbart import BreitBartIE -from .brightcove import ( - BrightcoveLegacyIE, - BrightcoveNewIE, -) -from .businessinsider import BusinessInsiderIE -from .buzzfeed import BuzzFeedIE -from .byutv import BYUtvIE -from .c56 import C56IE -from .cableav import CableAVIE -from .callin import CallinIE -from .caltrans import CaltransIE -from .cam4 import CAM4IE -from .camdemy import ( - CamdemyIE, - CamdemyFolderIE -) -from .cammodels import CamModelsIE -from .camwithher import CamWithHerIE -from .canalalpha import CanalAlphaIE -from .canalplus import CanalplusIE -from .canalc2 import Canalc2IE -from .canvas import ( - CanvasIE, - CanvasEenIE, - VrtNUIE, - DagelijkseKostIE, -) -from .carambatv import ( - CarambaTVIE, - CarambaTVPageIE, -) -from .cartoonnetwork import CartoonNetworkIE -from .cbc import ( - CBCIE, - CBCPlayerIE, - CBCGemIE, - CBCGemPlaylistIE, - CBCGemLiveIE, -) -from .cbs import CBSIE -from .cbslocal import ( - CBSLocalIE, - CBSLocalArticleIE, -) -from .cbsinteractive import CBSInteractiveIE -from .cbsnews import ( - CBSNewsEmbedIE, - CBSNewsIE, - CBSNewsLiveVideoIE, -) -from .cbssports import ( - CBSSportsEmbedIE, - CBSSportsIE, - TwentyFourSevenSportsIE, -) -from .ccc import ( - CCCIE, - CCCPlaylistIE, -) -from .ccma import CCMAIE -from .cctv import CCTVIE -from .cda import CDAIE -from .ceskatelevize import CeskaTelevizeIE -from .cgtn import CGTNIE -from .channel9 import Channel9IE -from .charlierose import CharlieRoseIE -from .chaturbate import ChaturbateIE -from .chilloutzone import ChilloutzoneIE -from .chingari import ( - ChingariIE, - ChingariUserIE, -) -from .chirbit import ( - ChirbitIE, - ChirbitProfileIE, -) -from .cinchcast import CinchcastIE -from .cinemax import CinemaxIE -from .ciscolive import ( - CiscoLiveSessionIE, - CiscoLiveSearchIE, -) -from .ciscowebex import CiscoWebexIE -from .cjsw import CJSWIE -from .cliphunter import CliphunterIE -from .clippit import ClippitIE -from .cliprs import ClipRsIE -from .clipsyndicate import ClipsyndicateIE -from .closertotruth import CloserToTruthIE -from .cloudflarestream import CloudflareStreamIE -from .cloudy import CloudyIE -from .clubic import ClubicIE -from .clyp import ClypIE -from .cmt import CMTIE -from .cnbc import ( - CNBCIE, - CNBCVideoIE, -) -from .cnn import ( - CNNIE, - CNNBlogsIE, - CNNArticleIE, -) -from .coub import CoubIE -from .comedycentral import ( - ComedyCentralIE, - ComedyCentralTVIE, -) -from .commonmistakes import CommonMistakesIE, UnicodeBOMIE -from .commonprotocols import ( - MmsIE, - RtmpIE, - ViewSourceIE, -) -from .condenast import CondeNastIE -from .contv import CONtvIE -from .corus import CorusIE -from .cpac import ( - CPACIE, - CPACPlaylistIE, -) -from .cozytv import CozyTVIE -from .cracked import CrackedIE -from .crackle import CrackleIE -from .craftsy import CraftsyIE -from .crooksandliars import CrooksAndLiarsIE -from .crowdbunker import ( - CrowdBunkerIE, - CrowdBunkerChannelIE, -) -from .crunchyroll import ( - CrunchyrollIE, - CrunchyrollShowPlaylistIE, - CrunchyrollBetaIE, - CrunchyrollBetaShowIE, -) -from .cspan import CSpanIE, CSpanCongressIE -from .ctsnews import CtsNewsIE -from .ctv import CTVIE -from .ctvnews import CTVNewsIE -from .cultureunplugged import CultureUnpluggedIE -from .curiositystream import ( - CuriosityStreamIE, - CuriosityStreamCollectionsIE, - CuriosityStreamSeriesIE, -) -from .cwtv import CWTVIE -from .cybrary import ( - CybraryIE, - CybraryCourseIE -) -from .daftsex import DaftsexIE -from .dailymail import DailyMailIE -from .dailymotion import ( - DailymotionIE, - DailymotionPlaylistIE, - DailymotionUserIE, -) -from .damtomo import ( - DamtomoRecordIE, - DamtomoVideoIE, -) -from .daum import ( - DaumIE, - DaumClipIE, - DaumPlaylistIE, - DaumUserIE, -) -from .daystar import DaystarClipIE -from .dbtv import DBTVIE -from .dctp import DctpTvIE -from .deezer import ( - DeezerPlaylistIE, - DeezerAlbumIE, -) -from .democracynow import DemocracynowIE -from .dfb import DFBIE -from .dhm import DHMIE -from .digg import DiggIE -from .dotsub import DotsubIE -from .douyutv import ( - DouyuShowIE, - DouyuTVIE, -) -from .dplay import ( - DPlayIE, - DiscoveryPlusIE, - HGTVDeIE, - GoDiscoveryIE, - TravelChannelIE, - CookingChannelIE, - HGTVUsaIE, - FoodNetworkIE, - InvestigationDiscoveryIE, - DestinationAmericaIE, - AmHistoryChannelIE, - ScienceChannelIE, - DIYNetworkIE, - DiscoveryLifeIE, - AnimalPlanetIE, - TLCIE, - DiscoveryPlusIndiaIE, - DiscoveryNetworksDeIE, - DiscoveryPlusItalyIE, - DiscoveryPlusItalyShowIE, - DiscoveryPlusIndiaShowIE, -) -from .dreisat import DreiSatIE -from .drbonanza import DRBonanzaIE -from .drtuber import DrTuberIE -from .drtv import ( - DRTVIE, - DRTVLiveIE, -) -from .dtube import DTubeIE -from .dvtv import DVTVIE -from .duboku import ( - DubokuIE, - DubokuPlaylistIE -) -from .dumpert import DumpertIE -from .defense import DefenseGouvFrIE -from .digitalconcerthall import DigitalConcertHallIE -from .discovery import DiscoveryIE -from .disney import DisneyIE -from .dispeak import DigitallySpeakingIE -from .doodstream import DoodStreamIE -from .dropbox import DropboxIE -from .dropout import ( - DropoutSeasonIE, - DropoutIE -) -from .dw import ( - DWIE, - DWArticleIE, -) -from .eagleplatform import EaglePlatformIE -from .ebaumsworld import EbaumsWorldIE -from .echomsk import EchoMskIE -from .egghead import ( - EggheadCourseIE, - EggheadLessonIE, -) -from .ehow import EHowIE -from .eighttracks import EightTracksIE -from .einthusan import EinthusanIE -from .eitb import EitbIE -from .ellentube import ( - EllenTubeIE, - EllenTubeVideoIE, - EllenTubePlaylistIE, -) -from .elonet import ElonetIE -from .elpais import ElPaisIE -from .embedly import EmbedlyIE -from .engadget import EngadgetIE -from .epicon import ( - EpiconIE, - EpiconSeriesIE, -) -from .eporner import EpornerIE -from .eroprofile import ( - EroProfileIE, - EroProfileAlbumIE, -) -from .ertgr import ( - ERTFlixCodenameIE, - ERTFlixIE, - ERTWebtvEmbedIE, -) -from .escapist import EscapistIE -from .espn import ( - ESPNIE, - ESPNArticleIE, - FiveThirtyEightIE, - ESPNCricInfoIE, -) -from .esri import EsriVideoIE -from .europa import EuropaIE -from .europeantour import EuropeanTourIE -from .euscreen import EUScreenIE -from .expotv import ExpoTVIE -from .expressen import ExpressenIE -from .extremetube import ExtremeTubeIE -from .eyedotv import EyedoTVIE -from .facebook import ( - FacebookIE, - FacebookPluginsVideoIE, - FacebookRedirectURLIE, -) -from .fancode import ( - FancodeVodIE, - FancodeLiveIE -) +from ..utils import load_plugins -from .faz import FazIE -from .fc2 import ( - FC2IE, - FC2EmbedIE, - FC2LiveIE, -) -from .fczenit import FczenitIE -from .filmmodu import FilmmoduIE -from .filmon import ( - FilmOnIE, - FilmOnChannelIE, -) -from .filmweb import FilmwebIE -from .firsttv import FirstTVIE -from .fivetv import FiveTVIE -from .flickr import FlickrIE -from .folketinget import FolketingetIE -from .footyroom import FootyRoomIE -from .formula1 import Formula1IE -from .fourtube import ( - FourTubeIE, - PornTubeIE, - PornerBrosIE, - FuxIE, -) -from .fox import FOXIE -from .fox9 import ( - FOX9IE, - FOX9NewsIE, -) -from .foxgay import FoxgayIE -from .foxnews import ( - FoxNewsIE, - FoxNewsArticleIE, -) -from .foxsports import FoxSportsIE -from .fptplay import FptplayIE -from .franceculture import FranceCultureIE -from .franceinter import FranceInterIE -from .francetv import ( - FranceTVIE, - FranceTVSiteIE, - FranceTVInfoIE, -) -from .freesound import FreesoundIE -from .freespeech import FreespeechIE -from .frontendmasters import ( - FrontendMastersIE, - FrontendMastersLessonIE, - FrontendMastersCourseIE -) -from .fujitv import FujiTVFODPlus7IE -from .funimation import ( - FunimationIE, - FunimationPageIE, - FunimationShowIE, -) -from .funk import FunkIE -from .fusion import FusionIE -from .gab import ( - GabTVIE, - GabIE, -) -from .gaia import GaiaIE -from .gameinformer import GameInformerIE -from .gamejolt import ( - GameJoltIE, - GameJoltUserIE, - GameJoltGameIE, - GameJoltGameSoundtrackIE, - GameJoltCommunityIE, - GameJoltSearchIE, -) -from .gamespot import GameSpotIE -from .gamestar import GameStarIE -from .gaskrank import GaskrankIE -from .gazeta import GazetaIE -from .gdcvault import GDCVaultIE -from .gedidigital import GediDigitalIE -from .generic import GenericIE -from .gettr import ( - GettrIE, - GettrStreamingIE, -) -from .gfycat import GfycatIE -from .giantbomb import GiantBombIE -from .giga import GigaIE -from .glide import GlideIE -from .globo import ( - GloboIE, - GloboArticleIE, -) -from .go import GoIE -from .godtube import GodTubeIE -from .gofile import GofileIE -from .golem import GolemIE -from .googledrive import GoogleDriveIE -from .googlepodcasts import ( - GooglePodcastsIE, - GooglePodcastsFeedIE, -) -from .googlesearch import GoogleSearchIE -from .gopro import GoProIE -from .goshgay import GoshgayIE -from .gotostage import GoToStageIE -from .gputechconf import GPUTechConfIE -from .gronkh import GronkhIE -from .groupon import GrouponIE -from .hbo import HBOIE -from .hearthisat import HearThisAtIE -from .heise import HeiseIE -from .hellporno import HellPornoIE -from .helsinki import HelsinkiIE -from .hentaistigma import HentaiStigmaIE -from .hgtv import HGTVComShowIE -from .hketv import HKETVIE -from .hidive import HiDiveIE -from .historicfilms import HistoricFilmsIE -from .hitbox import HitboxIE, HitboxLiveIE -from .hitrecord import HitRecordIE -from .hotnewhiphop import HotNewHipHopIE -from .hotstar import ( - HotStarIE, - HotStarPlaylistIE, - HotStarSeriesIE, -) -from .howcast import HowcastIE -from .howstuffworks import HowStuffWorksIE -from .hrfensehen import HRFernsehenIE -from .hrti import ( - HRTiIE, - HRTiPlaylistIE, -) -from .hse import ( - HSEShowIE, - HSEProductIE, -) -from .huajiao import HuajiaoIE -from .huya import HuyaLiveIE -from .huffpost import HuffPostIE -from .hungama import ( - HungamaIE, - HungamaSongIE, - HungamaAlbumPlaylistIE, -) -from .hypem import HypemIE -from .ichinanalive import ( - IchinanaLiveIE, - IchinanaLiveClipIE, -) -from .ign import ( - IGNIE, - IGNVideoIE, - IGNArticleIE, -) -from .iheart import ( - IHeartRadioIE, - IHeartRadioPodcastIE, -) -from .imdb import ( - ImdbIE, - ImdbListIE -) -from .imgur import ( - ImgurIE, - ImgurAlbumIE, - ImgurGalleryIE, -) -from .ina import InaIE -from .inc import IncIE -from .indavideo import IndavideoEmbedIE -from .infoq import InfoQIE -from .instagram import ( - InstagramIE, - InstagramIOSIE, - InstagramUserIE, - InstagramTagIE, - InstagramStoryIE, -) -from .internazionale import InternazionaleIE -from .internetvideoarchive import InternetVideoArchiveIE -from .iprima import ( - IPrimaIE, - IPrimaCNNIE -) -from .iqiyi import ( - IqiyiIE, - IqIE, - IqAlbumIE -) +# NB: Must be before other imports so that plugins can be correctly injected +_PLUGIN_CLASSES = load_plugins('extractor', 'IE', {}) -from .itprotv import ( - ITProTVIE, - ITProTVCourseIE -) +_LAZY_LOADER = False +if not os.environ.get('YTDLP_NO_LAZY_EXTRACTORS'): + with contextlib.suppress(ImportError): + from .lazy_extractors import * # noqa: F403 + from .lazy_extractors import _ALL_CLASSES + _LAZY_LOADER = True -from .itv import ( - ITVIE, - ITVBTCCIE, -) -from .ivi import ( - IviIE, - IviCompilationIE -) -from .ivideon import IvideonIE -from .iwara import IwaraIE -from .izlesene import IzleseneIE -from .jamendo import ( - JamendoIE, - JamendoAlbumIE, -) -from .jeuxvideo import JeuxVideoIE -from .jove import JoveIE -from .joj import JojIE -from .jwplatform import JWPlatformIE -from .kakao import KakaoIE -from .kaltura import KalturaIE -from .karaoketv import KaraoketvIE -from .karrierevideos import KarriereVideosIE -from .keezmovies import KeezMoviesIE -from .kelbyone import KelbyOneIE -from .ketnet import KetnetIE -from .khanacademy import ( - KhanAcademyIE, - KhanAcademyUnitIE, -) -from .kickstarter import KickStarterIE -from .kinja import KinjaEmbedIE -from .kinopoisk import KinoPoiskIE -from .konserthusetplay import KonserthusetPlayIE -from .koo import KooIE -from .krasview import KrasViewIE -from .ku6 import Ku6IE -from .kusi import KUSIIE -from .kuwo import ( - KuwoIE, - KuwoAlbumIE, - KuwoChartIE, - KuwoSingerIE, - KuwoCategoryIE, - KuwoMvIE, -) -from .la7 import ( - LA7IE, - LA7PodcastEpisodeIE, - LA7PodcastIE, -) -from .laola1tv import ( - Laola1TvEmbedIE, - Laola1TvIE, - EHFTVIE, - ITTFIE, -) -from .lastfm import ( - LastFMIE, - LastFMPlaylistIE, - LastFMUserIE, -) -from .lbry import ( - LBRYIE, - LBRYChannelIE, -) -from .lci import LCIIE -from .lcp import ( - LcpPlayIE, - LcpIE, -) -from .lecture2go import Lecture2GoIE -from .lecturio import ( - LecturioIE, - LecturioCourseIE, - LecturioDeCourseIE, -) -from .leeco import ( - LeIE, - LePlaylistIE, - LetvCloudIE, -) -from .lego import LEGOIE -from .lemonde import LemondeIE -from .lenta import LentaIE -from .libraryofcongress import LibraryOfCongressIE -from .libsyn import LibsynIE -from .lifenews import ( - LifeNewsIE, - LifeEmbedIE, -) -from .limelight import ( - LimelightMediaIE, - LimelightChannelIE, - LimelightChannelListIE, -) -from .line import ( - LineLiveIE, - LineLiveChannelIE, -) -from .linkedin import ( - LinkedInIE, - LinkedInLearningIE, - LinkedInLearningCourseIE, -) -from .linuxacademy import LinuxAcademyIE -from .litv import LiTVIE -from .livejournal import LiveJournalIE -from .livestream import ( - LivestreamIE, - LivestreamOriginalIE, - LivestreamShortenerIE, -) -from .lnkgo import ( - LnkGoIE, - LnkIE, -) -from .localnews8 import LocalNews8IE -from .lovehomeporn import LoveHomePornIE -from .lrt import LRTIE -from .lynda import ( - LyndaIE, - LyndaCourseIE -) -from .m6 import M6IE -from .magentamusik360 import MagentaMusik360IE -from .mailru import ( - MailRuIE, - MailRuMusicIE, - MailRuMusicSearchIE, -) -from .mainstreaming import MainStreamingIE -from .malltv import MallTVIE -from .mangomolo import ( - MangomoloVideoIE, - MangomoloLiveIE, -) -from .manoto import ( - ManotoTVIE, - ManotoTVShowIE, - ManotoTVLiveIE, -) -from .manyvids import ManyVidsIE -from .maoritv import MaoriTVIE -from .markiza import ( - MarkizaIE, - MarkizaPageIE, -) -from .massengeschmacktv import MassengeschmackTVIE -from .matchtv import MatchTVIE -from .mdr import MDRIE -from .medaltv import MedalTVIE -from .mediaite import MediaiteIE -from .mediaklikk import MediaKlikkIE -from .mediaset import ( - MediasetIE, - MediasetShowIE, -) -from .mediasite import ( - MediasiteIE, - MediasiteCatalogIE, - MediasiteNamedCatalogIE, -) -from .medici import MediciIE -from .megaphone import MegaphoneIE -from .meipai import MeipaiIE -from .melonvod import MelonVODIE -from .meta import METAIE -from .metacafe import MetacafeIE -from .metacritic import MetacriticIE -from .mgoon import MgoonIE -from .mgtv import MGTVIE -from .miaopai import MiaoPaiIE -from .microsoftstream import MicrosoftStreamIE -from .microsoftvirtualacademy import ( - MicrosoftVirtualAcademyIE, - MicrosoftVirtualAcademyCourseIE, -) -from .mildom import ( - MildomIE, - MildomVodIE, - MildomClipIE, - MildomUserVodIE, -) -from .minds import ( - MindsIE, - MindsChannelIE, - MindsGroupIE, -) -from .ministrygrid import MinistryGridIE -from .minoto import MinotoIE -from .miomio import MioMioIE -from .mirrativ import ( - MirrativIE, - MirrativUserIE, -) -from .mit import TechTVMITIE, OCWMITIE -from .mitele import MiTeleIE -from .mixch import ( - MixchIE, - MixchArchiveIE, -) -from .mixcloud import ( - MixcloudIE, - MixcloudUserIE, - MixcloudPlaylistIE, -) -from .mlb import ( - MLBIE, - MLBVideoIE, -) -from .mlssoccer import MLSSoccerIE -from .mnet import MnetIE -from .moevideo import MoeVideoIE -from .mofosex import ( - MofosexIE, - MofosexEmbedIE, -) -from .mojvideo import MojvideoIE -from .morningstar import MorningstarIE -from .motherless import ( - MotherlessIE, - MotherlessGroupIE -) -from .motorsport import MotorsportIE -from .movieclips import MovieClipsIE -from .moviezine import MoviezineIE -from .movingimage import MovingImageIE -from .msn import MSNIE -from .mtv import ( - MTVIE, - MTVVideoIE, - MTVServicesEmbeddedIE, - MTVDEIE, - MTVJapanIE, - MTVItaliaIE, - MTVItaliaProgrammaIE, -) -from .muenchentv import MuenchenTVIE -from .murrtube import MurrtubeIE, MurrtubeUserIE -from .musescore import MuseScoreIE -from .musicdex import ( - MusicdexSongIE, - MusicdexAlbumIE, - MusicdexArtistIE, - MusicdexPlaylistIE, -) -from .mwave import MwaveIE, MwaveMeetGreetIE -from .mxplayer import ( - MxplayerIE, - MxplayerShowIE, -) -from .mychannels import MyChannelsIE -from .myspace import MySpaceIE, MySpaceAlbumIE -from .myspass import MySpassIE -from .myvi import ( - MyviIE, - MyviEmbedIE, -) -from .myvideoge import MyVideoGeIE -from .myvidster import MyVidsterIE -from .n1 import ( - N1InfoAssetIE, - N1InfoIIE, -) -from .nate import ( - NateIE, - NateProgramIE, -) -from .nationalgeographic import ( - NationalGeographicVideoIE, - NationalGeographicTVIE, -) -from .naver import ( - NaverIE, - NaverLiveIE, -) -from .nba import ( - NBAWatchEmbedIE, - NBAWatchIE, - NBAWatchCollectionIE, - NBAEmbedIE, - NBAIE, - NBAChannelIE, -) -from .nbc import ( - NBCIE, - NBCNewsIE, - NBCOlympicsIE, - NBCOlympicsStreamIE, - NBCSportsIE, - NBCSportsStreamIE, - NBCSportsVPlayerIE, -) -from .ndr import ( - NDRIE, - NJoyIE, - NDREmbedBaseIE, - NDREmbedIE, - NJoyEmbedIE, -) -from .ndtv import NDTVIE -from .nebula import ( - NebulaIE, - NebulaCollectionIE, -) -from .nerdcubed import NerdCubedFeedIE -from .netzkino import NetzkinoIE -from .neteasemusic import ( - NetEaseMusicIE, - NetEaseMusicAlbumIE, - NetEaseMusicSingerIE, - NetEaseMusicListIE, - NetEaseMusicMvIE, - NetEaseMusicProgramIE, - NetEaseMusicDjRadioIE, -) -from .newgrounds import ( - NewgroundsIE, - NewgroundsPlaylistIE, - NewgroundsUserIE, -) -from .newstube import NewstubeIE -from .newsy import NewsyIE -from .nextmedia import ( - NextMediaIE, - NextMediaActionNewsIE, - AppleDailyIE, - NextTVIE, -) -from .nexx import ( - NexxIE, - NexxEmbedIE, -) -from .nfb import NFBIE -from .nfhsnetwork import NFHSNetworkIE -from .nfl import ( - NFLIE, - NFLArticleIE, -) -from .nhk import ( - NhkVodIE, - NhkVodProgramIE, - NhkForSchoolBangumiIE, - NhkForSchoolSubjectIE, - NhkForSchoolProgramListIE, -) -from .nhl import NHLIE -from .nick import ( - NickIE, - NickBrIE, - NickDeIE, - NickNightIE, - NickRuIE, -) -from .niconico import ( - NiconicoIE, - NiconicoPlaylistIE, - NiconicoUserIE, - NiconicoSeriesIE, - NiconicoHistoryIE, - NicovideoSearchDateIE, - NicovideoSearchIE, - NicovideoSearchURLIE, - NicovideoTagURLIE, -) -from .ninecninemedia import ( - NineCNineMediaIE, - CPTwentyFourIE, -) -from .ninegag import NineGagIE -from .ninenow import NineNowIE -from .nintendo import NintendoIE -from .nitter import NitterIE -from .njpwworld import NJPWWorldIE -from .nobelprize import NobelPrizeIE -from .nonktube import NonkTubeIE -from .noodlemagazine import NoodleMagazineIE -from .noovo import NoovoIE -from .normalboots import NormalbootsIE -from .nosvideo import NosVideoIE -from .nova import ( - NovaEmbedIE, - NovaIE, -) -from .novaplay import NovaPlayIE -from .nowness import ( - NownessIE, - NownessPlaylistIE, - NownessSeriesIE, -) -from .noz import NozIE -from .npo import ( - AndereTijdenIE, - NPOIE, - NPOLiveIE, - NPORadioIE, - NPORadioFragmentIE, - SchoolTVIE, - HetKlokhuisIE, - VPROIE, - WNLIE, -) -from .npr import NprIE -from .nrk import ( - NRKIE, - NRKPlaylistIE, - NRKSkoleIE, - NRKTVIE, - NRKTVDirekteIE, - NRKRadioPodkastIE, - NRKTVEpisodeIE, - NRKTVEpisodesIE, - NRKTVSeasonIE, - NRKTVSeriesIE, -) -from .nrl import NRLTVIE -from .ntvcojp import NTVCoJpCUIE -from .ntvde import NTVDeIE -from .ntvru import NTVRuIE -from .nytimes import ( - NYTimesIE, - NYTimesArticleIE, - NYTimesCookingIE, -) -from .nuvid import NuvidIE -from .nzherald import NZHeraldIE -from .nzz import NZZIE -from .odatv import OdaTVIE -from .odnoklassniki import OdnoklassnikiIE -from .oktoberfesttv import OktoberfestTVIE -from .olympics import OlympicsReplayIE -from .on24 import On24IE -from .ondemandkorea import OnDemandKoreaIE -from .onefootball import OneFootballIE -from .onet import ( - OnetIE, - OnetChannelIE, - OnetMVPIE, - OnetPlIE, -) -from .onionstudios import OnionStudiosIE -from .ooyala import ( - OoyalaIE, - OoyalaExternalIE, -) -from .opencast import ( - OpencastIE, - OpencastPlaylistIE, -) -from .openrec import ( - OpenRecIE, - OpenRecCaptureIE, - OpenRecMovieIE, -) -from .ora import OraTVIE -from .orf import ( - ORFTVthekIE, - ORFFM4IE, - ORFFM4StoryIE, - ORFOE1IE, - ORFOE3IE, - ORFNOEIE, - ORFWIEIE, - ORFBGLIE, - ORFOOEIE, - ORFSTMIE, - ORFKTNIE, - ORFSBGIE, - ORFTIRIE, - ORFVBGIE, - ORFIPTVIE, -) -from .outsidetv import OutsideTVIE -from .packtpub import ( - PacktPubIE, - PacktPubCourseIE, -) -from .palcomp3 import ( - PalcoMP3IE, - PalcoMP3ArtistIE, - PalcoMP3VideoIE, -) -from .pandoratv import PandoraTVIE -from .panopto import ( - PanoptoIE, - PanoptoListIE, - PanoptoPlaylistIE -) -from .paramountplus import ( - ParamountPlusIE, - ParamountPlusSeriesIE, -) -from .parliamentliveuk import ParliamentLiveUKIE -from .parlview import ParlviewIE -from .patreon import ( - PatreonIE, - PatreonUserIE -) -from .pbs import PBSIE -from .pearvideo import PearVideoIE -from .peekvids import PeekVidsIE, PlayVidsIE -from .peertube import ( - PeerTubeIE, - PeerTubePlaylistIE, -) -from .peertv import PeerTVIE -from .peloton import ( - PelotonIE, - PelotonLiveIE -) -from .people import PeopleIE -from .performgroup import PerformGroupIE -from .periscope import ( - PeriscopeIE, - PeriscopeUserIE, -) -from .philharmoniedeparis import PhilharmonieDeParisIE -from .phoenix import PhoenixIE -from .photobucket import PhotobucketIE -from .piapro import PiaproIE -from .picarto import ( - PicartoIE, - PicartoVodIE, -) -from .piksel import PikselIE -from .pinkbike import PinkbikeIE -from .pinterest import ( - PinterestIE, - PinterestCollectionIE, -) -from .pixivsketch import ( - PixivSketchIE, - PixivSketchUserIE, -) -from .pladform import PladformIE -from .planetmarathi import PlanetMarathiIE -from .platzi import ( - PlatziIE, - PlatziCourseIE, -) -from .playfm import PlayFMIE -from .playplustv import PlayPlusTVIE -from .plays import PlaysTVIE -from .playstuff import PlayStuffIE -from .playtvak import PlaytvakIE -from .playvid import PlayvidIE -from .playwire import PlaywireIE -from .plutotv import PlutoTVIE -from .pluralsight import ( - PluralsightIE, - PluralsightCourseIE, -) -from .podomatic import PodomaticIE -from .pokemon import ( - PokemonIE, - PokemonWatchIE, - PokemonSoundLibraryIE, -) -from .pokergo import ( - PokerGoIE, - PokerGoCollectionIE, -) -from .polsatgo import PolsatGoIE -from .polskieradio import ( - PolskieRadioIE, - PolskieRadioCategoryIE, - PolskieRadioPlayerIE, - PolskieRadioPodcastIE, - PolskieRadioPodcastListIE, - PolskieRadioRadioKierowcowIE, -) -from .popcorntimes import PopcorntimesIE -from .popcorntv import PopcornTVIE -from .porn91 import Porn91IE -from .porncom import PornComIE -from .pornflip import PornFlipIE -from .pornhd import PornHdIE -from .pornhub import ( - PornHubIE, - PornHubUserIE, - PornHubPlaylistIE, - PornHubPagedVideoListIE, - PornHubUserVideosUploadIE, -) -from .pornotube import PornotubeIE -from .pornovoisines import PornoVoisinesIE -from .pornoxo import PornoXOIE -from .pornez import PornezIE -from .puhutv import ( - PuhuTVIE, - PuhuTVSerieIE, -) -from .presstv import PressTVIE -from .projectveritas import ProjectVeritasIE -from .prosiebensat1 import ProSiebenSat1IE -from .prx import ( - PRXStoryIE, - PRXSeriesIE, - PRXAccountIE, - PRXStoriesSearchIE, - PRXSeriesSearchIE -) -from .puls4 import Puls4IE -from .pyvideo import PyvideoIE -from .qqmusic import ( - QQMusicIE, - QQMusicSingerIE, - QQMusicAlbumIE, - QQMusicToplistIE, - QQMusicPlaylistIE, -) -from .r7 import ( - R7IE, - R7ArticleIE, -) -from .radiko import RadikoIE, RadikoRadioIE -from .radiocanada import ( - RadioCanadaIE, - RadioCanadaAudioVideoIE, -) -from .radiode import RadioDeIE -from .radiojavan import RadioJavanIE -from .radiobremen import RadioBremenIE -from .radiofrance import RadioFranceIE -from .radiozet import RadioZetPodcastIE -from .radiokapital import ( - RadioKapitalIE, - RadioKapitalShowIE, -) -from .radlive import ( - RadLiveIE, - RadLiveChannelIE, - RadLiveSeasonIE, -) -from .rai import ( - RaiPlayIE, - RaiPlayLiveIE, - RaiPlayPlaylistIE, - RaiPlaySoundIE, - RaiPlaySoundLiveIE, - RaiPlaySoundPlaylistIE, - RaiIE, -) -from .raywenderlich import ( - RayWenderlichIE, - RayWenderlichCourseIE, -) -from .rbmaradio import RBMARadioIE -from .rcs import ( - RCSIE, - RCSEmbedsIE, - RCSVariousIE, -) -from .rcti import ( - RCTIPlusIE, - RCTIPlusSeriesIE, - RCTIPlusTVIE, -) -from .rds import RDSIE -from .redbulltv import ( - RedBullTVIE, - RedBullEmbedIE, - RedBullTVRrnContentIE, - RedBullIE, -) -from .reddit import RedditIE -from .redgifs import ( - RedGifsIE, - RedGifsSearchIE, - RedGifsUserIE, -) -from .redtube import RedTubeIE -from .regiotv import RegioTVIE -from .rentv import ( - RENTVIE, - RENTVArticleIE, -) -from .restudy import RestudyIE -from .reuters import ReutersIE -from .reverbnation import ReverbNationIE -from .rice import RICEIE -from .rmcdecouverte import RMCDecouverteIE -from .rockstargames import RockstarGamesIE -from .rokfin import ( - RokfinIE, - RokfinStackIE, - RokfinChannelIE, -) -from .roosterteeth import RoosterTeethIE, RoosterTeethSeriesIE -from .rottentomatoes import RottenTomatoesIE -from .rozhlas import RozhlasIE -from .rtbf import RTBFIE -from .rte import RteIE, RteRadioIE -from .rtlnl import RtlNlIE -from .rtl2 import ( - RTL2IE, - RTL2YouIE, - RTL2YouSeriesIE, -) -from .rtnews import ( - RTNewsIE, - RTDocumentryIE, - RTDocumentryPlaylistIE, - RuptlyIE, -) -from .rtp import RTPIE -from .rtrfm import RTRFMIE -from .rts import RTSIE -from .rtve import ( - RTVEALaCartaIE, - RTVEAudioIE, - RTVELiveIE, - RTVEInfantilIE, - RTVETelevisionIE, -) -from .rtvnh import RTVNHIE -from .rtvs import RTVSIE -from .ruhd import RUHDIE -from .rule34video import Rule34VideoIE -from .rumble import ( - RumbleEmbedIE, - RumbleChannelIE, -) -from .rutube import ( - RutubeIE, - RutubeChannelIE, - RutubeEmbedIE, - RutubeMovieIE, - RutubePersonIE, - RutubePlaylistIE, - RutubeTagsIE, -) -from .glomex import ( - GlomexIE, - GlomexEmbedIE, -) -from .megatvcom import ( - MegaTVComIE, - MegaTVComEmbedIE, -) -from .ant1newsgr import ( - Ant1NewsGrWatchIE, - Ant1NewsGrArticleIE, - Ant1NewsGrEmbedIE, -) -from .rutv import RUTVIE -from .ruutu import RuutuIE -from .ruv import ( - RuvIE, - RuvSpilaIE -) -from .safari import ( - SafariIE, - SafariApiIE, - SafariCourseIE, -) -from .saitosan import SaitosanIE -from .samplefocus import SampleFocusIE -from .sapo import SapoIE -from .savefrom import SaveFromIE -from .sbs import SBSIE -from .screencast import ScreencastIE -from .screencastomatic import ScreencastOMaticIE -from .scrippsnetworks import ( - ScrippsNetworksWatchIE, - ScrippsNetworksIE, -) -from .scte import ( - SCTEIE, - SCTECourseIE, -) -from .seeker import SeekerIE -from .senategov import SenateISVPIE, SenateGovIE -from .sendtonews import SendtoNewsIE -from .servus import ServusIE -from .sevenplus import SevenPlusIE -from .sexu import SexuIE -from .seznamzpravy import ( - SeznamZpravyIE, - SeznamZpravyArticleIE, -) -from .shahid import ( - ShahidIE, - ShahidShowIE, -) -from .shared import ( - SharedIE, - VivoIE, -) -from .shemaroome import ShemarooMeIE -from .showroomlive import ShowRoomLiveIE -from .simplecast import ( - SimplecastIE, - SimplecastEpisodeIE, - SimplecastPodcastIE, -) -from .sina import SinaIE -from .sixplay import SixPlayIE -from .skeb import SkebIE -from .skyit import ( - SkyItPlayerIE, - SkyItVideoIE, - SkyItVideoLiveIE, - SkyItIE, - SkyItAcademyIE, - SkyItArteIE, - CieloTVItIE, - TV8ItIE, -) -from .skylinewebcams import SkylineWebcamsIE -from .skynewsarabia import ( - SkyNewsArabiaIE, - SkyNewsArabiaArticleIE, -) -from .skynewsau import SkyNewsAUIE -from .sky import ( - SkyNewsIE, - SkyNewsStoryIE, - SkySportsIE, - SkySportsNewsIE, -) -from .slideshare import SlideshareIE -from .slideslive import SlidesLiveIE -from .slutload import SlutloadIE -from .snotr import SnotrIE -from .sohu import SohuIE -from .sonyliv import ( - SonyLIVIE, - SonyLIVSeriesIE, -) -from .soundcloud import ( - SoundcloudEmbedIE, - SoundcloudIE, - SoundcloudSetIE, - SoundcloudRelatedIE, - SoundcloudUserIE, - SoundcloudTrackStationIE, - SoundcloudPlaylistIE, - SoundcloudSearchIE, -) -from .soundgasm import ( - SoundgasmIE, - SoundgasmProfileIE -) -from .southpark import ( - SouthParkIE, - SouthParkDeIE, - SouthParkDkIE, - SouthParkEsIE, - SouthParkNlIE -) -from .sovietscloset import ( - SovietsClosetIE, - SovietsClosetPlaylistIE -) -from .spankbang import ( - SpankBangIE, - SpankBangPlaylistIE, -) -from .spankwire import SpankwireIE -from .spiegel import SpiegelIE -from .spike import ( - BellatorIE, - ParamountNetworkIE, -) -from .stitcher import ( - StitcherIE, - StitcherShowIE, -) -from .sport5 import Sport5IE -from .sportbox import SportBoxIE -from .sportdeutschland import SportDeutschlandIE -from .spotify import ( - SpotifyIE, - SpotifyShowIE, -) -from .spreaker import ( - SpreakerIE, - SpreakerPageIE, - SpreakerShowIE, - SpreakerShowPageIE, -) -from .springboardplatform import SpringboardPlatformIE -from .sprout import SproutIE -from .srgssr import ( - SRGSSRIE, - SRGSSRPlayIE, -) -from .srmediathek import SRMediathekIE -from .stanfordoc import StanfordOpenClassroomIE -from .startv import StarTVIE -from .steam import SteamIE -from .storyfire import ( - StoryFireIE, - StoryFireUserIE, - StoryFireSeriesIE, -) -from .streamable import StreamableIE -from .streamanity import StreamanityIE -from .streamcloud import StreamcloudIE -from .streamcz import StreamCZIE -from .streamff import StreamFFIE -from .streetvoice import StreetVoiceIE -from .stretchinternet import StretchInternetIE -from .stripchat import StripchatIE -from .stv import STVPlayerIE -from .sunporno import SunPornoIE -from .sverigesradio import ( - SverigesRadioEpisodeIE, - SverigesRadioPublicationIE, -) -from .svt import ( - SVTIE, - SVTPageIE, - SVTPlayIE, - SVTSeriesIE, -) -from .swrmediathek import SWRMediathekIE -from .syfy import SyfyIE -from .sztvhu import SztvHuIE -from .tagesschau import TagesschauIE -from .tass import TassIE -from .tbs import TBSIE -from .tdslifeway import TDSLifewayIE -from .teachable import ( - TeachableIE, - TeachableCourseIE, -) -from .teachertube import ( - TeacherTubeIE, - TeacherTubeUserIE, -) -from .teachingchannel import TeachingChannelIE -from .teamcoco import TeamcocoIE -from .teamtreehouse import TeamTreeHouseIE -from .techtalks import TechTalksIE -from .ted import ( - TedEmbedIE, - TedPlaylistIE, - TedSeriesIE, - TedTalkIE, -) -from .tele5 import Tele5IE -from .tele13 import Tele13IE -from .telebruxelles import TeleBruxellesIE -from .telecinco import TelecincoIE -from .telegraaf import TelegraafIE -from .telegram import TelegramEmbedIE -from .telemb import TeleMBIE -from .telemundo import TelemundoIE -from .telequebec import ( - TeleQuebecIE, - TeleQuebecSquatIE, - TeleQuebecEmissionIE, - TeleQuebecLiveIE, - TeleQuebecVideoIE, -) -from .teletask import TeleTaskIE -from .telewebion import TelewebionIE -from .tennistv import TennisTVIE -from .tenplay import TenPlayIE -from .testurl import TestURLIE -from .tf1 import TF1IE -from .tfo import TFOIE -from .theintercept import TheInterceptIE -from .theplatform import ( - ThePlatformIE, - ThePlatformFeedIE, -) -from .thestar import TheStarIE -from .thesun import TheSunIE -from .theta import ( - ThetaVideoIE, - ThetaStreamIE, -) -from .theweatherchannel import TheWeatherChannelIE -from .thisamericanlife import ThisAmericanLifeIE -from .thisav import ThisAVIE -from .thisoldhouse import ThisOldHouseIE -from .threespeak import ( - ThreeSpeakIE, - ThreeSpeakUserIE, -) -from .threeqsdn import ThreeQSDNIE -from .tiktok import ( - TikTokIE, - TikTokUserIE, - TikTokSoundIE, - TikTokEffectIE, - TikTokTagIE, - TikTokVMIE, - DouyinIE, -) -from .tinypic import TinyPicIE -from .tmz import TMZIE -from .tnaflix import ( - TNAFlixNetworkEmbedIE, - TNAFlixIE, - EMPFlixIE, - MovieFapIE, -) -from .toggle import ( - ToggleIE, - MeWatchIE, -) -from .toggo import ( - ToggoIE, -) -from .tokentube import ( - TokentubeIE, - TokentubeChannelIE -) -from .tonline import TOnlineIE -from .toongoggles import ToonGogglesIE -from .toutv import TouTvIE -from .toypics import ToypicsUserIE, ToypicsIE -from .traileraddict import TrailerAddictIE -from .trilulilu import TriluliluIE -from .trovo import ( - TrovoIE, - TrovoVodIE, - TrovoChannelVodIE, - TrovoChannelClipIE, -) -from .trueid import TrueIDIE -from .trunews import TruNewsIE -from .trutv import TruTVIE -from .tube8 import Tube8IE -from .tubitv import ( - TubiTvIE, - TubiTvShowIE, -) -from .tumblr import TumblrIE -from .tunein import ( - TuneInClipIE, - TuneInStationIE, - TuneInProgramIE, - TuneInTopicIE, - TuneInShortenerIE, -) -from .tunepk import TunePkIE -from .turbo import TurboIE -from .tv2 import ( - TV2IE, - TV2ArticleIE, - KatsomoIE, - MTVUutisetArticleIE, -) -from .tv2dk import ( - TV2DKIE, - TV2DKBornholmPlayIE, -) -from .tv2hu import ( - TV2HuIE, - TV2HuSeriesIE, -) -from .tv4 import TV4IE -from .tv5mondeplus import TV5MondePlusIE -from .tv5unis import ( - TV5UnisVideoIE, - TV5UnisIE, -) -from .tva import ( - TVAIE, - QubIE, -) -from .tvanouvelles import ( - TVANouvellesIE, - TVANouvellesArticleIE, -) -from .tvc import ( - TVCIE, - TVCArticleIE, -) -from .tver import TVerIE -from .tvigle import TvigleIE -from .tvland import TVLandIE -from .tvn24 import TVN24IE -from .tvnet import TVNetIE -from .tvnoe import TVNoeIE -from .tvnow import ( - TVNowIE, - TVNowFilmIE, - TVNowNewIE, - TVNowSeasonIE, - TVNowAnnualIE, - TVNowShowIE, -) -from .tvopengr import ( - TVOpenGrWatchIE, - TVOpenGrEmbedIE, -) -from .tvp import ( - TVPEmbedIE, - TVPIE, - TVPStreamIE, - TVPWebsiteIE, -) -from .tvplay import ( - TVPlayIE, - ViafreeIE, - TVPlayHomeIE, -) -from .tvplayer import TVPlayerIE -from .tweakers import TweakersIE -from .twentyfourvideo import TwentyFourVideoIE -from .twentymin import TwentyMinutenIE -from .twentythreevideo import TwentyThreeVideoIE -from .twitcasting import ( - TwitCastingIE, - TwitCastingLiveIE, - TwitCastingUserIE, -) -from .twitch import ( - TwitchVodIE, - TwitchCollectionIE, - TwitchVideosIE, - TwitchVideosClipsIE, - TwitchVideosCollectionsIE, - TwitchStreamIE, - TwitchClipsIE, -) -from .twitter import ( - TwitterCardIE, - TwitterIE, - TwitterAmplifyIE, - TwitterBroadcastIE, - TwitterShortenerIE, -) -from .udemy import ( - UdemyIE, - UdemyCourseIE -) -from .udn import UDNEmbedIE -from .ufctv import ( - UFCTVIE, - UFCArabiaIE, -) -from .ukcolumn import UkColumnIE -from .uktvplay import UKTVPlayIE -from .digiteka import DigitekaIE -from .dlive import ( - DLiveVODIE, - DLiveStreamIE, -) -from .drooble import DroobleIE -from .umg import UMGDeIE -from .unistra import UnistraIE -from .unity import UnityIE -from .uol import UOLIE -from .uplynk import ( - UplynkIE, - UplynkPreplayIE, -) -from .urort import UrortIE -from .urplay import URPlayIE -from .usanetwork import USANetworkIE -from .usatoday import USATodayIE -from .ustream import UstreamIE, UstreamChannelIE -from .ustudio import ( - UstudioIE, - UstudioEmbedIE, -) -from .utreon import UtreonIE -from .varzesh3 import Varzesh3IE -from .vbox7 import Vbox7IE -from .veehd import VeeHDIE -from .veo import VeoIE -from .veoh import VeohIE -from .vesti import VestiIE -from .vevo import ( - VevoIE, - VevoPlaylistIE, -) -from .vgtv import ( - BTArticleIE, - BTVestlendingenIE, - VGTVIE, -) -from .vh1 import VH1IE -from .vice import ( - ViceIE, - ViceArticleIE, - ViceShowIE, -) -from .vidbit import VidbitIE -from .viddler import ViddlerIE -from .videa import VideaIE -from .videocampus_sachsen import ( - VideocampusSachsenIE, - VideocampusSachsenEmbedIE, -) -from .videodetective import VideoDetectiveIE -from .videofyme import VideofyMeIE -from .videomore import ( - VideomoreIE, - VideomoreVideoIE, - VideomoreSeasonIE, -) -from .videopress import VideoPressIE -from .vidio import ( - VidioIE, - VidioPremierIE, - VidioLiveIE -) -from .vidlii import VidLiiIE -from .vier import VierIE, VierVideosIE -from .viewlift import ( - ViewLiftIE, - ViewLiftEmbedIE, -) -from .viidea import ViideaIE -from .vimeo import ( - VimeoIE, - VimeoAlbumIE, - VimeoChannelIE, - VimeoGroupsIE, - VimeoLikesIE, - VimeoOndemandIE, - VimeoReviewIE, - VimeoUserIE, - VimeoWatchLaterIE, - VHXEmbedIE, -) -from .vimm import ( - VimmIE, - VimmRecordingIE, -) -from .vimple import VimpleIE -from .vine import ( - VineIE, - VineUserIE, -) -from .viki import ( - VikiIE, - VikiChannelIE, -) -from .viqeo import ViqeoIE -from .viu import ( - ViuIE, - ViuPlaylistIE, - ViuOTTIE, -) -from .vk import ( - VKIE, - VKUserVideosIE, - VKWallPostIE, -) -from .vlive import ( - VLiveIE, - VLivePostIE, - VLiveChannelIE, -) -from .vodlocker import VodlockerIE -from .vodpl import VODPlIE -from .vodplatform import VODPlatformIE -from .voicerepublic import VoiceRepublicIE -from .voicy import ( - VoicyIE, - VoicyChannelIE, -) -from .voot import ( - VootIE, - VootSeriesIE, -) -from .voxmedia import ( - VoxMediaVolumeIE, - VoxMediaIE, -) -from .vrt import VRTIE -from .vrak import VrakIE -from .vrv import ( - VRVIE, - VRVSeriesIE, -) -from .vshare import VShareIE -from .vtm import VTMIE -from .medialaan import MedialaanIE -from .vuclip import VuClipIE -from .vupload import VuploadIE -from .vvvvid import ( - VVVVIDIE, - VVVVIDShowIE, -) -from .vyborymos import VyboryMosIE -from .vzaar import VzaarIE -from .wakanim import WakanimIE -from .walla import WallaIE -from .washingtonpost import ( - WashingtonPostIE, - WashingtonPostArticleIE, -) -from .wasdtv import ( - WASDTVStreamIE, - WASDTVRecordIE, - WASDTVClipIE, -) -from .wat import WatIE -from .watchbox import WatchBoxIE -from .watchindianporn import WatchIndianPornIE -from .wdr import ( - WDRIE, - WDRPageIE, - WDRElefantIE, - WDRMobileIE, -) -from .webcaster import ( - WebcasterIE, - WebcasterFeedIE, -) -from .webofstories import ( - WebOfStoriesIE, - WebOfStoriesPlaylistIE, -) -from .weibo import ( - WeiboIE, - WeiboMobileIE -) -from .weiqitv import WeiqiTVIE -from .willow import WillowIE -from .wimtv import WimTVIE -from .whowatch import WhoWatchIE -from .wistia import ( - WistiaIE, - WistiaPlaylistIE, -) -from .worldstarhiphop import WorldStarHipHopIE -from .wppilot import ( - WPPilotIE, - WPPilotChannelsIE, -) -from .wsj import ( - WSJIE, - WSJArticleIE, -) -from .wwe import WWEIE -from .xbef import XBefIE -from .xboxclips import XboxClipsIE -from .xfileshare import XFileShareIE -from .xhamster import ( - XHamsterIE, - XHamsterEmbedIE, - XHamsterUserIE, -) -from .xiami import ( - XiamiSongIE, - XiamiAlbumIE, - XiamiArtistIE, - XiamiCollectionIE -) -from .ximalaya import ( - XimalayaIE, - XimalayaAlbumIE -) -from .xinpianchang import XinpianchangIE -from .xminus import XMinusIE -from .xnxx import XNXXIE -from .xstream import XstreamIE -from .xtube import XTubeUserIE, XTubeIE -from .xuite import XuiteIE -from .xvideos import XVideosIE -from .xxxymovies import XXXYMoviesIE -from .yahoo import ( - YahooIE, - YahooSearchIE, - YahooGyaOPlayerIE, - YahooGyaOIE, - YahooJapanNewsIE, -) -from .yandexdisk import YandexDiskIE -from .yandexmusic import ( - YandexMusicTrackIE, - YandexMusicAlbumIE, - YandexMusicPlaylistIE, - YandexMusicArtistTracksIE, - YandexMusicArtistAlbumsIE, -) -from .yandexvideo import ( - YandexVideoIE, - YandexVideoPreviewIE, - ZenYandexIE, - ZenYandexChannelIE, -) -from .yapfiles import YapFilesIE -from .yesjapan import YesJapanIE -from .yinyuetai import YinYueTaiIE -from .ynet import YnetIE -from .youjizz import YouJizzIE -from .youku import ( - YoukuIE, - YoukuShowIE, -) -from .younow import ( - YouNowLiveIE, - YouNowChannelIE, - YouNowMomentIE, -) -from .youporn import YouPornIE -from .yourporn import YourPornIE -from .yourupload import YourUploadIE -from .youtube import ( - YoutubeIE, - YoutubeClipIE, - YoutubeFavouritesIE, - YoutubeHistoryIE, - YoutubeTabIE, - YoutubeLivestreamEmbedIE, - YoutubePlaylistIE, - YoutubeRecommendedIE, - YoutubeSearchDateIE, - YoutubeSearchIE, - YoutubeSearchURLIE, - YoutubeMusicSearchURLIE, - YoutubeSubscriptionsIE, - YoutubeTruncatedIDIE, - YoutubeTruncatedURLIE, - YoutubeYtBeIE, - YoutubeYtUserIE, - YoutubeWatchLaterIE, -) -from .zapiks import ZapiksIE -from .zattoo import ( - BBVTVIE, - EinsUndEinsTVIE, - EWETVIE, - GlattvisionTVIE, - MNetTVIE, - MyVisionTVIE, - NetPlusIE, - OsnatelTVIE, - QuantumTVIE, - QuicklineIE, - QuicklineLiveIE, - SaltTVIE, - SAKTVIE, - VTXTVIE, - WalyTVIE, - ZattooIE, - ZattooLiveIE, -) -from .zdf import ZDFIE, ZDFChannelIE -from .zee5 import ( - Zee5IE, - Zee5SeriesIE, -) -from .zhihu import ZhihuIE -from .zingmp3 import ( - ZingMp3IE, - ZingMp3AlbumIE, -) -from .zoom import ZoomIE -from .zype import ZypeIE +if not _LAZY_LOADER: + from ._extractors import * # noqa: F403 + _ALL_CLASSES = [ # noqa: F811 + klass + for name, klass in globals().items() + if name.endswith('IE') and name != 'GenericIE' + ] + _ALL_CLASSES.append(GenericIE) # noqa: F405 + +globals().update(_PLUGIN_CLASSES) +_ALL_CLASSES[:0] = _PLUGIN_CLASSES.values() diff --git a/hypervideo_dl/extractor/extremetube.py b/hypervideo_dl/extractor/extremetube.py index acd4090..2c19698 100644 --- a/hypervideo_dl/extractor/extremetube.py +++ b/hypervideo_dl/extractor/extremetube.py @@ -1,10 +1,8 @@ -from __future__ import unicode_literals - from ..utils import str_to_int from .keezmovies import KeezMoviesIE -class ExtremeTubeIE(KeezMoviesIE): +class ExtremeTubeIE(KeezMoviesIE): # XXX: Do not subclass from concrete IE _VALID_URL = r'https?://(?:www\.)?extremetube\.com/(?:[^/]+/)?video/(?P<id>[^/#?&]+)' _TESTS = [{ 'url': 'http://www.extremetube.com/video/music-video-14-british-euro-brit-european-cumshots-swallow-652431', diff --git a/hypervideo_dl/extractor/eyedotv.py b/hypervideo_dl/extractor/eyedotv.py index f62ddeb..d8b068e 100644 --- a/hypervideo_dl/extractor/eyedotv.py +++ b/hypervideo_dl/extractor/eyedotv.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( xpath_text, diff --git a/hypervideo_dl/extractor/facebook.py b/hypervideo_dl/extractor/facebook.py index 022ea85..a58d9c8 100644 --- a/hypervideo_dl/extractor/facebook.py +++ b/hypervideo_dl/extractor/facebook.py @@ -1,21 +1,18 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json import re +import urllib.parse from .common import InfoExtractor from ..compat import ( compat_etree_fromstring, compat_str, compat_urllib_parse_unquote, - compat_urllib_parse_unquote_plus, ) from ..utils import ( + ExtractorError, clean_html, determine_ext, error_to_compat_str, - ExtractorError, float_or_none, get_element_by_id, get_first, @@ -60,6 +57,13 @@ class FacebookIE(InfoExtractor): ) (?P<id>[0-9]+) ''' + _EMBED_REGEX = [ + r'<iframe[^>]+?src=(["\'])(?P<url>https?://www\.facebook\.com/(?:video/embed|plugins/video\.php).+?)\1', + # Facebook API embed https://developers.facebook.com/docs/plugins/embedded-video-player + r'''(?x)<div[^>]+ + class=(?P<q1>[\'"])[^\'"]*\bfb-(?:video|post)\b[^\'"]*(?P=q1)[^>]+ + data-href=(?P<q2>[\'"])(?P<url>(?:https?:)?//(?:www\.)?facebook.com/.+?)(?P=q2)''', + ] _LOGIN_URL = 'https://www.facebook.com/login.php?next=http%3A%2F%2Ffacebook.com%2Fhome.php&login_attempt=1' _CHECKPOINT_URL = 'https://www.facebook.com/checkpoint/?next=http%3A%2F%2Ffacebook.com%2Fhome.php&_fb_noscript=1' _NETRC_MACHINE = 'facebook' @@ -314,21 +318,6 @@ class FacebookIE(InfoExtractor): 'graphURI': '/api/graphql/' } - @staticmethod - def _extract_urls(webpage): - urls = [] - for mobj in re.finditer( - r'<iframe[^>]+?src=(["\'])(?P<url>https?://www\.facebook\.com/(?:video/embed|plugins/video\.php).+?)\1', - webpage): - urls.append(mobj.group('url')) - # Facebook API embed - # see https://developers.facebook.com/docs/plugins/embedded-video-player - for mobj in re.finditer(r'''(?x)<div[^>]+ - class=(?P<q1>[\'"])[^\'"]*\bfb-(?:video|post)\b[^\'"]*(?P=q1)[^>]+ - data-href=(?P<q2>[\'"])(?P<url>(?:https?:)?//(?:www\.)?facebook.com/.+?)(?P=q2)''', webpage): - urls.append(mobj.group('url')) - return urls - def _perform_login(self, username, password): login_page_req = sanitized_Request(self._LOGIN_URL) self._set_cookie('facebook.com', 'locale', 'en_US') @@ -397,10 +386,8 @@ class FacebookIE(InfoExtractor): r'handleWithCustomApplyEach\(\s*ScheduledApplyEach\s*,\s*(\{.+?\})\s*\);', webpage)] post = traverse_obj(post_data, ( ..., 'require', ..., ..., ..., '__bbox', 'result', 'data'), expected_type=dict) or [] - media = traverse_obj( - post, - (..., 'attachments', ..., 'media', lambda _, m: str(m['id']) == video_id and m['__typename'] == 'Video'), - expected_type=dict) + media = traverse_obj(post, (..., 'attachments', ..., lambda k, v: ( + k == 'media' and str(v['id']) == video_id and v['__typename'] == 'Video')), expected_type=dict) title = get_first(media, ('title', 'text')) description = get_first(media, ('creation_story', 'comet_sections', 'message', 'story', 'message', 'text')) uploader_data = get_first(media, 'owner') or get_first(post, ('node', 'actors', ...)) or {} @@ -472,15 +459,14 @@ class FacebookIE(InfoExtractor): dash_manifest = video.get('dash_manifest') if dash_manifest: formats.extend(self._parse_mpd_formats( - compat_etree_fromstring(compat_urllib_parse_unquote_plus(dash_manifest)))) + compat_etree_fromstring(urllib.parse.unquote_plus(dash_manifest)))) - def process_formats(formats): + def process_formats(info): # Downloads with browser's User-Agent are rate limited. Working around # with non-browser User-Agent. - for f in formats: + for f in info['formats']: f.setdefault('http_headers', {})['User-Agent'] = 'facebookexternalhit/1.1' - - self._sort_formats(formats, ('res', 'quality')) + info['_format_sort_fields'] = ('res', 'quality') def extract_relay_data(_filter): return self._parse_json(self._search_regex( @@ -523,16 +509,17 @@ class FacebookIE(InfoExtractor): 'url': playable_url, }) extract_dash_manifest(video, formats) - process_formats(formats) v_id = video.get('videoId') or video.get('id') or video_id info = { 'id': v_id, 'formats': formats, - 'thumbnail': try_get(video, lambda x: x['thumbnailImage']['uri']), + 'thumbnail': traverse_obj( + video, ('thumbnailImage', 'uri'), ('preferred_thumbnail', 'image', 'uri')), 'uploader_id': try_get(video, lambda x: x['owner']['id']), 'timestamp': int_or_none(video.get('publish_time')), 'duration': float_or_none(video.get('playable_duration_in_ms'), 1000), } + process_formats(info) description = try_get(video, lambda x: x['savable_description']['text']) title = video.get('name') if title: @@ -699,13 +686,12 @@ class FacebookIE(InfoExtractor): if subtitles_src: subtitles.setdefault('en', []).append({'url': subtitles_src}) - process_formats(formats) - info_dict = { 'id': video_id, 'formats': formats, 'subtitles': subtitles, } + process_formats(info_dict) info_dict.update(extract_metadata(webpage)) return info_dict @@ -784,3 +770,30 @@ class FacebookRedirectURLIE(InfoExtractor): if not redirect_url: raise ExtractorError('Invalid facebook redirect URL', expected=True) return self.url_result(redirect_url) + + +class FacebookReelIE(InfoExtractor): + _VALID_URL = r'https?://(?:[\w-]+\.)?facebook\.com/reel/(?P<id>\d+)' + IE_NAME = 'facebook:reel' + + _TESTS = [{ + 'url': 'https://www.facebook.com/reel/1195289147628387', + 'md5': 'c4ff9a7182ff9ff7d6f7a83603bae831', + 'info_dict': { + 'id': '1195289147628387', + 'ext': 'mp4', + 'title': 'md5:9f5b142921b2dc57004fa13f76005f87', + 'description': 'md5:24ea7ef062215d295bdde64e778f5474', + 'uploader': 'Beast Camp Training', + 'uploader_id': '1738535909799870', + 'duration': 9.536, + 'thumbnail': r're:^https?://.*', + 'upload_date': '20211121', + 'timestamp': 1637502604, + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + return self.url_result( + f'https://m.facebook.com/watch/?v={video_id}&_rdr', FacebookIE, video_id) diff --git a/hypervideo_dl/extractor/fancode.py b/hypervideo_dl/extractor/fancode.py index 7ea16c6..1b5db81 100644 --- a/hypervideo_dl/extractor/fancode.py +++ b/hypervideo_dl/extractor/fancode.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import compat_str @@ -128,7 +125,7 @@ class FancodeVodIE(InfoExtractor): } -class FancodeLiveIE(FancodeVodIE): +class FancodeLiveIE(FancodeVodIE): # XXX: Do not subclass from concrete IE IE_NAME = 'fancode:live' _VALID_URL = r'https?://(www\.)?fancode\.com/match/(?P<id>[0-9]+).+' diff --git a/hypervideo_dl/extractor/faz.py b/hypervideo_dl/extractor/faz.py index 312ee2a..bca62ad 100644 --- a/hypervideo_dl/extractor/faz.py +++ b/hypervideo_dl/extractor/faz.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -81,7 +78,6 @@ class FazIE(InfoExtractor): 'tbr': tbr or int(mobj.group(3)), }) formats.append(f) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/fc2.py b/hypervideo_dl/extractor/fc2.py index 54a83aa..dd5e088 100644 --- a/hypervideo_dl/extractor/fc2.py +++ b/hypervideo_dl/extractor/fc2.py @@ -1,19 +1,13 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor -from ..compat import ( - compat_parse_qs, -) +from ..compat import compat_parse_qs +from ..dependencies import websockets from ..utils import ( ExtractorError, WebSocketsWrapper, - has_websockets, js_to_json, sanitized_Request, - std_headers, traverse_obj, update_url_query, urlencode_postdata, @@ -84,7 +78,7 @@ class FC2IE(InfoExtractor): webpage = None if not url.startswith('fc2:'): webpage = self._download_webpage(url, video_id) - self._downloader.cookiejar.clear_session_cookies() # must clear + self.cookiejar.clear_session_cookies() # must clear self._login() title, thumbnail, description = None, None, None @@ -173,7 +167,7 @@ class FC2LiveIE(InfoExtractor): }] def _real_extract(self, url): - if not has_websockets: + if not websockets: raise ExtractorError('websockets library is not available. Please install it.', expected=True) video_id = self._match_id(url) webpage = self._download_webpage('https://live.fc2.com/%s/' % video_id, video_id) @@ -210,10 +204,10 @@ class FC2LiveIE(InfoExtractor): 'Cookie': str(self._get_cookies('https://live.fc2.com/'))[12:], 'Origin': 'https://live.fc2.com', 'Accept': '*/*', - 'User-Agent': std_headers['User-Agent'], + 'User-Agent': self.get_param('http_headers')['User-Agent'], }) - self.write_debug('[debug] Sending HLS server request') + self.write_debug('Sending HLS server request') while True: recv = ws.recv() @@ -235,13 +229,10 @@ class FC2LiveIE(InfoExtractor): if not data or not isinstance(data, dict): continue if data.get('name') == '_response_' and data.get('id') == 1: - self.write_debug('[debug] Goodbye.') + self.write_debug('Goodbye') playlist_data = data break - elif self._downloader.params.get('verbose', False): - if len(recv) > 100: - recv = recv[:100] + '...' - self.to_screen('[debug] Server said: %s' % recv) + self.write_debug('Server said: %s%s' % (recv[:100], '...' if len(recv) > 100 else '')) if not playlist_data: raise ExtractorError('Unable to fetch HLS playlist info via WebSocket') @@ -259,7 +250,6 @@ class FC2LiveIE(InfoExtractor): 'Referer': url, })) - self._sort_formats(formats) for fmt in formats: fmt.update({ 'protocol': 'fc2_live', diff --git a/hypervideo_dl/extractor/fczenit.py b/hypervideo_dl/extractor/fczenit.py index 8db7c59..8175b6b 100644 --- a/hypervideo_dl/extractor/fczenit.py +++ b/hypervideo_dl/extractor/fczenit.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( int_or_none, @@ -41,8 +38,6 @@ class FczenitIE(InfoExtractor): 'height': int_or_none(q.get('label')), } for q in msi_data['qualities'] if q.get('url')] - self._sort_formats(formats) - tags = [tag['label'] for tag in msi_data.get('tags', []) if tag.get('label')] return { diff --git a/hypervideo_dl/extractor/fifa.py b/hypervideo_dl/extractor/fifa.py new file mode 100644 index 0000000..dc00edc --- /dev/null +++ b/hypervideo_dl/extractor/fifa.py @@ -0,0 +1,94 @@ +from .common import InfoExtractor + +from ..utils import ( + int_or_none, + traverse_obj, + unified_timestamp, +) + + +class FifaIE(InfoExtractor): + _VALID_URL = r'https?://www.fifa.com/fifaplus/(?P<locale>\w{2})/watch/([^#?]+/)?(?P<id>\w+)' + _TESTS = [{ + 'url': 'https://www.fifa.com/fifaplus/en/watch/7on10qPcnyLajDDU3ntg6y', + 'info_dict': { + 'id': '7on10qPcnyLajDDU3ntg6y', + 'title': 'Italy v France | Final | 2006 FIFA World Cup Germany™ | Full Match Replay', + 'description': 'md5:f4520d0ee80529c8ba4134a7d692ff8b', + 'ext': 'mp4', + 'categories': ['FIFA Tournaments'], + 'thumbnail': 'https://digitalhub.fifa.com/transform/fa6f0b3e-a2e9-4cf7-9f32-53c57bcb7360/2006_Final_ITA_FRA', + 'duration': 8165, + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.fifa.com/fifaplus/pt/watch/1cg5r5Qt6Qt12ilkDgb1sV', + 'info_dict': { + 'id': '1cg5r5Qt6Qt12ilkDgb1sV', + 'title': 'Brazil v Germany | Semi-finals | 2014 FIFA World Cup Brazil™ | Extended Highlights', + 'description': 'md5:d908c74ee66322b804ae2e521b02a855', + 'ext': 'mp4', + 'categories': ['FIFA Tournaments', 'Highlights'], + 'thumbnail': 'https://digitalhub.fifa.com/transform/d8fe6f61-276d-4a73-a7fe-6878a35fd082/FIFAPLS_100EXTHL_2014BRAvGER_TMB', + 'duration': 902, + 'release_timestamp': 1404777600, + 'release_date': '20140708', + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.fifa.com/fifaplus/fr/watch/3C6gQH9C2DLwzNx7BMRQdp', + 'info_dict': { + 'id': '3C6gQH9C2DLwzNx7BMRQdp', + 'title': 'Josimar goal against Northern Ireland | Classic Goals', + 'description': 'md5:cbe7e7bb52f603c9f1fe9a4780fe983b', + 'ext': 'mp4', + 'categories': ['FIFA Tournaments', 'Goal'], + 'duration': 28, + 'thumbnail': 'https://digitalhub.fifa.com/transform/f9301391-f8d9-48b5-823e-c093ac5e3e11/CG_MEN_1986_JOSIMAR', + }, + 'params': {'skip_download': 'm3u8'}, + }] + + def _real_extract(self, url): + video_id, locale = self._match_valid_url(url).group('id', 'locale') + webpage = self._download_webpage(url, video_id) + + preconnect_link = self._search_regex( + r'<link[^>]+rel\s*=\s*"preconnect"[^>]+href\s*=\s*"([^"]+)"', webpage, 'Preconnect Link') + + video_details = self._download_json( + f'{preconnect_link}/sections/videoDetails/{video_id}', video_id, 'Downloading Video Details', fatal=False) + + preplay_parameters = self._download_json( + f'{preconnect_link}/videoPlayerData/{video_id}', video_id, 'Downloading Preplay Parameters')['preplayParameters'] + + cid = preplay_parameters['contentId'] + content_data = self._download_json( + f'https://content.uplynk.com/preplay/{cid}/multiple.json', video_id, 'Downloading Content Data', query={ + 'v': preplay_parameters['preplayAPIVersion'], + 'tc': preplay_parameters['tokenCheckAlgorithmVersion'], + 'rn': preplay_parameters['randomNumber'], + 'exp': preplay_parameters['tokenExpirationDate'], + 'ct': preplay_parameters['contentType'], + 'cid': cid, + 'mbtracks': preplay_parameters['tracksAssetNumber'], + 'ad': preplay_parameters['adConfiguration'], + 'ad.preroll': int(preplay_parameters['adPreroll']), + 'ad.cmsid': preplay_parameters['adCMSSourceId'], + 'ad.vid': preplay_parameters['adSourceVideoID'], + 'sig': preplay_parameters['signature'], + }) + + formats, subtitles = self._extract_m3u8_formats_and_subtitles(content_data['playURL'], video_id) + + return { + 'id': video_id, + 'title': video_details.get('title'), + 'description': video_details.get('description'), + 'duration': int_or_none(video_details.get('duration')), + 'release_timestamp': unified_timestamp(video_details.get('dateOfRelease')), + 'categories': traverse_obj(video_details, (('videoCategory', 'videoSubcategory'),)), + 'thumbnail': traverse_obj(video_details, ('backgroundImage', 'src')), + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/hypervideo_dl/extractor/filmmodu.py b/hypervideo_dl/extractor/filmmodu.py index 2746876..9eb550e 100644 --- a/hypervideo_dl/extractor/filmmodu.py +++ b/hypervideo_dl/extractor/filmmodu.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import int_or_none @@ -54,8 +51,6 @@ class FilmmoduIE(InfoExtractor): 'protocol': 'm3u8_native', } for source in data['sources']] - self._sort_formats(formats) - subtitles = {} if data.get('subtitle'): diff --git a/hypervideo_dl/extractor/filmon.py b/hypervideo_dl/extractor/filmon.py index 7b43ecc..9a93cb9 100644 --- a/hypervideo_dl/extractor/filmon.py +++ b/hypervideo_dl/extractor/filmon.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import ( compat_str, @@ -68,7 +65,6 @@ class FilmOnIE(InfoExtractor): 'quality': QUALITY(stream.get('quality')), 'protocol': 'm3u8_native', }) - self._sort_formats(formats) thumbnails = [] poster = response.get('poster', {}) @@ -156,7 +152,6 @@ class FilmOnChannelIE(InfoExtractor): 'ext': 'mp4', 'quality': QUALITY(quality), }) - self._sort_formats(formats) thumbnails = [] for name, width, height in self._THUMBNAIL_RES: diff --git a/hypervideo_dl/extractor/filmweb.py b/hypervideo_dl/extractor/filmweb.py index 5e323b4..cfea1f2 100644 --- a/hypervideo_dl/extractor/filmweb.py +++ b/hypervideo_dl/extractor/filmweb.py @@ -1,6 +1,3 @@ -from __future__ import unicode_literals - - from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/firsttv.py b/hypervideo_dl/extractor/firsttv.py index ccad173..f74bd13 100644 --- a/hypervideo_dl/extractor/firsttv.py +++ b/hypervideo_dl/extractor/firsttv.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import ( compat_str, @@ -126,7 +123,6 @@ class FirstTVIE(InfoExtractor): % (path, m3u8_path), display_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) - self._sort_formats(formats) thumbnail = item.get('poster') or self._og_search_thumbnail(webpage) duration = int_or_none(item.get('duration') or self._html_search_meta( diff --git a/hypervideo_dl/extractor/fivemin.py b/hypervideo_dl/extractor/fivemin.py deleted file mode 100644 index f3f876e..0000000 --- a/hypervideo_dl/extractor/fivemin.py +++ /dev/null @@ -1,54 +0,0 @@ -from __future__ import unicode_literals - -from .common import InfoExtractor - - -class FiveMinIE(InfoExtractor): - IE_NAME = '5min' - _VALID_URL = r'(?:5min:|https?://(?:[^/]*?5min\.com/|delivery\.vidible\.tv/aol)(?:(?:Scripts/PlayerSeed\.js|playerseed/?)?\?.*?playList=)?)(?P<id>\d+)' - - _TESTS = [ - { - # From http://www.engadget.com/2013/11/15/ipad-mini-retina-display-review/ - 'url': 'http://pshared.5min.com/Scripts/PlayerSeed.js?sid=281&width=560&height=345&playList=518013791', - 'md5': '4f7b0b79bf1a470e5004f7112385941d', - 'info_dict': { - 'id': '518013791', - 'ext': 'mp4', - 'title': 'iPad Mini with Retina Display Review', - 'description': 'iPad mini with Retina Display review', - 'duration': 177, - 'uploader': 'engadget', - 'upload_date': '20131115', - 'timestamp': 1384515288, - }, - 'params': { - # m3u8 download - 'skip_download': True, - } - }, - { - # From http://on.aol.com/video/how-to-make-a-next-level-fruit-salad-518086247 - 'url': '5min:518086247', - 'md5': 'e539a9dd682c288ef5a498898009f69e', - 'info_dict': { - 'id': '518086247', - 'ext': 'mp4', - 'title': 'How to Make a Next-Level Fruit Salad', - 'duration': 184, - }, - 'skip': 'no longer available', - }, - { - 'url': 'http://embed.5min.com/518726732/', - 'only_matching': True, - }, - { - 'url': 'http://delivery.vidible.tv/aol?playList=518013791', - 'only_matching': True, - } - ] - - def _real_extract(self, url): - video_id = self._match_id(url) - return self.url_result('aol-video:%s' % video_id) diff --git a/hypervideo_dl/extractor/fivetv.py b/hypervideo_dl/extractor/fivetv.py index d6bebd1..1f48cfd 100644 --- a/hypervideo_dl/extractor/fivetv.py +++ b/hypervideo_dl/extractor/fivetv.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor from ..utils import int_or_none @@ -75,7 +71,7 @@ class FiveTVIE(InfoExtractor): r'<a[^>]+?href="([^"]+)"[^>]+?class="videoplayer"'], webpage, 'video url') - title = self._og_search_title(webpage, default=None) or self._html_extract_title(webpage) + title = self._generic_title('', webpage) duration = int_or_none(self._og_search_property( 'video:duration', webpage, 'duration', default=None)) diff --git a/hypervideo_dl/extractor/flickr.py b/hypervideo_dl/extractor/flickr.py index 2ed6c2b..89a40d7 100644 --- a/hypervideo_dl/extractor/flickr.py +++ b/hypervideo_dl/extractor/flickr.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import ( compat_str, @@ -91,12 +89,11 @@ class FlickrIE(InfoExtractor): 'url': stream['_content'], 'quality': preference(stream_type), }) - self._sort_formats(formats) owner = video_info.get('owner', {}) uploader_id = owner.get('nsid') uploader_path = owner.get('path_alias') or uploader_id - uploader_url = format_field(uploader_path, template='https://www.flickr.com/photos/%s/') + uploader_url = format_field(uploader_path, None, 'https://www.flickr.com/photos/%s/') return { 'id': video_id, diff --git a/hypervideo_dl/extractor/folketinget.py b/hypervideo_dl/extractor/folketinget.py index b3df93f..55a11e5 100644 --- a/hypervideo_dl/extractor/folketinget.py +++ b/hypervideo_dl/extractor/folketinget.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import compat_parse_qs from ..utils import ( @@ -62,7 +59,6 @@ class FolketingetIE(InfoExtractor): 'url': xpath_text(n, './url', fatal=True), 'tbr': int_or_none(n.attrib['bitrate']), } for n in doc.findall('.//streams/stream')] - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/footyroom.py b/hypervideo_dl/extractor/footyroom.py index 118325b..4a1316b 100644 --- a/hypervideo_dl/extractor/footyroom.py +++ b/hypervideo_dl/extractor/footyroom.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from .streamable import StreamableIE diff --git a/hypervideo_dl/extractor/formula1.py b/hypervideo_dl/extractor/formula1.py index 67662e6..0a8ef85 100644 --- a/hypervideo_dl/extractor/formula1.py +++ b/hypervideo_dl/extractor/formula1.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/fourtube.py b/hypervideo_dl/extractor/fourtube.py index d4d955b..b6368b8 100644 --- a/hypervideo_dl/extractor/fourtube.py +++ b/hypervideo_dl/extractor/fourtube.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -37,7 +35,6 @@ class FourTubeBaseIE(InfoExtractor): 'resolution': format + 'p', 'quality': int(format), } for format in sources] - self._sort_formats(formats) return formats def _real_extract(self, url): diff --git a/hypervideo_dl/extractor/fourzerostudio.py b/hypervideo_dl/extractor/fourzerostudio.py new file mode 100644 index 0000000..c388a3a --- /dev/null +++ b/hypervideo_dl/extractor/fourzerostudio.py @@ -0,0 +1,106 @@ +from .common import InfoExtractor +from ..utils import traverse_obj, unified_timestamp + + +class FourZeroStudioArchiveIE(InfoExtractor): + _VALID_URL = r'https?://0000\.studio/(?P<uploader_id>[^/]+)/broadcasts/(?P<id>[^/]+)/archive' + IE_NAME = '0000studio:archive' + _TESTS = [{ + 'url': 'https://0000.studio/mumeijiten/broadcasts/1290f433-fce0-4909-a24a-5f7df09665dc/archive', + 'info_dict': { + 'id': '1290f433-fce0-4909-a24a-5f7df09665dc', + 'title': 'noteで『canape』様へのファンレターを執筆します。(数秘術その2)', + 'timestamp': 1653802534, + 'release_timestamp': 1653796604, + 'thumbnails': 'count:1', + 'comments': 'count:7', + 'uploader': '『中崎雄心』の執務室。', + 'uploader_id': 'mumeijiten', + } + }] + + def _real_extract(self, url): + video_id, uploader_id = self._match_valid_url(url).group('id', 'uploader_id') + webpage = self._download_webpage(url, video_id) + nuxt_data = self._search_nuxt_data(webpage, video_id, traverse=None) + + pcb = traverse_obj(nuxt_data, ('ssrRefs', lambda _, v: v['__typename'] == 'PublicCreatorBroadcast'), get_all=False) + uploader_internal_id = traverse_obj(nuxt_data, ( + 'ssrRefs', lambda _, v: v['__typename'] == 'PublicUser', 'id'), get_all=False) + + formats, subs = self._extract_m3u8_formats_and_subtitles(pcb['archiveUrl'], video_id, ext='mp4') + + return { + 'id': video_id, + 'title': pcb.get('title'), + 'age_limit': 18 if pcb.get('isAdult') else None, + 'timestamp': unified_timestamp(pcb.get('finishTime')), + 'release_timestamp': unified_timestamp(pcb.get('createdAt')), + 'thumbnails': [{ + 'url': pcb['thumbnailUrl'], + 'ext': 'png', + }] if pcb.get('thumbnailUrl') else None, + 'formats': formats, + 'subtitles': subs, + 'comments': [{ + 'author': c.get('username'), + 'author_id': c.get('postedUserId'), + 'author_thumbnail': c.get('userThumbnailUrl'), + 'id': c.get('id'), + 'text': c.get('body'), + 'timestamp': unified_timestamp(c.get('createdAt')), + 'like_count': c.get('likeCount'), + 'is_favorited': c.get('isLikedByOwner'), + 'author_is_uploader': c.get('postedUserId') == uploader_internal_id, + } for c in traverse_obj(nuxt_data, ( + 'ssrRefs', ..., lambda _, v: v['__typename'] == 'PublicCreatorBroadcastComment')) or []], + 'uploader_id': uploader_id, + 'uploader': traverse_obj(nuxt_data, ( + 'ssrRefs', lambda _, v: v['__typename'] == 'PublicUser', 'username'), get_all=False), + } + + +class FourZeroStudioClipIE(InfoExtractor): + _VALID_URL = r'https?://0000\.studio/(?P<uploader_id>[^/]+)/archive-clip/(?P<id>[^/]+)' + IE_NAME = '0000studio:clip' + _TESTS = [{ + 'url': 'https://0000.studio/soeji/archive-clip/e46b0278-24cd-40a8-92e1-b8fc2b21f34f', + 'info_dict': { + 'id': 'e46b0278-24cd-40a8-92e1-b8fc2b21f34f', + 'title': 'わたベーさんからイラスト差し入れいただきました。ありがとうございました!', + 'timestamp': 1652109105, + 'like_count': 1, + 'uploader': 'ソエジマケイタ', + 'uploader_id': 'soeji', + } + }] + + def _real_extract(self, url): + video_id, uploader_id = self._match_valid_url(url).group('id', 'uploader_id') + webpage = self._download_webpage(url, video_id) + nuxt_data = self._search_nuxt_data(webpage, video_id, traverse=None) + + clip_info = traverse_obj(nuxt_data, ('ssrRefs', lambda _, v: v['__typename'] == 'PublicCreatorArchivedClip'), get_all=False) + + info = next(( + m for m in self._parse_html5_media_entries(url, webpage, video_id) + if 'mp4' in traverse_obj(m, ('formats', ..., 'ext')) + ), None) + if not info: + self.report_warning('Failed to find a desired media element. Falling back to using NUXT data.') + info = { + 'formats': [{ + 'ext': 'mp4', + 'url': url, + } for url in clip_info.get('mediaFiles') or [] if url], + } + return { + **info, + 'id': video_id, + 'title': clip_info.get('clipComment'), + 'timestamp': unified_timestamp(clip_info.get('createdAt')), + 'like_count': clip_info.get('likeCount'), + 'uploader_id': uploader_id, + 'uploader': traverse_obj(nuxt_data, ( + 'ssrRefs', lambda _, v: v['__typename'] == 'PublicUser', 'username'), get_all=False), + } diff --git a/hypervideo_dl/extractor/fox.py b/hypervideo_dl/extractor/fox.py index 4c52b9a..15c0c48 100644 --- a/hypervideo_dl/extractor/fox.py +++ b/hypervideo_dl/extractor/fox.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json import uuid @@ -15,8 +12,10 @@ from ..utils import ( int_or_none, parse_age_limit, parse_duration, + traverse_obj, try_get, unified_timestamp, + url_or_none, ) @@ -37,7 +36,8 @@ class FOXIE(InfoExtractor): 'creator': 'FOX', 'series': 'Gotham', 'age_limit': 14, - 'episode': 'Aftermath: Bruce Wayne Develops Into The Dark Knight' + 'episode': 'Aftermath: Bruce Wayne Develops Into The Dark Knight', + 'thumbnail': r're:^https?://.*\.jpg$', }, 'params': { 'skip_download': True, @@ -132,7 +132,6 @@ class FOXIE(InfoExtractor): formats = self._extract_m3u8_formats( m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') - self._sort_formats(formats) data = try_get( video, lambda x: x['trackingData']['properties'], dict) or {} @@ -168,6 +167,7 @@ class FOXIE(InfoExtractor): 'season_number': int_or_none(video.get('seasonNumber')), 'episode': video.get('name'), 'episode_number': int_or_none(video.get('episodeNumber')), + 'thumbnail': traverse_obj(video, ('images', 'still', 'raw'), expected_type=url_or_none), 'release_year': int_or_none(video.get('releaseYear')), 'subtitles': subtitles, } diff --git a/hypervideo_dl/extractor/fox9.py b/hypervideo_dl/extractor/fox9.py index 91f8f7b..dfbafa7 100644 --- a/hypervideo_dl/extractor/fox9.py +++ b/hypervideo_dl/extractor/fox9.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/foxgay.py b/hypervideo_dl/extractor/foxgay.py index 1c53e06..f4f29c6 100644 --- a/hypervideo_dl/extractor/foxgay.py +++ b/hypervideo_dl/extractor/foxgay.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import itertools from .common import InfoExtractor @@ -33,7 +31,7 @@ class FoxgayIE(InfoExtractor): description = get_element_by_id('inf_tit', webpage) # The default user-agent with foxgay cookies leads to pages without videos - self._downloader.cookiejar.clear('.foxgay.com') + self.cookiejar.clear('.foxgay.com') # Find the URL for the iFrame which contains the actual video. iframe_url = self._html_search_regex( r'<iframe[^>]+src=([\'"])(?P<url>[^\'"]+)\1', webpage, @@ -50,8 +48,6 @@ class FoxgayIE(InfoExtractor): } for source, resolution in zip( video_data['sources'], video_data.get('resolutions', itertools.repeat(None)))] - self._sort_formats(formats) - return { 'id': video_id, 'title': title, diff --git a/hypervideo_dl/extractor/foxnews.py b/hypervideo_dl/extractor/foxnews.py index 18fa0a5..52172aa 100644 --- a/hypervideo_dl/extractor/foxnews.py +++ b/hypervideo_dl/extractor/foxnews.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re from .amp import AMPIE @@ -58,13 +56,15 @@ class FoxNewsIE(AMPIE): }, ] - @staticmethod - def _extract_urls(webpage): - return [ - mobj.group('url') - for mobj in re.finditer( - r'<(?:amp-)?iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//video\.foxnews\.com/v/video-embed\.html?.*?\bvideo_id=\d+.*?)\1', - webpage)] + @classmethod + def _extract_embed_urls(cls, url, webpage): + for mobj in re.finditer( + r'''(?x) + <(?:script|(?:amp-)?iframe)[^>]+\bsrc=["\'] + (?:https?:)?//video\.foxnews\.com/v/(?:video-embed\.html|embed\.js)\? + (?:[^>"\']+&)?(?:video_)?id=(?P<video_id>\d+) + ''', webpage): + yield f'https://video.foxnews.com/v/video-embed.html?video_id={mobj.group("video_id")}' def _real_extract(self, url): host, video_id = self._match_valid_url(url).groups() @@ -75,6 +75,29 @@ class FoxNewsIE(AMPIE): return info +class FoxNewsVideoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?foxnews\.com/video/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.foxnews.com/video/6313058664112', + 'info_dict': { + 'id': '6313058664112', + 'ext': 'mp4', + 'thumbnail': r're:https://.+/1280x720/match/image\.jpg', + 'upload_date': '20220930', + 'description': 'New York City, Kids Therapy, Biden', + 'duration': 2415, + 'title': 'Gutfeld! - Thursday, September 29', + 'timestamp': 1664527538, + }, + 'expected_warnings': ['Ignoring subtitle tracks'], + 'params': {'skip_download': 'm3u8'}, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + return self.url_result(f'https://video.foxnews.com/v/{video_id}', FoxNewsIE, video_id) + + class FoxNewsArticleIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?(?:insider\.)?foxnews\.com/(?!v)([^/]+/)+(?P<id>[a-z-]+)' IE_NAME = 'foxnews:article' @@ -124,4 +147,4 @@ class FoxNewsArticleIE(InfoExtractor): 'http://video.foxnews.com/v/' + video_id, FoxNewsIE.ie_key()) return self.url_result( - FoxNewsIE._extract_urls(webpage)[0], FoxNewsIE.ie_key()) + next(FoxNewsIE._extract_embed_urls(url, webpage)), FoxNewsIE.ie_key()) diff --git a/hypervideo_dl/extractor/foxsports.py b/hypervideo_dl/extractor/foxsports.py index 2b2cb6c..f9d7fe5 100644 --- a/hypervideo_dl/extractor/foxsports.py +++ b/hypervideo_dl/extractor/foxsports.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/fptplay.py b/hypervideo_dl/extractor/fptplay.py index a34e90b..85613ba 100644 --- a/hypervideo_dl/extractor/fptplay.py +++ b/hypervideo_dl/extractor/fptplay.py @@ -1,18 +1,17 @@ -# coding: utf-8 -from __future__ import unicode_literals - import hashlib import time import urllib.parse from .common import InfoExtractor from ..utils import ( + clean_html, join_nonempty, + strip_or_none, ) class FptplayIE(InfoExtractor): - _VALID_URL = r'https?://fptplay\.vn/(?P<type>xem-video)/[^/]+\-(?P<id>\w+)(?:/tap-(?P<episode>[^/]+)?/?(?:[?#]|$)|)' + _VALID_URL = r'https?://fptplay\.vn/xem-video/[^/]+\-(?P<id>\w+)(?:/tap-(?P<episode>\d+)?/?(?:[?#]|$)|)' _GEO_COUNTRIES = ['VN'] IE_NAME = 'fptplay' IE_DESC = 'fptplay.vn' @@ -22,7 +21,7 @@ class FptplayIE(InfoExtractor): 'info_dict': { 'id': '621a123016f369ebbde55945', 'ext': 'mp4', - 'title': 'Nhân Duyên Đại Nhân Xin Dừng Bước - Ms. Cupid In Love', + 'title': 'Nhân Duyên Đại Nhân Xin Dừng Bước - Tập 1A', 'description': 'md5:23cf7d1ce0ade8e21e76ae482e6a8c6c', }, }, { @@ -31,25 +30,41 @@ class FptplayIE(InfoExtractor): 'info_dict': { 'id': '61f3aa8a6b3b1d2e73c60eb5', 'ext': 'mp4', - 'title': 'Má Tôi Là Đại Gia - 3', + 'title': 'Má Tôi Là Đại Gia - Tập 3', 'description': 'md5:ff8ba62fb6e98ef8875c42edff641d1c', }, + }, { + 'url': 'https://fptplay.vn/xem-video/lap-toi-do-giam-under-the-skin-6222d9684ec7230fa6e627a2/tap-4', + 'md5': 'bcb06c55ec14786d7d4eda07fa1ccbb9', + 'info_dict': { + 'id': '6222d9684ec7230fa6e627a2', + 'ext': 'mp4', + 'title': 'Lạp Tội Đồ Giám - Tập 2B', + 'description': 'md5:e5a47e9d35fbf7e9479ca8a77204908b', + }, }, { 'url': 'https://fptplay.vn/xem-video/nha-co-chuyen-hi-alls-well-ends-well-1997-6218995f6af792ee370459f0', 'only_matching': True, }] def _real_extract(self, url): - type_url, video_id, episode = self._match_valid_url(url).group('type', 'id', 'episode') - webpage = self._download_webpage(url, video_id=video_id, fatal=False) - info = self._download_json(self.get_api_with_st_token(video_id, episode or 0), video_id) + video_id, slug_episode = self._match_valid_url(url).group('id', 'episode') + webpage = self._download_webpage(url, video_id=video_id, fatal=False) or '' + title = self._search_regex( + r'(?s)<h4\s+class="mb-1 text-2xl text-white"[^>]*>(.+)</h4>', webpage, 'title', fatal=False) + real_episode = slug_episode if not title else self._search_regex( + r'<p.+title="(?P<episode>[^">]+)"\s+class="epi-title active"', webpage, 'episode', fatal=False) + title = strip_or_none(title) or self._html_search_meta(('og:title', 'twitter:title'), webpage) + + info = self._download_json( + self.get_api_with_st_token(video_id, int(slug_episode) - 1 if slug_episode else 0), video_id) formats, subtitles = self._extract_m3u8_formats_and_subtitles(info['data']['url'], video_id, 'mp4') - self._sort_formats(formats) return { 'id': video_id, - 'title': join_nonempty( - self._html_search_meta(('og:title', 'twitter:title'), webpage), episode, delim=' - '), - 'description': self._html_search_meta(['og:description', 'twitter:description'], webpage), + 'title': join_nonempty(title, real_episode, delim=' - '), + 'description': ( + clean_html(self._search_regex(r'<p\s+class="overflow-hidden"[^>]*>(.+)</p>', webpage, 'description')) + or self._html_search_meta(('og:description', 'twitter:description'), webpage)), 'formats': formats, 'subtitles': subtitles, } diff --git a/hypervideo_dl/extractor/franceculture.py b/hypervideo_dl/extractor/franceculture.py deleted file mode 100644 index 9dc28d8..0000000 --- a/hypervideo_dl/extractor/franceculture.py +++ /dev/null @@ -1,128 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re -from .common import InfoExtractor -from ..utils import ( - determine_ext, - extract_attributes, - int_or_none, - traverse_obj, - unified_strdate, -) - - -class FranceCultureIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?franceculture\.fr/emissions/(?:[^/]+/)*(?P<id>[^/?#&]+)' - _TESTS = [{ - # playlist - 'url': 'https://www.franceculture.fr/emissions/serie/hasta-dente', - 'playlist_count': 12, - 'info_dict': { - 'id': 'hasta-dente', - 'title': 'Hasta Dente', - 'description': 'md5:57479af50648d14e9bb649e6b1f8f911', - 'thumbnail': r're:^https?://.*\.jpg$', - 'upload_date': '20201024', - }, - 'playlist': [{ - 'info_dict': { - 'id': '3c1c2e55-41a0-11e5-9fe0-005056a87c89', - 'ext': 'mp3', - 'title': 'Jeudi, vous avez dit bizarre ?', - 'description': 'md5:47cf1e00cc21c86b0210279996a812c6', - 'duration': 604, - 'upload_date': '20201024', - 'thumbnail': r're:^https?://.*\.jpg$', - 'timestamp': 1603576680 - }, - }, - ], - }, { - 'url': 'https://www.franceculture.fr/emissions/carnet-nomade/rendez-vous-au-pays-des-geeks', - 'info_dict': { - 'id': 'rendez-vous-au-pays-des-geeks', - 'display_id': 'rendez-vous-au-pays-des-geeks', - 'ext': 'mp3', - 'title': 'Rendez-vous au pays des geeks', - 'thumbnail': r're:^https?://.*\.jpg$', - 'upload_date': '20140301', - 'vcodec': 'none', - 'duration': 3569, - }, - }, { - # no thumbnail - 'url': 'https://www.franceculture.fr/emissions/la-recherche-montre-en-main/la-recherche-montre-en-main-du-mercredi-10-octobre-2018', - 'only_matching': True, - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - - info = { - 'id': display_id, - 'title': self._html_search_regex( - r'(?s)<h1[^>]*itemprop="[^"]*name[^"]*"[^>]*>(.+?)</h1>', - webpage, 'title', default=self._og_search_title(webpage)), - 'description': self._html_search_regex( - r'(?s)<div[^>]+class="excerpt"[^>]*>(.*?)</div>', webpage, 'description', default=None), - 'thumbnail': self._og_search_thumbnail(webpage), - 'uploader': self._html_search_regex( - r'(?s)<span class="author">(.*?)</span>', webpage, 'uploader', default=None), - 'upload_date': unified_strdate(self._html_search_regex( - r'(?s)class="teaser-text-date".*?(\d{2}/\d{2}/\d{4})', webpage, 'date', default=None)), - } - - playlist_data = self._search_regex( - r'''(?sx) - <section[^>]+data-xiti-place="[^"]*?liste_episodes[^"?]*?"[^>]*> - (.*?) - </section> - ''', - webpage, 'playlist data', fatal=False, default=None) - - if playlist_data: - entries = [] - for item, item_description in re.findall( - r'(?s)(<button[^<]*class="[^"]*replay-button[^>]*>).*?<p[^>]*class="[^"]*teaser-text-chapo[^>]*>(.*?)</p>', - playlist_data): - - item_attributes = extract_attributes(item) - entries.append({ - 'id': item_attributes.get('data-emission-uuid'), - 'url': item_attributes.get('data-url'), - 'title': item_attributes.get('data-diffusion-title'), - 'duration': int_or_none(traverse_obj(item_attributes, 'data-duration-seconds', 'data-duration-seconds')), - 'description': item_description, - 'timestamp': int_or_none(item_attributes.get('data-start-time')), - 'thumbnail': info['thumbnail'], - 'uploader': info['uploader'], - }) - - return { - '_type': 'playlist', - 'entries': entries, - **info - } - - video_data = extract_attributes(self._search_regex( - r'''(?sx) - (?: - </h1>| - <div[^>]+class="[^"]*?(?:title-zone-diffusion|heading-zone-(?:wrapper|player-button))[^"]*?"[^>]*> - ).*? - (<button[^>]+data-(?:url|asset-source)="[^"]+"[^>]+>) - ''', - webpage, 'video data')) - video_url = traverse_obj(video_data, 'data-url', 'data-asset-source') - ext = determine_ext(video_url.lower()) - - return { - 'display_id': display_id, - 'url': video_url, - 'ext': ext, - 'vcodec': 'none' if ext == 'mp3' else None, - 'duration': int_or_none(video_data.get('data-duration')), - **info - } diff --git a/hypervideo_dl/extractor/franceinter.py b/hypervideo_dl/extractor/franceinter.py index ae822a5..779249b 100644 --- a/hypervideo_dl/extractor/franceinter.py +++ b/hypervideo_dl/extractor/franceinter.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import month_by_name diff --git a/hypervideo_dl/extractor/francetv.py b/hypervideo_dl/extractor/francetv.py index 347a766..0523172 100644 --- a/hypervideo_dl/extractor/francetv.py +++ b/hypervideo_dl/extractor/francetv.py @@ -1,8 +1,3 @@ -# coding: utf-8 - -from __future__ import unicode_literals - - from .common import InfoExtractor from ..utils import ( determine_ext, @@ -37,6 +32,7 @@ class FranceTVIE(InfoExtractor): (?P<id>[^@]+)(?:@(?P<catalog>.+))? ) ''' + _EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?://)?embed\.francetv\.fr/\?ue=.+?)\1'] _TESTS = [{ # without catalog @@ -195,8 +191,6 @@ class FranceTVIE(InfoExtractor): } for sheet in spritesheets] }) - self._sort_formats(formats) - if subtitle: title += ' - %s' % subtitle title = title.strip() @@ -375,7 +369,7 @@ class FranceTVInfoIE(FranceTVBaseInfoExtractor): webpage = self._download_webpage(url, display_id) - dailymotion_urls = DailymotionIE._extract_urls(webpage) + dailymotion_urls = tuple(DailymotionIE._extract_embed_urls(url, webpage)) if dailymotion_urls: return self.playlist_result([ self.url_result(dailymotion_url, DailymotionIE.ie_key()) diff --git a/hypervideo_dl/extractor/freesound.py b/hypervideo_dl/extractor/freesound.py index 138b6bc..8b5f227 100644 --- a/hypervideo_dl/extractor/freesound.py +++ b/hypervideo_dl/extractor/freesound.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -65,7 +63,6 @@ class FreesoundIE(InfoExtractor): 'format_note': channels, 'quality': quality, } for quality, format_url in enumerate(audio_urls)] - self._sort_formats(formats) return { 'id': audio_id, diff --git a/hypervideo_dl/extractor/freespeech.py b/hypervideo_dl/extractor/freespeech.py index ea9c3e3..aea5513 100644 --- a/hypervideo_dl/extractor/freespeech.py +++ b/hypervideo_dl/extractor/freespeech.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from .youtube import YoutubeIE diff --git a/hypervideo_dl/extractor/freetv.py b/hypervideo_dl/extractor/freetv.py new file mode 100644 index 0000000..757a10d --- /dev/null +++ b/hypervideo_dl/extractor/freetv.py @@ -0,0 +1,139 @@ +import itertools +import re + +from .common import InfoExtractor +from ..utils import int_or_none, traverse_obj, urlencode_postdata + + +class FreeTvBaseIE(InfoExtractor): + def _get_api_response(self, content_id, resource_type, postdata): + return self._download_json( + 'https://www.freetv.com/wordpress/wp-admin/admin-ajax.php', + content_id, data=urlencode_postdata(postdata), + note=f'Downloading {content_id} {resource_type} JSON')['data'] + + +class FreeTvMoviesIE(FreeTvBaseIE): + _VALID_URL = r'https?://(?:www\.)?freetv\.com/peliculas/(?P<id>[^/]+)' + _TESTS = [{ + 'url': 'https://www.freetv.com/peliculas/atrapame-si-puedes/', + 'md5': 'dc62d5abf0514726640077cd1591aa92', + 'info_dict': { + 'id': '428021', + 'title': 'Atrápame Si Puedes', + 'description': 'md5:ca63bc00898aeb2f64ec87c6d3a5b982', + 'ext': 'mp4', + } + }, { + 'url': 'https://www.freetv.com/peliculas/monstruoso/', + 'md5': '509c15c68de41cb708d1f92d071f20aa', + 'info_dict': { + 'id': '377652', + 'title': 'Monstruoso', + 'description': 'md5:333fc19ee327b457b980e54a911ea4a3', + 'ext': 'mp4', + } + }] + + def _extract_video(self, content_id, action='olyott_video_play'): + api_response = self._get_api_response(content_id, 'video', { + 'action': action, + 'contentID': content_id, + }) + + video_id, video_url = api_response['displayMeta']['contentID'], api_response['displayMeta']['streamURLVideo'] + formats, subtitles = self._extract_m3u8_formats_and_subtitles(video_url, video_id, 'mp4') + + return { + 'id': video_id, + 'title': traverse_obj(api_response, ('displayMeta', 'title')), + 'description': traverse_obj(api_response, ('displayMeta', 'desc')), + 'formats': formats, + 'subtitles': subtitles, + } + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + return self._extract_video( + self._search_regex(( + r'class=["\'][^>]+postid-(?P<video_id>\d+)', + r'<link[^>]+freetv.com/\?p=(?P<video_id>\d+)', + r'<div[^>]+data-params=["\'][^>]+post_id=(?P<video_id>\d+)', + ), webpage, 'video id', group='video_id')) + + +class FreeTvIE(FreeTvBaseIE): + IE_NAME = 'freetv:series' + _VALID_URL = r'https?://(?:www\.)?freetv\.com/series/(?P<id>[^/]+)' + _TESTS = [{ + 'url': 'https://www.freetv.com/series/el-detective-l/', + 'info_dict': { + 'id': 'el-detective-l', + 'title': 'El Detective L', + 'description': 'md5:f9f1143bc33e9856ecbfcbfb97a759be' + }, + 'playlist_count': 24, + }, { + 'url': 'https://www.freetv.com/series/esmeraldas/', + 'info_dict': { + 'id': 'esmeraldas', + 'title': 'Esmeraldas', + 'description': 'md5:43d7ec45bd931d8268a4f5afaf4c77bf' + }, + 'playlist_count': 62, + }, { + 'url': 'https://www.freetv.com/series/las-aventuras-de-leonardo/', + 'info_dict': { + 'id': 'las-aventuras-de-leonardo', + 'title': 'Las Aventuras de Leonardo', + 'description': 'md5:0c47130846c141120a382aca059288f6' + }, + 'playlist_count': 13, + }, + ] + + def _extract_series_season(self, season_id, series_title): + episodes = self._get_api_response(season_id, 'series', { + 'contentID': season_id, + 'action': 'olyott_get_dynamic_series_content', + 'type': 'list', + 'perPage': '1000', + })['1'] + + for episode in episodes: + video_id = str(episode['contentID']) + formats, subtitles = self._extract_m3u8_formats_and_subtitles(episode['streamURL'], video_id, 'mp4') + + yield { + 'id': video_id, + 'title': episode.get('fullTitle'), + 'description': episode.get('description'), + 'formats': formats, + 'subtitles': subtitles, + 'thumbnail': episode.get('thumbnail'), + 'series': series_title, + 'series_id': traverse_obj(episode, ('contentMeta', 'displayMeta', 'seriesID')), + 'season_id': traverse_obj(episode, ('contentMeta', 'displayMeta', 'seasonID')), + 'season_number': traverse_obj( + episode, ('contentMeta', 'displayMeta', 'seasonNum'), expected_type=int_or_none), + 'episode_number': traverse_obj( + episode, ('contentMeta', 'displayMeta', 'episodeNum'), expected_type=int_or_none), + } + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + title = self._html_search_regex( + r'<h1[^>]+class=["\']synopis[^>]>(?P<title>[^<]+)', webpage, 'title', group='title', fatal=False) + description = self._html_search_regex( + r'<div[^>]+class=["\']+synopis content[^>]><p>(?P<description>[^<]+)', + webpage, 'description', group='description', fatal=False) + + return self.playlist_result( + itertools.chain.from_iterable( + self._extract_series_season(season_id, title) + for season_id in re.findall(r'<option[^>]+value=["\'](\d+)["\']', webpage)), + display_id, title, description) diff --git a/hypervideo_dl/extractor/freshlive.py b/hypervideo_dl/extractor/freshlive.py deleted file mode 100644 index 72a8459..0000000 --- a/hypervideo_dl/extractor/freshlive.py +++ /dev/null @@ -1,83 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - ExtractorError, - int_or_none, - try_get, - unified_timestamp, -) - - -class FreshLiveIE(InfoExtractor): - _VALID_URL = r'https?://freshlive\.tv/[^/]+/(?P<id>\d+)' - _TEST = { - 'url': 'https://freshlive.tv/satotv/74712', - 'md5': '9f0cf5516979c4454ce982df3d97f352', - 'info_dict': { - 'id': '74712', - 'ext': 'mp4', - 'title': 'テスト', - 'description': 'テスト', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 1511, - 'timestamp': 1483619655, - 'upload_date': '20170105', - 'uploader': 'サトTV', - 'uploader_id': 'satotv', - 'view_count': int, - 'comment_count': int, - 'is_live': False, - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - options = self._parse_json( - self._search_regex( - r'window\.__CONTEXT__\s*=\s*({.+?});\s*</script>', - webpage, 'initial context'), - video_id) - - info = options['context']['dispatcher']['stores']['ProgramStore']['programs'][video_id] - - title = info['title'] - - if info.get('status') == 'upcoming': - raise ExtractorError('Stream %s is upcoming' % video_id, expected=True) - - stream_url = info.get('liveStreamUrl') or info['archiveStreamUrl'] - - is_live = info.get('liveStreamUrl') is not None - - formats = self._extract_m3u8_formats( - stream_url, video_id, 'mp4', - 'm3u8_native', m3u8_id='hls') - - if is_live: - title = self._live_title(title) - - return { - 'id': video_id, - 'formats': formats, - 'title': title, - 'description': info.get('description'), - 'thumbnail': info.get('thumbnailUrl'), - 'duration': int_or_none(info.get('airTime')), - 'timestamp': unified_timestamp(info.get('createdAt')), - 'uploader': try_get( - info, lambda x: x['channel']['title'], compat_str), - 'uploader_id': try_get( - info, lambda x: x['channel']['code'], compat_str), - 'uploader_url': try_get( - info, lambda x: x['channel']['permalink'], compat_str), - 'view_count': int_or_none(info.get('viewCount')), - 'comment_count': int_or_none(info.get('commentCount')), - 'tags': info.get('tags', []), - 'is_live': is_live, - } diff --git a/hypervideo_dl/extractor/frontendmasters.py b/hypervideo_dl/extractor/frontendmasters.py index fc67a84..3bae8ad 100644 --- a/hypervideo_dl/extractor/frontendmasters.py +++ b/hypervideo_dl/extractor/frontendmasters.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -163,7 +160,6 @@ class FrontendMastersIE(FrontendMastersBaseIE): 'format_id': format_id, }) formats.append(f) - self._sort_formats(formats) subtitles = { 'en': [{ diff --git a/hypervideo_dl/extractor/fujitv.py b/hypervideo_dl/extractor/fujitv.py index 4fdfe12..668bb27 100644 --- a/hypervideo_dl/extractor/fujitv.py +++ b/hypervideo_dl/extractor/fujitv.py @@ -1,5 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals from ..utils import HEADRequest from .common import InfoExtractor @@ -19,7 +17,7 @@ class FujiTVFODPlus7IE(InfoExtractor): 'url': 'https://fod.fujitv.co.jp/title/5d40/5d40110076', 'info_dict': { 'id': '5d40110076', - 'ext': 'mp4', + 'ext': 'ts', 'title': '#1318 『まる子、まぼろしの洋館を見る』の巻', 'series': 'ちびまる子ちゃん', 'series_id': '5d40', @@ -30,7 +28,7 @@ class FujiTVFODPlus7IE(InfoExtractor): 'url': 'https://fod.fujitv.co.jp/title/5d40/5d40810083', 'info_dict': { 'id': '5d40810083', - 'ext': 'mp4', + 'ext': 'ts', 'title': '#1324 『まる子とオニの子』の巻/『結成!2月をムダにしない会』の巻', 'description': 'md5:3972d900b896adc8ab1849e310507efa', 'series': 'ちびまる子ちゃん', @@ -47,19 +45,18 @@ class FujiTVFODPlus7IE(InfoExtractor): if token: json_info = self._download_json('https://fod-sp.fujitv.co.jp/apps/api/episode/detail/?ep_id=%s&is_premium=false' % video_id, video_id, headers={'x-authorization': f'Bearer {token.value}'}, fatal=False) else: - self.report_warning(f'The token cookie is needed to extract video metadata. {self._LOGIN_HINTS["cookies"]}') + self.report_warning(f'The token cookie is needed to extract video metadata. {self._login_hint("cookies")}') formats, subtitles = [], {} src_json = self._download_json(f'{self._BASE_URL}abrjson_v2/tv_android/{video_id}', video_id) for src in src_json['video_selector']: if not src.get('url'): continue - fmt, subs = self._extract_m3u8_formats_and_subtitles(src['url'], video_id, 'mp4') + fmt, subs = self._extract_m3u8_formats_and_subtitles(src['url'], video_id, 'ts') for f in fmt: f.update(dict(zip(('height', 'width'), self._BITRATE_MAP.get(f.get('tbr'), ())))) formats.extend(fmt) subtitles = self._merge_subtitles(subtitles, subs) - self._sort_formats(formats, ['tbr']) return { 'id': video_id, @@ -70,4 +67,5 @@ class FujiTVFODPlus7IE(InfoExtractor): 'formats': formats, 'subtitles': subtitles, 'thumbnail': f'{self._BASE_URL}img/program/{series_id}/episode/{video_id}_a.jpg', + '_format_sort_fields': ('tbr', ) } diff --git a/hypervideo_dl/extractor/funimation.py b/hypervideo_dl/extractor/funimation.py index 6aa9bc9..18363c1 100644 --- a/hypervideo_dl/extractor/funimation.py +++ b/hypervideo_dl/extractor/funimation.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import random import re import string @@ -8,17 +5,18 @@ import string from .common import InfoExtractor from ..compat import compat_HTTPError from ..utils import ( + ExtractorError, determine_ext, int_or_none, join_nonempty, js_to_json, + make_archive_id, orderedSet, qualities, str_or_none, traverse_obj, try_get, urlencode_postdata, - ExtractorError, ) @@ -245,11 +243,14 @@ class FunimationIE(FunimationBaseIE): 'language_preference': language_preference(lang.lower()), }) formats.extend(current_formats) + if not formats and (requested_languages or requested_versions): + self.raise_no_formats( + 'There are no video formats matching the requested languages/versions', expected=True, video_id=display_id) self._remove_duplicate_formats(formats) - self._sort_formats(formats, ('lang', 'source')) return { - 'id': initial_experience_id if only_initial_experience else episode_id, + 'id': episode_id, + '_old_archive_ids': [make_archive_id(self, initial_experience_id)], 'display_id': display_id, 'duration': duration, 'title': episode['episodeTitle'], @@ -264,6 +265,7 @@ class FunimationIE(FunimationBaseIE): 'formats': formats, 'thumbnails': thumbnails, 'subtitles': subtitles, + '_format_sort_fields': ('lang', 'source'), } def _get_subtitles(self, subtitles, experience_id, episode, display_id, format_name): diff --git a/hypervideo_dl/extractor/funk.py b/hypervideo_dl/extractor/funk.py index 2c5cfe8..539d719 100644 --- a/hypervideo_dl/extractor/funk.py +++ b/hypervideo_dl/extractor/funk.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor from .nexx import NexxIE from ..utils import ( diff --git a/hypervideo_dl/extractor/fusion.py b/hypervideo_dl/extractor/fusion.py index a3f44b8..689422f 100644 --- a/hypervideo_dl/extractor/fusion.py +++ b/hypervideo_dl/extractor/fusion.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( determine_ext, @@ -72,7 +70,6 @@ class FusionIE(InfoExtractor): 'protocol': 'm3u8_native' if ext == 'm3u8' else 'https', }) if formats: - self._sort_formats(formats) info['formats'] = formats else: info.update({ diff --git a/hypervideo_dl/extractor/fuyintv.py b/hypervideo_dl/extractor/fuyintv.py new file mode 100644 index 0000000..197901d --- /dev/null +++ b/hypervideo_dl/extractor/fuyintv.py @@ -0,0 +1,30 @@ +from .common import InfoExtractor +from ..utils import traverse_obj + + +class FuyinTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?fuyin\.tv/html/(?:\d+)/(?P<id>\d+)\.html' + _TESTS = [{ + 'url': 'https://www.fuyin.tv/html/2733/44129.html', + 'info_dict': { + 'id': '44129', + 'ext': 'mp4', + 'title': '第1集', + 'description': 'md5:21a3d238dc8d49608e1308e85044b9c3', + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + json_data = self._download_json( + 'https://www.fuyin.tv/api/api/tv.movie/url', + video_id, query={'urlid': f'{video_id}'}) + webpage = self._download_webpage(url, video_id, fatal=False) + + return { + 'id': video_id, + 'title': traverse_obj(json_data, ('data', 'title')), + 'url': json_data['data']['url'], + 'ext': 'mp4', + 'description': self._html_search_meta('description', webpage), + } diff --git a/hypervideo_dl/extractor/fxnetworks.py b/hypervideo_dl/extractor/fxnetworks.py deleted file mode 100644 index 00e6742..0000000 --- a/hypervideo_dl/extractor/fxnetworks.py +++ /dev/null @@ -1,77 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .adobepass import AdobePassIE -from ..utils import ( - extract_attributes, - int_or_none, - parse_age_limit, - smuggle_url, - update_url_query, -) - - -class FXNetworksIE(AdobePassIE): - _VALID_URL = r'https?://(?:www\.)?(?:fxnetworks|simpsonsworld)\.com/video/(?P<id>\d+)' - _TESTS = [{ - 'url': 'http://www.fxnetworks.com/video/1032565827847', - 'md5': '8d99b97b4aa7a202f55b6ed47ea7e703', - 'info_dict': { - 'id': 'dRzwHC_MMqIv', - 'ext': 'mp4', - 'title': 'First Look: Better Things - Season 2', - 'description': 'Because real life is like a fart. Watch this FIRST LOOK to see what inspired the new season of Better Things.', - 'age_limit': 14, - 'uploader': 'NEWA-FNG-FX', - 'upload_date': '20170825', - 'timestamp': 1503686274, - 'episode_number': 0, - 'season_number': 2, - 'series': 'Better Things', - }, - 'add_ie': ['ThePlatform'], - }, { - 'url': 'http://www.simpsonsworld.com/video/716094019682', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - if 'The content you are trying to access is not available in your region.' in webpage: - self.raise_geo_restricted() - video_data = extract_attributes(self._search_regex( - r'(<a.+?rel="https?://link\.theplatform\.com/s/.+?</a>)', webpage, 'video data')) - player_type = self._search_regex(r'playerType\s*=\s*[\'"]([^\'"]+)', webpage, 'player type', default=None) - release_url = video_data['rel'] - title = video_data['data-title'] - rating = video_data.get('data-rating') - query = { - 'mbr': 'true', - } - if player_type == 'movies': - query.update({ - 'manifest': 'm3u', - }) - else: - query.update({ - 'switch': 'http', - }) - if video_data.get('data-req-auth') == '1': - resource = self._get_mvpd_resource( - video_data['data-channel'], title, - video_data.get('data-guid'), rating) - query['auth'] = self._extract_mvpd_auth(url, video_id, 'fx', resource) - - return { - '_type': 'url_transparent', - 'id': video_id, - 'title': title, - 'url': smuggle_url(update_url_query(release_url, query), {'force_smil_url': True}), - 'series': video_data.get('data-show-title'), - 'episode_number': int_or_none(video_data.get('data-episode')), - 'season_number': int_or_none(video_data.get('data-season')), - 'thumbnail': video_data.get('data-large-thumb'), - 'age_limit': parse_age_limit(rating), - 'ie_key': 'ThePlatform', - } diff --git a/hypervideo_dl/extractor/gab.py b/hypervideo_dl/extractor/gab.py index 9ba0b1c..5016e2f 100644 --- a/hypervideo_dl/extractor/gab.py +++ b/hypervideo_dl/extractor/gab.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -57,7 +54,6 @@ class GabTVIE(InfoExtractor): else: frmt['height'] = str_to_int(resolution.replace('p', '')) formats.append(frmt) - self._sort_formats(formats) return { 'id': id, @@ -123,8 +119,6 @@ class GabIE(InfoExtractor): } for url, f in ((media.get('url'), metadata.get('original') or {}), (media.get('source_mp4'), metadata.get('playable') or {})) if url] - self._sort_formats(formats) - author = json_data.get('account') or {} entries.append({ 'id': f'{post_id}-{idx}', diff --git a/hypervideo_dl/extractor/gaia.py b/hypervideo_dl/extractor/gaia.py index 5b0195c..c84386f 100644 --- a/hypervideo_dl/extractor/gaia.py +++ b/hypervideo_dl/extractor/gaia.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor from ..compat import ( compat_str, @@ -92,7 +88,6 @@ class GaiaIE(InfoExtractor): media_id, headers=headers) formats = self._extract_m3u8_formats( media['mediaUrls']['bcHLS'], media_id, 'mp4') - self._sort_formats(formats) subtitles = {} text_tracks = media.get('textTracks', {}) diff --git a/hypervideo_dl/extractor/gameinformer.py b/hypervideo_dl/extractor/gameinformer.py index f1b96c1..2664edb 100644 --- a/hypervideo_dl/extractor/gameinformer.py +++ b/hypervideo_dl/extractor/gameinformer.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .brightcove import BrightcoveNewIE from .common import InfoExtractor from ..utils import ( diff --git a/hypervideo_dl/extractor/gamejolt.py b/hypervideo_dl/extractor/gamejolt.py index a13e528..440b832 100644 --- a/hypervideo_dl/extractor/gamejolt.py +++ b/hypervideo_dl/extractor/gamejolt.py @@ -1,4 +1,3 @@ -# coding: utf-8 import itertools import json import math diff --git a/hypervideo_dl/extractor/gamespot.py b/hypervideo_dl/extractor/gamespot.py index 7a1beae..8dec252 100644 --- a/hypervideo_dl/extractor/gamespot.py +++ b/hypervideo_dl/extractor/gamespot.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .once import OnceIE from ..compat import compat_urllib_parse_unquote @@ -67,8 +65,6 @@ class GameSpotIE(OnceIE): formats.extend(self._extract_mpd_formats( mpd_url, page_id, mpd_id='dash', fatal=False)) - self._sort_formats(formats) - return { 'id': data_video.get('guid') or page_id, 'display_id': page_id, diff --git a/hypervideo_dl/extractor/gamestar.py b/hypervideo_dl/extractor/gamestar.py index e882fa6..e9966f5 100644 --- a/hypervideo_dl/extractor/gamestar.py +++ b/hypervideo_dl/extractor/gamestar.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor from ..utils import ( int_or_none, diff --git a/hypervideo_dl/extractor/gaskrank.py b/hypervideo_dl/extractor/gaskrank.py index 03acd2a..e0bbdae 100644 --- a/hypervideo_dl/extractor/gaskrank.py +++ b/hypervideo_dl/extractor/gaskrank.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor from ..utils import ( @@ -96,6 +93,5 @@ class GaskrankIE(InfoExtractor): 'view_count': view_count, 'average_rating': average_rating, }) - self._sort_formats(entry['formats']) return entry diff --git a/hypervideo_dl/extractor/gazeta.py b/hypervideo_dl/extractor/gazeta.py index 3671870..c6868a6 100644 --- a/hypervideo_dl/extractor/gazeta.py +++ b/hypervideo_dl/extractor/gazeta.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/gdcvault.py b/hypervideo_dl/extractor/gdcvault.py index c3ad6b4..2878bbd 100644 --- a/hypervideo_dl/extractor/gdcvault.py +++ b/hypervideo_dl/extractor/gdcvault.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/gedidigital.py b/hypervideo_dl/extractor/gedidigital.py index ec386c2..1878d63 100644 --- a/hypervideo_dl/extractor/gedidigital.py +++ b/hypervideo_dl/extractor/gedidigital.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -14,7 +11,7 @@ from ..utils import ( class GediDigitalIE(InfoExtractor): - _VALID_URL = r'''(?x)(?P<url>(?:https?:)//video\. + _VALID_URL = r'''(?x:(?P<base_url>(?:https?:)//video\. (?: (?: (?:espresso\.)?repubblica @@ -36,7 +33,13 @@ class GediDigitalIE(InfoExtractor): |corrierealpi |lasentinella )\.gelocal - )\.it(?:/[^/]+){2,4}/(?P<id>\d+))(?:$|[?&].*)''' + )\.it(?:/[^/]+){2,4}/(?P<id>\d+))(?:$|[?&].*))''' + _EMBED_REGEX = [rf'''(?x) + (?: + data-frame-src=| + <iframe[^\n]+src= + ) + (["'])(?P<url>{_VALID_URL})\1'''] _TESTS = [{ 'url': 'https://video.lastampa.it/politica/il-paradosso-delle-regionali-la-lega-vince-ma-sembra-aver-perso/121559/121683', 'md5': '84658d7fb9e55a6e57ecc77b73137494', @@ -112,22 +115,9 @@ class GediDigitalIE(InfoExtractor): urls[i] = urljoin(base_url(e), url_basename(e)) return urls - @staticmethod - def _extract_urls(webpage): - entries = [ - mobj.group('eurl') - for mobj in re.finditer(r'''(?x) - (?: - data-frame-src=| - <iframe[^\n]+src= - ) - (["'])(?P<eurl>%s)\1''' % GediDigitalIE._VALID_URL, webpage)] - return GediDigitalIE._sanitize_urls(entries) - - @staticmethod - def _extract_url(webpage): - urls = GediDigitalIE._extract_urls(webpage) - return urls[0] if urls else None + @classmethod + def _extract_embed_urls(cls, url, webpage): + return cls._sanitize_urls(tuple(super()._extract_embed_urls(url, webpage))) @staticmethod def _clean_formats(formats): @@ -142,8 +132,7 @@ class GediDigitalIE(InfoExtractor): formats[:] = clean_formats def _real_extract(self, url): - video_id = self._match_id(url) - url = self._match_valid_url(url).group('url') + video_id, url = self._match_valid_url(url).group('id', 'base_url') webpage = self._download_webpage(url, video_id) title = self._html_search_meta( ['twitter:title', 'og:title'], webpage, fatal=True) @@ -197,7 +186,6 @@ class GediDigitalIE(InfoExtractor): duration = int_or_none(v) self._clean_formats(formats) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/generic.py b/hypervideo_dl/extractor/generic.py index 03e6eb2..f28a77e 100644 --- a/hypervideo_dl/extractor/generic.py +++ b/hypervideo_dl/extractor/generic.py @@ -1,162 +1,49 @@ -# coding: utf-8 - -from __future__ import unicode_literals - import os import re -import sys +import types +import urllib.parse +import xml.etree.ElementTree -from .common import InfoExtractor +from .common import InfoExtractor # isort: split +from .commonprotocols import RtmpIE from .youtube import YoutubeIE -from ..compat import ( - compat_etree_fromstring, - compat_str, - compat_urllib_parse_unquote, - compat_urlparse, - compat_xml_parse_error, -) +from ..compat import compat_etree_fromstring from ..utils import ( + KNOWN_EXTENSIONS, + MEDIA_EXTENSIONS, + ExtractorError, + UnsupportedError, determine_ext, dict_get, - ExtractorError, - float_or_none, - HEADRequest, + format_field, int_or_none, is_html, js_to_json, - KNOWN_EXTENSIONS, merge_dicts, mimetype2ext, orderedSet, parse_duration, parse_resolution, - sanitized_Request, smuggle_url, str_or_none, + traverse_obj, + try_call, unescapeHTML, unified_timestamp, unsmuggle_url, - UnsupportedError, url_or_none, + variadic, xpath_attr, xpath_text, xpath_with_ns, ) -from .commonprotocols import RtmpIE -from .brightcove import ( - BrightcoveLegacyIE, - BrightcoveNewIE, -) -from .nexx import ( - NexxIE, - NexxEmbedIE, -) -from .nbc import NBCSportsVPlayerIE -from .ooyala import OoyalaIE -from .rutv import RUTVIE -from .tvc import TVCIE -from .sportbox import SportBoxIE -from .myvi import MyviIE -from .condenast import CondeNastIE -from .udn import UDNEmbedIE -from .senategov import SenateISVPIE -from .svt import SVTIE -from .pornhub import PornHubIE -from .xhamster import XHamsterEmbedIE -from .tnaflix import TNAFlixNetworkEmbedIE -from .drtuber import DrTuberIE -from .redtube import RedTubeIE -from .tube8 import Tube8IE -from .mofosex import MofosexEmbedIE -from .spankwire import SpankwireIE -from .youporn import YouPornIE -from .vimeo import ( - VimeoIE, - VHXEmbedIE, -) -from .dailymotion import DailymotionIE -from .dailymail import DailyMailIE -from .onionstudios import OnionStudiosIE -from .viewlift import ViewLiftEmbedIE -from .mtv import MTVServicesEmbeddedIE -from .pladform import PladformIE -from .videomore import VideomoreIE -from .webcaster import WebcasterFeedIE -from .googledrive import GoogleDriveIE -from .jwplatform import JWPlatformIE -from .digiteka import DigitekaIE -from .arkena import ArkenaIE -from .instagram import InstagramIE -from .threeqsdn import ThreeQSDNIE -from .theplatform import ThePlatformIE -from .kaltura import KalturaIE -from .eagleplatform import EaglePlatformIE -from .facebook import FacebookIE -from .soundcloud import SoundcloudEmbedIE -from .tunein import TuneInBaseIE -from .vbox7 import Vbox7IE -from .dbtv import DBTVIE -from .piksel import PikselIE -from .videa import VideaIE -from .twentymin import TwentyMinutenIE -from .ustream import UstreamIE -from .arte import ArteTVEmbedIE -from .videopress import VideoPressIE -from .rutube import RutubeIE -from .glomex import GlomexEmbedIE -from .megatvcom import MegaTVComEmbedIE -from .ant1newsgr import Ant1NewsGrEmbedIE -from .limelight import LimelightBaseIE -from .anvato import AnvatoIE -from .washingtonpost import WashingtonPostIE -from .wistia import WistiaIE -from .mediaset import MediasetIE -from .joj import JojIE -from .megaphone import MegaphoneIE -from .vzaar import VzaarIE -from .channel9 import Channel9IE -from .vshare import VShareIE -from .mediasite import MediasiteIE -from .springboardplatform import SpringboardPlatformIE -from .ted import TedEmbedIE -from .yapfiles import YapFilesIE -from .vice import ViceIE -from .xfileshare import XFileShareIE -from .cloudflarestream import CloudflareStreamIE -from .peertube import PeerTubeIE -from .teachable import TeachableIE -from .indavideo import IndavideoEmbedIE -from .apa import APAIE -from .foxnews import FoxNewsIE -from .viqeo import ViqeoIE -from .expressen import ExpressenIE -from .zype import ZypeIE -from .odnoklassniki import OdnoklassnikiIE -from .vk import VKIE -from .kinja import KinjaEmbedIE -from .gedidigital import GediDigitalIE -from .rcs import RCSEmbedsIE -from .bitchute import BitChuteIE -from .rumble import RumbleEmbedIE -from .arcpublishing import ArcPublishingIE -from .medialaan import MedialaanIE -from .simplecast import SimplecastIE -from .wimtv import WimTVIE -from .tvopengr import TVOpenGrEmbedIE -from .ertgr import ERTWebtvEmbedIE -from .tvp import TVPEmbedIE -from .blogger import BloggerIE -from .mainstreaming import MainStreamingIE -from .gfycat import GfycatIE -from .panopto import PanoptoBaseIE -from .ruutu import RuutuIE class GenericIE(InfoExtractor): IE_DESC = 'Generic downloader that works on some sites' _VALID_URL = r'.*' IE_NAME = 'generic' - _NETRC_MACHINE = False # Supress username warning + _NETRC_MACHINE = False # Suppress username warning _TESTS = [ # Direct link to a video { @@ -474,188 +361,6 @@ class GenericIE(InfoExtractor): }, 'skip': 'There is a limit of 200 free downloads / month for the test song', }, - { - # embedded brightcove video - # it also tests brightcove videos that need to set the 'Referer' - # in the http requests - 'add_ie': ['BrightcoveLegacy'], - 'url': 'http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/', - 'info_dict': { - 'id': '2765128793001', - 'ext': 'mp4', - 'title': 'Le cours de bourse : l’analyse technique', - 'description': 'md5:7e9ad046e968cb2d1114004aba466fd9', - 'uploader': 'BFM BUSINESS', - }, - 'params': { - 'skip_download': True, - }, - }, - { - # embedded with itemprop embedURL and video id spelled as `idVideo` - 'add_id': ['BrightcoveLegacy'], - 'url': 'http://bfmbusiness.bfmtv.com/mediaplayer/chroniques/olivier-delamarche/', - 'info_dict': { - 'id': '5255628253001', - 'ext': 'mp4', - 'title': 'md5:37c519b1128915607601e75a87995fc0', - 'description': 'md5:37f7f888b434bb8f8cc8dbd4f7a4cf26', - 'uploader': 'BFM BUSINESS', - 'uploader_id': '876450612001', - 'timestamp': 1482255315, - 'upload_date': '20161220', - }, - 'params': { - 'skip_download': True, - }, - }, - { - # https://github.com/ytdl-org/youtube-dl/issues/2253 - 'url': 'http://bcove.me/i6nfkrc3', - 'md5': '0ba9446db037002366bab3b3eb30c88c', - 'info_dict': { - 'id': '3101154703001', - 'ext': 'mp4', - 'title': 'Still no power', - 'uploader': 'thestar.com', - 'description': 'Mississauga resident David Farmer is still out of power as a result of the ice storm a month ago. To keep the house warm, Farmer cuts wood from his property for a wood burning stove downstairs.', - }, - 'add_ie': ['BrightcoveLegacy'], - 'skip': 'video gone', - }, - { - 'url': 'http://www.championat.com/video/football/v/87/87499.html', - 'md5': 'fb973ecf6e4a78a67453647444222983', - 'info_dict': { - 'id': '3414141473001', - 'ext': 'mp4', - 'title': 'Видео. Удаление Дзагоева (ЦСКА)', - 'description': 'Онлайн-трансляция матча ЦСКА - "Волга"', - 'uploader': 'Championat', - }, - }, - { - # https://github.com/ytdl-org/youtube-dl/issues/3541 - 'add_ie': ['BrightcoveLegacy'], - 'url': 'http://www.kijk.nl/sbs6/leermijvrouwenkennen/videos/jqMiXKAYan2S/aflevering-1', - 'info_dict': { - 'id': '3866516442001', - 'ext': 'mp4', - 'title': 'Leer mij vrouwen kennen: Aflevering 1', - 'description': 'Leer mij vrouwen kennen: Aflevering 1', - 'uploader': 'SBS Broadcasting', - }, - 'skip': 'Restricted to Netherlands', - 'params': { - 'skip_download': True, # m3u8 download - }, - }, - { - # Brightcove video in <iframe> - 'url': 'http://www.un.org/chinese/News/story.asp?NewsID=27724', - 'md5': '36d74ef5e37c8b4a2ce92880d208b968', - 'info_dict': { - 'id': '5360463607001', - 'ext': 'mp4', - 'title': '叙利亚失明儿童在废墟上演唱《心跳》 呼吁获得正常童年生活', - 'description': '联合国儿童基金会中东和北非区域大使、作曲家扎德·迪拉尼(Zade Dirani)在3月15日叙利亚冲突爆发7周年纪念日之际发布了为叙利亚谱写的歌曲《心跳》(HEARTBEAT),为受到六年冲突影响的叙利亚儿童发出强烈呐喊,呼吁世界做出共同努力,使叙利亚儿童重新获得享有正常童年生活的权利。', - 'uploader': 'United Nations', - 'uploader_id': '1362235914001', - 'timestamp': 1489593889, - 'upload_date': '20170315', - }, - 'add_ie': ['BrightcoveLegacy'], - }, - { - # Brightcove with alternative playerID key - 'url': 'http://www.nature.com/nmeth/journal/v9/n7/fig_tab/nmeth.2062_SV1.html', - 'info_dict': { - 'id': 'nmeth.2062_SV1', - 'title': 'Simultaneous multiview imaging of the Drosophila syncytial blastoderm : Quantitative high-speed imaging of entire developing embryos with simultaneous multiview light-sheet microscopy : Nature Methods : Nature Research', - }, - 'playlist': [{ - 'info_dict': { - 'id': '2228375078001', - 'ext': 'mp4', - 'title': 'nmeth.2062-sv1', - 'description': 'nmeth.2062-sv1', - 'timestamp': 1363357591, - 'upload_date': '20130315', - 'uploader': 'Nature Publishing Group', - 'uploader_id': '1964492299001', - }, - }], - }, - { - # Brightcove with UUID in videoPlayer - 'url': 'http://www8.hp.com/cn/zh/home.html', - 'info_dict': { - 'id': '5255815316001', - 'ext': 'mp4', - 'title': 'Sprocket Video - China', - 'description': 'Sprocket Video - China', - 'uploader': 'HP-Video Gallery', - 'timestamp': 1482263210, - 'upload_date': '20161220', - 'uploader_id': '1107601872001', - }, - 'params': { - 'skip_download': True, # m3u8 download - }, - 'skip': 'video rotates...weekly?', - }, - { - # Brightcove:new type [2]. - 'url': 'http://www.delawaresportszone.com/video-st-thomas-more-earns-first-trip-to-basketball-semis', - 'md5': '2b35148fcf48da41c9fb4591650784f3', - 'info_dict': { - 'id': '5348741021001', - 'ext': 'mp4', - 'upload_date': '20170306', - 'uploader_id': '4191638492001', - 'timestamp': 1488769918, - 'title': 'VIDEO: St. Thomas More earns first trip to basketball semis', - - }, - }, - { - # Alternative brightcove <video> attributes - 'url': 'http://www.programme-tv.net/videos/extraits/81095-guillaume-canet-evoque-les-rumeurs-d-infidelite-de-marion-cotillard-avec-brad-pitt-dans-vivement-dimanche/', - 'info_dict': { - 'id': '81095-guillaume-canet-evoque-les-rumeurs-d-infidelite-de-marion-cotillard-avec-brad-pitt-dans-vivement-dimanche', - 'title': "Guillaume Canet évoque les rumeurs d'infidélité de Marion Cotillard avec Brad Pitt dans Vivement Dimanche, Extraits : toutes les vidéos avec Télé-Loisirs", - }, - 'playlist': [{ - 'md5': '732d22ba3d33f2f3fc253c39f8f36523', - 'info_dict': { - 'id': '5311302538001', - 'ext': 'mp4', - 'title': "Guillaume Canet évoque les rumeurs d'infidélité de Marion Cotillard avec Brad Pitt dans Vivement Dimanche", - 'description': "Guillaume Canet évoque les rumeurs d'infidélité de Marion Cotillard avec Brad Pitt dans Vivement Dimanche (France 2, 5 février 2017)", - 'timestamp': 1486321708, - 'upload_date': '20170205', - 'uploader_id': '800000640001', - }, - 'only_matching': True, - }], - }, - { - # Brightcove with UUID in videoPlayer - 'url': 'http://www8.hp.com/cn/zh/home.html', - 'info_dict': { - 'id': '5255815316001', - 'ext': 'mp4', - 'title': 'Sprocket Video - China', - 'description': 'Sprocket Video - China', - 'uploader': 'HP-Video Gallery', - 'timestamp': 1482263210, - 'upload_date': '20161220', - 'uploader_id': '1107601872001', - }, - 'params': { - 'skip_download': True, # m3u8 download - }, - }, # ooyala video { 'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219', @@ -947,45 +652,6 @@ class GenericIE(InfoExtractor): 'skip_download': True, } }, - # YouTube <object> embed - { - 'url': 'http://www.improbable.com/2017/04/03/untrained-modern-youths-and-ancient-masters-in-selfie-portraits/', - 'md5': '516718101ec834f74318df76259fb3cc', - 'info_dict': { - 'id': 'msN87y-iEx0', - 'ext': 'webm', - 'title': 'Feynman: Mirrors FUN TO IMAGINE 6', - 'upload_date': '20080526', - 'description': 'md5:0ffc78ea3f01b2e2c247d5f8d1d3c18d', - 'uploader': 'Christopher Sykes', - 'uploader_id': 'ChristopherJSykes', - }, - 'add_ie': ['Youtube'], - }, - # Camtasia studio - { - 'url': 'http://www.ll.mit.edu/workshops/education/videocourses/antennas/lecture1/video/', - 'playlist': [{ - 'md5': '0c5e352edabf715d762b0ad4e6d9ee67', - 'info_dict': { - 'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final', - 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - video1', - 'ext': 'flv', - 'duration': 2235.90, - } - }, { - 'md5': '10e4bb3aaca9fd630e273ff92d9f3c63', - 'info_dict': { - 'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final_PIP', - 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - pip', - 'ext': 'flv', - 'duration': 2235.93, - } - }], - 'info_dict': { - 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final', - } - }, # Flowplayer { 'url': 'http://www.handjobhub.com/video/busty-blonde-siri-tit-fuck-while-wank-6313.html', @@ -998,20 +664,6 @@ class GenericIE(InfoExtractor): 'title': 'Busty Blonde Siri Tit Fuck While Wank at HandjobHub.com', } }, - # Multiple brightcove videos - # https://github.com/ytdl-org/youtube-dl/issues/2283 - { - 'url': 'http://www.newyorker.com/online/blogs/newsdesk/2014/01/always-never-nuclear-command-and-control.html', - 'info_dict': { - 'id': 'always-never', - 'title': 'Always / Never - The New Yorker', - }, - 'playlist_count': 3, - 'params': { - 'extract_flat': False, - 'skip_download': True, - } - }, # MLB embed { 'url': 'http://umpire-empire.com/index.php/topic/58125-laz-decides-no-thats-low/', @@ -1027,36 +679,6 @@ class GenericIE(InfoExtractor): 'thumbnail': r're:^https?://.*\.jpg$', }, }, - # Wistia embed - { - 'url': 'http://study.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson', - 'md5': '1953f3a698ab51cfc948ed3992a0b7ff', - 'info_dict': { - 'id': '6e2wtrbdaf', - 'ext': 'mov', - 'title': 'paywall_north-american-exploration-failed-colonies-of-spain-france-england', - 'description': 'a Paywall Videos video from Remilon', - 'duration': 644.072, - 'uploader': 'study.com', - 'timestamp': 1459678540, - 'upload_date': '20160403', - 'filesize': 24687186, - }, - }, - { - 'url': 'http://thoughtworks.wistia.com/medias/uxjb0lwrcz', - 'md5': 'baf49c2baa8a7de5f3fc145a8506dcd4', - 'info_dict': { - 'id': 'uxjb0lwrcz', - 'ext': 'mp4', - 'title': 'Conversation about Hexagonal Rails Part 1', - 'description': 'a Martin Fowler video from ThoughtWorks', - 'duration': 1715.0, - 'uploader': 'thoughtworks.wistia.com', - 'timestamp': 1401832161, - 'upload_date': '20140603', - }, - }, # Wistia standard embed (async) { 'url': 'https://www.getdrip.com/university/brennan-dunn-drip-workshop/', @@ -1071,7 +693,8 @@ class GenericIE(InfoExtractor): }, 'params': { 'skip_download': True, - } + }, + 'skip': 'webpage 404 not found', }, # Soundcloud embed { @@ -1254,18 +877,6 @@ class GenericIE(InfoExtractor): 'skip_download': True, } }, - { - # JWPlatform iframe - 'url': 'https://www.covermagazine.co.uk/feature/2465255/business-protection-involved', - 'info_dict': { - 'id': 'AG26UQXM', - 'ext': 'mp4', - 'upload_date': '20160719', - 'timestamp': 468923808, - 'title': '2016_05_18 Cover L&G Business Protection V1 FINAL.mp4', - }, - 'add_ie': [JWPlatformIE.ie_key()], - }, { # Video.js embed, multiple formats 'url': 'http://ortcam.com/solidworks-урок-6-настройка-чертежа_33f9b7351.html', @@ -1545,21 +1156,6 @@ class GenericIE(InfoExtractor): }, 'expected_warnings': ['Failed to parse JSON Expecting value'], }, - # Brightcove URL in single quotes - { - 'url': 'http://www.sportsnet.ca/baseball/mlb/sn-presents-russell-martin-world-citizen/', - 'md5': '4ae374f1f8b91c889c4b9203c8c752af', - 'info_dict': { - 'id': '4255764656001', - 'ext': 'mp4', - 'title': 'SN Presents: Russell Martin, World Citizen', - 'description': 'To understand why he was the Toronto Blue Jays’ top off-season priority is to appreciate his background and upbringing in Montreal, where he first developed his baseball skills. Written and narrated by Stephen Brunt.', - 'uploader': 'Rogers Sportsnet', - 'uploader_id': '1704050871', - 'upload_date': '20150525', - 'timestamp': 1432570283, - }, - }, # Kinja embed { 'url': 'http://www.clickhole.com/video/dont-understand-bitcoin-man-will-mumble-explanatio-2537', @@ -1595,52 +1191,6 @@ class GenericIE(InfoExtractor): 'duration': 248.667, }, }, - # BrightcoveInPageEmbed embed - { - 'url': 'http://www.geekandsundry.com/tabletop-bonus-wils-final-thoughts-on-dread/', - 'info_dict': { - 'id': '4238694884001', - 'ext': 'flv', - 'title': 'Tabletop: Dread, Last Thoughts', - 'description': 'Tabletop: Dread, Last Thoughts', - 'duration': 51690, - }, - }, - # Brightcove embed, with no valid 'renditions' but valid 'IOSRenditions' - # This video can't be played in browsers if Flash disabled and UA set to iPhone, which is actually a false alarm - { - 'url': 'https://dl.dropboxusercontent.com/u/29092637/interview.html', - 'info_dict': { - 'id': '4785848093001', - 'ext': 'mp4', - 'title': 'The Cardinal Pell Interview', - 'description': 'Sky News Contributor Andrew Bolt interviews George Pell in Rome, following the Cardinal\'s evidence before the Royal Commission into Child Abuse. ', - 'uploader': 'GlobeCast Australia - GlobeStream', - 'uploader_id': '2733773828001', - 'upload_date': '20160304', - 'timestamp': 1457083087, - }, - 'params': { - # m3u8 downloads - 'skip_download': True, - }, - }, - { - # Brightcove embed with whitespace around attribute names - 'url': 'http://www.stack.com/video/3167554373001/learn-to-hit-open-three-pointers-with-damian-lillard-s-baseline-drift-drill', - 'info_dict': { - 'id': '3167554373001', - 'ext': 'mp4', - 'title': "Learn to Hit Open Three-Pointers With Damian Lillard's Baseline Drift Drill", - 'description': 'md5:57bacb0e0f29349de4972bfda3191713', - 'uploader_id': '1079349493', - 'upload_date': '20140207', - 'timestamp': 1391810548, - }, - 'params': { - 'skip_download': True, - }, - }, # Another form of arte.tv embed { 'url': 'http://www.tv-replay.fr/redirection/09-04-16/arte-reportage-arte-11508975.html', @@ -1691,7 +1241,7 @@ class GenericIE(InfoExtractor): 'timestamp': 1464107587, 'uploader': 'TheAtlantic', }, - 'add_ie': ['BrightcoveLegacy'], + 'skip': 'Private Youtube video', }, # Facebook <iframe> embed { @@ -1800,7 +1350,7 @@ class GenericIE(InfoExtractor): 'params': { 'skip_download': True, }, - 'add_ie': [ArkenaIE.ie_key()], + 'add_ie': ['Arkena'], }, { 'url': 'http://nova.bg/news/view/2016/08/16/156543/%D0%BD%D0%B0-%D0%BA%D0%BE%D1%81%D1%8A%D0%BC-%D0%BE%D1%82-%D0%B2%D0%B7%D1%80%D0%B8%D0%B2-%D0%BE%D1%82%D1%86%D0%B5%D0%BF%D0%B8%D1%85%D0%B0-%D1%86%D1%8F%D0%BB-%D0%BA%D0%B2%D0%B0%D1%80%D1%82%D0%B0%D0%BB-%D0%B7%D0%B0%D1%80%D0%B0%D0%B4%D0%B8-%D0%B8%D0%B7%D1%82%D0%B8%D1%87%D0%B0%D0%BD%D0%B5-%D0%BD%D0%B0-%D0%B3%D0%B0%D0%B7-%D0%B2-%D0%BF%D0%BB%D0%BE%D0%B2%D0%B4%D0%B8%D0%B2/', @@ -1812,7 +1362,7 @@ class GenericIE(InfoExtractor): 'params': { 'skip_download': True, }, - 'add_ie': [Vbox7IE.ie_key()], + 'add_ie': ['Vbox7'], }, { # DBTV embeds @@ -1844,7 +1394,7 @@ class GenericIE(InfoExtractor): 'params': { 'skip_download': True, }, - 'add_ie': [TwentyMinutenIE.ie_key()], + 'add_ie': ['TwentyMinuten'], }, { # VideoPress embed @@ -1859,7 +1409,7 @@ class GenericIE(InfoExtractor): 'params': { 'skip_download': True, }, - 'add_ie': [VideoPressIE.ie_key()], + 'add_ie': ['VideoPress'], }, { # Rutube embed @@ -1876,7 +1426,7 @@ class GenericIE(InfoExtractor): 'params': { 'skip_download': True, }, - 'add_ie': [RutubeIE.ie_key()], + 'add_ie': ['Rutube'], }, { # glomex:embed @@ -1948,7 +1498,7 @@ class GenericIE(InfoExtractor): 'ext': 'mp4', 'title': 'Integrated Senate Video Player', }, - 'add_ie': [SenateISVPIE.ie_key()], + 'add_ie': ['SenateISVP'], }, { # Limelight embeds (1 channel embed + 4 media embeds) @@ -1995,7 +1545,7 @@ class GenericIE(InfoExtractor): 'uploader': 'The Washington Post', 'upload_date': '20160211', }, - 'add_ie': [WashingtonPostIE.ie_key()], + 'add_ie': ['WashingtonPost'], }, { # Mediaset embed @@ -2008,7 +1558,7 @@ class GenericIE(InfoExtractor): 'params': { 'skip_download': True, }, - 'add_ie': [MediasetIE.ie_key()], + 'add_ie': ['Mediaset'], }, { # JOJ.sk embeds @@ -2018,7 +1568,7 @@ class GenericIE(InfoExtractor): 'title': 'Slovenskom sa prehnala vlna silných búrok', }, 'playlist_mincount': 5, - 'add_ie': [JojIE.ie_key()], + 'add_ie': ['Joj'], }, { # AMP embed (see https://www.ampproject.org/docs/reference/components/amp-video) @@ -2084,7 +1634,7 @@ class GenericIE(InfoExtractor): 'params': { 'skip_download': True, }, - 'add_ie': [SpringboardPlatformIE.ie_key()], + 'add_ie': ['SpringboardPlatform'], }, { 'url': 'https://www.yapfiles.ru/show/1872528/690b05d3054d2dbe1e69523aa21bb3b1.mp4.html', @@ -2093,7 +1643,7 @@ class GenericIE(InfoExtractor): 'ext': 'mp4', 'title': 'Котята', }, - 'add_ie': [YapFilesIE.ie_key()], + 'add_ie': ['YapFiles'], 'params': { 'skip_download': True, }, @@ -2106,7 +1656,7 @@ class GenericIE(InfoExtractor): 'ext': 'mp4', 'title': '31c9291ab41fac05471db4e73aa11717', }, - 'add_ie': [CloudflareStreamIE.ie_key()], + 'add_ie': ['CloudflareStream'], 'params': { 'skip_download': True, }, @@ -2133,7 +1683,7 @@ class GenericIE(InfoExtractor): 'uploader': 'StreetKitchen', 'uploader_id': '546363', }, - 'add_ie': [IndavideoEmbedIE.ie_key()], + 'add_ie': ['IndavideoEmbed'], 'params': { 'skip_download': True, }, @@ -2174,22 +1724,6 @@ class GenericIE(InfoExtractor): }, 'playlist_count': 6, }, - { - # Squarespace video embed, 2019-08-28 - 'url': 'http://ootboxford.com', - 'info_dict': { - 'id': 'Tc7b_JGdZfw', - 'title': 'Out of the Blue, at Childish Things 10', - 'ext': 'mp4', - 'description': 'md5:a83d0026666cf5ee970f8bd1cfd69c7f', - 'uploader_id': 'helendouglashouse', - 'uploader': 'Helen & Douglas House', - 'upload_date': '20140328', - }, - 'params': { - 'skip_download': True, - }, - }, # { # # Zype embed # 'url': 'https://www.cookscountry.com/episode/554-smoky-barbecue-favorites', @@ -2508,10 +2042,10 @@ class GenericIE(InfoExtractor): # Panopto embeds 'url': 'https://www.monash.edu/learning-teaching/teachhq/learning-technologies/panopto/how-to/insert-a-quiz-into-a-panopto-video', 'info_dict': { - 'title': 'Insert a quiz into a Panopto video', - 'id': 'insert-a-quiz-into-a-panopto-video' + 'ext': 'mp4', + 'id': '0bd3f16c-824a-436a-8486-ac5900693aef', + 'title': 'Quizzes in Panopto', }, - 'playlist_count': 1 }, { # Ruutu embed @@ -2530,114 +2064,178 @@ class GenericIE(InfoExtractor): 'upload_date': '20220308', }, }, + { + # Multiple Ruutu embeds + 'url': 'https://www.hs.fi/kotimaa/art-2000008762560.html', + 'info_dict': { + 'title': 'Koronavirus | Epidemiahuippu voi olla Suomessa ohi, mutta koronaviruksen poistamista yleisvaarallisten tautien joukosta harkitaan vasta syksyllä', + 'id': 'art-2000008762560' + }, + 'playlist_count': 3 + }, + { + # Ruutu embed in hs.fi with a single video + 'url': 'https://www.hs.fi/kotimaa/art-2000008793421.html', + 'md5': 'f8964e65d8fada6e8a562389bf366bb4', + 'info_dict': { + 'id': '4081841', + 'ext': 'mp4', + 'title': 'Puolustusvoimat siirsi panssariajoneuvoja harjoituksiin Niinisaloon 2.5.2022', + 'thumbnail': r're:^https?://.+\.jpg$', + 'duration': 138, + 'age_limit': 0, + 'upload_date': '20220504', + }, + }, + { + # Webpage contains double BOM + 'url': 'https://www.filmarkivet.se/movies/paris-d-moll/', + 'md5': 'df02cadc719dcc63d43288366f037754', + 'info_dict': { + 'id': 'paris-d-moll', + 'ext': 'mp4', + 'upload_date': '20220518', + 'title': 'Paris d-moll', + 'description': 'md5:319e37ea5542293db37e1e13072fe330', + 'thumbnail': 'https://www.filmarkivet.se/wp-content/uploads/parisdmoll2.jpg', + 'timestamp': 1652833414, + 'age_limit': 0, + } + }, + { + 'url': 'https://www.mollymovieclub.com/p/interstellar?s=r#details', + 'md5': '198bde8bed23d0b23c70725c83c9b6d9', + 'info_dict': { + 'id': '53602801', + 'ext': 'mpga', + 'title': 'Interstellar', + 'description': 'Listen now | Episode One', + 'thumbnail': 'md5:c30d9c83f738e16d8551d7219d321538', + 'uploader': 'Molly Movie Club', + 'uploader_id': '839621', + }, + }, + { + 'url': 'https://www.blockedandreported.org/p/episode-117-lets-talk-about-depp?s=r', + 'md5': 'c0cc44ee7415daeed13c26e5b56d6aa0', + 'info_dict': { + 'id': '57962052', + 'ext': 'mpga', + 'title': 'md5:855b2756f0ee10f6723fa00b16266f8d', + 'description': 'md5:fe512a5e94136ad260c80bde00ea4eef', + 'thumbnail': 'md5:2218f27dfe517bb5ac16c47d0aebac59', + 'uploader': 'Blocked and Reported', + 'uploader_id': '500230', + }, + }, + { + 'url': 'https://www.skimag.com/video/ski-people-1980/', + 'md5': '022a7e31c70620ebec18deeab376ee03', + 'info_dict': { + 'id': 'YTmgRiNU', + 'ext': 'mp4', + 'title': '1980 Ski People', + 'timestamp': 1610407738, + 'description': 'md5:cf9c3d101452c91e141f292b19fe4843', + 'thumbnail': 'https://cdn.jwplayer.com/v2/media/YTmgRiNU/poster.jpg?width=720', + 'duration': 5688.0, + 'upload_date': '20210111', + } + }, + { + 'note': 'JSON LD with multiple @type', + 'url': 'https://www.nu.nl/280161/video/hoe-een-bladvlo-dit-verwoestende-japanse-onkruid-moet-vernietigen.html', + 'md5': 'c7949f34f57273013fb7ccb1156393db', + 'info_dict': { + 'id': 'ipy2AcGL', + 'ext': 'mp4', + 'description': 'md5:6a9d644bab0dc2dc06849c2505d8383d', + 'thumbnail': r're:https://media\.nu\.nl/m/.+\.jpg', + 'title': 'Hoe een bladvlo dit verwoestende Japanse onkruid moet vernietigen', + 'timestamp': 1586577474, + 'upload_date': '20200411', + 'age_limit': 0, + 'duration': 111.0, + } + }, + { + 'note': 'JSON LD with unexpected data type', + 'url': 'https://www.autoweek.nl/autotests/artikel/porsche-911-gt3-rs-rij-impressie-2/', + 'info_dict': { + 'id': 'porsche-911-gt3-rs-rij-impressie-2', + 'ext': 'mp4', + 'title': 'Test: Porsche 911 GT3 RS', + 'description': 'Je ziet het niet, maar het is er wel. Downforce, hebben we het dan over. En in de nieuwe Porsche 911 GT3 RS is er zelfs heel veel downforce.', + 'timestamp': 1664920902, + 'upload_date': '20221004', + 'thumbnail': r're:^https://media.autoweek.nl/m/.+\.jpg$', + 'age_limit': 0, + 'direct': True, + } + } ] def report_following_redirect(self, new_url): """Report information extraction.""" self._downloader.to_screen('[redirect] Following redirect to %s' % new_url) - def report_detected(self, name): - self._downloader.write_debug(f'Identified a {name}') + def report_detected(self, name, num=1, note=None): + if num > 1: + name += 's' + elif not num: + return + else: + num = 'a' - def _extract_rss(self, url, video_id, doc): - playlist_title = doc.find('./channel/title').text - playlist_desc_el = doc.find('./channel/description') - playlist_desc = None if playlist_desc_el is None else playlist_desc_el.text + self._downloader.write_debug(f'Identified {num} {name}{format_field(note, None, "; %s")}') + def _fragment_query(self, url): + if self._configuration_arg('fragment_query'): + query_string = urllib.parse.urlparse(url).query + if query_string: + return {'extra_param_to_segment_url': query_string} + return {} + + def _extract_rss(self, url, video_id, doc): NS_MAP = { 'itunes': 'http://www.itunes.com/dtds/podcast-1.0.dtd', } entries = [] for it in doc.findall('./channel/item'): - next_url = None - enclosure_nodes = it.findall('./enclosure') - for e in enclosure_nodes: - next_url = e.attrib.get('url') - if next_url: - break - - if not next_url: - next_url = xpath_text(it, 'link', fatal=False) - + next_url = next( + (e.attrib.get('url') for e in it.findall('./enclosure')), + xpath_text(it, 'link', fatal=False)) if not next_url: continue - if it.find('guid').text is not None: - next_url = smuggle_url(next_url, {'force_videoid': it.find('guid').text}) + guid = try_call(lambda: it.find('guid').text) + if guid: + next_url = smuggle_url(next_url, {'force_videoid': guid}) def itunes(key): - return xpath_text( - it, xpath_with_ns('./itunes:%s' % key, NS_MAP), - default=None) - - duration = itunes('duration') - explicit = (itunes('explicit') or '').lower() - if explicit in ('true', 'yes'): - age_limit = 18 - elif explicit in ('false', 'no'): - age_limit = 0 - else: - age_limit = None + return xpath_text(it, xpath_with_ns(f'./itunes:{key}', NS_MAP), default=None) entries.append({ '_type': 'url_transparent', 'url': next_url, - 'title': it.find('title').text, + 'title': try_call(lambda: it.find('title').text), 'description': xpath_text(it, 'description', default=None), - 'timestamp': unified_timestamp( - xpath_text(it, 'pubDate', default=None)), - 'duration': int_or_none(duration) or parse_duration(duration), + 'timestamp': unified_timestamp(xpath_text(it, 'pubDate', default=None)), + 'duration': parse_duration(itunes('duration')), 'thumbnail': url_or_none(xpath_attr(it, xpath_with_ns('./itunes:image', NS_MAP), 'href')), 'episode': itunes('title'), 'episode_number': int_or_none(itunes('episode')), 'season_number': int_or_none(itunes('season')), - 'age_limit': age_limit, + 'age_limit': {'true': 18, 'yes': 18, 'false': 0, 'no': 0}.get((itunes('explicit') or '').lower()), }) return { '_type': 'playlist', 'id': url, - 'title': playlist_title, - 'description': playlist_desc, - 'entries': entries, - } - - def _extract_camtasia(self, url, video_id, webpage): - """ Returns None if no camtasia video can be found. """ - - camtasia_cfg = self._search_regex( - r'fo\.addVariable\(\s*"csConfigFile",\s*"([^"]+)"\s*\);', - webpage, 'camtasia configuration file', default=None) - if camtasia_cfg is None: - return None - - title = self._html_search_meta('DC.title', webpage, fatal=True) - - camtasia_url = compat_urlparse.urljoin(url, camtasia_cfg) - camtasia_cfg = self._download_xml( - camtasia_url, video_id, - note='Downloading camtasia configuration', - errnote='Failed to download camtasia configuration') - fileset_node = camtasia_cfg.find('./playlist/array/fileset') - - entries = [] - for n in fileset_node.getchildren(): - url_n = n.find('./uri') - if url_n is None: - continue - - entries.append({ - 'id': os.path.splitext(url_n.text.rpartition('/')[2])[0], - 'title': '%s - %s' % (title, n.tag), - 'url': compat_urlparse.urljoin(url, url_n.text), - 'duration': float_or_none(n.find('./duration').text), - }) - - return { - '_type': 'playlist', + 'title': try_call(lambda: doc.find('./channel/title').text), + 'description': try_call(lambda: doc.find('./channel/description').text), 'entries': entries, - 'title': title, } def _kvs_getrealurl(self, video_url, license_code): @@ -2651,7 +2249,7 @@ class GenericIE(InfoExtractor): for o in range(len(newmagic) - 1, -1, -1): new = '' - l = (o + sum([int(n) for n in license[o:]])) % 32 + l = (o + sum(int(n) for n in license[o:])) % 32 for i in range(0, len(newmagic)): if i == o: @@ -2682,7 +2280,7 @@ class GenericIE(InfoExtractor): if url.startswith('//'): return self.url_result(self.http_scheme() + url) - parsed_url = compat_urlparse.urlparse(url) + parsed_url = urllib.parse.urlparse(url) if not parsed_url.scheme: default_search = self.get_param('default_search') if default_search is None: @@ -2713,59 +2311,59 @@ class GenericIE(InfoExtractor): default_search += ':' return self.url_result(default_search + url) - url, smuggled_data = unsmuggle_url(url) + original_url = url + url, smuggled_data = unsmuggle_url(url, {}) force_videoid = None - is_intentional = smuggled_data and smuggled_data.get('to_generic') - if smuggled_data and 'force_videoid' in smuggled_data: + is_intentional = smuggled_data.get('to_generic') + if 'force_videoid' in smuggled_data: force_videoid = smuggled_data['force_videoid'] video_id = force_videoid else: video_id = self._generic_id(url) - self.to_screen('%s: Requesting header' % video_id) - - head_req = HEADRequest(url) - head_response = self._request_webpage( - head_req, video_id, - note=False, errnote='Could not send HEAD request to %s' % url, - fatal=False) - - if head_response is not False: - # Check for redirect - new_url = head_response.geturl() - if url != new_url: - self.report_following_redirect(new_url) - if force_videoid: - new_url = smuggle_url( - new_url, {'force_videoid': force_videoid}) - return self.url_result(new_url) - - full_response = None - if head_response is False: - request = sanitized_Request(url) - request.add_header('Accept-Encoding', '*') - full_response = self._request_webpage(request, video_id) - head_response = full_response + # Some webservers may serve compressed content of rather big size (e.g. gzipped flac) + # making it impossible to download only chunk of the file (yet we need only 512kB to + # test whether it's HTML or not). According to hypervideo default Accept-Encoding + # that will always result in downloading the whole file that is not desirable. + # Therefore for extraction pass we have to override Accept-Encoding to any in order + # to accept raw bytes and being able to download only a chunk. + # It may probably better to solve this by checking Content-Type for application/octet-stream + # after a HEAD request, but not sure if we can rely on this. + full_response = self._request_webpage(url, video_id, headers={ + 'Accept-Encoding': '*', + **smuggled_data.get('http_headers', {}) + }) + new_url = full_response.geturl() + if new_url == urllib.parse.urlparse(url)._replace(scheme='https').geturl(): + url = new_url + elif url != new_url: + self.report_following_redirect(new_url) + if force_videoid: + new_url = smuggle_url(new_url, {'force_videoid': force_videoid}) + return self.url_result(new_url) info_dict = { 'id': video_id, 'title': self._generic_title(url), - 'timestamp': unified_timestamp(head_response.headers.get('Last-Modified')) + 'timestamp': unified_timestamp(full_response.headers.get('Last-Modified')) } # Check for direct link to a video - content_type = head_response.headers.get('Content-Type', '').lower() + content_type = full_response.headers.get('Content-Type', '').lower() m = re.match(r'^(?P<type>audio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P<format_id>[^;\s]+)', content_type) if m: self.report_detected('direct video link') - format_id = compat_str(m.group('format_id')) + headers = smuggled_data.get('http_headers', {}) + format_id = str(m.group('format_id')) subtitles = {} if format_id.endswith('mpegurl'): - formats, subtitles = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4') + formats, subtitles = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4', headers=headers) + info_dict.update(self._fragment_query(url)) elif format_id.endswith('mpd') or format_id.endswith('dash+xml'): - formats, subtitles = self._extract_mpd_formats_and_subtitles(url, video_id) + formats, subtitles = self._extract_mpd_formats_and_subtitles(url, video_id, headers=headers) + info_dict.update(self._fragment_query(url)) elif format_id == 'f4m': - formats = self._extract_f4m_formats(url, video_id) + formats = self._extract_f4m_formats(url, video_id, headers=headers) else: formats = [{ 'format_id': format_id, @@ -2773,28 +2371,16 @@ class GenericIE(InfoExtractor): 'vcodec': 'none' if m.group('type') == 'audio' else None }] info_dict['direct'] = True - self._sort_formats(formats) - info_dict['formats'] = formats - info_dict['subtitles'] = subtitles + info_dict.update({ + 'formats': formats, + 'subtitles': subtitles, + 'http_headers': headers, + }) return info_dict if not self.get_param('test', False) and not is_intentional: force = self.get_param('force_generic_extractor', False) - self.report_warning( - '%s on generic information extractor.' % ('Forcing' if force else 'Falling back')) - - if not full_response: - request = sanitized_Request(url) - # Some webservers may serve compressed content of rather big size (e.g. gzipped flac) - # making it impossible to download only chunk of the file (yet we need only 512kB to - # test whether it's HTML or not). According to hypervideo default Accept-Encoding - # that will always result in downloading the whole file that is not desirable. - # Therefore for extraction pass we have to override Accept-Encoding to any in order - # to accept raw bytes and being able to download only a chunk. - # It may probably better to solve this by checking Content-Type for application/octet-stream - # after HEAD request finishes, but not sure if we can rely on this. - request.add_header('Accept-Encoding', '*') - full_response = self._request_webpage(request, video_id) + self.report_warning('%s generic information extractor' % ('Forcing' if force else 'Falling back on')) first_bytes = full_response.read(512) @@ -2802,7 +2388,7 @@ class GenericIE(InfoExtractor): if first_bytes.startswith(b'#EXTM3U'): self.report_detected('M3U playlist') info_dict['formats'], info_dict['subtitles'] = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4') - self._sort_formats(info_dict['formats']) + info_dict.update(self._fragment_query(url)) return info_dict # Maybe it's a direct link to a video? @@ -2828,7 +2414,7 @@ class GenericIE(InfoExtractor): try: try: doc = compat_etree_fromstring(webpage) - except compat_xml_parse_error: + except xml.etree.ElementTree.ParseError: doc = compat_etree_fromstring(webpage.encode('utf-8')) if doc.tag == 'rss': self.report_detected('RSS feed') @@ -2836,12 +2422,10 @@ class GenericIE(InfoExtractor): elif doc.tag == 'SmoothStreamingMedia': info_dict['formats'], info_dict['subtitles'] = self._parse_ism_formats_and_subtitles(doc, url) self.report_detected('ISM manifest') - self._sort_formats(info_dict['formats']) return info_dict elif re.match(r'^(?:{[^}]+})?smil$', doc.tag): smil = self._parse_smil(doc, url, video_id) self.report_detected('SMIL file') - self._sort_formats(smil['formats']) return smil elif doc.tag == '{http://xspf.org/ns/0/}playlist': self.report_detected('XSPF playlist') @@ -2855,947 +2439,83 @@ class GenericIE(InfoExtractor): doc, mpd_base_url=full_response.geturl().rpartition('/')[0], mpd_url=url) + info_dict.update(self._fragment_query(url)) self.report_detected('DASH manifest') - self._sort_formats(info_dict['formats']) return info_dict elif re.match(r'^{http://ns\.adobe\.com/f4m/[12]\.0}manifest$', doc.tag): info_dict['formats'] = self._parse_f4m_formats(doc, url, video_id) self.report_detected('F4M manifest') - self._sort_formats(info_dict['formats']) return info_dict - except compat_xml_parse_error: + except xml.etree.ElementTree.ParseError: pass - # Is it a Camtasia project? - camtasia_res = self._extract_camtasia(url, video_id, webpage) - if camtasia_res is not None: - self.report_detected('Camtasia video') - return camtasia_res + info_dict.update({ + # it's tempting to parse this further, but you would + # have to take into account all the variations like + # Video Title - Site Name + # Site Name | Video Title + # Video Title - Tagline | Site Name + # and so on and so forth; it's just not practical + 'title': self._generic_title('', webpage, default='video'), + 'description': self._og_search_description(webpage, default=None), + 'thumbnail': self._og_search_thumbnail(webpage, default=None), + 'age_limit': self._rta_search(webpage), + }) + + self._downloader.write_debug('Looking for embeds') + embeds = list(self._extract_embeds(original_url, webpage, urlh=full_response, info_dict=info_dict)) + if len(embeds) == 1: + return {**info_dict, **embeds[0]} + elif embeds: + return self.playlist_result(embeds, **info_dict) + raise UnsupportedError(url) + + def _extract_embeds(self, url, webpage, *, urlh=None, info_dict={}): + """Returns an iterator of video entries""" + info_dict = types.MappingProxyType(info_dict) # Prevents accidental mutation + video_id = traverse_obj(info_dict, 'display_id', 'id') or self._generic_id(url) + url, smuggled_data = unsmuggle_url(url, {}) + actual_url = urlh.geturl() if urlh else url # Sometimes embedded video player is hidden behind percent encoding # (e.g. https://github.com/ytdl-org/youtube-dl/issues/2448) # Unescaping the whole page allows to handle those cases in a generic way # FIXME: unescaping the whole page may break URLs, commenting out for now. # There probably should be a second run of generic extractor on unescaped webpage. - # webpage = compat_urllib_parse_unquote(webpage) - - # Unescape squarespace embeds to be detected by generic extractor, - # see https://github.com/ytdl-org/youtube-dl/issues/21294 - webpage = re.sub( - r'<div[^>]+class=[^>]*?\bsqs-video-wrapper\b[^>]*>', - lambda x: unescapeHTML(x.group(0)), webpage) - - # it's tempting to parse this further, but you would - # have to take into account all the variations like - # Video Title - Site Name - # Site Name | Video Title - # Video Title - Tagline | Site Name - # and so on and so forth; it's just not practical - video_title = (self._og_search_title(webpage, default=None) - or self._html_extract_title(webpage, 'video title', default='video')) - - # Try to detect age limit automatically - age_limit = self._rta_search(webpage) - # And then there are the jokers who advertise that they use RTA, - # but actually don't. - AGE_LIMIT_MARKERS = [ - r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>', - ] - if any(re.search(marker, webpage) for marker in AGE_LIMIT_MARKERS): - age_limit = 18 - - # video uploader is domain name - video_uploader = self._search_regex( - r'^(?:https?://)?([^/]*)/.*', url, 'video uploader') - - video_description = self._og_search_description(webpage, default=None) - video_thumbnail = self._og_search_thumbnail(webpage, default=None) - - info_dict.update({ - 'title': video_title, - 'description': video_description, - 'thumbnail': video_thumbnail, - 'age_limit': age_limit, - }) - - self._downloader.write_debug('Looking for video embeds') - - # Look for Brightcove Legacy Studio embeds - bc_urls = BrightcoveLegacyIE._extract_brightcove_urls(webpage) - if bc_urls: - entries = [{ - '_type': 'url', - 'url': smuggle_url(bc_url, {'Referer': url}), - 'ie_key': 'BrightcoveLegacy' - } for bc_url in bc_urls] - - return { - '_type': 'playlist', - 'title': video_title, - 'id': video_id, - 'entries': entries, - } - - # Look for Brightcove New Studio embeds - bc_urls = BrightcoveNewIE._extract_urls(self, webpage) - if bc_urls: - return self.playlist_from_matches( - bc_urls, video_id, video_title, - getter=lambda x: smuggle_url(x, {'referrer': url}), - ie='BrightcoveNew') - - # Look for Nexx embeds - nexx_urls = NexxIE._extract_urls(webpage) - if nexx_urls: - return self.playlist_from_matches(nexx_urls, video_id, video_title, ie=NexxIE.ie_key()) - - # Look for Nexx iFrame embeds - nexx_embed_urls = NexxEmbedIE._extract_urls(webpage) - if nexx_embed_urls: - return self.playlist_from_matches(nexx_embed_urls, video_id, video_title, ie=NexxEmbedIE.ie_key()) - - # Look for ThePlatform embeds - tp_urls = ThePlatformIE._extract_urls(webpage) - if tp_urls: - return self.playlist_from_matches(tp_urls, video_id, video_title, ie='ThePlatform') - - arc_urls = ArcPublishingIE._extract_urls(webpage) - if arc_urls: - return self.playlist_from_matches(arc_urls, video_id, video_title, ie=ArcPublishingIE.ie_key()) - - mychannels_urls = MedialaanIE._extract_urls(webpage) - if mychannels_urls: - return self.playlist_from_matches( - mychannels_urls, video_id, video_title, ie=MedialaanIE.ie_key()) - - # Look for embedded rtl.nl player - matches = re.findall( - r'<iframe[^>]+?src="((?:https?:)?//(?:(?:www|static)\.)?rtl\.nl/(?:system/videoplayer/[^"]+(?:video_)?)?embed[^"]+)"', - webpage) - if matches: - return self.playlist_from_matches(matches, video_id, video_title, ie='RtlNl') - - vimeo_urls = VimeoIE._extract_urls(url, webpage) - if vimeo_urls: - return self.playlist_from_matches(vimeo_urls, video_id, video_title, ie=VimeoIE.ie_key()) - - vhx_url = VHXEmbedIE._extract_url(webpage) - if vhx_url: - return self.url_result(vhx_url, VHXEmbedIE.ie_key()) - - # Invidious Instances - # https://github.com/hypervideo/hypervideo/issues/195 - # https://github.com/iv-org/invidious/pull/1730 - youtube_url = self._search_regex( - r'<link rel="alternate" href="(https://www\.youtube\.com/watch\?v=[0-9A-Za-z_-]{11})"', - webpage, 'youtube link', default=None) - if youtube_url: - return self.url_result(youtube_url, YoutubeIE.ie_key()) - - # Look for YouTube embeds - youtube_urls = YoutubeIE._extract_urls(webpage) - if youtube_urls: - return self.playlist_from_matches( - youtube_urls, video_id, video_title, ie=YoutubeIE.ie_key()) - - matches = DailymotionIE._extract_urls(webpage) - if matches: - return self.playlist_from_matches(matches, video_id, video_title) - - # Look for embedded Dailymotion playlist player (#3822) - m = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.[a-z]{2,3}/widget/jukebox\?.+?)\1', webpage) - if m: - playlists = re.findall( - r'list\[\]=/playlist/([^/]+)/', unescapeHTML(m.group('url'))) - if playlists: - return self.playlist_from_matches( - playlists, video_id, video_title, lambda p: '//dailymotion.com/playlist/%s' % p) - - # Look for DailyMail embeds - dailymail_urls = DailyMailIE._extract_urls(webpage) - if dailymail_urls: - return self.playlist_from_matches( - dailymail_urls, video_id, video_title, ie=DailyMailIE.ie_key()) - - # Look for Teachable embeds, must be before Wistia - teachable_url = TeachableIE._extract_url(webpage, url) - if teachable_url: - return self.url_result(teachable_url) - - # Look for embedded Wistia player - wistia_urls = WistiaIE._extract_urls(webpage) - if wistia_urls: - playlist = self.playlist_from_matches(wistia_urls, video_id, video_title, ie=WistiaIE.ie_key()) - for entry in playlist['entries']: - entry.update({ - '_type': 'url_transparent', - 'uploader': video_uploader, - }) - return playlist - - # Look for SVT player - svt_url = SVTIE._extract_url(webpage) - if svt_url: - return self.url_result(svt_url, 'SVT') - - # Look for Bandcamp pages with custom domain - mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage) - if mobj is not None: - burl = unescapeHTML(mobj.group(1)) - # Don't set the extractor because it can be a track url or an album - return self.url_result(burl) - - # Look for embedded Vevo player - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:cache\.)?vevo\.com/.+?)\1', webpage) - if mobj is not None: - return self.url_result(mobj.group('url')) - - # Look for embedded Viddler player - mobj = re.search( - r'<(?:iframe[^>]+?src|param[^>]+?value)=(["\'])(?P<url>(?:https?:)?//(?:www\.)?viddler\.com/(?:embed|player)/.+?)\1', - webpage) - if mobj is not None: - return self.url_result(mobj.group('url')) - - # Look for NYTimes player - mobj = re.search( - r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//graphics8\.nytimes\.com/bcvideo/[^/]+/iframe/embed\.html.+?)\1>', - webpage) - if mobj is not None: - return self.url_result(mobj.group('url')) - - # Look for Libsyn player - mobj = re.search( - r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//html5-player\.libsyn\.com/embed/.+?)\1', webpage) - if mobj is not None: - return self.url_result(mobj.group('url')) - - # Look for Ooyala videos - mobj = (re.search(r'player\.ooyala\.com/[^"?]+[?#][^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) - or re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage) - or re.search(r'OO\.Player\.create\.apply\(\s*OO\.Player\s*,\s*op\(\s*\[\s*[\'"][^\'"]*[\'"]\s*,\s*[\'"](?P<ec>.{32})[\'"]', webpage) - or re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P<ec>.{32})[\'"]\)', webpage) - or re.search(r'data-ooyala-video-id\s*=\s*[\'"](?P<ec>.{32})[\'"]', webpage)) - if mobj is not None: - embed_token = self._search_regex( - r'embedToken[\'"]?\s*:\s*[\'"]([^\'"]+)', - webpage, 'ooyala embed token', default=None) - return OoyalaIE._build_url_result(smuggle_url( - mobj.group('ec'), { - 'domain': url, - 'embed_token': embed_token, - })) - - # Look for multiple Ooyala embeds on SBN network websites - mobj = re.search(r'SBN\.VideoLinkset\.entryGroup\((\[.*?\])', webpage) - if mobj is not None: - embeds = self._parse_json(mobj.group(1), video_id, fatal=False) - if embeds: - return self.playlist_from_matches( - embeds, video_id, video_title, - getter=lambda v: OoyalaIE._url_for_embed_code(smuggle_url(v['provider_video_id'], {'domain': url})), ie='Ooyala') - - # Look for Aparat videos - mobj = re.search(r'<iframe .*?src="(http://www\.aparat\.com/video/[^"]+)"', webpage) - if mobj is not None: - return self.url_result(mobj.group(1), 'Aparat') - - # Look for MPORA videos - mobj = re.search(r'<iframe .*?src="(http://mpora\.(?:com|de)/videos/[^"]+)"', webpage) - if mobj is not None: - return self.url_result(mobj.group(1), 'Mpora') - - # Look for embedded Facebook player - facebook_urls = FacebookIE._extract_urls(webpage) - if facebook_urls: - return self.playlist_from_matches(facebook_urls, video_id, video_title) - - # Look for embedded VK player - mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://vk\.com/video_ext\.php.+?)\1', webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'VK') + # webpage = urllib.parse.unquote(webpage) - # Look for embedded Odnoklassniki player - odnoklassniki_url = OdnoklassnikiIE._extract_url(webpage) - if odnoklassniki_url: - return self.url_result(odnoklassniki_url, OdnoklassnikiIE.ie_key()) - - # Look for sibnet embedded player - sibnet_urls = VKIE._extract_sibnet_urls(webpage) - if sibnet_urls: - return self.playlist_from_matches(sibnet_urls, video_id, video_title) - - # Look for embedded ivi player - mobj = re.search(r'<embed[^>]+?src=(["\'])(?P<url>https?://(?:www\.)?ivi\.ru/video/player.+?)\1', webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'Ivi') - - # Look for embedded Huffington Post player - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed\.live\.huffingtonpost\.com/.+?)\1', webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'HuffPost') - - # Look for embed.ly - mobj = re.search(r'class=["\']embedly-card["\'][^>]href=["\'](?P<url>[^"\']+)', webpage) - if mobj is not None: - return self.url_result(mobj.group('url')) - mobj = re.search(r'class=["\']embedly-embed["\'][^>]src=["\'][^"\']*url=(?P<url>[^&]+)', webpage) - if mobj is not None: - return self.url_result(compat_urllib_parse_unquote(mobj.group('url'))) - - # Look for funnyordie embed - matches = re.findall(r'<iframe[^>]+?src="(https?://(?:www\.)?funnyordie\.com/embed/[^"]+)"', webpage) - if matches: - return self.playlist_from_matches( - matches, video_id, video_title, getter=unescapeHTML, ie='FunnyOrDie') - - # Look for Simplecast embeds - simplecast_urls = SimplecastIE._extract_urls(webpage) - if simplecast_urls: - return self.playlist_from_matches( - simplecast_urls, video_id, video_title) - - # Look for BBC iPlayer embed - matches = re.findall(r'setPlaylist\("(https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)', webpage) - if matches: - return self.playlist_from_matches(matches, video_id, video_title, ie='BBCCoUk') - - # Look for embedded RUTV player - rutv_url = RUTVIE._extract_url(webpage) - if rutv_url: - return self.url_result(rutv_url, 'RUTV') - - # Look for embedded TVC player - tvc_url = TVCIE._extract_url(webpage) - if tvc_url: - return self.url_result(tvc_url, 'TVC') - - # Look for embedded SportBox player - sportbox_urls = SportBoxIE._extract_urls(webpage) - if sportbox_urls: - return self.playlist_from_matches(sportbox_urls, video_id, video_title, ie=SportBoxIE.ie_key()) - - # Look for embedded XHamster player - xhamster_urls = XHamsterEmbedIE._extract_urls(webpage) - if xhamster_urls: - return self.playlist_from_matches(xhamster_urls, video_id, video_title, ie='XHamsterEmbed') - - # Look for embedded TNAFlixNetwork player - tnaflix_urls = TNAFlixNetworkEmbedIE._extract_urls(webpage) - if tnaflix_urls: - return self.playlist_from_matches(tnaflix_urls, video_id, video_title, ie=TNAFlixNetworkEmbedIE.ie_key()) - - # Look for embedded PornHub player - pornhub_urls = PornHubIE._extract_urls(webpage) - if pornhub_urls: - return self.playlist_from_matches(pornhub_urls, video_id, video_title, ie=PornHubIE.ie_key()) - - # Look for embedded DrTuber player - drtuber_urls = DrTuberIE._extract_urls(webpage) - if drtuber_urls: - return self.playlist_from_matches(drtuber_urls, video_id, video_title, ie=DrTuberIE.ie_key()) - - # Look for embedded RedTube player - redtube_urls = RedTubeIE._extract_urls(webpage) - if redtube_urls: - return self.playlist_from_matches(redtube_urls, video_id, video_title, ie=RedTubeIE.ie_key()) - - # Look for embedded Tube8 player - tube8_urls = Tube8IE._extract_urls(webpage) - if tube8_urls: - return self.playlist_from_matches(tube8_urls, video_id, video_title, ie=Tube8IE.ie_key()) - - # Look for embedded Mofosex player - mofosex_urls = MofosexEmbedIE._extract_urls(webpage) - if mofosex_urls: - return self.playlist_from_matches(mofosex_urls, video_id, video_title, ie=MofosexEmbedIE.ie_key()) - - # Look for embedded Spankwire player - spankwire_urls = SpankwireIE._extract_urls(webpage) - if spankwire_urls: - return self.playlist_from_matches(spankwire_urls, video_id, video_title, ie=SpankwireIE.ie_key()) - - # Look for embedded YouPorn player - youporn_urls = YouPornIE._extract_urls(webpage) - if youporn_urls: - return self.playlist_from_matches(youporn_urls, video_id, video_title, ie=YouPornIE.ie_key()) - - # Look for embedded Tvigle player - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//cloud\.tvigle\.ru/video/.+?)\1', webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'Tvigle') - - # Look for embedded TED player - ted_urls = TedEmbedIE._extract_urls(webpage) - if ted_urls: - return self.playlist_from_matches(ted_urls, video_id, video_title, ie=TedEmbedIE.ie_key()) - - # Look for embedded Ustream videos - ustream_url = UstreamIE._extract_url(webpage) - if ustream_url: - return self.url_result(ustream_url, UstreamIE.ie_key()) - - # Look for embedded arte.tv player - arte_urls = ArteTVEmbedIE._extract_urls(webpage) - if arte_urls: - return self.playlist_from_matches(arte_urls, video_id, video_title) - - # Look for embedded francetv player - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?://)?embed\.francetv\.fr/\?ue=.+?)\1', - webpage) - if mobj is not None: - return self.url_result(mobj.group('url')) - - # Look for embedded Myvi.ru player - myvi_url = MyviIE._extract_url(webpage) - if myvi_url: - return self.url_result(myvi_url) - - # Look for embedded soundcloud player - soundcloud_urls = SoundcloudEmbedIE._extract_urls(webpage) - if soundcloud_urls: - return self.playlist_from_matches(soundcloud_urls, video_id, video_title, getter=unescapeHTML) - - # Look for tunein player - tunein_urls = TuneInBaseIE._extract_urls(webpage) - if tunein_urls: - return self.playlist_from_matches(tunein_urls, video_id, video_title) - - # Look for embedded mtvservices player - mtvservices_url = MTVServicesEmbeddedIE._extract_url(webpage) - if mtvservices_url: - return self.url_result(mtvservices_url, ie='MTVServicesEmbedded') - - # Look for embedded yahoo player - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:screen|movies)\.yahoo\.com/.+?\.html\?format=embed)\1', - webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'Yahoo') - - # Look for embedded sbs.com.au player - mobj = re.search( - r'''(?x) - (?: - <meta\s+property="og:video"\s+content=| - <iframe[^>]+?src= - ) - (["\'])(?P<url>https?://(?:www\.)?sbs\.com\.au/ondemand/video/.+?)\1''', - webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'SBS') - - # Look for embedded Cinchcast player - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.cinchcast\.com/.+?)\1', - webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'Cinchcast') - - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>https?://m(?:lb)?\.mlb\.com/shared/video/embed/embed\.html\?.+?)\1', - webpage) - if not mobj: - mobj = re.search( - r'data-video-link=["\'](?P<url>http://m\.mlb\.com/video/[^"\']+)', - webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'MLB') - - mobj = re.search( - r'<(?:iframe|script)[^>]+?src=(["\'])(?P<url>%s)\1' % CondeNastIE.EMBED_URL, - webpage) - if mobj is not None: - return self.url_result(self._proto_relative_url(mobj.group('url'), scheme='http:'), 'CondeNast') - - mobj = re.search( - r'<iframe[^>]+src="(?P<url>https?://(?:new\.)?livestream\.com/[^"]+/player[^"]+)"', - webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'Livestream') - - # Look for Zapiks embed - mobj = re.search( - r'<iframe[^>]+src="(?P<url>https?://(?:www\.)?zapiks\.fr/index\.php\?.+?)"', webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'Zapiks') - - # Look for Kaltura embeds - kaltura_urls = KalturaIE._extract_urls(webpage) - if kaltura_urls: - return self.playlist_from_matches( - kaltura_urls, video_id, video_title, - getter=lambda x: smuggle_url(x, {'source_url': url}), - ie=KalturaIE.ie_key()) - - # Look for EaglePlatform embeds - eagleplatform_url = EaglePlatformIE._extract_url(webpage) - if eagleplatform_url: - return self.url_result(smuggle_url(eagleplatform_url, {'referrer': url}), EaglePlatformIE.ie_key()) - - # Look for ClipYou (uses EaglePlatform) embeds - mobj = re.search( - r'<iframe[^>]+src="https?://(?P<host>media\.clipyou\.ru)/index/player\?.*\brecord_id=(?P<id>\d+).*"', webpage) - if mobj is not None: - return self.url_result('eagleplatform:%(host)s:%(id)s' % mobj.groupdict(), 'EaglePlatform') - - # Look for Pladform embeds - pladform_url = PladformIE._extract_url(webpage) - if pladform_url: - return self.url_result(pladform_url) - - # Look for Videomore embeds - videomore_url = VideomoreIE._extract_url(webpage) - if videomore_url: - return self.url_result(videomore_url) - - # Look for Webcaster embeds - webcaster_url = WebcasterFeedIE._extract_url(self, webpage) - if webcaster_url: - return self.url_result(webcaster_url, ie=WebcasterFeedIE.ie_key()) - - # Look for Playwire embeds - mobj = re.search( - r'<script[^>]+data-config=(["\'])(?P<url>(?:https?:)?//config\.playwire\.com/.+?)\1', webpage) - if mobj is not None: - return self.url_result(mobj.group('url')) - - # Look for Crooks and Liars embeds - mobj = re.search( - r'<(?:iframe[^>]+src|param[^>]+value)=(["\'])(?P<url>(?:https?:)?//embed\.crooksandliars\.com/(?:embed|v)/.+?)\1', webpage) - if mobj is not None: - return self.url_result(mobj.group('url')) - - # Look for NBC Sports VPlayer embeds - nbc_sports_url = NBCSportsVPlayerIE._extract_url(webpage) - if nbc_sports_url: - return self.url_result(nbc_sports_url, 'NBCSportsVPlayer') - - # Look for NBC News embeds - nbc_news_embed_url = re.search( - r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//www\.nbcnews\.com/widget/video-embed/[^"\']+)\1', webpage) - if nbc_news_embed_url: - return self.url_result(nbc_news_embed_url.group('url'), 'NBCNews') - - # Look for Google Drive embeds - google_drive_url = GoogleDriveIE._extract_url(webpage) - if google_drive_url: - return self.url_result(google_drive_url, 'GoogleDrive') - - # Look for UDN embeds - mobj = re.search( - r'<iframe[^>]+src="(?:https?:)?(?P<url>%s)"' % UDNEmbedIE._PROTOCOL_RELATIVE_VALID_URL, webpage) - if mobj is not None: - return self.url_result( - compat_urlparse.urljoin(url, mobj.group('url')), 'UDNEmbed') - - # Look for Senate ISVP iframe - senate_isvp_url = SenateISVPIE._search_iframe_url(webpage) - if senate_isvp_url: - return self.url_result(senate_isvp_url, 'SenateISVP') - - # Look for Kinja embeds - kinja_embed_urls = KinjaEmbedIE._extract_urls(webpage, url) - if kinja_embed_urls: - return self.playlist_from_matches( - kinja_embed_urls, video_id, video_title) - - # Look for OnionStudios embeds - onionstudios_url = OnionStudiosIE._extract_url(webpage) - if onionstudios_url: - return self.url_result(onionstudios_url) - - # Look for Blogger embeds - blogger_urls = BloggerIE._extract_urls(webpage) - if blogger_urls: - return self.playlist_from_matches(blogger_urls, video_id, video_title, ie=BloggerIE.ie_key()) - - # Look for ViewLift embeds - viewlift_url = ViewLiftEmbedIE._extract_url(webpage) - if viewlift_url: - return self.url_result(viewlift_url) - - # Look for JWPlatform embeds - jwplatform_urls = JWPlatformIE._extract_urls(webpage) - if jwplatform_urls: - return self.playlist_from_matches(jwplatform_urls, video_id, video_title, ie=JWPlatformIE.ie_key()) - - # Look for Digiteka embeds - digiteka_url = DigitekaIE._extract_url(webpage) - if digiteka_url: - return self.url_result(self._proto_relative_url(digiteka_url), DigitekaIE.ie_key()) - - # Look for Arkena embeds - arkena_url = ArkenaIE._extract_url(webpage) - if arkena_url: - return self.url_result(arkena_url, ArkenaIE.ie_key()) - - # Look for Piksel embeds - piksel_url = PikselIE._extract_url(webpage) - if piksel_url: - return self.url_result(piksel_url, PikselIE.ie_key()) - - # Look for Limelight embeds - limelight_urls = LimelightBaseIE._extract_urls(webpage, url) - if limelight_urls: - return self.playlist_result( - limelight_urls, video_id, video_title, video_description) - - # Look for Anvato embeds - anvato_urls = AnvatoIE._extract_urls(self, webpage, video_id) - if anvato_urls: - return self.playlist_result( - anvato_urls, video_id, video_title, video_description) - - # Look for AdobeTVVideo embeds - mobj = re.search( - r'<iframe[^>]+src=[\'"]((?:https?:)?//video\.tv\.adobe\.com/v/\d+[^"]+)[\'"]', - webpage) - if mobj is not None: - return self.url_result( - self._proto_relative_url(unescapeHTML(mobj.group(1))), - 'AdobeTVVideo') - - # Look for Vine embeds - mobj = re.search( - r'<iframe[^>]+src=[\'"]((?:https?:)?//(?:www\.)?vine\.co/v/[^/]+/embed/(?:simple|postcard))', - webpage) - if mobj is not None: - return self.url_result( - self._proto_relative_url(unescapeHTML(mobj.group(1))), 'Vine') - - # Look for VODPlatform embeds - mobj = re.search( - r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:(?:www\.)?vod-platform\.net|embed\.kwikmotion\.com)/[eE]mbed/.+?)\1', - webpage) - if mobj is not None: - return self.url_result( - self._proto_relative_url(unescapeHTML(mobj.group('url'))), 'VODPlatform') - - # Look for Mangomolo embeds - mobj = re.search( - r'''(?x)<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?// - (?: - admin\.mangomolo\.com/analytics/index\.php/customers/embed| - player\.mangomolo\.com/v1 - )/ - (?: - video\?.*?\bid=(?P<video_id>\d+)| - (?:index|live)\?.*?\bchannelid=(?P<channel_id>(?:[A-Za-z0-9+/=]|%2B|%2F|%3D)+) - ).+?)\1''', webpage) - if mobj is not None: - info = { - '_type': 'url_transparent', - 'url': self._proto_relative_url(unescapeHTML(mobj.group('url'))), - 'title': video_title, - 'description': video_description, - 'thumbnail': video_thumbnail, - 'uploader': video_uploader, - } - video_id = mobj.group('video_id') - if video_id: - info.update({ - 'ie_key': 'MangomoloVideo', - 'id': video_id, - }) - else: - info.update({ - 'ie_key': 'MangomoloLive', - 'id': mobj.group('channel_id'), - }) - return info - - # Look for Instagram embeds - instagram_embed_url = InstagramIE._extract_embed_url(webpage) - if instagram_embed_url is not None: - return self.url_result( - self._proto_relative_url(instagram_embed_url), InstagramIE.ie_key()) - - # Look for 3Q SDN embeds - threeqsdn_url = ThreeQSDNIE._extract_url(webpage) - if threeqsdn_url: - return { - '_type': 'url_transparent', - 'ie_key': ThreeQSDNIE.ie_key(), - 'url': self._proto_relative_url(threeqsdn_url), - 'title': video_title, - 'description': video_description, - 'thumbnail': video_thumbnail, - 'uploader': video_uploader, - } - - # Look for VBOX7 embeds - vbox7_url = Vbox7IE._extract_url(webpage) - if vbox7_url: - return self.url_result(vbox7_url, Vbox7IE.ie_key()) - - # Look for DBTV embeds - dbtv_urls = DBTVIE._extract_urls(webpage) - if dbtv_urls: - return self.playlist_from_matches(dbtv_urls, video_id, video_title, ie=DBTVIE.ie_key()) - - # Look for Videa embeds - videa_urls = VideaIE._extract_urls(webpage) - if videa_urls: - return self.playlist_from_matches(videa_urls, video_id, video_title, ie=VideaIE.ie_key()) - - # Look for 20 minuten embeds - twentymin_urls = TwentyMinutenIE._extract_urls(webpage) - if twentymin_urls: - return self.playlist_from_matches( - twentymin_urls, video_id, video_title, ie=TwentyMinutenIE.ie_key()) - - # Look for VideoPress embeds - videopress_urls = VideoPressIE._extract_urls(webpage) - if videopress_urls: - return self.playlist_from_matches( - videopress_urls, video_id, video_title, ie=VideoPressIE.ie_key()) - - # Look for Rutube embeds - rutube_urls = RutubeIE._extract_urls(webpage) - if rutube_urls: - return self.playlist_from_matches( - rutube_urls, video_id, video_title, ie=RutubeIE.ie_key()) - - # Look for Glomex embeds - glomex_urls = list(GlomexEmbedIE._extract_urls(webpage, url)) - if glomex_urls: - return self.playlist_from_matches( - glomex_urls, video_id, video_title, ie=GlomexEmbedIE.ie_key()) - - # Look for megatv.com embeds - megatvcom_urls = list(MegaTVComEmbedIE._extract_urls(webpage)) - if megatvcom_urls: - return self.playlist_from_matches( - megatvcom_urls, video_id, video_title, ie=MegaTVComEmbedIE.ie_key()) - - # Look for ant1news.gr embeds - ant1newsgr_urls = list(Ant1NewsGrEmbedIE._extract_urls(webpage)) - if ant1newsgr_urls: - return self.playlist_from_matches( - ant1newsgr_urls, video_id, video_title, ie=Ant1NewsGrEmbedIE.ie_key()) - - # Look for WashingtonPost embeds - wapo_urls = WashingtonPostIE._extract_urls(webpage) - if wapo_urls: - return self.playlist_from_matches( - wapo_urls, video_id, video_title, ie=WashingtonPostIE.ie_key()) - - # Look for Mediaset embeds - mediaset_urls = MediasetIE._extract_urls(self, webpage) - if mediaset_urls: - return self.playlist_from_matches( - mediaset_urls, video_id, video_title, ie=MediasetIE.ie_key()) - - # Look for JOJ.sk embeds - joj_urls = JojIE._extract_urls(webpage) - if joj_urls: - return self.playlist_from_matches( - joj_urls, video_id, video_title, ie=JojIE.ie_key()) - - # Look for megaphone.fm embeds - mpfn_urls = MegaphoneIE._extract_urls(webpage) - if mpfn_urls: - return self.playlist_from_matches( - mpfn_urls, video_id, video_title, ie=MegaphoneIE.ie_key()) - - # Look for vzaar embeds - vzaar_urls = VzaarIE._extract_urls(webpage) - if vzaar_urls: - return self.playlist_from_matches( - vzaar_urls, video_id, video_title, ie=VzaarIE.ie_key()) - - channel9_urls = Channel9IE._extract_urls(webpage) - if channel9_urls: - return self.playlist_from_matches( - channel9_urls, video_id, video_title, ie=Channel9IE.ie_key()) - - vshare_urls = VShareIE._extract_urls(webpage) - if vshare_urls: - return self.playlist_from_matches( - vshare_urls, video_id, video_title, ie=VShareIE.ie_key()) - - # Look for Mediasite embeds - mediasite_urls = MediasiteIE._extract_urls(webpage) - if mediasite_urls: - entries = [ - self.url_result(smuggle_url( - compat_urlparse.urljoin(url, mediasite_url), - {'UrlReferrer': url}), ie=MediasiteIE.ie_key()) - for mediasite_url in mediasite_urls] - return self.playlist_result(entries, video_id, video_title) - - springboardplatform_urls = SpringboardPlatformIE._extract_urls(webpage) - if springboardplatform_urls: - return self.playlist_from_matches( - springboardplatform_urls, video_id, video_title, - ie=SpringboardPlatformIE.ie_key()) - - yapfiles_urls = YapFilesIE._extract_urls(webpage) - if yapfiles_urls: - return self.playlist_from_matches( - yapfiles_urls, video_id, video_title, ie=YapFilesIE.ie_key()) - - vice_urls = ViceIE._extract_urls(webpage) - if vice_urls: - return self.playlist_from_matches( - vice_urls, video_id, video_title, ie=ViceIE.ie_key()) - - xfileshare_urls = XFileShareIE._extract_urls(webpage) - if xfileshare_urls: - return self.playlist_from_matches( - xfileshare_urls, video_id, video_title, ie=XFileShareIE.ie_key()) - - cloudflarestream_urls = CloudflareStreamIE._extract_urls(webpage) - if cloudflarestream_urls: - return self.playlist_from_matches( - cloudflarestream_urls, video_id, video_title, ie=CloudflareStreamIE.ie_key()) - - peertube_urls = PeerTubeIE._extract_urls(webpage, url) - if peertube_urls: - return self.playlist_from_matches( - peertube_urls, video_id, video_title, ie=PeerTubeIE.ie_key()) - - indavideo_urls = IndavideoEmbedIE._extract_urls(webpage) - if indavideo_urls: - return self.playlist_from_matches( - indavideo_urls, video_id, video_title, ie=IndavideoEmbedIE.ie_key()) - - apa_urls = APAIE._extract_urls(webpage) - if apa_urls: - return self.playlist_from_matches( - apa_urls, video_id, video_title, ie=APAIE.ie_key()) - - foxnews_urls = FoxNewsIE._extract_urls(webpage) - if foxnews_urls: - return self.playlist_from_matches( - foxnews_urls, video_id, video_title, ie=FoxNewsIE.ie_key()) - - sharevideos_urls = [sharevideos_mobj.group('url') for sharevideos_mobj in re.finditer( - r'<iframe[^>]+?\bsrc\s*=\s*(["\'])(?P<url>(?:https?:)?//embed\.share-videos\.se/auto/embed/\d+\?.*?\buid=\d+.*?)\1', - webpage)] - if sharevideos_urls: - return self.playlist_from_matches( - sharevideos_urls, video_id, video_title) - - viqeo_urls = ViqeoIE._extract_urls(webpage) - if viqeo_urls: - return self.playlist_from_matches( - viqeo_urls, video_id, video_title, ie=ViqeoIE.ie_key()) - - expressen_urls = ExpressenIE._extract_urls(webpage) - if expressen_urls: - return self.playlist_from_matches( - expressen_urls, video_id, video_title, ie=ExpressenIE.ie_key()) - - zype_urls = ZypeIE._extract_urls(webpage) - if zype_urls: - return self.playlist_from_matches( - zype_urls, video_id, video_title, ie=ZypeIE.ie_key()) - - gedi_urls = GediDigitalIE._extract_urls(webpage) - if gedi_urls: - return self.playlist_from_matches( - gedi_urls, video_id, video_title, ie=GediDigitalIE.ie_key()) - - # Look for RCS media group embeds - rcs_urls = RCSEmbedsIE._extract_urls(webpage) - if rcs_urls: - return self.playlist_from_matches( - rcs_urls, video_id, video_title, ie=RCSEmbedsIE.ie_key()) - - wimtv_urls = WimTVIE._extract_urls(webpage) - if wimtv_urls: - return self.playlist_from_matches( - wimtv_urls, video_id, video_title, ie=WimTVIE.ie_key()) - - bitchute_urls = BitChuteIE._extract_urls(webpage) - if bitchute_urls: - return self.playlist_from_matches( - bitchute_urls, video_id, video_title, ie=BitChuteIE.ie_key()) - - rumble_urls = RumbleEmbedIE._extract_urls(webpage) - if len(rumble_urls) == 1: - return self.url_result(rumble_urls[0], RumbleEmbedIE.ie_key()) - if rumble_urls: - return self.playlist_from_matches( - rumble_urls, video_id, video_title, ie=RumbleEmbedIE.ie_key()) - - # Look for (tvopen|ethnos).gr embeds - tvopengr_urls = list(TVOpenGrEmbedIE._extract_urls(webpage)) - if tvopengr_urls: - return self.playlist_from_matches(tvopengr_urls, video_id, video_title, ie=TVOpenGrEmbedIE.ie_key()) - - # Look for ert.gr webtv embeds - ertwebtv_urls = list(ERTWebtvEmbedIE._extract_urls(webpage)) - if len(ertwebtv_urls) == 1: - return self.url_result(self._proto_relative_url(ertwebtv_urls[0]), video_title=video_title, url_transparent=True) - elif ertwebtv_urls: - return self.playlist_from_matches(ertwebtv_urls, video_id, video_title, ie=ERTWebtvEmbedIE.ie_key()) - - tvp_urls = TVPEmbedIE._extract_urls(webpage) - if tvp_urls: - return self.playlist_from_matches(tvp_urls, video_id, video_title, ie=TVPEmbedIE.ie_key()) - - # Look for MainStreaming embeds - mainstreaming_urls = MainStreamingIE._extract_urls(webpage) - if mainstreaming_urls: - return self.playlist_from_matches(mainstreaming_urls, video_id, video_title, ie=MainStreamingIE.ie_key()) - - # Look for Gfycat Embeds - gfycat_urls = GfycatIE._extract_urls(webpage) - if gfycat_urls: - return self.playlist_from_matches(gfycat_urls, video_id, video_title, ie=GfycatIE.ie_key()) - - panopto_urls = PanoptoBaseIE._extract_urls(webpage) - if panopto_urls: - return self.playlist_from_matches(panopto_urls, video_id, video_title) - - # Look for Ruutu embeds - ruutu_url = RuutuIE._extract_url(webpage) - if ruutu_url: - return self.url_result(ruutu_url, RuutuIE) - - # Look for HTML5 media - entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls') - if entries: - self.report_detected('HTML5 media') - if len(entries) == 1: - entries[0].update({ - 'id': video_id, - 'title': video_title, - }) - else: - for num, entry in enumerate(entries, start=1): - entry.update({ - 'id': '%s-%s' % (video_id, num), - 'title': '%s (%d)' % (video_title, num), - }) - for entry in entries: - self._sort_formats(entry['formats']) - return self.playlist_result(entries, video_id, video_title) + embeds = [] + for ie in self._downloader._ies.values(): + if ie.ie_key() in smuggled_data.get('block_ies', []): + continue + gen = ie.extract_from_webpage(self._downloader, url, webpage) + current_embeds = [] + try: + while True: + current_embeds.append(next(gen)) + except self.StopExtraction: + self.report_detected(f'{ie.IE_NAME} exclusive embed', len(current_embeds), + embeds and 'discarding other embeds') + return current_embeds + except StopIteration: + self.report_detected(f'{ie.IE_NAME} embed', len(current_embeds)) + embeds.extend(current_embeds) + + if embeds: + return embeds jwplayer_data = self._find_jwplayer_data( webpage, video_id, transform_source=js_to_json) if jwplayer_data: if isinstance(jwplayer_data.get('playlist'), str): self.report_detected('JW Player playlist') - return { - **info_dict, - '_type': 'url', - 'ie_key': JWPlatformIE.ie_key(), - 'url': jwplayer_data['playlist'], - } + return [self.url_result(jwplayer_data['playlist'], 'JWPlatform')] try: info = self._parse_jwplayer_data( jwplayer_data, video_id, require_title=False, base_url=url) - self.report_detected('JW Player data') - return merge_dicts(info, info_dict) + if traverse_obj(info, 'formats', ('entries', ..., 'formats')): + self.report_detected('JW Player data') + return [info] except ExtractorError: # See https://github.com/ytdl-org/youtube-dl/pull/16735 pass @@ -3806,24 +2526,21 @@ class GenericIE(InfoExtractor): webpage) if mobj is not None: varname = mobj.group(1) - sources = self._parse_json( - mobj.group(2), video_id, transform_source=js_to_json, - fatal=False) or [] - if not isinstance(sources, list): - sources = [sources] + sources = variadic(self._parse_json( + mobj.group(2), video_id, transform_source=js_to_json, fatal=False) or []) formats = [] subtitles = {} for source in sources: src = source.get('src') - if not src or not isinstance(src, compat_str): + if not src or not isinstance(src, str): continue - src = compat_urlparse.urljoin(url, src) + src = urllib.parse.urljoin(url, src) src_type = source.get('type') - if isinstance(src_type, compat_str): + if isinstance(src_type, str): src_type = src_type.lower() ext = determine_ext(src).lower() if src_type == 'video/youtube': - return self.url_result(src, YoutubeIE.ie_key()) + return [self.url_result(src, YoutubeIE.ie_key())] if src_type == 'application/dash+xml' or ext == 'mpd': fmts, subs = self._extract_mpd_formats_and_subtitles( src, video_id, mpd_id='dash', fatal=False) @@ -3835,13 +2552,16 @@ class GenericIE(InfoExtractor): m3u8_id='hls', fatal=False) formats.extend(fmts) self._merge_subtitles(subs, target=subtitles) - else: + for fmt in formats: + fmt.update(self._fragment_query(src)) + + if not formats: formats.append({ 'url': src, 'ext': (mimetype2ext(src_type) or ext if ext in KNOWN_EXTENSIONS else 'mp4'), 'http_headers': { - 'Referer': full_response.geturl(), + 'Referer': actual_url, }, }) # https://docs.videojs.com/player#addRemoteTextTrack @@ -3853,39 +2573,36 @@ class GenericIE(InfoExtractor): if not src: continue subtitles.setdefault(dict_get(sub, ('language', 'srclang')) or 'und', []).append({ - 'url': compat_urlparse.urljoin(url, src), + 'url': urllib.parse.urljoin(url, src), 'name': sub.get('label'), 'http_headers': { - 'Referer': full_response.geturl(), + 'Referer': actual_url, }, }) if formats or subtitles: self.report_detected('video.js embed') - self._sort_formats(formats) - info_dict['formats'] = formats - info_dict['subtitles'] = subtitles - return info_dict + return [{'formats': formats, 'subtitles': subtitles}] # Looking for http://schema.org/VideoObject json_ld = self._search_json_ld(webpage, video_id, default={}) if json_ld.get('url') not in (url, None): self.report_detected('JSON LD') - if determine_ext(json_ld['url']) == 'm3u8': - json_ld['formats'], json_ld['subtitles'] = self._extract_m3u8_formats_and_subtitles( - json_ld['url'], video_id, 'mp4') - json_ld.pop('url') - self._sort_formats(json_ld['formats']) - else: - json_ld['_type'] = 'url_transparent' - json_ld['url'] = smuggle_url(json_ld['url'], {'force_videoid': video_id, 'to_generic': True}) - return merge_dicts(json_ld, info_dict) + is_direct = json_ld.get('ext') not in (None, *MEDIA_EXTENSIONS.manifests) + return [merge_dicts({ + '_type': 'video' if is_direct else 'url_transparent', + 'url': smuggle_url(json_ld['url'], { + 'force_videoid': video_id, + 'to_generic': True, + 'http_headers': {'Referer': url}, + }), + }, json_ld)] def check_video(vurl): if YoutubeIE.suitable(vurl): return True if RtmpIE.suitable(vurl): return True - vpath = compat_urlparse.urlparse(vurl).path + vpath = urllib.parse.urlparse(vurl).path vext = determine_ext(vpath, None) return vext not in (None, 'swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml', 'js', 'xml') @@ -3947,15 +2664,13 @@ class GenericIE(InfoExtractor): if not formats[-1].get('height'): formats[-1]['quality'] = 1 - self._sort_formats(formats) - - return { + return [{ 'id': flashvars['video_id'], 'display_id': display_id, 'title': title, 'thumbnail': thumbnail, 'formats': formats, - } + }] if not found: # Broaden the search a little bit found = filter_video(re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)) @@ -3994,7 +2709,7 @@ class GenericIE(InfoExtractor): self.report_detected('Twitter card') if not found: # We look for Open Graph info: - # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am) + # We have to match any number spaces between elements, some sites try to align them, e.g.: statigr.am m_video_type = re.findall(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage) # We only look in og:video if the MIME type is a video, don't try if it's a Flash player: if m_video_type is not None: @@ -4009,20 +2724,14 @@ class GenericIE(InfoExtractor): webpage) if not found: # Look also in Refresh HTTP header - refresh_header = head_response.headers.get('Refresh') + refresh_header = urlh and urlh.headers.get('Refresh') if refresh_header: - # In python 2 response HTTP headers are bytestrings - if sys.version_info < (3, 0) and isinstance(refresh_header, str): - refresh_header = refresh_header.decode('iso-8859-1') found = re.search(REDIRECT_REGEX, refresh_header) if found: - new_url = compat_urlparse.urljoin(url, unescapeHTML(found.group(1))) + new_url = urllib.parse.urljoin(url, unescapeHTML(found.group(1))) if new_url != url: self.report_following_redirect(new_url) - return { - '_type': 'url', - 'url': new_url, - } + return [self.url_result(new_url)] else: found = None @@ -4033,34 +2742,35 @@ class GenericIE(InfoExtractor): embed_url = self._html_search_meta('twitter:player', webpage, default=None) if embed_url and embed_url != url: self.report_detected('twitter:player iframe') - return self.url_result(embed_url) + return [self.url_result(embed_url)] if not found: - raise UnsupportedError(url) + return [] + + domain_name = self._search_regex(r'^(?:https?://)?([^/]*)/.*', url, 'video uploader', default=None) entries = [] for video_url in orderedSet(found): video_url = unescapeHTML(video_url) video_url = video_url.replace('\\/', '/') - video_url = compat_urlparse.urljoin(url, video_url) - video_id = compat_urllib_parse_unquote(os.path.basename(video_url)) + video_url = urllib.parse.urljoin(url, video_url) + video_id = urllib.parse.unquote(os.path.basename(video_url)) # Sometimes, jwplayer extraction will result in a YouTube URL if YoutubeIE.suitable(video_url): entries.append(self.url_result(video_url, 'Youtube')) continue - # here's a fun little line of code for you: video_id = os.path.splitext(video_id)[0] headers = { - 'referer': full_response.geturl() + 'referer': actual_url } entry_info_dict = { 'id': video_id, - 'uploader': video_uploader, - 'title': video_title, - 'age_limit': age_limit, + 'uploader': domain_name, + 'title': info_dict['title'], + 'age_limit': info_dict['age_limit'], 'http_headers': headers, } @@ -4077,11 +2787,13 @@ class GenericIE(InfoExtractor): if ext == 'smil': entry_info_dict = {**self._extract_smil_info(video_url, video_id), **entry_info_dict} elif ext == 'xspf': - return self.playlist_result(self._extract_xspf_playlist(video_url, video_id), video_id) + return [self._extract_xspf_playlist(video_url, video_id)] elif ext == 'm3u8': entry_info_dict['formats'], entry_info_dict['subtitles'] = self._extract_m3u8_formats_and_subtitles(video_url, video_id, ext='mp4', headers=headers) + entry_info_dict.update(self._fragment_query(video_url)) elif ext == 'mpd': entry_info_dict['formats'], entry_info_dict['subtitles'] = self._extract_mpd_formats_and_subtitles(video_url, video_id, headers=headers) + entry_info_dict.update(self._fragment_query(video_url)) elif ext == 'f4m': entry_info_dict['formats'] = self._extract_f4m_formats(video_url, video_id, headers=headers) elif re.search(r'(?i)\.(?:ism|smil)/manifest', video_url) and video_url != url: @@ -4102,19 +2814,11 @@ class GenericIE(InfoExtractor): else: entry_info_dict['url'] = video_url - if entry_info_dict.get('formats'): - self._sort_formats(entry_info_dict['formats']) - entries.append(entry_info_dict) - if len(entries) == 1: - return entries[0] - else: + if len(entries) > 1: for num, e in enumerate(entries, start=1): # 'url' results don't have a title if e.get('title') is not None: e['title'] = '%s (%d)' % (e['title'], num) - return { - '_type': 'playlist', - 'entries': entries, - } + return entries diff --git a/hypervideo_dl/extractor/genericembeds.py b/hypervideo_dl/extractor/genericembeds.py new file mode 100644 index 0000000..9b4f14d --- /dev/null +++ b/hypervideo_dl/extractor/genericembeds.py @@ -0,0 +1,114 @@ +import re +import urllib.parse + +from .common import InfoExtractor +from ..utils import make_archive_id, unescapeHTML + + +class HTML5MediaEmbedIE(InfoExtractor): + _VALID_URL = False + IE_NAME = 'html5' + _WEBPAGE_TESTS = [ + { + 'url': 'https://html.com/media/', + 'info_dict': { + 'title': 'HTML5 Media', + 'description': 'md5:933b2d02ceffe7a7a0f3c8326d91cc2a', + }, + 'playlist_count': 2 + } + ] + + def _extract_from_webpage(self, url, webpage): + video_id, title = self._generic_id(url), self._generic_title(url, webpage) + entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls') or [] + for num, entry in enumerate(entries, start=1): + entry.update({ + 'id': f'{video_id}-{num}', + 'title': f'{title} ({num})', + '_old_archive_ids': [ + make_archive_id('generic', f'{video_id}-{num}' if len(entries) > 1 else video_id), + ], + }) + yield entry + + +class QuotedHTMLIE(InfoExtractor): + """For common cases of quoted/escaped html parts in the webpage""" + _VALID_URL = False + IE_NAME = 'generic:quoted-html' + IE_DESC = False # Do not list + _WEBPAGE_TESTS = [{ + # 2 YouTube embeds in data-html + 'url': 'https://24tv.ua/bronetransporteri-ozbroyenni-zsu-shho-vidomo-pro-bronovik-wolfhound_n2167966', + 'info_dict': { + 'id': 'bronetransporteri-ozbroyenni-zsu-shho-vidomo-pro-bronovik-wolfhound_n2167966', + 'title': 'Броньовик Wolfhound: гігант, який допомагає ЗСУ знищувати окупантів на фронті', + 'thumbnail': r're:^https?://.*\.jpe?g', + 'timestamp': float, + 'upload_date': str, + 'description': 'md5:6816e1e5a65304bd7898e4c7eb1b26f7', + 'age_limit': 0, + }, + 'playlist_count': 2 + }, { + # Generic iframe embed of TV24UAPlayerIE within data-html + 'url': 'https://24tv.ua/harkivyani-zgaduyut-misto-do-viyni-shhemlive-video_n1887584', + 'info_dict': { + 'id': '1887584', + 'ext': 'mp4', + 'title': 'Харків\'яни згадують місто до війни: щемливе відео', + 'thumbnail': r're:^https?://.*\.jpe?g', + }, + 'params': {'skip_download': True} + }, { + # YouTube embeds on Squarespace (data-html): https://github.com/ytdl-org/youtube-dl/issues/21294 + 'url': 'https://www.harvardballetcompany.org/past-productions', + 'info_dict': { + 'id': 'past-productions', + 'title': 'Productions — Harvard Ballet Company', + 'age_limit': 0, + 'description': 'Past Productions', + }, + 'playlist_mincount': 26 + }, { + # Squarespace video embed, 2019-08-28, data-html + 'url': 'http://ootboxford.com', + 'info_dict': { + 'id': 'Tc7b_JGdZfw', + 'title': 'Out of the Blue, at Childish Things 10', + 'ext': 'mp4', + 'description': 'md5:a83d0026666cf5ee970f8bd1cfd69c7f', + 'uploader_id': 'helendouglashouse', + 'uploader': 'Helen & Douglas House', + 'upload_date': '20140328', + 'availability': 'public', + 'view_count': int, + 'channel': 'Helen & Douglas House', + 'comment_count': int, + 'uploader_url': 'http://www.youtube.com/user/helendouglashouse', + 'duration': 253, + 'channel_url': 'https://www.youtube.com/channel/UCTChGezrZVmlYlpMlkmulPA', + 'playable_in_embed': True, + 'age_limit': 0, + 'channel_follower_count': int, + 'channel_id': 'UCTChGezrZVmlYlpMlkmulPA', + 'tags': 'count:6', + 'categories': ['Nonprofits & Activism'], + 'like_count': int, + 'thumbnail': 'https://i.ytimg.com/vi/Tc7b_JGdZfw/hqdefault.jpg', + }, + 'params': { + 'skip_download': True, + }, + }] + + def _extract_from_webpage(self, url, webpage): + combined = '' + for _, html in re.findall(r'(?s)\bdata-html=(["\'])((?:(?!\1).)+)\1', webpage): + # unescapeHTML can handle " etc., unquote can handle percent encoding + unquoted_html = unescapeHTML(urllib.parse.unquote(html)) + if unquoted_html != html: + combined += unquoted_html + if combined: + yield from self._extract_generic_embeds(url, combined) diff --git a/hypervideo_dl/extractor/genius.py b/hypervideo_dl/extractor/genius.py new file mode 100644 index 0000000..62f5a28 --- /dev/null +++ b/hypervideo_dl/extractor/genius.py @@ -0,0 +1,127 @@ +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + js_to_json, + smuggle_url, + str_or_none, + traverse_obj, + unescapeHTML, +) + + +class GeniusIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?genius\.com/videos/(?P<id>[^?/#]+)' + _TESTS = [{ + 'url': 'https://genius.com/videos/Vince-staples-breaks-down-the-meaning-of-when-sparks-fly', + 'md5': '64c2ad98cfafcfda23bfa0ad0c512f4c', + 'info_dict': { + 'id': '6313303597112', + 'ext': 'mp4', + 'title': 'Vince Staples Breaks Down The Meaning Of “When Sparks Fly”', + 'description': 'md5:bc15e00342c537c0039d414423ae5752', + 'tags': 'count:1', + 'uploader_id': '4863540648001', + 'duration': 388.416, + 'upload_date': '20221005', + 'timestamp': 1664982341, + 'thumbnail': r're:^https?://.*\.jpg$', + }, + }, { + 'url': 'https://genius.com/videos/Breaking-down-drakes-certified-lover-boy-kanye-beef-way-2-sexy-cudi', + 'md5': 'b8ed87a5efd1473bd027c20a969d4060', + 'info_dict': { + 'id': '6271792014001', + 'ext': 'mp4', + 'title': 'md5:c6355f7fa8a70bc86492a3963919fc15', + 'description': 'md5:1774638c31548b31b037c09e9b821393', + 'tags': 'count:3', + 'uploader_id': '4863540648001', + 'duration': 2685.099, + 'upload_date': '20210909', + 'timestamp': 1631209167, + 'thumbnail': r're:^https?://.*\.jpg$', + }, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + metadata = self._search_json( + r'<meta content="', webpage, 'metadata', display_id, transform_source=unescapeHTML) + video_id = traverse_obj( + metadata, ('video', 'provider_id'), + ('dfp_kv', lambda _, x: x['name'] == 'brightcove_video_id', 'values', 0), get_all=False) + if not video_id: + raise ExtractorError('Brightcove video id not found in webpage') + + config = self._search_json(r'var\s*APP_CONFIG\s*=', webpage, 'config', video_id, default={}) + account_id = config.get('brightcove_account_id', '4863540648001') + player_id = traverse_obj( + config, 'brightcove_standard_web_player_id', 'brightcove_standard_no_autoplay_web_player_id', + 'brightcove_modal_web_player_id', 'brightcove_song_story_web_player_id', default='S1ZcmcOC1x') + + return self.url_result( + smuggle_url( + f'https://players.brightcove.net/{account_id}/{player_id}_default/index.html?videoId={video_id}', + {'referrer': url}), 'BrightcoveNew', video_id) + + +class GeniusLyricsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?genius\.com/(?P<id>[^?/#]+)-lyrics[?/#]?' + _TESTS = [{ + 'url': 'https://genius.com/Lil-baby-heyy-lyrics', + 'playlist_mincount': 2, + 'info_dict': { + 'id': '8454545', + 'title': 'Heyy', + 'description': 'Heyy by Lil Baby', + }, + }, { + 'url': 'https://genius.com/Outkast-two-dope-boyz-in-a-cadillac-lyrics', + 'playlist_mincount': 1, + 'info_dict': { + 'id': '36239', + 'title': 'Two Dope Boyz (In a Cadillac)', + 'description': 'Two Dope Boyz (In a Cadillac) by OutKast', + }, + }, { + 'url': 'https://genius.com/Playboi-carti-rip-lyrics', + 'playlist_mincount': 1, + 'info_dict': { + 'id': '3710582', + 'title': 'R.I.P.', + 'description': 'R.I.P. by Playboi Carti', + }, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + json_string = self._search_json( + r'window\.__PRELOADED_STATE__\s*=\s*JSON\.parse\(', webpage, 'json string', + display_id, transform_source=js_to_json, contains_pattern=r'\'{(?s:.+)}\'') + song_info = self._parse_json(json_string, display_id) + song_id = str_or_none(traverse_obj(song_info, ('songPage', 'song'))) + if not song_id: + raise ExtractorError('Song id not found in webpage') + + title = traverse_obj( + song_info, ('songPage', 'trackingData', lambda _, x: x['key'] == 'Title', 'value'), + get_all=False, default='untitled') + artist = traverse_obj( + song_info, ('songPage', 'trackingData', lambda _, x: x['key'] == 'Primary Artist', 'value'), + get_all=False, default='unknown artist') + media = traverse_obj( + song_info, ('entities', 'songs', song_id, 'media'), expected_type=list, default=[]) + + entries = [] + for m in media: + if m.get('type') in ('video', 'audio') and m.get('url'): + if m.get('provider') == 'spotify': + self.to_screen(f'{song_id}: Skipping Spotify audio embed') + else: + entries.append(self.url_result(m['url'])) + + return self.playlist_result(entries, song_id, title, f'{title} by {artist}') diff --git a/hypervideo_dl/extractor/gettr.py b/hypervideo_dl/extractor/gettr.py index 327a4d0..7795dc5 100644 --- a/hypervideo_dl/extractor/gettr.py +++ b/hypervideo_dl/extractor/gettr.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( bool_or_none, @@ -124,8 +121,6 @@ class GettrIE(GettrBaseIE): 'height': int_or_none(post_data.get('vid_hgt')), }) - self._sort_formats(formats) - return { 'id': post_id, 'title': title, @@ -195,8 +190,6 @@ class GettrStreamingIE(GettrBaseIE): 'url': urljoin(self._MEDIA_BASE_URL, thumbnail), } for thumbnail in try_get(video_info, lambda x: x['postData']['imgs'], list) or []] - self._sort_formats(formats) - return { 'id': video_id, 'title': try_get(video_info, lambda x: x['postData']['ttl'], str), diff --git a/hypervideo_dl/extractor/gfycat.py b/hypervideo_dl/extractor/gfycat.py index 2ad03e2..edc2e56 100644 --- a/hypervideo_dl/extractor/gfycat.py +++ b/hypervideo_dl/extractor/gfycat.py @@ -1,8 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - from .common import InfoExtractor from ..utils import ( int_or_none, @@ -13,7 +8,8 @@ from ..utils import ( class GfycatIE(InfoExtractor): - _VALID_URL = r'(?i)https?://(?:(?:www|giant|thumbs)\.)?gfycat\.com/(?:ru/|ifr/|gifs/detail/)?(?P<id>[^-/?#\."\']+)' + _VALID_URL = r'https?://(?:(?:www|giant|thumbs)\.)?gfycat\.com/(?i:ru/|ifr/|gifs/detail/)?(?P<id>[^-/?#\."\']+)' + _EMBED_REGEX = [rf'<(?:iframe|source)[^>]+\bsrc=["\'](?P<url>{_VALID_URL})'] _TESTS = [{ 'url': 'http://gfycat.com/DeadlyDecisiveGermanpinscher', 'info_dict': { @@ -85,14 +81,6 @@ class GfycatIE(InfoExtractor): 'only_matching': True }] - @staticmethod - def _extract_urls(webpage): - return [ - mobj.group('url') - for mobj in re.finditer( - r'<(?:iframe|source)[^>]+\bsrc=["\'](?P<url>%s)' % GfycatIE._VALID_URL, - webpage)] - def _real_extract(self, url): video_id = self._match_id(url) @@ -139,7 +127,6 @@ class GfycatIE(InfoExtractor): 'filesize': filesize, 'quality': quality(format_id), }) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/giantbomb.py b/hypervideo_dl/extractor/giantbomb.py index 1920923..1125723 100644 --- a/hypervideo_dl/extractor/giantbomb.py +++ b/hypervideo_dl/extractor/giantbomb.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import json from .common import InfoExtractor @@ -76,8 +74,6 @@ class GiantBombIE(InfoExtractor): if youtube_id: return self.url_result(youtube_id, 'Youtube') - self._sort_formats(formats) - return { 'id': video_id, 'display_id': display_id, diff --git a/hypervideo_dl/extractor/giga.py b/hypervideo_dl/extractor/giga.py index 5a9992a..b59c129 100644 --- a/hypervideo_dl/extractor/giga.py +++ b/hypervideo_dl/extractor/giga.py @@ -1,16 +1,8 @@ -# coding: utf-8 -from __future__ import unicode_literals - import itertools from .common import InfoExtractor -from ..utils import ( - qualities, - compat_str, - parse_duration, - parse_iso8601, - str_to_int, -) +from ..compat import compat_str +from ..utils import parse_duration, parse_iso8601, qualities, str_to_int class GigaIE(InfoExtractor): @@ -67,7 +59,6 @@ class GigaIE(InfoExtractor): 'format_id': '%s-%s' % (fmt['quality'], fmt['type'].split('/')[-1]), 'quality': quality(fmt['quality']), }) - self._sort_formats(formats) title = self._html_search_meta( 'title', webpage, 'title', fatal=True) diff --git a/hypervideo_dl/extractor/gigya.py b/hypervideo_dl/extractor/gigya.py index 4121784..c5bc86b 100644 --- a/hypervideo_dl/extractor/gigya.py +++ b/hypervideo_dl/extractor/gigya.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( diff --git a/hypervideo_dl/extractor/glide.py b/hypervideo_dl/extractor/glide.py index 12af859..d114f34 100644 --- a/hypervideo_dl/extractor/glide.py +++ b/hypervideo_dl/extractor/glide.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor @@ -23,7 +20,7 @@ class GlideIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - title = self._html_extract_title(webpage, default=None) or self._og_search_title(webpage) + title = self._generic_title('', webpage) video_url = self._proto_relative_url(self._search_regex( r'<source[^>]+src=(["\'])(?P<url>.+?)\1', webpage, 'video URL', default=None, diff --git a/hypervideo_dl/extractor/globo.py b/hypervideo_dl/extractor/globo.py index f6aaae1..a7be2cb 100644 --- a/hypervideo_dl/extractor/globo.py +++ b/hypervideo_dl/extractor/globo.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import base64 import hashlib import json @@ -142,7 +139,6 @@ class GloboIE(InfoExtractor): fmts, subtitles = self._extract_m3u8_formats_and_subtitles( signed_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) formats.extend(fmts) - self._sort_formats(formats) for resource in video['resources']: if resource.get('type') == 'subtitle': @@ -181,12 +177,12 @@ class GloboArticleIE(InfoExtractor): _VALID_URL = r'https?://.+?\.globo\.com/(?:[^/]+/)*(?P<id>[^/.]+)(?:\.html)?' _VIDEOID_REGEXES = [ - r'\bdata-video-id=["\'](\d{7,})', - r'\bdata-player-videosids=["\'](\d{7,})', + r'\bdata-video-id=["\'](\d{7,})["\']', + r'\bdata-player-videosids=["\'](\d{7,})["\']', r'\bvideosIDs\s*:\s*["\']?(\d{7,})', - r'\bdata-id=["\'](\d{7,})', - r'<div[^>]+\bid=["\'](\d{7,})', - r'<bs-player[^>]+\bvideoid=["\'](\d{8,})', + r'\bdata-id=["\'](\d{7,})["\']', + r'<div[^>]+\bid=["\'](\d{7,})["\']', + r'<bs-player[^>]+\bvideoid=["\'](\d{8,})["\']', ] _TESTS = [{ @@ -222,6 +218,14 @@ class GloboArticleIE(InfoExtractor): 'description': 'md5:2d089d036c4c9675117d3a56f8c61739', }, 'playlist_count': 1, + }, { + 'url': 'https://redeglobo.globo.com/rpc/meuparana/noticia/a-producao-de-chocolates-no-parana.ghtml', + 'info_dict': { + 'id': 'a-producao-de-chocolates-no-parana', + 'title': 'A produção de chocolates no Paraná', + 'description': 'md5:f2e3daf00ffd1dc0e9a8a6c7cfb0a89e', + }, + 'playlist_count': 2, }] @classmethod @@ -237,6 +241,6 @@ class GloboArticleIE(InfoExtractor): entries = [ self.url_result('globo:%s' % video_id, GloboIE.ie_key()) for video_id in orderedSet(video_ids)] - title = self._og_search_title(webpage) + title = self._og_search_title(webpage).strip() description = self._html_search_meta('description', webpage) return self.playlist_result(entries, display_id, title, description) diff --git a/hypervideo_dl/extractor/glomex.py b/hypervideo_dl/extractor/glomex.py index d9ef433..22aac0d 100644 --- a/hypervideo_dl/extractor/glomex.py +++ b/hypervideo_dl/extractor/glomex.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re import urllib.parse @@ -85,7 +82,6 @@ class GlomexBaseIE(InfoExtractor): if video.get('language'): for fmt in formats: fmt['language'] = video['language'] - self._sort_formats(formats) images = (video.get('images') or []) + [video.get('image') or {}] thumbnails = [{ @@ -177,7 +173,7 @@ class GlomexEmbedIE(GlomexBaseIE): return cls._smuggle_origin_url(f'https:{cls._BASE_PLAYER_URL}?{query_string}', origin_url) @classmethod - def _extract_urls(cls, webpage, origin_url): + def _extract_embed_urls(cls, url, webpage): # https://docs.glomex.com/publisher/video-player-integration/javascript-api/ quot_re = r'["\']' @@ -186,9 +182,9 @@ class GlomexEmbedIE(GlomexBaseIE): (?:https?:)?{cls._BASE_PLAYER_URL_RE}\?(?:(?!(?P=q)).)+ )(?P=q)''' for mobj in re.finditer(regex, webpage): - url = unescapeHTML(mobj.group('url')) - if cls.suitable(url): - yield cls._smuggle_origin_url(url, origin_url) + embed_url = unescapeHTML(mobj.group('url')) + if cls.suitable(embed_url): + yield cls._smuggle_origin_url(embed_url, url) regex = fr'''(?x) <glomex-player [^>]+?>| @@ -196,7 +192,7 @@ class GlomexEmbedIE(GlomexBaseIE): for mobj in re.finditer(regex, webpage): attrs = extract_attributes(mobj.group(0)) if attrs.get('data-integration-id') and attrs.get('data-playlist-id'): - yield cls.build_player_url(attrs['data-playlist-id'], attrs['data-integration-id'], origin_url) + yield cls.build_player_url(attrs['data-playlist-id'], attrs['data-integration-id'], url) # naive parsing of inline scripts for hard-coded integration parameters regex = fr'''(?x) @@ -209,7 +205,7 @@ class GlomexEmbedIE(GlomexBaseIE): continue playlist_id = re.search(regex % 'playlistId', script) if playlist_id: - yield cls.build_player_url(playlist_id, integration_id, origin_url) + yield cls.build_player_url(playlist_id, integration_id, url) def _real_extract(self, url): url, origin_url = self._unsmuggle_origin_url(url) diff --git a/hypervideo_dl/extractor/go.py b/hypervideo_dl/extractor/go.py index f92e166..b075a02 100644 --- a/hypervideo_dl/extractor/go.py +++ b/hypervideo_dl/extractor/go.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .adobepass import AdobePassIE @@ -14,6 +11,8 @@ from ..utils import ( try_get, urlencode_postdata, ExtractorError, + unified_timestamp, + traverse_obj, ) @@ -73,7 +72,7 @@ class GoIE(AdobePassIE): }, 'skip': 'This content is no longer available.', }, { - 'url': 'http://watchdisneyxd.go.com/doraemon', + 'url': 'https://disneynow.com/shows/big-hero-6-the-series', 'info_dict': { 'title': 'Doraemon', 'id': 'SH55574025', @@ -83,10 +82,19 @@ class GoIE(AdobePassIE): 'url': 'http://freeform.go.com/shows/shadowhunters/episodes/season-2/1-this-guilty-blood', 'info_dict': { 'id': 'VDKA3609139', - 'ext': 'mp4', 'title': 'This Guilty Blood', 'description': 'md5:f18e79ad1c613798d95fdabfe96cd292', 'age_limit': 14, + 'episode': 'Episode 1', + 'upload_date': '20170102', + 'season': 'Season 2', + 'thumbnail': 'http://cdn1.edgedatg.com/aws/v2/abcf/Shadowhunters/video/201/ae5f75608d86bf88aa4f9f4aa76ab1b7/579x325-Q100_ae5f75608d86bf88aa4f9f4aa76ab1b7.jpg', + 'duration': 2544, + 'season_number': 2, + 'series': 'Shadowhunters', + 'episode_number': 1, + 'timestamp': 1483387200, + 'ext': 'mp4' }, 'params': { 'geo_bypass_ip_block': '3.244.239.0/24', @@ -94,13 +102,22 @@ class GoIE(AdobePassIE): 'skip_download': True, }, }, { - 'url': 'https://abc.com/shows/the-rookie/episode-guide/season-02/03-the-bet', + 'url': 'https://abc.com/shows/the-rookie/episode-guide/season-04/12-the-knock', 'info_dict': { - 'id': 'VDKA13435179', - 'ext': 'mp4', - 'title': 'The Bet', - 'description': 'md5:c66de8ba2e92c6c5c113c3ade84ab404', + 'id': 'VDKA26050359', + 'title': 'The Knock', + 'description': 'md5:0c2947e3ada4c31f28296db7db14aa64', 'age_limit': 14, + 'ext': 'mp4', + 'thumbnail': 'http://cdn1.edgedatg.com/aws/v2/abc/TheRookie/video/412/daf830d06e83b11eaf5c0a299d993ae3/1556x876-Q75_daf830d06e83b11eaf5c0a299d993ae3.jpg', + 'episode': 'Episode 12', + 'season_number': 4, + 'season': 'Season 4', + 'timestamp': 1642975200, + 'episode_number': 12, + 'upload_date': '20220123', + 'series': 'The Rookie', + 'duration': 2572, }, 'params': { 'geo_bypass_ip_block': '3.244.239.0/24', @@ -111,24 +128,18 @@ class GoIE(AdobePassIE): 'url': 'https://fxnow.fxnetworks.com/shows/better-things/video/vdka12782841', 'info_dict': { 'id': 'VDKA12782841', - 'ext': 'mp4', 'title': 'First Look: Better Things - Season 2', 'description': 'md5:fa73584a95761c605d9d54904e35b407', - }, - 'params': { - 'geo_bypass_ip_block': '3.244.239.0/24', - # m3u8 download - 'skip_download': True, - }, - }, { - 'url': 'https://abc.com/shows/modern-family/episode-guide/season-01/101-pilot', - 'info_dict': { - 'id': 'VDKA22600213', 'ext': 'mp4', - 'title': 'Pilot', - 'description': 'md5:74306df917cfc199d76d061d66bebdb4', + 'age_limit': 14, + 'upload_date': '20170825', + 'duration': 161, + 'series': 'Better Things', + 'thumbnail': 'http://cdn1.edgedatg.com/aws/v2/fx/BetterThings/video/12782841/b6b05e58264121cc2c98811318e6d507/1556x876-Q75_b6b05e58264121cc2c98811318e6d507.jpg', + 'timestamp': 1503661074, }, 'params': { + 'geo_bypass_ip_block': '3.244.239.0/24', # m3u8 download 'skip_download': True, }, @@ -282,7 +293,6 @@ class GoIE(AdobePassIE): 'height': height, }) formats.append(f) - self._sort_formats(formats) for cc in video_data.get('closedcaption', {}).get('src', []): cc_url = cc.get('value') @@ -319,4 +329,5 @@ class GoIE(AdobePassIE): 'thumbnails': thumbnails, 'formats': formats, 'subtitles': subtitles, + 'timestamp': unified_timestamp(traverse_obj(video_data, ('airdates', 'airdate', 0))), } diff --git a/hypervideo_dl/extractor/godtube.py b/hypervideo_dl/extractor/godtube.py index 96e68b4..6975401 100644 --- a/hypervideo_dl/extractor/godtube.py +++ b/hypervideo_dl/extractor/godtube.py @@ -1,6 +1,3 @@ -from __future__ import unicode_literals - - from .common import InfoExtractor from ..utils import ( parse_duration, diff --git a/hypervideo_dl/extractor/gofile.py b/hypervideo_dl/extractor/gofile.py index 62d778c..ddbce2e 100644 --- a/hypervideo_dl/extractor/gofile.py +++ b/hypervideo_dl/extractor/gofile.py @@ -1,4 +1,5 @@ -# coding: utf-8 +import hashlib + from .common import InfoExtractor from ..utils import ( ExtractorError, @@ -19,22 +20,34 @@ class GofileIE(InfoExtractor): 'id': 'de571ac1-5edc-42e2-8ec2-bdac83ad4a31', 'filesize': 928116, 'ext': 'mp4', - 'title': 'nuuh' + 'title': 'nuuh', + 'release_timestamp': 1638338704, + 'release_date': '20211201', } }] - }, { # URL to test mixed file types - 'url': 'https://gofile.io/d/avt34h', + }, { + 'url': 'https://gofile.io/d/is8lKr', 'info_dict': { - 'id': 'avt34h', - }, - 'playlist_mincount': 1, - }, { # URL to test no video/audio error - 'url': 'https://gofile.io/d/aB03lZ', - 'info_dict': { - 'id': 'aB03lZ', + 'id': 'TMjXd9', + 'ext': 'mp4', }, 'playlist_count': 0, 'skip': 'No video/audio found at provided URL.', + }, { + 'url': 'https://gofile.io/d/TMjXd9', + 'info_dict': { + 'id': 'TMjXd9', + }, + 'playlist_count': 1, + }, { + 'url': 'https://gofile.io/d/gqOtRf', + 'info_dict': { + 'id': 'gqOtRf', + }, + 'playlist_mincount': 1, + 'params': { + 'videopassword': 'password', + }, }] _TOKEN = None @@ -50,12 +63,22 @@ class GofileIE(InfoExtractor): self._set_cookie('gofile.io', 'accountToken', self._TOKEN) def _entries(self, file_id): + query_params = { + 'contentId': file_id, + 'token': self._TOKEN, + 'websiteToken': 12345, + } + password = self.get_param('videopassword') + if password: + query_params['password'] = hashlib.sha256(password.encode('utf-8')).hexdigest() files = self._download_json( - f'https://api.gofile.io/getContent?contentId={file_id}&token={self._TOKEN}&websiteToken=websiteToken&cache=true', - 'Gofile', note='Getting filelist') + 'https://api.gofile.io/getContent', file_id, note='Getting filelist', query=query_params) status = files['status'] - if status != 'ok': + if status == 'error-passwordRequired': + raise ExtractorError( + 'This video is protected by a password, use the --video-password option', expected=True) + elif status != 'ok': raise ExtractorError(f'{self.IE_NAME} said: status {status}', expected=True) found_files = False @@ -65,7 +88,7 @@ class GofileIE(InfoExtractor): continue found_files = True - file_url = file.get('directLink') + file_url = file.get('link') if file_url: yield { 'id': file['id'], diff --git a/hypervideo_dl/extractor/golem.py b/hypervideo_dl/extractor/golem.py index 47a068e..c33d950 100644 --- a/hypervideo_dl/extractor/golem.py +++ b/hypervideo_dl/extractor/golem.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import ( compat_str, @@ -54,7 +51,6 @@ class GolemIE(InfoExtractor): 'filesize': self._int(e.findtext('filesize'), 'filesize'), 'ext': determine_ext(e.findtext('./filename')), }) - self._sort_formats(formats) info['formats'] = formats thumbnails = [] diff --git a/hypervideo_dl/extractor/goodgame.py b/hypervideo_dl/extractor/goodgame.py new file mode 100644 index 0000000..c17ad56 --- /dev/null +++ b/hypervideo_dl/extractor/goodgame.py @@ -0,0 +1,57 @@ +from .common import InfoExtractor +from ..utils import ( + clean_html, + int_or_none, + str_or_none, + traverse_obj, +) + + +class GoodGameIE(InfoExtractor): + IE_NAME = 'goodgame:stream' + _VALID_URL = r'https?://goodgame\.ru/channel/(?P<id>\w+)' + _TESTS = [{ + 'url': 'https://goodgame.ru/channel/Pomi/#autoplay', + 'info_dict': { + 'id': 'pomi', + 'ext': 'mp4', + 'title': r're:Reynor vs Special \(1/2,bo3\) Wardi Spring EU \- playoff \(финальный день\) \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', + 'channel_id': '1644', + 'channel': 'Pomi', + 'channel_url': 'https://goodgame.ru/channel/Pomi/', + 'description': 'md5:4a87b775ee7b2b57bdccebe285bbe171', + 'thumbnail': r're:^https?://.*\.jpg$', + 'live_status': 'is_live', + 'view_count': int, + }, + 'params': {'skip_download': 'm3u8'}, + 'skip': 'May not be online', + }] + + def _real_extract(self, url): + channel_name = self._match_id(url) + response = self._download_json(f'https://api2.goodgame.ru/v2/streams/{channel_name}', channel_name) + player_id = response['channel']['gg_player_src'] + + formats, subtitles = [], {} + if response.get('status') == 'Live': + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + f'https://hls.goodgame.ru/manifest/{player_id}_master.m3u8', + channel_name, 'mp4', live=True) + else: + self.raise_no_formats('User is offline', expected=True, video_id=channel_name) + + return { + 'id': player_id, + 'formats': formats, + 'subtitles': subtitles, + 'title': traverse_obj(response, ('channel', 'title')), + 'channel': channel_name, + 'channel_id': str_or_none(traverse_obj(response, ('channel', 'id'))), + 'channel_url': response.get('url'), + 'description': clean_html(traverse_obj(response, ('channel', 'description'))), + 'thumbnail': traverse_obj(response, ('channel', 'thumb')), + 'is_live': bool(formats), + 'view_count': int_or_none(response.get('viewers')), + 'age_limit': 18 if traverse_obj(response, ('channel', 'adult')) else None, + } diff --git a/hypervideo_dl/extractor/googledrive.py b/hypervideo_dl/extractor/googledrive.py index 7b5bf28..e027ea7 100644 --- a/hypervideo_dl/extractor/googledrive.py +++ b/hypervideo_dl/extractor/googledrive.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -79,13 +77,13 @@ class GoogleDriveIE(InfoExtractor): _caption_formats_ext = [] _captions_xml = None - @staticmethod - def _extract_url(webpage): + @classmethod + def _extract_embed_urls(cls, url, webpage): mobj = re.search( r'<iframe[^>]+src="https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9_-]{28,})', webpage) if mobj: - return 'https://drive.google.com/file/d/%s' % mobj.group('id') + yield 'https://drive.google.com/file/d/%s' % mobj.group('id') def _download_subtitles_xml(self, video_id, subtitles_id, hl): if self._captions_xml: @@ -255,8 +253,6 @@ class GoogleDriveIE(InfoExtractor): if not formats and reason: self.raise_no_formats(reason, expected=True) - self._sort_formats(formats) - hl = get_value('hl') subtitles_id = None ttsurl = get_value('ttsurl') @@ -266,7 +262,7 @@ class GoogleDriveIE(InfoExtractor): subtitles_id = ttsurl.encode('utf-8').decode( 'unicode_escape').split('=')[-1] - self._downloader.cookiejar.clear(domain='.google.com', path='/', name='NID') + self.cookiejar.clear(domain='.google.com', path='/', name='NID') return { 'id': video_id, @@ -278,3 +274,59 @@ class GoogleDriveIE(InfoExtractor): 'automatic_captions': self.extract_automatic_captions( video_id, subtitles_id, hl), } + + +class GoogleDriveFolderIE(InfoExtractor): + IE_NAME = 'GoogleDrive:Folder' + _VALID_URL = r'https?://(?:docs|drive)\.google\.com/drive/folders/(?P<id>[\w-]{28,})' + _TESTS = [{ + 'url': 'https://drive.google.com/drive/folders/1dQ4sx0-__Nvg65rxTSgQrl7VyW_FZ9QI', + 'info_dict': { + 'id': '1dQ4sx0-__Nvg65rxTSgQrl7VyW_FZ9QI', + 'title': 'Forrest' + }, + 'playlist_count': 3, + }] + _BOUNDARY = '=====vc17a3rwnndj=====' + _REQUEST = "/drive/v2beta/files?openDrive=true&reason=102&syncType=0&errorRecovery=false&q=trashed%20%3D%20false%20and%20'{folder_id}'%20in%20parents&fields=kind%2CnextPageToken%2Citems(kind%2CmodifiedDate%2CmodifiedByMeDate%2ClastViewedByMeDate%2CfileSize%2Cowners(kind%2CpermissionId%2Cid)%2ClastModifyingUser(kind%2CpermissionId%2Cid)%2ChasThumbnail%2CthumbnailVersion%2Ctitle%2Cid%2CresourceKey%2Cshared%2CsharedWithMeDate%2CuserPermission(role)%2CexplicitlyTrashed%2CmimeType%2CquotaBytesUsed%2Ccopyable%2CfileExtension%2CsharingUser(kind%2CpermissionId%2Cid)%2Cspaces%2Cversion%2CteamDriveId%2ChasAugmentedPermissions%2CcreatedDate%2CtrashingUser(kind%2CpermissionId%2Cid)%2CtrashedDate%2Cparents(id)%2CshortcutDetails(targetId%2CtargetMimeType%2CtargetLookupStatus)%2Ccapabilities(canCopy%2CcanDownload%2CcanEdit%2CcanAddChildren%2CcanDelete%2CcanRemoveChildren%2CcanShare%2CcanTrash%2CcanRename%2CcanReadTeamDrive%2CcanMoveTeamDriveItem)%2Clabels(starred%2Ctrashed%2Crestricted%2Cviewed))%2CincompleteSearch&appDataFilter=NO_APP_DATA&spaces=drive&pageToken={page_token}&maxResults=50&supportsTeamDrives=true&includeItemsFromAllDrives=true&corpora=default&orderBy=folder%2Ctitle_natural%20asc&retryCount=0&key={key} HTTP/1.1" + _DATA = f'''--{_BOUNDARY} +content-type: application/http +content-transfer-encoding: binary + +GET %s + +--{_BOUNDARY} +''' + + def _call_api(self, folder_id, key, data, **kwargs): + response = self._download_webpage( + 'https://clients6.google.com/batch/drive/v2beta', + folder_id, data=data.encode('utf-8'), + headers={ + 'Content-Type': 'text/plain;charset=UTF-8;', + 'Origin': 'https://drive.google.com', + }, query={ + '$ct': f'multipart/mixed; boundary="{self._BOUNDARY}"', + 'key': key + }, **kwargs) + return self._search_json('', response, 'api response', folder_id, **kwargs) or {} + + def _get_folder_items(self, folder_id, key): + page_token = '' + while page_token is not None: + request = self._REQUEST.format(folder_id=folder_id, page_token=page_token, key=key) + page = self._call_api(folder_id, key, self._DATA % request) + yield from page['items'] + page_token = page.get('nextPageToken') + + def _real_extract(self, url): + folder_id = self._match_id(url) + + webpage = self._download_webpage(url, folder_id) + key = self._search_regex(r'"(\w{39})"', webpage, 'key') + + folder_info = self._call_api(folder_id, key, self._DATA % f'/drive/v2beta/files/{folder_id} HTTP/1.1', fatal=False) + + return self.playlist_from_matches( + self._get_folder_items(folder_id, key), folder_id, folder_info.get('title'), + ie=GoogleDriveIE, getter=lambda item: f'https://drive.google.com/file/d/{item["id"]}') diff --git a/hypervideo_dl/extractor/googlepodcasts.py b/hypervideo_dl/extractor/googlepodcasts.py index 25631e2..8b2351b 100644 --- a/hypervideo_dl/extractor/googlepodcasts.py +++ b/hypervideo_dl/extractor/googlepodcasts.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/googlesearch.py b/hypervideo_dl/extractor/googlesearch.py index 4b8b1bc..67ca0e5 100644 --- a/hypervideo_dl/extractor/googlesearch.py +++ b/hypervideo_dl/extractor/googlesearch.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import itertools import re diff --git a/hypervideo_dl/extractor/goplay.py b/hypervideo_dl/extractor/goplay.py new file mode 100644 index 0000000..2882b49 --- /dev/null +++ b/hypervideo_dl/extractor/goplay.py @@ -0,0 +1,394 @@ +import base64 +import binascii +import datetime +import hashlib +import hmac +import json +import os + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + traverse_obj, + unescapeHTML, +) + + +class GoPlayIE(InfoExtractor): + _VALID_URL = r'https?://(www\.)?goplay\.be/video/([^/]+/[^/]+/|)(?P<display_id>[^/#]+)' + + _NETRC_MACHINE = 'goplay' + + _TESTS = [{ + 'url': 'https://www.goplay.be/video/de-container-cup/de-container-cup-s3/de-container-cup-s3-aflevering-2#autoplay', + 'info_dict': { + 'id': '9c4214b8-e55d-4e4b-a446-f015f6c6f811', + 'ext': 'mp4', + 'title': 'S3 - Aflevering 2', + 'series': 'De Container Cup', + 'season': 'Season 3', + 'season_number': 3, + 'episode': 'Episode 2', + 'episode_number': 2, + }, + 'skip': 'This video is only available for registered users' + }, { + 'url': 'https://www.goplay.be/video/a-family-for-thr-holidays-s1-aflevering-1#autoplay', + 'info_dict': { + 'id': '74e3ed07-748c-49e4-85a0-393a93337dbf', + 'ext': 'mp4', + 'title': 'A Family for the Holidays', + }, + 'skip': 'This video is only available for registered users' + }] + + _id_token = None + + def _perform_login(self, username, password): + self.report_login() + aws = AwsIdp(ie=self, pool_id='eu-west-1_dViSsKM5Y', client_id='6s1h851s8uplco5h6mqh1jac8m') + self._id_token, _ = aws.authenticate(username=username, password=password) + + def _real_initialize(self): + if not self._id_token: + raise self.raise_login_required(method='password') + + def _real_extract(self, url): + url, display_id = self._match_valid_url(url).group(0, 'display_id') + webpage = self._download_webpage(url, display_id) + video_data_json = self._html_search_regex(r'<div\s+data-hero="([^"]+)"', webpage, 'video_data') + video_data = self._parse_json(unescapeHTML(video_data_json), display_id).get('data') + + movie = video_data.get('movie') + if movie: + video_id = movie['videoUuid'] + info_dict = { + 'title': movie.get('title') + } + else: + episode = traverse_obj(video_data, ('playlists', ..., 'episodes', lambda _, v: v['pageInfo']['url'] == url), get_all=False) + video_id = episode['videoUuid'] + info_dict = { + 'title': episode.get('episodeTitle'), + 'series': traverse_obj(episode, ('program', 'title')), + 'season_number': episode.get('seasonNumber'), + 'episode_number': episode.get('episodeNumber'), + } + + api = self._download_json( + f'https://api.viervijfzes.be/content/{video_id}', + video_id, headers={'Authorization': self._id_token}) + + formats, subs = self._extract_m3u8_formats_and_subtitles( + api['video']['S'], video_id, ext='mp4', m3u8_id='HLS') + + info_dict.update({ + 'id': video_id, + 'formats': formats, + }) + + return info_dict + + +# Taken from https://github.com/add-ons/plugin.video.viervijfzes/blob/master/resources/lib/viervijfzes/auth_awsidp.py +# Released into Public domain by https://github.com/michaelarnauts + +class InvalidLoginException(ExtractorError): + """ The login credentials are invalid """ + + +class AuthenticationException(ExtractorError): + """ Something went wrong while logging in """ + + +class AwsIdp: + """ AWS Identity Provider """ + + def __init__(self, ie, pool_id, client_id): + """ + :param InfoExtrator ie: The extractor that instantiated this class. + :param str pool_id: The AWS user pool to connect to (format: <region>_<poolid>). + E.g.: eu-west-1_aLkOfYN3T + :param str client_id: The client application ID (the ID of the application connecting) + """ + + self.ie = ie + + self.pool_id = pool_id + if "_" not in self.pool_id: + raise ValueError("Invalid pool_id format. Should be <region>_<poolid>.") + + self.client_id = client_id + self.region = self.pool_id.split("_")[0] + self.url = "https://cognito-idp.%s.amazonaws.com/" % (self.region,) + + # Initialize the values + # https://github.com/aws/amazon-cognito-identity-js/blob/master/src/AuthenticationHelper.js#L22 + self.n_hex = 'FFFFFFFFFFFFFFFFC90FDAA22168C234C4C6628B80DC1CD1' + \ + '29024E088A67CC74020BBEA63B139B22514A08798E3404DD' + \ + 'EF9519B3CD3A431B302B0A6DF25F14374FE1356D6D51C245' + \ + 'E485B576625E7EC6F44C42E9A637ED6B0BFF5CB6F406B7ED' + \ + 'EE386BFB5A899FA5AE9F24117C4B1FE649286651ECE45B3D' + \ + 'C2007CB8A163BF0598DA48361C55D39A69163FA8FD24CF5F' + \ + '83655D23DCA3AD961C62F356208552BB9ED529077096966D' + \ + '670C354E4ABC9804F1746C08CA18217C32905E462E36CE3B' + \ + 'E39E772C180E86039B2783A2EC07A28FB5C55DF06F4C52C9' + \ + 'DE2BCBF6955817183995497CEA956AE515D2261898FA0510' + \ + '15728E5A8AAAC42DAD33170D04507A33A85521ABDF1CBA64' + \ + 'ECFB850458DBEF0A8AEA71575D060C7DB3970F85A6E1E4C7' + \ + 'ABF5AE8CDB0933D71E8C94E04A25619DCEE3D2261AD2EE6B' + \ + 'F12FFA06D98A0864D87602733EC86A64521F2B18177B200C' + \ + 'BBE117577A615D6C770988C0BAD946E208E24FA074E5AB31' + \ + '43DB5BFCE0FD108E4B82D120A93AD2CAFFFFFFFFFFFFFFFF' + + # https://github.com/aws/amazon-cognito-identity-js/blob/master/src/AuthenticationHelper.js#L49 + self.g_hex = '2' + self.info_bits = bytearray('Caldera Derived Key', 'utf-8') + + self.big_n = self.__hex_to_long(self.n_hex) + self.g = self.__hex_to_long(self.g_hex) + self.k = self.__hex_to_long(self.__hex_hash('00' + self.n_hex + '0' + self.g_hex)) + self.small_a_value = self.__generate_random_small_a() + self.large_a_value = self.__calculate_a() + + def authenticate(self, username, password): + """ Authenticate with a username and password. """ + # Step 1: First initiate an authentication request + auth_data_dict = self.__get_authentication_request(username) + auth_data = json.dumps(auth_data_dict).encode("utf-8") + auth_headers = { + "X-Amz-Target": "AWSCognitoIdentityProviderService.InitiateAuth", + "Accept-Encoding": "identity", + "Content-Type": "application/x-amz-json-1.1" + } + auth_response_json = self.ie._download_json( + self.url, None, data=auth_data, headers=auth_headers, + note='Authenticating username', errnote='Invalid username') + challenge_parameters = auth_response_json.get("ChallengeParameters") + + if auth_response_json.get("ChallengeName") != "PASSWORD_VERIFIER": + raise AuthenticationException(auth_response_json["message"]) + + # Step 2: Respond to the Challenge with a valid ChallengeResponse + challenge_request = self.__get_challenge_response_request(challenge_parameters, password) + challenge_data = json.dumps(challenge_request).encode("utf-8") + challenge_headers = { + "X-Amz-Target": "AWSCognitoIdentityProviderService.RespondToAuthChallenge", + "Content-Type": "application/x-amz-json-1.1" + } + auth_response_json = self.ie._download_json( + self.url, None, data=challenge_data, headers=challenge_headers, + note='Authenticating password', errnote='Invalid password') + + if 'message' in auth_response_json: + raise InvalidLoginException(auth_response_json['message']) + return ( + auth_response_json['AuthenticationResult']['IdToken'], + auth_response_json['AuthenticationResult']['RefreshToken'] + ) + + def __get_authentication_request(self, username): + """ + + :param str username: The username to use + + :return: A full Authorization request. + :rtype: dict + """ + auth_request = { + "AuthParameters": { + "USERNAME": username, + "SRP_A": self.__long_to_hex(self.large_a_value) + }, + "AuthFlow": "USER_SRP_AUTH", + "ClientId": self.client_id + } + return auth_request + + def __get_challenge_response_request(self, challenge_parameters, password): + """ Create a Challenge Response Request object. + + :param dict[str,str|imt] challenge_parameters: The parameters for the challenge. + :param str password: The password. + + :return: A valid and full request data object to use as a response for a challenge. + :rtype: dict + """ + user_id = challenge_parameters["USERNAME"] + user_id_for_srp = challenge_parameters["USER_ID_FOR_SRP"] + srp_b = challenge_parameters["SRP_B"] + salt = challenge_parameters["SALT"] + secret_block = challenge_parameters["SECRET_BLOCK"] + + timestamp = self.__get_current_timestamp() + + # Get a HKDF key for the password, SrpB and the Salt + hkdf = self.__get_hkdf_key_for_password( + user_id_for_srp, + password, + self.__hex_to_long(srp_b), + salt + ) + secret_block_bytes = base64.standard_b64decode(secret_block) + + # the message is a combo of the pool_id, provided SRP userId, the Secret and Timestamp + msg = \ + bytearray(self.pool_id.split('_')[1], 'utf-8') + \ + bytearray(user_id_for_srp, 'utf-8') + \ + bytearray(secret_block_bytes) + \ + bytearray(timestamp, 'utf-8') + hmac_obj = hmac.new(hkdf, msg, digestmod=hashlib.sha256) + signature_string = base64.standard_b64encode(hmac_obj.digest()).decode('utf-8') + challenge_request = { + "ChallengeResponses": { + "USERNAME": user_id, + "TIMESTAMP": timestamp, + "PASSWORD_CLAIM_SECRET_BLOCK": secret_block, + "PASSWORD_CLAIM_SIGNATURE": signature_string + }, + "ChallengeName": "PASSWORD_VERIFIER", + "ClientId": self.client_id + } + return challenge_request + + def __get_hkdf_key_for_password(self, username, password, server_b_value, salt): + """ Calculates the final hkdf based on computed S value, and computed U value and the key. + + :param str username: Username. + :param str password: Password. + :param int server_b_value: Server B value. + :param int salt: Generated salt. + + :return Computed HKDF value. + :rtype: object + """ + + u_value = self.__calculate_u(self.large_a_value, server_b_value) + if u_value == 0: + raise ValueError('U cannot be zero.') + username_password = '%s%s:%s' % (self.pool_id.split('_')[1], username, password) + username_password_hash = self.__hash_sha256(username_password.encode('utf-8')) + + x_value = self.__hex_to_long(self.__hex_hash(self.__pad_hex(salt) + username_password_hash)) + g_mod_pow_xn = pow(self.g, x_value, self.big_n) + int_value2 = server_b_value - self.k * g_mod_pow_xn + s_value = pow(int_value2, self.small_a_value + u_value * x_value, self.big_n) + hkdf = self.__compute_hkdf( + bytearray.fromhex(self.__pad_hex(s_value)), + bytearray.fromhex(self.__pad_hex(self.__long_to_hex(u_value))) + ) + return hkdf + + def __compute_hkdf(self, ikm, salt): + """ Standard hkdf algorithm + + :param {Buffer} ikm Input key material. + :param {Buffer} salt Salt value. + :return {Buffer} Strong key material. + """ + + prk = hmac.new(salt, ikm, hashlib.sha256).digest() + info_bits_update = self.info_bits + bytearray(chr(1), 'utf-8') + hmac_hash = hmac.new(prk, info_bits_update, hashlib.sha256).digest() + return hmac_hash[:16] + + def __calculate_u(self, big_a, big_b): + """ Calculate the client's value U which is the hash of A and B + + :param int big_a: Large A value. + :param int big_b: Server B value. + + :return Computed U value. + :rtype: int + """ + + u_hex_hash = self.__hex_hash(self.__pad_hex(big_a) + self.__pad_hex(big_b)) + return self.__hex_to_long(u_hex_hash) + + def __generate_random_small_a(self): + """ Helper function to generate a random big integer + + :return a random value. + :rtype: int + """ + random_long_int = self.__get_random(128) + return random_long_int % self.big_n + + def __calculate_a(self): + """ Calculate the client's public value A = g^a%N with the generated random number a + + :return Computed large A. + :rtype: int + """ + + big_a = pow(self.g, self.small_a_value, self.big_n) + # safety check + if (big_a % self.big_n) == 0: + raise ValueError('Safety check for A failed') + return big_a + + @staticmethod + def __long_to_hex(long_num): + return '%x' % long_num + + @staticmethod + def __hex_to_long(hex_string): + return int(hex_string, 16) + + @staticmethod + def __hex_hash(hex_string): + return AwsIdp.__hash_sha256(bytearray.fromhex(hex_string)) + + @staticmethod + def __hash_sha256(buf): + """AuthenticationHelper.hash""" + digest = hashlib.sha256(buf).hexdigest() + return (64 - len(digest)) * '0' + digest + + @staticmethod + def __pad_hex(long_int): + """ Converts a Long integer (or hex string) to hex format padded with zeroes for hashing + + :param int|str long_int: Number or string to pad. + + :return Padded hex string. + :rtype: str + """ + + if not isinstance(long_int, str): + hash_str = AwsIdp.__long_to_hex(long_int) + else: + hash_str = long_int + if len(hash_str) % 2 == 1: + hash_str = '0%s' % hash_str + elif hash_str[0] in '89ABCDEFabcdef': + hash_str = '00%s' % hash_str + return hash_str + + @staticmethod + def __get_random(nbytes): + random_hex = binascii.hexlify(os.urandom(nbytes)) + return AwsIdp.__hex_to_long(random_hex) + + @staticmethod + def __get_current_timestamp(): + """ Creates a timestamp with the correct English format. + + :return: timestamp in format 'Sun Jan 27 19:00:04 UTC 2019' + :rtype: str + """ + + # We need US only data, so we cannot just do a strftime: + # Sun Jan 27 19:00:04 UTC 2019 + months = [None, 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] + days = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'] + + time_now = datetime.datetime.utcnow() + format_string = "{} {} {} %H:%M:%S UTC %Y".format(days[time_now.weekday()], months[time_now.month], time_now.day) + time_string = datetime.datetime.utcnow().strftime(format_string) + return time_string + + def __str__(self): + return "AWS IDP Client for:\nRegion: %s\nPoolId: %s\nAppId: %s" % ( + self.region, self.pool_id.split("_")[1], self.client_id + ) diff --git a/hypervideo_dl/extractor/gopro.py b/hypervideo_dl/extractor/gopro.py index 10cc1ae..ae96537 100644 --- a/hypervideo_dl/extractor/gopro.py +++ b/hypervideo_dl/extractor/gopro.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( int_or_none, @@ -81,8 +78,6 @@ class GoProIE(InfoExtractor): 'height': int_or_none(fmt.get('height')), }) - self._sort_formats(formats) - title = str_or_none( try_get(metadata, lambda x: x['collection']['title']) or self._html_search_meta(['og:title', 'twitter:title'], webpage) diff --git a/hypervideo_dl/extractor/goshgay.py b/hypervideo_dl/extractor/goshgay.py index 377981d..9a1f32b 100644 --- a/hypervideo_dl/extractor/goshgay.py +++ b/hypervideo_dl/extractor/goshgay.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import ( compat_parse_qs, diff --git a/hypervideo_dl/extractor/gotostage.py b/hypervideo_dl/extractor/gotostage.py index 6aa9610..112293b 100644 --- a/hypervideo_dl/extractor/gotostage.py +++ b/hypervideo_dl/extractor/gotostage.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import compat_str from ..utils import ( diff --git a/hypervideo_dl/extractor/gputechconf.py b/hypervideo_dl/extractor/gputechconf.py index 73dc62c..2d13bf4 100644 --- a/hypervideo_dl/extractor/gputechconf.py +++ b/hypervideo_dl/extractor/gputechconf.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/gronkh.py b/hypervideo_dl/extractor/gronkh.py index c9f1dd2..b9370e3 100644 --- a/hypervideo_dl/extractor/gronkh.py +++ b/hypervideo_dl/extractor/gronkh.py @@ -1,20 +1,34 @@ -# coding: utf-8 -from __future__ import unicode_literals +import functools from .common import InfoExtractor -from ..utils import unified_strdate +from ..utils import ( + OnDemandPagedList, + traverse_obj, + unified_strdate, +) class GronkhIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?gronkh\.tv/(?:watch/)?stream/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?gronkh\.tv/(?:watch/)?streams?/(?P<id>\d+)' _TESTS = [{ + 'url': 'https://gronkh.tv/streams/657', + 'info_dict': { + 'id': '657', + 'ext': 'mp4', + 'title': 'H.O.R.D.E. - DAS ZWEiTE ZEiTALTER 🎲 Session 1', + 'view_count': int, + 'thumbnail': 'https://01.cdn.vod.farm/preview/9e2555d3a23bf4e5c5b7c6b3b70a9d84.jpg', + 'upload_date': '20221111' + }, + 'params': {'skip_download': True} + }, { 'url': 'https://gronkh.tv/stream/536', 'info_dict': { 'id': '536', 'ext': 'mp4', 'title': 'GTV0536, 2021-10-01 - MARTHA IS DEAD #FREiAB1830 !FF7 !horde !archiv', - 'view_count': 19491, + 'view_count': int, 'thumbnail': 'https://01.cdn.vod.farm/preview/6436746cce14e25f751260a692872b9b.jpg', 'upload_date': '20211001' }, @@ -34,7 +48,6 @@ class GronkhIE(InfoExtractor): 'url': data_json['vtt_url'], 'ext': 'vtt', }) - self._sort_formats(formats) return { 'id': id, 'title': data_json.get('title'), @@ -44,3 +57,54 @@ class GronkhIE(InfoExtractor): 'formats': formats, 'subtitles': subtitles, } + + +class GronkhFeedIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?gronkh\.tv(?:/feed)?/?(?:#|$)' + IE_NAME = 'gronkh:feed' + + _TESTS = [{ + 'url': 'https://gronkh.tv/feed', + 'info_dict': { + 'id': 'feed', + }, + 'playlist_count': 16, + }, { + 'url': 'https://gronkh.tv', + 'only_matching': True, + }] + + def _entries(self): + for type_ in ('recent', 'views'): + info = self._download_json( + f'https://api.gronkh.tv/v1/video/discovery/{type_}', 'feed', note=f'Downloading {type_} API JSON') + for item in traverse_obj(info, ('discovery', ...)) or []: + yield self.url_result(f'https://gronkh.tv/watch/stream/{item["episode"]}', GronkhIE, item.get('title')) + + def _real_extract(self, url): + return self.playlist_result(self._entries(), 'feed') + + +class GronkhVodsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?gronkh\.tv/vods/streams/?(?:#|$)' + IE_NAME = 'gronkh:vods' + + _TESTS = [{ + 'url': 'https://gronkh.tv/vods/streams', + 'info_dict': { + 'id': 'vods', + }, + 'playlist_mincount': 150, + }] + _PER_PAGE = 25 + + def _fetch_page(self, page): + items = traverse_obj(self._download_json( + 'https://api.gronkh.tv/v1/search', 'vods', query={'offset': self._PER_PAGE * page, 'first': self._PER_PAGE}, + note=f'Downloading stream video page {page + 1}'), ('results', 'videos', ...)) + for item in items or []: + yield self.url_result(f'https://gronkh.tv/watch/stream/{item["episode"]}', GronkhIE, item['episode'], item.get('title')) + + def _real_extract(self, url): + entries = OnDemandPagedList(functools.partial(self._fetch_page), self._PER_PAGE) + return self.playlist_result(entries, 'vods') diff --git a/hypervideo_dl/extractor/groupon.py b/hypervideo_dl/extractor/groupon.py index a6da909..362d3ff 100644 --- a/hypervideo_dl/extractor/groupon.py +++ b/hypervideo_dl/extractor/groupon.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/harpodeon.py b/hypervideo_dl/extractor/harpodeon.py new file mode 100644 index 0000000..0aa4733 --- /dev/null +++ b/hypervideo_dl/extractor/harpodeon.py @@ -0,0 +1,70 @@ +from .common import InfoExtractor +from ..utils import unified_strdate + + +class HarpodeonIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?harpodeon\.com/(?:video|preview)/\w+/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.harpodeon.com/video/The_Smoking_Out_of_Bella_Butts/268068288', + 'md5': '727371564a6a9ebccef2073535b5b6bd', + 'skip': 'Free video could become unavailable', + 'info_dict': { + 'id': '268068288', + 'ext': 'mp4', + 'title': 'The Smoking Out of Bella Butts', + 'description': 'md5:47e16bdb41fc8a79c83ab83af11c8b77', + 'creator': 'Vitagraph Company of America', + 'release_date': '19150101' + } + }, { + 'url': 'https://www.harpodeon.com/preview/The_Smoking_Out_of_Bella_Butts/268068288', + 'md5': '6dfea5412845f690c7331be703f884db', + 'info_dict': { + 'id': '268068288', + 'ext': 'mp4', + 'title': 'The Smoking Out of Bella Butts', + 'description': 'md5:47e16bdb41fc8a79c83ab83af11c8b77', + 'creator': 'Vitagraph Company of America', + 'release_date': '19150101' + } + }, { + 'url': 'https://www.harpodeon.com/preview/Behind_the_Screen/421838710', + 'md5': '7979df9ca04637282cb7d172ab3a9c3b', + 'info_dict': { + 'id': '421838710', + 'ext': 'mp4', + 'title': 'Behind the Screen', + 'description': 'md5:008972a3dc51fba3965ee517d2ba9155', + 'creator': 'Lone Star Corporation', + 'release_date': '19160101' + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + title, creator, release_year = self._search_regex( + r'''(?x) + <div[^>]+videoInfo[^<]*<h2[^>]*>(?P<title>[^>]+)</h2> + (?:\s*<p[^>]*>\((?P<creator>.+),\s*)?(?P<release_year>\d{4})?''', + webpage, 'title', group=('title', 'creator', 'release_year'), + fatal=False) or (None, None, None) + + hp_base = self._html_search_regex(r'hpBase\(\s*["\']([^"\']+)', webpage, 'hp_base') + + hp_inject_video, hp_resolution = self._search_regex( + r'''(?x) + hpInjectVideo\([\'\"](?P<hp_inject_video>\w+)[\'\"], + [\'\"](?P<hp_resolution>\d+)[\'\"]''', + webpage, 'hp_inject_video', group=['hp_inject_video', 'hp_resolution']) + + return { + 'id': video_id, + 'title': title, + 'url': f'{hp_base}{hp_inject_video}_{hp_resolution}.mp4', + 'http_headers': {'Referer': url}, + 'description': self._html_search_meta('description', webpage, fatal=False), + 'creator': creator, + 'release_date': unified_strdate(f'{release_year}0101') + } diff --git a/hypervideo_dl/extractor/hbo.py b/hypervideo_dl/extractor/hbo.py index 68df748..530bdb7 100644 --- a/hypervideo_dl/extractor/hbo.py +++ b/hypervideo_dl/extractor/hbo.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -115,7 +112,6 @@ class HBOBaseIE(InfoExtractor): 'width': format_info.get('width'), 'height': format_info.get('height'), }) - self._sort_formats(formats) thumbnails = [] card_sizes = xpath_element(video_data, 'titleCardSizes') diff --git a/hypervideo_dl/extractor/hearthisat.py b/hypervideo_dl/extractor/hearthisat.py index a3d6a05..d1a400d 100644 --- a/hypervideo_dl/extractor/hearthisat.py +++ b/hypervideo_dl/extractor/hearthisat.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor from ..utils import ( determine_ext, @@ -85,7 +81,6 @@ class HearThisAtIE(InfoExtractor): 'acodec': ext, 'quality': 2, # Usually better quality }) - self._sort_formats(formats) return { 'id': track_id, diff --git a/hypervideo_dl/extractor/heise.py b/hypervideo_dl/extractor/heise.py index cbe564a..27d737c 100644 --- a/hypervideo_dl/extractor/heise.py +++ b/hypervideo_dl/extractor/heise.py @@ -1,13 +1,12 @@ -# coding: utf-8 -from __future__ import unicode_literals +import urllib.parse from .common import InfoExtractor from .kaltura import KalturaIE from .youtube import YoutubeIE from ..utils import ( + NO_DEFAULT, determine_ext, int_or_none, - NO_DEFAULT, parse_iso8601, smuggle_url, xpath_text, @@ -26,6 +25,9 @@ class HeiseIE(InfoExtractor): 'timestamp': 1512734959, 'upload_date': '20171208', 'description': 'md5:c934cbfb326c669c2bcabcbe3d3fcd20', + 'thumbnail': 're:^https?://.*/thumbnail/.*', + 'duration': 2845, + 'view_count': int, }, 'params': { 'skip_download': True, @@ -37,11 +39,27 @@ class HeiseIE(InfoExtractor): 'info_dict': { 'id': '6kmWbXleKW4', 'ext': 'mp4', - 'title': 'NEU IM SEPTEMBER | Netflix', - 'description': 'md5:2131f3c7525e540d5fd841de938bd452', + 'title': 'Neu im September 2017 | Netflix', + 'description': 'md5:d6852d1f96bb80760608eed3b907437c', 'upload_date': '20170830', 'uploader': 'Netflix Deutschland, Österreich und Schweiz', 'uploader_id': 'netflixdach', + 'categories': ['Entertainment'], + 'tags': 'count:27', + 'age_limit': 0, + 'availability': 'public', + 'comment_count': int, + 'channel_id': 'UCZqgRlLcvO3Fnx_npQJygcQ', + 'thumbnail': 'https://i.ytimg.com/vi_webp/6kmWbXleKW4/maxresdefault.webp', + 'uploader_url': 'http://www.youtube.com/user/netflixdach', + 'playable_in_embed': True, + 'live_status': 'not_live', + 'channel_url': 'https://www.youtube.com/channel/UCZqgRlLcvO3Fnx_npQJygcQ', + 'view_count': int, + 'channel': 'Netflix Deutschland, Österreich und Schweiz', + 'channel_follower_count': int, + 'like_count': int, + 'duration': 67, }, 'params': { 'skip_download': True, @@ -55,11 +73,15 @@ class HeiseIE(InfoExtractor): 'description': 'md5:47e8ffb6c46d85c92c310a512d6db271', 'timestamp': 1512470717, 'upload_date': '20171205', + 'duration': 786, + 'view_count': int, + 'thumbnail': 're:^https?://.*/thumbnail/.*', }, 'params': { 'skip_download': True, }, }, { + # FIXME: Video m3u8 fails to download; issue with Kaltura extractor 'url': 'https://www.heise.de/ct/artikel/c-t-uplink-20-8-Staubsaugerroboter-Xiaomi-Vacuum-2-AR-Brille-Meta-2-und-Android-rooten-3959893.html', 'info_dict': { 'id': '1_59mk80sf', @@ -72,6 +94,18 @@ class HeiseIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + # videout + 'url': 'https://www.heise.de/ct/artikel/c-t-uplink-3-8-Anonyme-SIM-Karten-G-Sync-Monitore-Citizenfour-2440327.html', + 'info_dict': { + 'id': '2440327', + 'ext': 'mp4', + 'title': 'c\'t uplink 3.8: Anonyme SIM-Karten, G-Sync-Monitore, Citizenfour', + 'thumbnail': 'http://www.heise.de/imagine/yxM2qmol0xV3iFB7qFb70dGvXjc/gallery/', + 'description': 'md5:fa164d8c8707dff124a9626d39205f5d', + 'timestamp': 1414825200, + 'upload_date': '20141101', + } }, { 'url': 'http://www.heise.de/ct/artikel/c-t-uplink-3-3-Owncloud-Tastaturen-Peilsender-Smartphone-2403911.html', 'only_matching': True, @@ -124,26 +158,28 @@ class HeiseIE(InfoExtractor): if kaltura_id: return _make_kaltura_result('kaltura:2238431:%s' % kaltura_id) - yt_urls = YoutubeIE._extract_urls(webpage) + yt_urls = tuple(YoutubeIE._extract_embed_urls(url, webpage)) if yt_urls: return self.playlist_from_matches( yt_urls, video_id, title, ie=YoutubeIE.ie_key()) title = extract_title() - - container_id = self._search_regex( - r'<div class="videoplayerjw"[^>]+data-container="([0-9]+)"', - webpage, 'container ID') - - sequenz_id = self._search_regex( - r'<div class="videoplayerjw"[^>]+data-sequenz="([0-9]+)"', - webpage, 'sequenz ID') - - doc = self._download_xml( - 'http://www.heise.de/videout/feed', video_id, query={ + api_params = urllib.parse.parse_qs( + self._search_regex(r'/videout/feed\.json\?([^\']+)', webpage, 'feed params', default=None) or '') + if not api_params or 'container' not in api_params or 'sequenz' not in api_params: + container_id = self._search_regex( + r'<div class="videoplayerjw"[^>]+data-container="([0-9]+)"', + webpage, 'container ID') + + sequenz_id = self._search_regex( + r'<div class="videoplayerjw"[^>]+data-sequenz="([0-9]+)"', + webpage, 'sequenz ID') + api_params = { 'container': container_id, 'sequenz': sequenz_id, - }) + } + doc = self._download_xml( + 'http://www.heise.de/videout/feed', video_id, query=api_params) formats = [] for source_node in doc.findall('.//{http://rss.jwpcdn.com/}source'): @@ -158,7 +194,6 @@ class HeiseIE(InfoExtractor): 'format_id': '%s_%s' % (ext, label), 'height': height, }) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/hellporno.py b/hypervideo_dl/extractor/hellporno.py index 92d32cd..fa32b27 100644 --- a/hypervideo_dl/extractor/hellporno.py +++ b/hypervideo_dl/extractor/hellporno.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( int_or_none, @@ -41,7 +39,6 @@ class HellPornoIE(InfoExtractor): title = remove_end(self._html_extract_title(webpage), ' - Hell Porno') info = self._parse_html5_media_entries(url, webpage, display_id)[0] - self._sort_formats(info['formats']) video_id = self._search_regex( (r'chs_object\s*=\s*["\'](\d+)', diff --git a/hypervideo_dl/extractor/helsinki.py b/hypervideo_dl/extractor/helsinki.py index 575fb33..e518cae 100644 --- a/hypervideo_dl/extractor/helsinki.py +++ b/hypervideo_dl/extractor/helsinki.py @@ -1,7 +1,3 @@ -# coding: utf-8 - -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import js_to_json @@ -33,7 +29,6 @@ class HelsinkiIE(InfoExtractor): 'url': s['file'], 'ext': 'mp4', } for s in params['sources']] - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/hentaistigma.py b/hypervideo_dl/extractor/hentaistigma.py index 86a93de..ca5ffc2 100644 --- a/hypervideo_dl/extractor/hentaistigma.py +++ b/hypervideo_dl/extractor/hentaistigma.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/hgtv.py b/hypervideo_dl/extractor/hgtv.py index a4f3325..c40017d 100644 --- a/hypervideo_dl/extractor/hgtv.py +++ b/hypervideo_dl/extractor/hgtv.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/hidive.py b/hypervideo_dl/extractor/hidive.py index 46d7d62..3a53f2c 100644 --- a/hypervideo_dl/extractor/hidive.py +++ b/hypervideo_dl/extractor/hidive.py @@ -1,4 +1,3 @@ -# coding: utf-8 import re from .common import InfoExtractor @@ -39,7 +38,9 @@ class HiDiveIE(InfoExtractor): webpage = self._download_webpage(self._LOGIN_URL, None) form = self._search_regex( r'(?s)<form[^>]+action="/account/login"[^>]*>(.+?)</form>', - webpage, 'login form') + webpage, 'login form', default=None) + if not form: # logged in + return data = self._hidden_inputs(form) data.update({ 'Email': username, @@ -102,7 +103,6 @@ class HiDiveIE(InfoExtractor): f['language'] = audio f['format_note'] = f'{version}, {extra}' formats.extend(frmt) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/historicfilms.py b/hypervideo_dl/extractor/historicfilms.py index 56343e9..c428fee 100644 --- a/hypervideo_dl/extractor/historicfilms.py +++ b/hypervideo_dl/extractor/historicfilms.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import parse_duration diff --git a/hypervideo_dl/extractor/hitbox.py b/hypervideo_dl/extractor/hitbox.py index 0470d0a..f0c6898 100644 --- a/hypervideo_dl/extractor/hitbox.py +++ b/hypervideo_dl/extractor/hitbox.py @@ -1,16 +1,13 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( clean_html, - parse_iso8601, + determine_ext, float_or_none, int_or_none, - compat_str, - determine_ext, + parse_iso8601, ) @@ -121,7 +118,6 @@ class HitboxIE(InfoExtractor): 'tbr': bitrate, 'format_note': label, }) - self._sort_formats(formats) metadata = self._extract_metadata( 'https://www.smashcast.tv/api/media/video', video_id) @@ -130,7 +126,7 @@ class HitboxIE(InfoExtractor): return metadata -class HitboxLiveIE(HitboxIE): +class HitboxLiveIE(HitboxIE): # XXX: Do not subclass from concrete IE IE_NAME = 'hitbox:live' _VALID_URL = r'https?://(?:www\.)?(?:hitbox|smashcast)\.tv/(?P<id>[^/?#&]+)' _TESTS = [{ @@ -203,7 +199,6 @@ class HitboxLiveIE(HitboxIE): 'page_url': url, 'player_url': 'http://www.hitbox.tv/static/player/flowplayer/flowplayer.commercial-3.2.16.swf', }) - self._sort_formats(formats) metadata = self._extract_metadata( 'https://www.smashcast.tv/api/media/live', video_id) diff --git a/hypervideo_dl/extractor/hitrecord.py b/hypervideo_dl/extractor/hitrecord.py index fd5dc29..902af44 100644 --- a/hypervideo_dl/extractor/hitrecord.py +++ b/hypervideo_dl/extractor/hitrecord.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import compat_str from ..utils import ( diff --git a/hypervideo_dl/extractor/hketv.py b/hypervideo_dl/extractor/hketv.py index 1f3502b..1087956 100644 --- a/hypervideo_dl/extractor/hketv.py +++ b/hypervideo_dl/extractor/hketv.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import compat_str from ..utils import ( @@ -140,7 +137,6 @@ class HKETVIE(InfoExtractor): 'width': w, 'height': h, }) - self._sort_formats(formats) subtitles = {} tracks = try_get(playlist0, lambda x: x['tracks'], list) or [] diff --git a/hypervideo_dl/extractor/holodex.py b/hypervideo_dl/extractor/holodex.py new file mode 100644 index 0000000..a2b73ec --- /dev/null +++ b/hypervideo_dl/extractor/holodex.py @@ -0,0 +1,100 @@ +from .common import InfoExtractor +from .youtube import YoutubeIE +from ..utils import traverse_obj + + +class HolodexIE(InfoExtractor): + _VALID_URL = r'''(?x)https?://(?:www\.|staging\.)?holodex\.net/(?: + api/v2/playlist/(?P<playlist>\d+)| + watch/(?P<id>[\w-]{11})(?:\?(?:[^#]+&)?playlist=(?P<playlist2>\d+))? + )''' + _TESTS = [{ + 'url': 'https://holodex.net/watch/9kQ2GtvDV3s', + 'md5': 'be5ffce2f0feae8ba4c01553abc0f175', + 'info_dict': { + 'ext': 'mp4', + 'id': '9kQ2GtvDV3s', + 'title': '【おちゃめ機能】ホロライブが吹っ切れた【24人で歌ってみた】', + 'channel_id': 'UCJFZiqLMntJufDCHc6bQixg', + 'playable_in_embed': True, + 'tags': 'count:43', + 'age_limit': 0, + 'live_status': 'not_live', + 'description': 'md5:040e866c09dc4ab899b36479f4b7c7a2', + 'channel_url': 'https://www.youtube.com/channel/UCJFZiqLMntJufDCHc6bQixg', + 'upload_date': '20200406', + 'uploader_url': 'http://www.youtube.com/channel/UCJFZiqLMntJufDCHc6bQixg', + 'view_count': int, + 'channel': 'hololive ホロライブ - VTuber Group', + 'categories': ['Music'], + 'uploader': 'hololive ホロライブ - VTuber Group', + 'channel_follower_count': int, + 'uploader_id': 'UCJFZiqLMntJufDCHc6bQixg', + 'availability': 'public', + 'thumbnail': 'https://i.ytimg.com/vi_webp/9kQ2GtvDV3s/maxresdefault.webp', + 'duration': 263, + 'like_count': int, + }, + }, { + 'url': 'https://holodex.net/api/v2/playlist/239', + 'info_dict': { + 'id': '239', + 'title': 'Songs/Videos that made fall into the rabbit hole (from my google activity history)', + }, + 'playlist_count': 14, + }, { + 'url': 'https://holodex.net/watch/_m2mQyaofjI?foo=bar&playlist=69', + 'info_dict': { + 'id': '69', + 'title': '拿著金斧頭的藍髮大姊姊' + }, + 'playlist_count': 3, + }, { + 'url': 'https://holodex.net/watch/_m2mQyaofjI?playlist=69', + 'info_dict': { + 'id': '_m2mQyaofjI', + 'ext': 'mp4', + 'playable_in_embed': True, + 'like_count': int, + 'uploader': 'Ernst / エンスト', + 'duration': 11, + 'uploader_url': 'http://www.youtube.com/channel/UCqSX4PPZY0cyetqKVY_wRVA', + 'categories': ['Entertainment'], + 'title': '【星街すいせい】星街向你獻上晚安', + 'upload_date': '20210705', + 'description': 'md5:8b8ffb157bae77f2d109021a0b577d4a', + 'channel': 'Ernst / エンスト', + 'channel_id': 'UCqSX4PPZY0cyetqKVY_wRVA', + 'channel_follower_count': int, + 'view_count': int, + 'tags': [], + 'live_status': 'not_live', + 'channel_url': 'https://www.youtube.com/channel/UCqSX4PPZY0cyetqKVY_wRVA', + 'availability': 'public', + 'thumbnail': 'https://i.ytimg.com/vi_webp/_m2mQyaofjI/maxresdefault.webp', + 'age_limit': 0, + 'uploader_id': 'UCqSX4PPZY0cyetqKVY_wRVA', + 'comment_count': int, + }, + 'params': {'noplaylist': True}, + }, { + 'url': 'https://staging.holodex.net/api/v2/playlist/125', + 'only_matching': True, + }, { + 'url': 'https://staging.holodex.net/watch/rJJTJA_T_b0?playlist=25', + 'only_matching': True, + }, { + 'url': 'https://staging.holodex.net/watch/s1ifBeukThg', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id, playlist_id, pl_id2 = self._match_valid_url(url).group('id', 'playlist', 'playlist2') + playlist_id = playlist_id or pl_id2 + + if not self._yes_playlist(playlist_id, video_id): + return self.url_result(f'https://www.youtube.com/watch?v={video_id}', YoutubeIE) + + data = self._download_json(f'https://holodex.net/api/v2/playlist/{playlist_id}', playlist_id) + return self.playlist_from_matches( + traverse_obj(data, ('videos', ..., 'id')), playlist_id, data.get('name'), ie=YoutubeIE) diff --git a/hypervideo_dl/extractor/hornbunny.py b/hypervideo_dl/extractor/hornbunny.py deleted file mode 100644 index c458a95..0000000 --- a/hypervideo_dl/extractor/hornbunny.py +++ /dev/null @@ -1,49 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ( - int_or_none, - parse_duration, -) - - -class HornBunnyIE(InfoExtractor): - _VALID_URL = r'http?://(?:www\.)?hornbunny\.com/videos/(?P<title_dash>[a-z-]+)-(?P<id>\d+)\.html' - _TEST = { - 'url': 'http://hornbunny.com/videos/panty-slut-jerk-off-instruction-5227.html', - 'md5': 'e20fd862d1894b67564c96f180f43924', - 'info_dict': { - 'id': '5227', - 'ext': 'mp4', - 'title': 'panty slut jerk off instruction', - 'duration': 550, - 'age_limit': 18, - 'view_count': int, - 'thumbnail': r're:^https?://.*\.jpg$', - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - title = self._og_search_title(webpage) - info_dict = self._parse_html5_media_entries(url, webpage, video_id)[0] - - duration = parse_duration(self._search_regex( - r'<strong>Runtime:</strong>\s*([0-9:]+)</div>', - webpage, 'duration', fatal=False)) - view_count = int_or_none(self._search_regex( - r'<strong>Views:</strong>\s*(\d+)</div>', - webpage, 'view count', fatal=False)) - - info_dict.update({ - 'id': video_id, - 'title': title, - 'duration': duration, - 'view_count': view_count, - 'age_limit': 18, - }) - - return info_dict diff --git a/hypervideo_dl/extractor/hotnewhiphop.py b/hypervideo_dl/extractor/hotnewhiphop.py index 4703e18..f8570cb 100644 --- a/hypervideo_dl/extractor/hotnewhiphop.py +++ b/hypervideo_dl/extractor/hotnewhiphop.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import compat_b64decode from ..utils import ( diff --git a/hypervideo_dl/extractor/hotstar.py b/hypervideo_dl/extractor/hotstar.py index d55a79b..61eec7b 100644 --- a/hypervideo_dl/extractor/hotstar.py +++ b/hypervideo_dl/extractor/hotstar.py @@ -1,31 +1,33 @@ -# coding: utf-8 -from __future__ import unicode_literals - import hashlib import hmac +import json import re import time import uuid -import json from .common import InfoExtractor -from ..compat import ( - compat_HTTPError, - compat_str -) +from ..compat import compat_HTTPError, compat_str from ..utils import ( - determine_ext, ExtractorError, + determine_ext, int_or_none, + join_nonempty, str_or_none, - try_get, + traverse_obj, url_or_none, ) class HotStarBaseIE(InfoExtractor): + _BASE_URL = 'https://www.hotstar.com' + _API_URL = 'https://api.hotstar.com' _AKAMAI_ENCRYPTION_KEY = b'\x05\xfc\x1a\x01\xca\xc9\x4b\xc4\x12\xfc\x53\x12\x07\x75\xf9\xee' + def _call_api_v1(self, path, *args, **kwargs): + return self._download_json( + f'{self._API_URL}/o/v1/{path}', *args, **kwargs, + headers={'x-country-code': 'IN', 'x-platform-code': 'PCTV'}) + def _call_api_impl(self, path, video_id, query, st=None, cookies=None): st = int_or_none(st) or int(time.time()) exp = st + 6000 @@ -36,7 +38,7 @@ class HotStarBaseIE(InfoExtractor): token = cookies.get('userUP').value else: token = self._download_json( - 'https://api.hotstar.com/um/v3/users', + f'{self._API_URL}/um/v3/users', video_id, note='Downloading token', data=json.dumps({"device_ids": [{"id": compat_str(uuid.uuid4()), "type": "device_id"}]}).encode('utf-8'), headers={ @@ -46,58 +48,48 @@ class HotStarBaseIE(InfoExtractor): })['user_identity'] response = self._download_json( - 'https://api.hotstar.com/' + path, video_id, headers={ + f'{self._API_URL}/{path}', video_id, query=query, + headers={ 'hotstarauth': auth, 'x-hs-appversion': '6.72.2', 'x-hs-platform': 'web', 'x-hs-usertoken': token, - }, query=query) + }) if response['message'] != "Playback URL's fetched successfully": raise ExtractorError( response['message'], expected=True) return response['data'] - def _call_api(self, path, video_id, query_name='contentId'): - return self._download_json('https://api.hotstar.com/' + path, video_id=video_id, query={ - query_name: video_id, - 'tas': 10000, - }, headers={ - 'x-country-code': 'IN', - 'x-platform-code': 'PCTV', - }) - def _call_api_v2(self, path, video_id, st=None, cookies=None): return self._call_api_impl( - '%s/content/%s' % (path, video_id), video_id, st=st, cookies=cookies, query={ + f'{path}/content/{video_id}', video_id, st=st, cookies=cookies, query={ 'desired-config': 'audio_channel:stereo|container:fmp4|dynamic_range:hdr|encryption:plain|ladder:tv|package:dash|resolution:fhd|subs-tag:HotstarVIP|video_codec:h265', 'device-id': cookies.get('device_id').value if cookies.get('device_id') else compat_str(uuid.uuid4()), 'os-name': 'Windows', 'os-version': '10', }) + def _playlist_entries(self, path, item_id, root=None, **kwargs): + results = self._call_api_v1(path, item_id, **kwargs)['body']['results'] + for video in traverse_obj(results, (('assets', None), 'items', ...)): + if video.get('contentId'): + yield self.url_result( + HotStarIE._video_url(video['contentId'], root=root), HotStarIE, video['contentId']) + class HotStarIE(HotStarBaseIE): IE_NAME = 'hotstar' _VALID_URL = r'''(?x) - (?: - hotstar\:| - https?://(?:www\.)?hotstar\.com(?:/in)?/(?!in/) - ) - (?: - (?P<type>movies|sports|episode|(?P<tv>tv)) - (?: - \:| - /[^/?#]+/ - (?(tv) - (?:[^/?#]+/){2}| - (?:[^/?#]+/)* - ) - )| - [^/?#]+/ - )? - (?P<id>\d{10}) - ''' + https?://(?:www\.)?hotstar\.com(?:/in)?/(?!in/) + (?: + (?P<type>movies|sports|episode|(?P<tv>tv))/ + (?(tv)(?:[^/?#]+/){2}|[^?#]*) + )? + [^/?#]+/ + (?P<id>\d{10}) + ''' + _TESTS = [{ 'url': 'https://www.hotstar.com/can-you-not-spread-rumours/1000076273', 'info_dict': { @@ -108,38 +100,9 @@ class HotStarIE(HotStarBaseIE): 'timestamp': 1447248600, 'upload_date': '20151111', 'duration': 381, + 'episode': 'Can You Not Spread Rumours?', }, - }, { - 'url': 'hotstar:1000076273', - 'only_matching': True, - }, { - 'url': 'https://www.hotstar.com/movies/radha-gopalam/1000057157', - 'info_dict': { - 'id': '1000057157', - 'ext': 'mp4', - 'title': 'Radha Gopalam', - 'description': 'md5:be3bc342cc120bbc95b3b0960e2b0d22', - 'timestamp': 1140805800, - 'upload_date': '20060224', - 'duration': 9182, - }, - }, { - 'url': 'hotstar:movies:1000057157', - 'only_matching': True, - }, { - 'url': 'https://www.hotstar.com/in/sports/cricket/follow-the-blues-2021/recap-eng-fight-back-on-day-2/1260066104', - 'only_matching': True, - }, { - 'url': 'https://www.hotstar.com/in/sports/football/most-costly-pl-transfers-ft-grealish/1260065956', - 'only_matching': True, - }, { - # contentData - 'url': 'hotstar:sports:1260065956', - 'only_matching': True, - }, { - # contentData - 'url': 'hotstar:sports:1260066104', - 'only_matching': True, + 'params': {'skip_download': 'm3u8'}, }, { 'url': 'https://www.hotstar.com/tv/ek-bhram-sarvagun-sampanna/s-2116/janhvi-targets-suman/1000234847', 'info_dict': { @@ -158,12 +121,19 @@ class HotStarIE(HotStarBaseIE): 'season_id': 6771, 'episode': 'Janhvi Targets Suman', 'episode_number': 8, - }, + } }, { - 'url': 'hotstar:episode:1000234847', + 'url': 'https://www.hotstar.com/movies/radha-gopalam/1000057157', + 'only_matching': True, + }, { + 'url': 'https://www.hotstar.com/in/sports/cricket/follow-the-blues-2021/recap-eng-fight-back-on-day-2/1260066104', + 'only_matching': True, + }, { + 'url': 'https://www.hotstar.com/in/sports/football/most-costly-pl-transfers-ft-grealish/1260065956', 'only_matching': True, }] _GEO_BYPASS = False + _TYPE = { 'movies': 'movie', 'sports': 'match', @@ -172,41 +142,54 @@ class HotStarIE(HotStarBaseIE): None: 'content', } + _IGNORE_MAP = { + 'res': 'resolution', + 'vcodec': 'video_codec', + 'dr': 'dynamic_range', + } + + @classmethod + def _video_url(cls, video_id, video_type=None, *, slug='ignore_me', root=None): + assert None in (video_type, root) + if not root: + root = join_nonempty(cls._BASE_URL, video_type, delim='/') + return f'{root}/{slug}/{video_id}' + def _real_extract(self, url): - mobj = self._match_valid_url(url) - video_id = mobj.group('id') - video_type = mobj.group('type') - cookies = self._get_cookies(url) + video_id, video_type = self._match_valid_url(url).group('id', 'type') video_type = self._TYPE.get(video_type, video_type) - video_data = self._call_api(f'o/v1/{video_type}/detail', video_id)['body']['results']['item'] - title = video_data['title'] + cookies = self._get_cookies(url) # Cookies before any request + video_data = self._call_api_v1(f'{video_type}/detail', video_id, + query={'tas': 10000, 'contentId': video_id})['body']['results']['item'] if not self.get_param('allow_unplayable_formats') and video_data.get('drmProtected'): self.report_drm(video_id) - headers = {'Referer': 'https://www.hotstar.com/in'} - formats = [] - subs = {} + # See https://github.com/hypervideo/hypervideo/issues/396 + st = self._download_webpage_handle(f'{self._BASE_URL}/in', video_id)[1].headers.get('x-origin-date') + geo_restricted = False - _, urlh = self._download_webpage_handle('https://www.hotstar.com/in', video_id) - # Required to fix https://github.com/hypervideo/hypervideo/issues/396 - st = urlh.headers.get('x-origin-date') + formats, subs = [], {} + headers = {'Referer': f'{self._BASE_URL}/in'} + # change to v2 in the future playback_sets = self._call_api_v2('play/v1/playback', video_id, st=st, cookies=cookies)['playBackSets'] for playback_set in playback_sets: if not isinstance(playback_set, dict): continue - dr = re.search(r'dynamic_range:(?P<dr>[a-z]+)', playback_set.get('tagsCombination')).group('dr') + tags = str_or_none(playback_set.get('tagsCombination')) or '' + if any(f'{prefix}:{ignore}' in tags + for key, prefix in self._IGNORE_MAP.items() + for ignore in self._configuration_arg(key)): + continue + format_url = url_or_none(playback_set.get('playbackUrl')) if not format_url: continue - format_url = re.sub( - r'(?<=//staragvod)(\d)', r'web\1', format_url) - tags = str_or_none(playback_set.get('tagsCombination')) or '' - ingored_res, ignored_vcodec, ignored_dr = self._configuration_arg('res'), self._configuration_arg('vcodec'), self._configuration_arg('dr') - if any(f'resolution:{ig_res}' in tags for ig_res in ingored_res) or any(f'video_codec:{ig_vc}' in tags for ig_vc in ignored_vcodec) or any(f'dynamic_range:{ig_dr}' in tags for ig_dr in ignored_dr): - continue + format_url = re.sub(r'(?<=//staragvod)(\d)', r'web\1', format_url) + dr = re.search(r'dynamic_range:(?P<dr>[a-z]+)', playback_set.get('tagsCombination')).group('dr') ext = determine_ext(format_url) + current_formats, current_subs = [], {} try: if 'package:hls' in tags or ext == 'm3u8': @@ -218,8 +201,7 @@ class HotStarIE(HotStarBaseIE): current_formats, current_subs = self._extract_mpd_formats_and_subtitles( format_url, video_id, mpd_id=f'{dr}-dash', headers=headers) elif ext == 'f4m': - # produce broken files - pass + pass # XXX: produce broken files else: current_formats = [{ 'url': format_url, @@ -230,6 +212,7 @@ class HotStarIE(HotStarBaseIE): if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: geo_restricted = True continue + if tags and 'encryption:plain' not in tags: for f in current_formats: f['has_drm'] = True @@ -238,18 +221,18 @@ class HotStarIE(HotStarBaseIE): for f in current_formats: if not f.get('langauge'): f['language'] = lang + formats.extend(current_formats) subs = self._merge_subtitles(subs, current_subs) + if not formats and geo_restricted: self.raise_geo_restricted(countries=['IN'], metadata_available=True) - self._sort_formats(formats) - for f in formats: f.setdefault('http_headers', {}).update(headers) return { 'id': video_id, - 'title': title, + 'title': video_data.get('title'), 'description': video_data.get('description'), 'duration': int_or_none(video_data.get('duration')), 'timestamp': int_or_none(video_data.get('broadcastDate') or video_data.get('startDate')), @@ -261,17 +244,51 @@ class HotStarIE(HotStarBaseIE): 'season': video_data.get('seasonName'), 'season_number': int_or_none(video_data.get('seasonNo')), 'season_id': video_data.get('seasonId'), - 'episode': title, + 'episode': video_data.get('title'), 'episode_number': int_or_none(video_data.get('episodeNo')), - 'http_headers': { - 'Referer': 'https://www.hotstar.com/in', - } } +class HotStarPrefixIE(InfoExtractor): + """ The "hotstar:" prefix is no longer in use, but this is kept for backward compatibility """ + IE_DESC = False + _VALID_URL = r'hotstar:(?:(?P<type>\w+):)?(?P<id>\d+)$' + _TESTS = [{ + 'url': 'hotstar:1000076273', + 'only_matching': True, + }, { + 'url': 'hotstar:movies:1260009879', + 'info_dict': { + 'id': '1260009879', + 'ext': 'mp4', + 'title': 'Nuvvu Naaku Nachav', + 'description': 'md5:d43701b1314e6f8233ce33523c043b7d', + 'timestamp': 1567525674, + 'upload_date': '20190903', + 'duration': 10787, + 'episode': 'Nuvvu Naaku Nachav', + }, + }, { + 'url': 'hotstar:episode:1000234847', + 'only_matching': True, + }, { + # contentData + 'url': 'hotstar:sports:1260065956', + 'only_matching': True, + }, { + # contentData + 'url': 'hotstar:sports:1260066104', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id, video_type = self._match_valid_url(url).group('id', 'type') + return self.url_result(HotStarIE._video_url(video_id, video_type), HotStarIE, video_id) + + class HotStarPlaylistIE(HotStarBaseIE): IE_NAME = 'hotstar:playlist' - _VALID_URL = r'https?://(?:www\.)?hotstar\.com/tv/[^/]+/s-\w+/list/[^/]+/t-(?P<id>\w+)' + _VALID_URL = r'https?://(?:www\.)?hotstar\.com(?:/in)?/tv(?:/[^/]+){2}/list/[^/]+/t-(?P<id>\w+)' _TESTS = [{ 'url': 'https://www.hotstar.com/tv/savdhaan-india/s-26/list/popular-clips/t-3_2_26', 'info_dict': { @@ -281,25 +298,49 @@ class HotStarPlaylistIE(HotStarBaseIE): }, { 'url': 'https://www.hotstar.com/tv/savdhaan-india/s-26/list/extras/t-2480', 'only_matching': True, + }, { + 'url': 'https://www.hotstar.com/in/tv/karthika-deepam/15457/list/popular-clips/t-3_2_1272', + 'only_matching': True, }] def _real_extract(self, url): - playlist_id = self._match_id(url) + id_ = self._match_id(url) + return self.playlist_result( + self._playlist_entries('tray/find', id_, query={'tas': 10000, 'uqId': id_}), id_) - collection = self._call_api('o/v1/tray/find', playlist_id, 'uqId')['body']['results'] - entries = [ - self.url_result( - 'https://www.hotstar.com/%s' % video['contentId'], - ie=HotStarIE.ie_key(), video_id=video['contentId']) - for video in collection['assets']['items'] - if video.get('contentId')] - return self.playlist_result(entries, playlist_id) +class HotStarSeasonIE(HotStarBaseIE): + IE_NAME = 'hotstar:season' + _VALID_URL = r'(?P<url>https?://(?:www\.)?hotstar\.com(?:/in)?/tv/[^/]+/\w+)/seasons/[^/]+/ss-(?P<id>\w+)' + _TESTS = [{ + 'url': 'https://www.hotstar.com/tv/radhakrishn/1260000646/seasons/season-2/ss-8028', + 'info_dict': { + 'id': '8028', + }, + 'playlist_mincount': 35, + }, { + 'url': 'https://www.hotstar.com/in/tv/ishqbaaz/9567/seasons/season-2/ss-4357', + 'info_dict': { + 'id': '4357', + }, + 'playlist_mincount': 30, + }, { + 'url': 'https://www.hotstar.com/in/tv/bigg-boss/14714/seasons/season-4/ss-8208/', + 'info_dict': { + 'id': '8208', + }, + 'playlist_mincount': 19, + }] + + def _real_extract(self, url): + url, season_id = self._match_valid_url(url).groups() + return self.playlist_result(self._playlist_entries( + 'season/asset', season_id, url, query={'tao': 0, 'tas': 0, 'size': 10000, 'id': season_id}), season_id) class HotStarSeriesIE(HotStarBaseIE): IE_NAME = 'hotstar:series' - _VALID_URL = r'(?P<url>https?://(?:www\.)?hotstar\.com(?:/in)?/tv/[^/]+/(?P<id>\d+))' + _VALID_URL = r'(?P<url>https?://(?:www\.)?hotstar\.com(?:/in)?/tv/[^/]+/(?P<id>\d+))/?(?:[#?]|$)' _TESTS = [{ 'url': 'https://www.hotstar.com/in/tv/radhakrishn/1260000646', 'info_dict': { @@ -317,25 +358,13 @@ class HotStarSeriesIE(HotStarBaseIE): 'info_dict': { 'id': '435', }, - 'playlist_mincount': 269, + 'playlist_mincount': 267, }] def _real_extract(self, url): url, series_id = self._match_valid_url(url).groups() - headers = { - 'x-country-code': 'IN', - 'x-platform-code': 'PCTV', - } - detail_json = self._download_json('https://api.hotstar.com/o/v1/show/detail?contentId=' + series_id, - video_id=series_id, headers=headers) - id = compat_str(try_get(detail_json, lambda x: x['body']['results']['item']['id'], int)) - item_json = self._download_json('https://api.hotstar.com/o/v1/tray/g/1/items?etid=0&tao=0&tas=10000&eid=' + id, - video_id=series_id, headers=headers) - entries = [ - self.url_result( - '%s/ignoreme/%d' % (url, video['contentId']), - ie=HotStarIE.ie_key(), video_id=video['contentId']) - for video in item_json['body']['results']['items'] - if video.get('contentId')] - - return self.playlist_result(entries, series_id) + id_ = self._call_api_v1( + 'show/detail', series_id, query={'contentId': series_id})['body']['results']['item']['id'] + + return self.playlist_result(self._playlist_entries( + 'tray/g/1/items', series_id, url, query={'tao': 0, 'tas': 10000, 'etid': 0, 'eid': id_}), series_id) diff --git a/hypervideo_dl/extractor/howcast.py b/hypervideo_dl/extractor/howcast.py index 7e36b85..59cf80f 100644 --- a/hypervideo_dl/extractor/howcast.py +++ b/hypervideo_dl/extractor/howcast.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import parse_iso8601 diff --git a/hypervideo_dl/extractor/howstuffworks.py b/hypervideo_dl/extractor/howstuffworks.py index cf90ab3..238fc0b 100644 --- a/hypervideo_dl/extractor/howstuffworks.py +++ b/hypervideo_dl/extractor/howstuffworks.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( find_xpath_attr, @@ -77,8 +75,6 @@ class HowStuffWorksIE(InfoExtractor): 'vbr': vbr, }) - self._sort_formats(formats) - return { 'id': '%s' % video_id, 'display_id': display_id, diff --git a/hypervideo_dl/extractor/hrfensehen.py b/hypervideo_dl/extractor/hrfensehen.py index e39ded2..35e9f67 100644 --- a/hypervideo_dl/extractor/hrfensehen.py +++ b/hypervideo_dl/extractor/hrfensehen.py @@ -1,17 +1,19 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json import re -from ..utils import int_or_none, unified_timestamp, unescapeHTML from .common import InfoExtractor +from ..utils import ( + int_or_none, + traverse_obj, + try_call, + unescapeHTML, + unified_timestamp, +) class HRFernsehenIE(InfoExtractor): IE_NAME = 'hrfernsehen' _VALID_URL = r'^https?://www\.(?:hr-fernsehen|hessenschau)\.de/.*,video-(?P<id>[0-9]{6})\.html' - _TESTS = [{ 'url': 'https://www.hessenschau.de/tv-sendung/hessenschau-vom-26082020,video-130546.html', 'md5': '5c4e0ba94677c516a2f65a84110fc536', @@ -24,10 +26,11 @@ class HRFernsehenIE(InfoExtractor): 'subtitles': {'de': [{ 'url': 'https://hr-a.akamaihd.net/video/as/hessenschau/2020_08/hrLogo_200826200407_L385592_512x288-25p-500kbit.vtt' }]}, - 'timestamp': 1598470200, + 'timestamp': 1598400000, 'upload_date': '20200826', - 'thumbnail': 'https://www.hessenschau.de/tv-sendung/hs_ganz-1554~_t-1598465545029_v-16to9__medium.jpg', - 'title': 'hessenschau vom 26.08.2020' + 'thumbnail': 'https://www.hessenschau.de/tv-sendung/hs_ganz-1554~_t-1598465545029_v-16to9.jpg', + 'title': 'hessenschau vom 26.08.2020', + 'duration': 1654 } }, { 'url': 'https://www.hr-fernsehen.de/sendungen-a-z/mex/sendungen/fair-und-gut---was-hinter-aldis-eigenem-guetesiegel-steckt,video-130544.html', @@ -36,25 +39,18 @@ class HRFernsehenIE(InfoExtractor): _GEO_COUNTRIES = ['DE'] - def extract_airdate(self, loader_data): - airdate_str = loader_data.get('mediaMetadata', {}).get('agf', {}).get('airdate') - - if airdate_str is None: - return None - - return unified_timestamp(airdate_str) - def extract_formats(self, loader_data): stream_formats = [] - for stream_obj in loader_data["videoResolutionLevels"]: + data = loader_data['mediaCollection']['streams'][0]['media'] + for inner in data[1:]: stream_format = { - 'format_id': str(stream_obj['verticalResolution']) + "p", - 'height': stream_obj['verticalResolution'], - 'url': stream_obj['url'], + 'format_id': try_call(lambda: f'{inner["maxHResolutionPx"]}p'), + 'height': inner.get('maxHResolutionPx'), + 'url': inner['url'], } quality_information = re.search(r'([0-9]{3,4})x([0-9]{3,4})-([0-9]{2})p-([0-9]{3,4})kbit', - stream_obj['url']) + inner['url']) if quality_information: stream_format['width'] = int_or_none(quality_information.group(1)) stream_format['height'] = int_or_none(quality_information.group(2)) @@ -62,8 +58,6 @@ class HRFernsehenIE(InfoExtractor): stream_format['tbr'] = int_or_none(quality_information.group(4)) stream_formats.append(stream_format) - - self._sort_formats(stream_formats) return stream_formats def _real_extract(self, url): @@ -75,22 +69,22 @@ class HRFernsehenIE(InfoExtractor): description = self._html_search_meta( ['description'], webpage) - loader_str = unescapeHTML(self._search_regex(r"data-new-hr-mediaplayer-loader='([^']*)'", webpage, "ardloader")) + loader_str = unescapeHTML(self._search_regex(r"data-(?:new-)?hr-mediaplayer-loader='([^']*)'", webpage, 'ardloader')) loader_data = json.loads(loader_str) + subtitle = traverse_obj(loader_data, ('mediaCollection', 'subTitles', 0, 'sources', 0, 'url')) + info = { 'id': video_id, 'title': title, 'description': description, 'formats': self.extract_formats(loader_data), - 'timestamp': self.extract_airdate(loader_data) + 'subtitles': {'de': [{'url': subtitle}]}, + 'timestamp': unified_timestamp(self._search_regex( + r'<time\sdatetime="(\d{4}\W\d{1,2}\W\d{1,2})', webpage, 'datetime', fatal=False)), + 'duration': int_or_none(traverse_obj( + loader_data, ('playerConfig', 'pluginData', 'trackingAti@all', 'richMedia', 'duration'))), + 'thumbnail': self._search_regex(r'thumbnailUrl\W*([^"]+)', webpage, 'thumbnail', default=None), } - if "subtitle" in loader_data: - info["subtitles"] = {"de": [{"url": loader_data["subtitle"]}]} - - thumbnails = list(set([t for t in loader_data.get("previewImageUrl", {}).values()])) - if len(thumbnails) > 0: - info["thumbnails"] = [{"url": t} for t in thumbnails] - return info diff --git a/hypervideo_dl/extractor/hrti.py b/hypervideo_dl/extractor/hrti.py index 36d6007..cfec80d 100644 --- a/hypervideo_dl/extractor/hrti.py +++ b/hypervideo_dl/extractor/hrti.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json from .common import InfoExtractor @@ -147,7 +144,6 @@ class HRTiIE(HRTiBaseIE): formats = self._extract_m3u8_formats( m3u8_url, display_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') - self._sort_formats(formats) description = clean_html(title_info.get('summary_long')) age_limit = parse_age_limit(video.get('parental_control', {}).get('rating')) diff --git a/hypervideo_dl/extractor/hse.py b/hypervideo_dl/extractor/hse.py index 9144ff8..3cb21d2 100644 --- a/hypervideo_dl/extractor/hse.py +++ b/hypervideo_dl/extractor/hse.py @@ -1,4 +1,3 @@ -# coding: utf-8 from .common import InfoExtractor from ..utils import ( ExtractorError, @@ -26,7 +25,6 @@ class HSEShowBaseInfoExtractor(InfoExtractor): fmts, subs = self._extract_m3u8_formats_and_subtitles(src['url'], video_id, ext='mp4') formats.extend(fmts) subtitles = self._merge_subtitles(subtitles, subs) - self._sort_formats(formats) return formats, subtitles diff --git a/hypervideo_dl/extractor/huajiao.py b/hypervideo_dl/extractor/huajiao.py index 4ca275d..c498fa3 100644 --- a/hypervideo_dl/extractor/huajiao.py +++ b/hypervideo_dl/extractor/huajiao.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( parse_duration, diff --git a/hypervideo_dl/extractor/huffpost.py b/hypervideo_dl/extractor/huffpost.py index 54385ba..69fdc34 100644 --- a/hypervideo_dl/extractor/huffpost.py +++ b/hypervideo_dl/extractor/huffpost.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -19,6 +17,7 @@ class HuffPostIE(InfoExtractor): HPLEmbedPlayer/\?segmentId= ) (?P<id>[0-9a-f]+)''' + _EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed\.live\.huffingtonpost\.com/.+?)\1'] _TEST = { 'url': 'http://live.huffingtonpost.com/r/segment/legalese-it/52dd3e4b02a7602131000677', @@ -80,8 +79,6 @@ class HuffPostIE(InfoExtractor): 'vcodec': 'none' if key.startswith('audio/') else None, }) - self._sort_formats(formats) - return { 'id': video_id, 'title': video_title, diff --git a/hypervideo_dl/extractor/hungama.py b/hypervideo_dl/extractor/hungama.py index 821b16e..2e99396 100644 --- a/hypervideo_dl/extractor/hungama.py +++ b/hypervideo_dl/extractor/hungama.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -23,15 +20,17 @@ class HungamaIE(InfoExtractor): ''' _TESTS = [{ 'url': 'http://www.hungama.com/video/krishna-chants/39349649/', - 'md5': 'a845a6d1ebd08d80c1035126d49bd6a0', + 'md5': '687c5f1e9f832f3b59f44ed0eb1f120a', 'info_dict': { - 'id': '2931166', + 'id': '39349649', 'ext': 'mp4', - 'title': 'Lucky Ali - Kitni Haseen Zindagi', - 'track': 'Kitni Haseen Zindagi', - 'artist': 'Lucky Ali', - 'album': 'Aks', - 'release_year': 2000, + 'title': 'Krishna Chants', + 'description': 'Watch Krishna Chants video now. You can also watch other latest videos only at Hungama', + 'upload_date': '20180829', + 'duration': 264, + 'timestamp': 1535500800, + 'view_count': int, + 'thumbnail': 'https://images.hungama.com/c/1/0dc/2ca/39349649/39349649_700x394.jpg', } }, { 'url': 'https://www.hungama.com/movie/kahaani-2/44129919/', @@ -43,12 +42,7 @@ class HungamaIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - info = self._search_json_ld(webpage, video_id) - - m3u8_url = self._download_json( + video_json = self._download_json( 'https://www.hungama.com/index.php', video_id, data=urlencode_postdata({'content_id': video_id}), headers={ 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', @@ -56,18 +50,24 @@ class HungamaIE(InfoExtractor): }, query={ 'c': 'common', 'm': 'get_video_mdn_url', - })['stream_url'] + }) - formats = self._extract_m3u8_formats( - m3u8_url, video_id, ext='mp4', entry_protocol='m3u8_native', - m3u8_id='hls') - self._sort_formats(formats) + formats = self._extract_m3u8_formats(video_json['stream_url'], video_id, ext='mp4', m3u8_id='hls') - info.update({ + json_ld = self._search_json_ld( + self._download_webpage(url, video_id, fatal=False) or '', video_id, fatal=False) + + return { + **json_ld, 'id': video_id, 'formats': formats, - }) - return info + 'subtitles': { + 'en': [{ + 'url': video_json['sub_title'], + 'ext': 'vtt', + }] + } if video_json.get('sub_title') else None, + } class HungamaSongIE(InfoExtractor): diff --git a/hypervideo_dl/extractor/huya.py b/hypervideo_dl/extractor/huya.py index 4e96f22..b6e9eec 100644 --- a/hypervideo_dl/extractor/huya.py +++ b/hypervideo_dl/extractor/huya.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import hashlib import random @@ -9,7 +6,6 @@ from ..compat import compat_urlparse, compat_b64decode from ..utils import ( ExtractorError, int_or_none, - js_to_json, str_or_none, try_get, unescapeHTML, @@ -58,11 +54,7 @@ class HuyaLiveIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id=video_id) - json_stream = self._search_regex(r'"stream":\s+"([a-zA-Z0-9+=/]+)"', webpage, 'stream', default=None) - if not json_stream: - raise ExtractorError('Video is offline', expected=True) - stream_data = self._parse_json(compat_b64decode(json_stream).decode(), video_id=video_id, - transform_source=js_to_json) + stream_data = self._search_json(r'stream:\s', webpage, 'stream', video_id=video_id, default=None) room_info = try_get(stream_data, lambda x: x['data'][0]['gameLiveInfo']) if not room_info: raise ExtractorError('Can not extract the room info', expected=True) @@ -70,6 +62,8 @@ class HuyaLiveIE(InfoExtractor): screen_type = room_info.get('screenType') live_source_type = room_info.get('liveSourceType') stream_info_list = stream_data['data'][0]['gameStreamInfoList'] + if not stream_info_list: + raise ExtractorError('Video is offline', expected=True) formats = [] for stream_info in stream_info_list: stream_url = stream_info.get('sFlvUrl') @@ -99,8 +93,6 @@ class HuyaLiveIE(InfoExtractor): **self._RESOLUTION.get(si.get('sDisplayName'), {}), }) - self._sort_formats(formats) - return { 'id': video_id, 'title': title, diff --git a/hypervideo_dl/extractor/hypem.py b/hypervideo_dl/extractor/hypem.py index 9ca28d6..54db7b3 100644 --- a/hypervideo_dl/extractor/hypem.py +++ b/hypervideo_dl/extractor/hypem.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import int_or_none diff --git a/hypervideo_dl/extractor/hytale.py b/hypervideo_dl/extractor/hytale.py new file mode 100644 index 0000000..0f4dcc3 --- /dev/null +++ b/hypervideo_dl/extractor/hytale.py @@ -0,0 +1,58 @@ +import re + +from .common import InfoExtractor +from ..utils import traverse_obj + + +class HytaleIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?hytale\.com/news/\d+/\d+/(?P<id>[a-z0-9-]+)' + _TESTS = [{ + 'url': 'https://hytale.com/news/2021/07/summer-2021-development-update', + 'info_dict': { + 'id': 'summer-2021-development-update', + 'title': 'Summer 2021 Development Update', + }, + 'playlist_count': 4, + 'playlist': [{ + 'md5': '0854ebe347d233ee19b86ab7b2ead610', + 'info_dict': { + 'id': 'ed51a2609d21bad6e14145c37c334999', + 'ext': 'mp4', + 'title': 'Avatar Personalization', + 'thumbnail': r're:https://videodelivery\.net/\w+/thumbnails/thumbnail\.jpg', + } + }] + }, { + 'url': 'https://www.hytale.com/news/2019/11/hytale-graphics-update', + 'info_dict': { + 'id': 'hytale-graphics-update', + 'title': 'Hytale graphics update', + }, + 'playlist_count': 2, + }] + + def _real_initialize(self): + media_webpage = self._download_webpage( + 'https://hytale.com/media', None, note='Downloading list of media', fatal=False) or '' + + clips_json = traverse_obj( + self._search_json( + r'window\.__INITIAL_COMPONENTS_STATE__\s*=\s*\[', + media_webpage, 'clips json', None), + ('media', 'clips')) or [] + + self._titles = {clip.get('src'): clip.get('caption') for clip in clips_json} + + def _real_extract(self, url): + playlist_id = self._match_id(url) + webpage = self._download_webpage(url, playlist_id) + entries = [ + self.url_result( + f'https://cloudflarestream.com/{video_hash}/manifest/video.mpd?parentOrigin=https%3A%2F%2Fhytale.com', + title=self._titles.get(video_hash), url_transparent=True) + for video_hash in re.findall( + r'<stream\s+class\s*=\s*"ql-video\s+cf-stream"\s+src\s*=\s*"([a-f0-9]{32})"', + webpage) + ] + + return self.playlist_result(entries, playlist_id, self._og_search_title(webpage)) diff --git a/hypervideo_dl/extractor/icareus.py b/hypervideo_dl/extractor/icareus.py new file mode 100644 index 0000000..d081cf4 --- /dev/null +++ b/hypervideo_dl/extractor/icareus.py @@ -0,0 +1,179 @@ +import re + +from .common import InfoExtractor +from ..utils import ( + clean_html, + determine_ext, + get_element_by_class, + int_or_none, + merge_dicts, + parse_bitrate, + parse_resolution, + remove_end, + str_or_none, + url_or_none, + urlencode_postdata, +) + + +class IcareusIE(InfoExtractor): + _DOMAINS = '|'.join(map(re.escape, ( + 'asahitv.fi', + 'helsinkikanava.fi', + 'hyvinvointitv.fi', + 'inez.fi', + 'permanto.fi', + 'suite.icareus.com', + 'videos.minifiddlers.org', + ))) + _VALID_URL = rf'(?P<base_url>https?://(?:www\.)?(?:{_DOMAINS}))/[^?#]+/player/[^?#]+\?(?:[^#]+&)?(?:assetId|eventId)=(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.helsinkikanava.fi/fi_FI/web/helsinkikanava/player/vod?assetId=68021894', + 'md5': 'ca0b62ffc814a5411dfa6349cf5adb8a', + 'info_dict': { + 'id': '68021894', + 'ext': 'mp4', + 'title': 'Perheiden parhaaksi', + 'description': 'md5:295785ea408e5ac00708766465cc1325', + 'thumbnail': 'https://www.helsinkikanava.fi/image/image_gallery?img_id=68022501', + 'upload_date': '20200924', + 'timestamp': 1600938300, + }, + }, { # Recorded livestream + 'url': 'https://www.helsinkikanava.fi/fi/web/helsinkikanava/player/event/view?eventId=76241489', + 'md5': '014327e69dfa7b949fcc861f6d162d6d', + 'info_dict': { + 'id': '76258304', + 'ext': 'mp4', + 'title': 'Helsingin kaupungin ja HUSin tiedotustilaisuus koronaepidemiatilanteesta 24.11.2020', + 'description': 'md5:3129d041c6fbbcdc7fe68d9a938fef1c', + 'thumbnail': 'https://icareus-suite.secure2.footprint.net/image/image_gallery?img_id=76288630', + 'upload_date': '20201124', + 'timestamp': 1606206600, + }, + }, { # Non-m3u8 stream + 'url': 'https://suite.icareus.com/fi/web/westend-indians/player/vod?assetId=47567389', + 'md5': '72fc04ee971bbedc44405cdf16c990b6', + 'info_dict': { + 'id': '47567389', + 'ext': 'mp4', + 'title': 'Omatoiminen harjoittelu - Laukominen', + 'description': '', + 'thumbnail': 'https://suite.icareus.com/image/image_gallery?img_id=47568162', + 'upload_date': '20200319', + 'timestamp': 1584658080, + }, + }, { + 'url': 'https://asahitv.fi/fi/web/asahi/player/vod?assetId=89415818', + 'only_matching': True + }, { + 'url': 'https://hyvinvointitv.fi/fi/web/hyvinvointitv/player/vod?assetId=89149730', + 'only_matching': True + }, { + 'url': 'https://inez.fi/fi/web/inez-media/player/vod?assetId=71328822', + 'only_matching': True + }, { + 'url': 'https://www.permanto.fi/fi/web/alfatv/player/vod?assetId=135497515', + 'only_matching': True + }, { + 'url': 'https://videos.minifiddlers.org/web/international-minifiddlers/player/vod?assetId=1982759', + 'only_matching': True + }] + + def _real_extract(self, url): + base_url, temp_id = self._match_valid_url(url).groups() + webpage = self._download_webpage(url, temp_id) + + video_id = self._search_regex(r"_icareus\['itemId'\]\s*=\s*'(\d+)'", webpage, 'video_id') + organization_id = self._search_regex(r"_icareus\['organizationId'\]\s*=\s*'(\d+)'", webpage, 'organization_id') + + assets = self._download_json( + self._search_regex(r'var\s+publishingServiceURL\s*=\s*"(http[^"]+)";', webpage, 'api_base'), + video_id, data=urlencode_postdata({ + 'version': '03', + 'action': 'getAssetPlaybackUrls', + 'organizationId': organization_id, + 'assetId': video_id, + 'token': self._search_regex(r"_icareus\['token'\]\s*=\s*'([a-f0-9]+)'", webpage, 'icareus_token'), + })) + + subtitles = { + remove_end(sdesc.split(' ')[0], ':'): [{'url': url_or_none(surl)}] + for _, sdesc, surl in assets.get('subtitles') or [] + } + + formats = [{ + 'format': item.get('name'), + 'format_id': 'audio', + 'vcodec': 'none', + 'url': url_or_none(item['url']), + 'tbr': int_or_none(self._search_regex( + r'\((\d+)\s*k\)', item.get('name') or '', 'audio bitrate', default=None)), + } for item in assets.get('audio_urls') or [] if url_or_none(item.get('url'))] + + for item in assets.get('urls') or []: + video_url = url_or_none(item.get('url')) + if video_url is None: + continue + ext = determine_ext(video_url) + if ext == 'm3u8': + fmts, subs = self._extract_m3u8_formats_and_subtitles( + video_url, video_id, 'mp4', m3u8_id='hls', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + else: + fmt = item.get('name') + formats.append({ + 'url': video_url, + 'format': fmt, + 'tbr': parse_bitrate(fmt), + 'format_id': str_or_none(item.get('id')), + **parse_resolution(fmt), + }) + + info, token, live_title = self._search_json_ld(webpage, video_id, default={}), None, None + if not info: + token = self._search_regex( + r'data\s*:\s*{action:"getAsset".*?token:\'([a-f0-9]+)\'}', webpage, 'token', default=None) + if not token: + live_title = get_element_by_class('unpublished-info-item future-event-title', webpage) + + if token: + metadata = self._download_json( + f'{base_url}/icareus-suite-api-portlet/publishing', + video_id, fatal=False, data=urlencode_postdata({ + 'version': '03', + 'action': 'getAsset', + 'organizationId': organization_id, + 'assetId': video_id, + 'languageId': 'en_US', + 'userId': '0', + 'token': token, + })) or {} + info = { + 'title': metadata.get('name'), + 'description': metadata.get('description'), + 'timestamp': int_or_none(metadata.get('date'), scale=1000), + 'duration': int_or_none(metadata.get('duration')), + 'thumbnail': url_or_none(metadata.get('thumbnailMedium')), + } + elif live_title: # Recorded livestream + info = { + 'title': live_title, + 'description': get_element_by_class('unpublished-info-item future-event-description', webpage), + 'timestamp': int_or_none(self._search_regex( + r'var startEvent\s*=\s*(\d+);', webpage, 'uploadDate', fatal=False), scale=1000), + } + + thumbnails = info.get('thumbnails') or [{ + 'url': url_or_none(info.get('thumbnail') or assets.get('thumbnail')) + }] + + return merge_dicts({ + 'id': video_id, + 'title': None, + 'formats': formats, + 'subtitles': subtitles, + 'description': clean_html(info.get('description')), + 'thumbnails': thumbnails if thumbnails[0]['url'] else None, + }, info) diff --git a/hypervideo_dl/extractor/ichinanalive.py b/hypervideo_dl/extractor/ichinanalive.py index cb39f82..9d55ddc 100644 --- a/hypervideo_dl/extractor/ichinanalive.py +++ b/hypervideo_dl/extractor/ichinanalive.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ExtractorError, str_or_none, traverse_obj, unified_strdate from ..compat import compat_str @@ -76,8 +73,6 @@ class IchinanaLiveIE(InfoExtractor): 'acodec': 'aac', }) - self._sort_formats(formats) - return { 'id': video_id, 'title': uploader or video_id, @@ -150,8 +145,6 @@ class IchinanaLiveClipIE(InfoExtractor): 'http_headers': {'Referer': url}, }) - self._sort_formats(formats) - return { 'id': video_id, 'title': uploader or video_id, diff --git a/hypervideo_dl/extractor/ign.py b/hypervideo_dl/extractor/ign.py index c826eb3..d4797d3 100644 --- a/hypervideo_dl/extractor/ign.py +++ b/hypervideo_dl/extractor/ign.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -104,8 +102,6 @@ class IGNIE(IGNBaseIE): 'url': mezzanine_url, }) - self._sort_formats(formats) - thumbnails = [] for thumbnail in (video.get('thumbnails') or []): thumbnail_url = thumbnail.get('url') diff --git a/hypervideo_dl/extractor/iheart.py b/hypervideo_dl/extractor/iheart.py index b54c05e..2c6a5b6 100644 --- a/hypervideo_dl/extractor/iheart.py +++ b/hypervideo_dl/extractor/iheart.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( clean_html, diff --git a/hypervideo_dl/extractor/iltalehti.py b/hypervideo_dl/extractor/iltalehti.py new file mode 100644 index 0000000..0e7e82c --- /dev/null +++ b/hypervideo_dl/extractor/iltalehti.py @@ -0,0 +1,51 @@ +from .common import InfoExtractor +from ..utils import js_to_json, traverse_obj + + +class IltalehtiIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?iltalehti\.fi/[^/?#]+/a/(?P<id>[^/?#])' + _TESTS = [ + # jwplatform embed main_media + { + 'url': 'https://www.iltalehti.fi/ulkomaat/a/9fbd067f-94e4-46cd-8748-9d958eb4dae2', + 'md5': 'af12d42c539f1f49f0b62d231fe72dcd', + 'info_dict': { + 'id': 'gYjjaf1L', + 'ext': 'mp4', + 'title': 'Sensuroimaton Päivärinta, jakso 227: Vieraana Suomen Venäjän ex-suurlähettiläs René Nyberg ja Kenraalimajuri evp Pekka Toveri', + 'description': '', + 'upload_date': '20220928', + 'timestamp': 1664360878, + 'duration': 2089, + 'thumbnail': r're:^https?://.*\.jpg', + }, + }, + # jwplatform embed body + { + 'url': 'https://www.iltalehti.fi/politiikka/a/1ce49d85-1670-428b-8db8-d2479b9950a4', + 'md5': '9e50334b8f8330ce8828b567a82a3c65', + 'info_dict': { + 'id': '18R6zkLi', + 'ext': 'mp4', + 'title': 'Pekka Toverin arvio: Näin Nord Stream -kaasuputken räjäyttäminen on saatettu toteuttaa', + 'description': 'md5:3d1302c9e17e7ffd564143ff58f8de35', + 'upload_date': '20220929', + 'timestamp': 1664435867, + 'duration': 165.0, + 'thumbnail': r're:^https?://.*\.jpg', + }, + }, + ] + + def _real_extract(self, url): + article_id = self._match_id(url) + webpage = self._download_webpage(url, article_id) + info = self._search_json( + r'<script>\s*window.App\s*=', webpage, 'json', article_id, + transform_source=js_to_json) + props = traverse_obj(info, ( + 'state', 'articles', ..., 'items', (('main_media', 'properties'), ('body', ..., 'properties')))) + video_ids = traverse_obj(props, (lambda _, v: v['provider'] == 'jwplayer', 'id')) + return self.playlist_from_matches( + video_ids, article_id, ie='JWPlatform', getter=lambda id: f'jwplatform:{id}', + title=traverse_obj(info, ('state', 'articles', ..., 'items', 'canonical_title'), get_all=False)) diff --git a/hypervideo_dl/extractor/imdb.py b/hypervideo_dl/extractor/imdb.py index 96cee2e..557a3b7 100644 --- a/hypervideo_dl/extractor/imdb.py +++ b/hypervideo_dl/extractor/imdb.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import base64 import json import re @@ -102,7 +100,6 @@ class ImdbIE(InfoExtractor): 'ext': ext, 'quality': quality(format_id), }) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/imggaming.py b/hypervideo_dl/extractor/imggaming.py index ce7b21a..8e220fd 100644 --- a/hypervideo_dl/extractor/imggaming.py +++ b/hypervideo_dl/extractor/imggaming.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json from .common import InfoExtractor @@ -106,7 +103,6 @@ class ImgGamingBaseIE(InfoExtractor): formats.extend(self._extract_mpd_formats( media_url, media_id, mpd_id='dash', fatal=False, headers=self._MANIFEST_HEADERS)) - self._sort_formats(formats) subtitles = {} for subtitle in video_data.get('subtitles', []): diff --git a/hypervideo_dl/extractor/imgur.py b/hypervideo_dl/extractor/imgur.py index c917cf1..061c4cc 100644 --- a/hypervideo_dl/extractor/imgur.py +++ b/hypervideo_dl/extractor/imgur.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -86,8 +84,6 @@ class ImgurIE(InfoExtractor): }, }) - self._sort_formats(formats) - return { 'id': video_id, 'formats': formats, @@ -140,7 +136,7 @@ class ImgurGalleryIE(InfoExtractor): return self.url_result('http://imgur.com/%s' % gallery_id, ImgurIE.ie_key(), gallery_id) -class ImgurAlbumIE(ImgurGalleryIE): +class ImgurAlbumIE(ImgurGalleryIE): # XXX: Do not subclass from concrete IE IE_NAME = 'imgur:album' _VALID_URL = r'https?://(?:i\.)?imgur\.com/a/(?P<id>[a-zA-Z0-9]+)' diff --git a/hypervideo_dl/extractor/ina.py b/hypervideo_dl/extractor/ina.py index b3b2683..857013d 100644 --- a/hypervideo_dl/extractor/ina.py +++ b/hypervideo_dl/extractor/ina.py @@ -1,26 +1,19 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor -from ..utils import ( - determine_ext, - int_or_none, - strip_or_none, - xpath_attr, - xpath_text, -) +from ..utils import unified_strdate class InaIE(InfoExtractor): - _VALID_URL = r'https?://(?:(?:www|m)\.)?ina\.fr/(?:video|audio)/(?P<id>[A-Z0-9_]+)' + _VALID_URL = r'https?://(?:(?:www|m)\.)?ina\.fr/(?:[^?#]+/)(?P<id>[\w-]+)' _TESTS = [{ - 'url': 'http://www.ina.fr/video/I12055569/francois-hollande-je-crois-que-c-est-clair-video.html', - 'md5': 'a667021bf2b41f8dc6049479d9bb38a3', + 'url': 'https://www.ina.fr/video/I12055569/francois-hollande-je-crois-que-c-est-clair-video.html', + 'md5': 'c5a09e5cb5604ed10709f06e7a377dda', 'info_dict': { 'id': 'I12055569', 'ext': 'mp4', 'title': 'François Hollande "Je crois que c\'est clair"', - 'description': 'md5:3f09eb072a06cb286b8f7e4f77109663', + 'description': 'md5:19f61e2b4844ed4bb2e3df9ab9f527ff', + 'upload_date': '20070712', + 'thumbnail': 'https://cdn-hub.ina.fr/notice/690x517/3c4/I12055569.jpeg', } }, { 'url': 'https://www.ina.fr/video/S806544_001/don-d-organes-des-avancees-mais-d-importants-besoins-video.html', @@ -34,53 +27,58 @@ class InaIE(InfoExtractor): }, { 'url': 'http://m.ina.fr/video/I12055569', 'only_matching': True, + }, { + 'url': 'https://www.ina.fr/ina-eclaire-actu/video/cpb8205116303/les-jeux-electroniques', + 'md5': '4b8284a9a3a184fdc7e744225b8251e7', + 'info_dict': { + 'id': 'CPB8205116303', + 'ext': 'mp4', + 'title': 'Les jeux électroniques', + 'description': 'md5:e09f7683dad1cc60b74950490127d233', + 'upload_date': '19821204', + 'duration': 657, + 'thumbnail': 'https://cdn-hub.ina.fr/notice/690x517/203/CPB8205116303.jpeg', + }, + }, { + 'url': 'https://www.ina.fr/ina-eclaire-actu/arletty-carriere-conseils-actrice-marcel-carne', + 'md5': '743d6f069a00e19dda0da166a54eeccb', + 'info_dict': { + 'id': 'I22203233', + 'ext': 'mp4', + 'title': 'Arletty sur le métier d\'actrice', + 'description': 'md5:3d89b5e419d8514c934f146045ccdbad', + 'upload_date': '19581128', + 'thumbnail': 'https://cdn-hub.ina.fr/notice/690x517/082/I22203233.jpeg', + }, + }, { + 'url': 'https://www.ina.fr/ina-eclaire-actu/chasse-croise-sncf-gare-d-austerlitz-vacances-d-ete', + 'md5': 'a96fb85e9ba3b5c5b2eeb0c5daa55f2f', + 'info_dict': { + 'id': 'CAF91038285', + 'ext': 'mp4', + 'title': 'Les grands départs : les trains', + 'description': 'md5:1630ee819d8d4da97df53459e99f72bb', + 'upload_date': '19740801', + 'thumbnail': 'https://cdn-hub.ina.fr/notice/690x517/2cf/CAF91038285.jpeg', + }, }] def _real_extract(self, url): - video_id = self._match_id(url) - info_doc = self._download_xml( - 'http://player.ina.fr/notices/%s.mrss' % video_id, video_id) - item = info_doc.find('channel/item') - title = xpath_text(item, 'title', fatal=True) - media_ns_xpath = lambda x: self._xpath_ns(x, 'http://search.yahoo.com/mrss/') - content = item.find(media_ns_xpath('content')) + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) - get_furl = lambda x: xpath_attr(content, media_ns_xpath(x), 'url') - formats = [] - for q, w, h in (('bq', 400, 300), ('mq', 512, 384), ('hq', 768, 576)): - q_url = get_furl(q) - if not q_url: - continue - formats.append({ - 'format_id': q, - 'url': q_url, - 'width': w, - 'height': h, - }) - if not formats: - furl = get_furl('player') or content.attrib['url'] - ext = determine_ext(furl) - formats = [{ - 'url': furl, - 'vcodec': 'none' if ext == 'mp3' else None, - 'ext': ext, - }] + api_url = self._html_search_regex(r'asset-details-url\s*=\s*["\'](?P<api_url>[^"\']+)', webpage, 'api_url') + asset_id = self._search_regex(r'assets/([^?/]+)', api_url, 'asset_id') - thumbnails = [] - for thumbnail in content.findall(media_ns_xpath('thumbnail')): - thumbnail_url = thumbnail.get('url') - if not thumbnail_url: - continue - thumbnails.append({ - 'url': thumbnail_url, - 'height': int_or_none(thumbnail.get('height')), - 'width': int_or_none(thumbnail.get('width')), - }) + api_response = self._download_json(api_url.replace(asset_id, f'{asset_id}.json'), asset_id) return { - 'id': video_id, - 'formats': formats, - 'title': title, - 'description': strip_or_none(xpath_text(item, 'description')), - 'thumbnails': thumbnails, + 'id': asset_id, + 'url': api_response['resourceUrl'], + 'ext': {'video': 'mp4', 'audio': 'mp3'}.get(api_response.get('type')), + 'title': api_response.get('title'), + 'description': api_response.get('description'), + 'upload_date': unified_strdate(api_response.get('dateOfBroadcast')), + 'duration': api_response.get('duration'), + 'thumbnail': api_response.get('resourceThumbnail'), } diff --git a/hypervideo_dl/extractor/inc.py b/hypervideo_dl/extractor/inc.py index d5b258a..9b3fe9a 100644 --- a/hypervideo_dl/extractor/inc.py +++ b/hypervideo_dl/extractor/inc.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from .kaltura import KalturaIE diff --git a/hypervideo_dl/extractor/indavideo.py b/hypervideo_dl/extractor/indavideo.py index 4c16243..4fa97d8 100644 --- a/hypervideo_dl/extractor/indavideo.py +++ b/hypervideo_dl/extractor/indavideo.py @@ -1,8 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - from .common import InfoExtractor from ..compat import compat_str from ..utils import ( @@ -15,6 +10,14 @@ from ..utils import ( class IndavideoEmbedIE(InfoExtractor): _VALID_URL = r'https?://(?:(?:embed\.)?indavideo\.hu/player/video/|assets\.indavideo\.hu/swf/player\.swf\?.*\b(?:v(?:ID|id))=)(?P<id>[\da-f]+)' + # Some example URLs covered by generic extractor: + # http://indavideo.hu/video/Vicces_cica_1 + # http://index.indavideo.hu/video/2015_0728_beregszasz + # http://auto.indavideo.hu/video/Sajat_utanfutoban_a_kis_tacsko + # http://erotika.indavideo.hu/video/Amator_tini_punci + # http://film.indavideo.hu/video/f_hrom_nagymamm_volt + # http://palyazat.indavideo.hu/video/Embertelen_dal_Dodgem_egyuttes + _EMBED_REGEX = [r'<iframe[^>]+\bsrc=["\'](?P<url>(?:https?:)?//embed\.indavideo\.hu/player/video/[\da-f]+)'] _TESTS = [{ 'url': 'http://indavideo.hu/player/video/1bdc3c6d80/', 'md5': 'c8a507a1c7410685f83a06eaeeaafeab', @@ -40,20 +43,6 @@ class IndavideoEmbedIE(InfoExtractor): 'only_matching': True, }] - # Some example URLs covered by generic extractor: - # http://indavideo.hu/video/Vicces_cica_1 - # http://index.indavideo.hu/video/2015_0728_beregszasz - # http://auto.indavideo.hu/video/Sajat_utanfutoban_a_kis_tacsko - # http://erotika.indavideo.hu/video/Amator_tini_punci - # http://film.indavideo.hu/video/f_hrom_nagymamm_volt - # http://palyazat.indavideo.hu/video/Embertelen_dal_Dodgem_egyuttes - - @staticmethod - def _extract_urls(webpage): - return re.findall( - r'<iframe[^>]+\bsrc=["\'](?P<url>(?:https?:)?//embed\.indavideo\.hu/player/video/[\da-f]+)', - webpage) - def _real_extract(self, url): video_id = self._match_id(url) @@ -100,7 +89,6 @@ class IndavideoEmbedIE(InfoExtractor): 'url': video_url, 'height': height, }) - self._sort_formats(formats) timestamp = video.get('date') if timestamp: diff --git a/hypervideo_dl/extractor/infoq.py b/hypervideo_dl/extractor/infoq.py index 347cc51..192bcfe 100644 --- a/hypervideo_dl/extractor/infoq.py +++ b/hypervideo_dl/extractor/infoq.py @@ -1,15 +1,13 @@ -# coding: utf-8 - -from __future__ import unicode_literals - from ..compat import ( compat_b64decode, compat_urllib_parse_unquote, compat_urlparse, ) from ..utils import ( + ExtractorError, determine_ext, update_url_query, + traverse_obj, ) from .bokecc import BokeCCBaseIE @@ -38,6 +36,7 @@ class InfoQIE(BokeCCBaseIE): 'ext': 'flv', 'description': 'md5:308d981fb28fa42f49f9568322c683ff', }, + 'skip': 'Sorry, the page you visited does not exist', }, { 'url': 'https://www.infoq.com/presentations/Simple-Made-Easy', 'md5': '0e34642d4d9ef44bf86f66f6399672db', @@ -90,8 +89,10 @@ class InfoQIE(BokeCCBaseIE): }] def _extract_http_audio(self, webpage, video_id): - fields = self._form_hidden_inputs('mp3Form', webpage) - http_audio_url = fields.get('filename') + try: + http_audio_url = traverse_obj(self._form_hidden_inputs('mp3Form', webpage), 'filename') + except ExtractorError: + http_audio_url = None if not http_audio_url: return [] @@ -127,8 +128,6 @@ class InfoQIE(BokeCCBaseIE): + self._extract_http_video(webpage) + self._extract_http_audio(webpage, video_id)) - self._sort_formats(formats) - return { 'id': video_id, 'title': video_title, diff --git a/hypervideo_dl/extractor/instagram.py b/hypervideo_dl/extractor/instagram.py index 970f2c8..0233513 100644 --- a/hypervideo_dl/extractor/instagram.py +++ b/hypervideo_dl/extractor/instagram.py @@ -1,19 +1,17 @@ -# coding: utf-8 - -import itertools import hashlib +import itertools import json import re import time +import urllib.error from .common import InfoExtractor -from ..compat import ( - compat_HTTPError, -) from ..utils import ( ExtractorError, - format_field, + decode_base_n, + encode_base_n, float_or_none, + format_field, get_element_by_attribute, int_or_none, lowercase_escape, @@ -24,42 +22,59 @@ from ..utils import ( urlencode_postdata, ) +_ENCODING_CHARS = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_' + + +def _pk_to_id(id): + """Source: https://stackoverflow.com/questions/24437823/getting-instagram-post-url-from-media-id""" + return encode_base_n(int(id.split('_')[0]), table=_ENCODING_CHARS) + + +def _id_to_pk(shortcode): + """Covert a shortcode to a numeric value""" + return decode_base_n(shortcode[:11], table=_ENCODING_CHARS) + class InstagramBaseIE(InfoExtractor): _NETRC_MACHINE = 'instagram' _IS_LOGGED_IN = False + _API_BASE_URL = 'https://i.instagram.com/api/v1' + _LOGIN_URL = 'https://www.instagram.com/accounts/login' + _API_HEADERS = { + 'X-IG-App-ID': '936619743392459', + 'X-ASBD-ID': '198387', + 'X-IG-WWW-Claim': '0', + 'Origin': 'https://www.instagram.com', + 'Accept': '*/*', + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36', + } + def _perform_login(self, username, password): if self._IS_LOGGED_IN: return login_webpage = self._download_webpage( - 'https://www.instagram.com/accounts/login/', None, - note='Downloading login webpage', errnote='Failed to download login webpage') + self._LOGIN_URL, None, note='Downloading login webpage', errnote='Failed to download login webpage') - shared_data = self._parse_json( - self._search_regex( - r'window\._sharedData\s*=\s*({.+?});', - login_webpage, 'shared data', default='{}'), - None) - - login = self._download_json('https://www.instagram.com/accounts/login/ajax/', None, note='Logging in', headers={ - 'Accept': '*/*', - 'X-IG-App-ID': '936619743392459', - 'X-ASBD-ID': '198387', - 'X-IG-WWW-Claim': '0', - 'X-Requested-With': 'XMLHttpRequest', - 'X-CSRFToken': shared_data['config']['csrf_token'], - 'X-Instagram-AJAX': shared_data['rollout_hash'], - 'Referer': 'https://www.instagram.com/', - }, data=urlencode_postdata({ - 'enc_password': f'#PWD_INSTAGRAM_BROWSER:0:{int(time.time())}:{password}', - 'username': username, - 'queryParams': '{}', - 'optIntoOneTap': 'false', - 'stopDeletionNonce': '', - 'trustedDeviceRecords': '{}', - })) + shared_data = self._parse_json(self._search_regex( + r'window\._sharedData\s*=\s*({.+?});', login_webpage, 'shared data', default='{}'), None) + + login = self._download_json( + f'{self._LOGIN_URL}/ajax/', None, note='Logging in', headers={ + **self._API_HEADERS, + 'X-Requested-With': 'XMLHttpRequest', + 'X-CSRFToken': shared_data['config']['csrf_token'], + 'X-Instagram-AJAX': shared_data['rollout_hash'], + 'Referer': 'https://www.instagram.com/', + }, data=urlencode_postdata({ + 'enc_password': f'#PWD_INSTAGRAM_BROWSER:0:{int(time.time())}:{password}', + 'username': username, + 'queryParams': '{}', + 'optIntoOneTap': 'false', + 'stopDeletionNonce': '', + 'trustedDeviceRecords': '{}', + })) if not login.get('authenticated'): if login.get('message'): @@ -124,7 +139,7 @@ class InstagramBaseIE(InfoExtractor): } def _extract_product_media(self, product_media): - media_id = product_media.get('code') or product_media.get('id') + media_id = product_media.get('code') or _pk_to_id(product_media.get('pk')) vcodec = product_media.get('video_codec') dash_manifest_raw = product_media.get('video_dash_manifest') videos_list = product_media.get('video_versions') @@ -140,7 +155,6 @@ class InstagramBaseIE(InfoExtractor): } for format in videos_list or []] if dash_manifest_raw: formats.extend(self._parse_mpd_formats(self._parse_xml(dash_manifest_raw, media_id), mpd_id='dash')) - self._sort_formats(formats) thumbnails = [{ 'url': thumbnail.get('url'), @@ -160,7 +174,7 @@ class InstagramBaseIE(InfoExtractor): user_info = product_info.get('user') or {} info_dict = { - 'id': product_info.get('code') or product_info.get('id'), + 'id': _pk_to_id(traverse_obj(product_info, 'pk', 'id', expected_type=str_or_none)[:19]), 'title': product_info.get('title') or f'Video by {user_info.get("username")}', 'description': traverse_obj(product_info, ('caption', 'text'), expected_type=str_or_none), 'timestamp': int_or_none(product_info.get('taken_at')), @@ -170,6 +184,7 @@ class InstagramBaseIE(InfoExtractor): 'view_count': int_or_none(product_info.get('view_count')), 'like_count': int_or_none(product_info.get('like_count')), 'comment_count': int_or_none(product_info.get('comment_count')), + '__post_extractor': self.extract_comments(_pk_to_id(product_info.get('pk'))), 'http_headers': { 'Referer': 'https://www.instagram.com/', } @@ -191,6 +206,23 @@ class InstagramBaseIE(InfoExtractor): **self._extract_product_media(product_info) } + def _get_comments(self, video_id): + comments_info = self._download_json( + f'{self._API_BASE_URL}/media/{_id_to_pk(video_id)}/comments/?can_support_threading=true&permalink_enabled=false', video_id, + fatal=False, errnote='Comments extraction failed', note='Downloading comments info', headers=self._API_HEADERS) or {} + + comment_data = traverse_obj(comments_info, ('edge_media_to_parent_comment', 'edges'), 'comments') + for comment_dict in comment_data or []: + yield { + 'author': traverse_obj(comment_dict, ('node', 'owner', 'username'), ('user', 'username')), + 'author_id': traverse_obj(comment_dict, ('node', 'owner', 'id'), ('user', 'pk')), + 'author_thumbnail': traverse_obj(comment_dict, ('node', 'owner', 'profile_pic_url'), ('user', 'profile_pic_url'), expected_type=url_or_none), + 'id': traverse_obj(comment_dict, ('node', 'id'), 'pk'), + 'text': traverse_obj(comment_dict, ('node', 'text'), 'text'), + 'like_count': traverse_obj(comment_dict, ('node', 'edge_liked_by', 'count'), 'comment_like_count', expected_type=int_or_none), + 'timestamp': traverse_obj(comment_dict, ('node', 'created_at'), 'created_at', expected_type=int_or_none), + } + class InstagramIOSIE(InfoExtractor): IE_DESC = 'IOS instagram:// URL' @@ -216,27 +248,14 @@ class InstagramIOSIE(InfoExtractor): 'add_ie': ['Instagram'] }] - def _get_id(self, id): - """Source: https://stackoverflow.com/questions/24437823/getting-instagram-post-url-from-media-id""" - chrs = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_' - media_id = int(id.split('_')[0]) - shortened_id = '' - while media_id > 0: - r = media_id % 64 - media_id = (media_id - r) // 64 - shortened_id = chrs[r] + shortened_id - return shortened_id - def _real_extract(self, url): - return { - '_type': 'url_transparent', - 'url': f'http://instagram.com/tv/{self._get_id(self._match_id(url))}/', - 'ie_key': 'Instagram', - } + video_id = _pk_to_id(self._match_id(url)) + return self.url_result(f'http://instagram.com/tv/{video_id}', InstagramIE, video_id) class InstagramIE(InstagramBaseIE): _VALID_URL = r'(?P<url>https?://(?:www\.)?instagram\.com(?:/[^/]+)?/(?:p|tv|reel)/(?P<id>[^/?#&]+))' + _EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?instagram\.com/p/[^/]+/embed.*?)\1'] _TESTS = [{ 'url': 'https://instagram.com/p/aye83DjauH/?foo=bar#abc', 'md5': '0d2da106a9d2631273e192b372806516', @@ -246,7 +265,7 @@ class InstagramIE(InstagramBaseIE): 'title': 'Video by naomipq', 'description': 'md5:1f17f0ab29bd6fe2bfad705f58de3cb8', 'thumbnail': r're:^https?://.*\.jpg', - 'duration': 0, + 'duration': 8.747, 'timestamp': 1371748545, 'upload_date': '20130620', 'uploader_id': '2815873', @@ -256,27 +275,34 @@ class InstagramIE(InstagramBaseIE): 'comment_count': int, 'comments': list, }, + 'expected_warnings': [ + 'General metadata extraction failed', + 'Main webpage is locked behind the login page', + ], }, { - # missing description - 'url': 'https://www.instagram.com/p/BA-pQFBG8HZ/?taken-by=britneyspears', + # reel + 'url': 'https://www.instagram.com/reel/Chunk8-jurw/', + 'md5': 'f6d8277f74515fa3ff9f5791426e42b1', 'info_dict': { - 'id': 'BA-pQFBG8HZ', + 'id': 'Chunk8-jurw', 'ext': 'mp4', - 'title': 'Video by britneyspears', + 'title': 'Video by instagram', + 'description': 'md5:c9cde483606ed6f80fbe9283a6a2b290', 'thumbnail': r're:^https?://.*\.jpg', - 'duration': 0, - 'timestamp': 1453760977, - 'upload_date': '20160125', - 'uploader_id': '12246775', - 'uploader': 'Britney Spears', - 'channel': 'britneyspears', + 'duration': 5.016, + 'timestamp': 1661529231, + 'upload_date': '20220826', + 'uploader_id': '25025320', + 'uploader': 'Instagram', + 'channel': 'instagram', 'like_count': int, 'comment_count': int, 'comments': list, }, - 'params': { - 'skip_download': True, - }, + 'expected_warnings': [ + 'General metadata extraction failed', + 'Main webpage is locked behind the login page', + ], }, { # multi video post 'url': 'https://www.instagram.com/p/BQ0eAlwhDrw/', @@ -285,18 +311,24 @@ class InstagramIE(InstagramBaseIE): 'id': 'BQ0dSaohpPW', 'ext': 'mp4', 'title': 'Video 1', + 'thumbnail': r're:^https?://.*\.jpg', + 'view_count': int, }, }, { 'info_dict': { 'id': 'BQ0dTpOhuHT', 'ext': 'mp4', 'title': 'Video 2', + 'thumbnail': r're:^https?://.*\.jpg', + 'view_count': int, }, }, { 'info_dict': { 'id': 'BQ0dT7RBFeF', 'ext': 'mp4', 'title': 'Video 3', + 'thumbnail': r're:^https?://.*\.jpg', + 'view_count': int, }, }], 'info_dict': { @@ -304,6 +336,10 @@ class InstagramIE(InstagramBaseIE): 'title': 'Post by instagram', 'description': 'md5:0f9203fc6a2ce4d228da5754bcf54957', }, + 'expected_warnings': [ + 'General metadata extraction failed', + 'Main webpage is locked behind the login page', + ], }, { # IGTV 'url': 'https://www.instagram.com/tv/BkfuX9UB-eK/', @@ -322,7 +358,11 @@ class InstagramIE(InstagramBaseIE): 'comment_count': int, 'comments': list, 'description': 'Meet Cass Hirst (@cass.fb), a fingerboarding pro who can perform tiny ollies and kickflips while blindfolded.', - } + }, + 'expected_warnings': [ + 'General metadata extraction failed', + 'Main webpage is locked behind the login page', + ], }, { 'url': 'https://instagram.com/p/-Cmh1cukG2/', 'only_matching': True, @@ -340,59 +380,88 @@ class InstagramIE(InstagramBaseIE): 'only_matching': True, }] - @staticmethod - def _extract_embed_url(webpage): - mobj = re.search( - r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?instagram\.com/p/[^/]+/embed.*?)\1', - webpage) - if mobj: - return mobj.group('url') - - blockquote_el = get_element_by_attribute( - 'class', 'instagram-media', webpage) - if blockquote_el is None: - return + @classmethod + def _extract_embed_urls(cls, url, webpage): + res = tuple(super()._extract_embed_urls(url, webpage)) + if res: + return res - mobj = re.search( - r'<a[^>]+href=([\'"])(?P<link>[^\'"]+)\1', blockquote_el) + mobj = re.search(r'<a[^>]+href=([\'"])(?P<link>[^\'"]+)\1', + get_element_by_attribute('class', 'instagram-media', webpage) or '') if mobj: - return mobj.group('link') + return [mobj.group('link')] def _real_extract(self, url): video_id, url = self._match_valid_url(url).group('id', 'url') - webpage, urlh = self._download_webpage_handle(url, video_id) - if 'www.instagram.com/accounts/login' in urlh.geturl(): - self.report_warning('Main webpage is locked behind the login page. ' - 'Retrying with embed webpage (Note that some metadata might be missing)') - webpage = self._download_webpage( - 'https://www.instagram.com/p/%s/embed/' % video_id, video_id, note='Downloading embed webpage') - - shared_data = self._parse_json( - self._search_regex( - r'window\._sharedData\s*=\s*({.+?});', - webpage, 'shared data', default='{}'), - video_id, fatal=False) - media = traverse_obj( - shared_data, - ('entry_data', 'PostPage', 0, 'graphql', 'shortcode_media'), - ('entry_data', 'PostPage', 0, 'media'), - expected_type=dict) - - # _sharedData.entry_data.PostPage is empty when authenticated (see - # https://github.com/ytdl-org/youtube-dl/pull/22880) - if not media: - additional_data = self._parse_json( - self._search_regex( - r'window\.__additionalDataLoaded\s*\(\s*[^,]+,\s*({.+?})\s*\);', - webpage, 'additional data', default='{}'), - video_id, fatal=False) - product_item = traverse_obj(additional_data, ('items', 0), expected_type=dict) - if product_item: - return self._extract_product(product_item) - media = traverse_obj(additional_data, ('graphql', 'shortcode_media'), 'shortcode_media', expected_type=dict) or {} - - if not media and 'www.instagram.com/accounts/login' in urlh.geturl(): - self.raise_login_required('You need to log in to access this content') + media, webpage = {}, '' + + if self._get_cookies(url).get('sessionid'): + info = traverse_obj(self._download_json( + f'{self._API_BASE_URL}/media/{_id_to_pk(video_id)}/info/', video_id, + fatal=False, errnote='Video info extraction failed', + note='Downloading video info', headers=self._API_HEADERS), ('items', 0)) + if info: + media.update(info) + return self._extract_product(media) + + api_check = self._download_json( + f'{self._API_BASE_URL}/web/get_ruling_for_content/?content_type=MEDIA&target_id={_id_to_pk(video_id)}', + video_id, headers=self._API_HEADERS, fatal=False, note='Setting up session', errnote=False) or {} + csrf_token = self._get_cookies('https://www.instagram.com').get('csrftoken') + + if not csrf_token: + self.report_warning('No csrf token set by Instagram API', video_id) + else: + csrf_token = csrf_token.value if api_check.get('status') == 'ok' else None + if not csrf_token: + self.report_warning('Instagram API is not granting access', video_id) + + variables = { + 'shortcode': video_id, + 'child_comment_count': 3, + 'fetch_comment_count': 40, + 'parent_comment_count': 24, + 'has_threaded_comments': True, + } + general_info = self._download_json( + 'https://www.instagram.com/graphql/query/', video_id, fatal=False, errnote=False, + headers={ + **self._API_HEADERS, + 'X-CSRFToken': csrf_token or '', + 'X-Requested-With': 'XMLHttpRequest', + 'Referer': url, + }, query={ + 'query_hash': '9f8827793ef34641b2fb195d4d41151c', + 'variables': json.dumps(variables, separators=(',', ':')), + }) + media.update(traverse_obj(general_info, ('data', 'shortcode_media')) or {}) + + if not general_info: + self.report_warning('General metadata extraction failed (some metadata might be missing).', video_id) + webpage, urlh = self._download_webpage_handle(url, video_id) + shared_data = self._search_json( + r'window\._sharedData\s*=', webpage, 'shared data', video_id, fatal=False) or {} + + if shared_data and self._LOGIN_URL not in urlh.geturl(): + media.update(traverse_obj( + shared_data, ('entry_data', 'PostPage', 0, 'graphql', 'shortcode_media'), + ('entry_data', 'PostPage', 0, 'media'), expected_type=dict) or {}) + else: + self.report_warning('Main webpage is locked behind the login page. Retrying with embed webpage (some metadata might be missing).') + webpage = self._download_webpage( + f'{url}/embed/', video_id, note='Downloading embed webpage', fatal=False) + additional_data = self._search_json( + r'window\.__additionalDataLoaded\s*\(\s*[^,]+,', webpage, 'additional data', video_id, fatal=False) + if not additional_data and not media: + self.raise_login_required('Requested content is not available, rate-limit reached or login required') + + product_item = traverse_obj(additional_data, ('items', 0), expected_type=dict) + if product_item: + media.update(product_item) + return self._extract_product(media) + + media.update(traverse_obj( + additional_data, ('graphql', 'shortcode_media'), 'shortcode_media', expected_type=dict) or {}) username = traverse_obj(media, ('owner', 'username')) or self._search_regex( r'"owner"\s*:\s*{\s*"username"\s*:\s*"(.+?)"', webpage, 'username', fatal=False) @@ -412,7 +481,7 @@ class InstagramIE(InstagramBaseIE): if nodes: return self.playlist_result( self._extract_nodes(nodes, True), video_id, - format_field(username, template='Post by %s'), description) + format_field(username, None, 'Post by %s'), description) video_url = self._og_search_video_url(webpage, secure=False) @@ -424,7 +493,6 @@ class InstagramIE(InstagramBaseIE): dash = traverse_obj(media, ('dash_info', 'video_dash_manifest')) if dash: formats.extend(self._parse_mpd_formats(self._parse_xml(dash, video_id), mpd_id='dash')) - self._sort_formats(formats) comment_data = traverse_obj(media, ('edge_media_to_parent_comment', 'edges')) comments = [{ @@ -521,7 +589,7 @@ class InstagramPlaylistBaseIE(InstagramBaseIE): except ExtractorError as e: # if it's an error caused by a bad query, and there are # more GIS templates to try, ignore it and keep trying - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + if isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 403: if gis_tmpl != gis_tmpls[-1]: continue raise @@ -631,41 +699,32 @@ class InstagramStoryIE(InstagramBaseIE): def _real_extract(self, url): username, story_id = self._match_valid_url(url).groups() - - story_info_url = f'{username}/{story_id}/?__a=1' if username == 'highlights' else f'{username}/?__a=1' - story_info = self._download_json(f'https://www.instagram.com/stories/{story_info_url}', story_id, headers={ - 'X-IG-App-ID': 936619743392459, - 'X-ASBD-ID': 198387, - 'X-IG-WWW-Claim': 0, - 'X-Requested-With': 'XMLHttpRequest', - 'Referer': url, - }) - user_id = story_info['user']['id'] - highlight_title = traverse_obj(story_info, ('highlight', 'title')) + story_info = self._download_webpage(url, story_id) + user_info = self._search_json(r'"user":', story_info, 'user info', story_id, fatal=False) + if not user_info: + self.raise_login_required('This content is unreachable') + user_id = user_info.get('id') story_info_url = user_id if username != 'highlights' else f'highlight:{story_id}' - videos = self._download_json(f'https://i.instagram.com/api/v1/feed/reels_media/?reel_ids={story_info_url}', story_id, headers={ - 'X-IG-App-ID': 936619743392459, - 'X-ASBD-ID': 198387, - 'X-IG-WWW-Claim': 0, - })['reels'] - - full_name = traverse_obj(videos, ('user', 'full_name')) - - user_info = {} - if not (username and username != 'highlights' and full_name): - user_info = self._download_json( - f'https://i.instagram.com/api/v1/users/{user_id}/info/', story_id, headers={ - 'User-Agent': 'Mozilla/5.0 (Linux; Android 11; SM-A505F Build/RP1A.200720.012; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/96.0.4664.45 Mobile Safari/537.36 Instagram 214.1.0.29.120 Android (30/11; 450dpi; 1080x2122; samsung; SM-A505F; a50; exynos9610; en_US; 333717274)', - }, note='Downloading user info') + videos = traverse_obj(self._download_json( + f'{self._API_BASE_URL}/feed/reels_media/?reel_ids={story_info_url}', + story_id, errnote=False, fatal=False, headers=self._API_HEADERS), 'reels') + if not videos: + self.raise_login_required('You need to log in to access this content') - username = traverse_obj(user_info, ('user', 'username')) or username - full_name = traverse_obj(user_info, ('user', 'full_name')) or full_name + full_name = traverse_obj(videos, (f'highlight:{story_id}', 'user', 'full_name'), (str(user_id), 'user', 'full_name')) + story_title = traverse_obj(videos, (f'highlight:{story_id}', 'title')) + if not story_title: + story_title = f'Story by {username}' highlights = traverse_obj(videos, (f'highlight:{story_id}', 'items'), (str(user_id), 'items')) - return self.playlist_result([{ - **self._extract_product(highlight), - 'title': f'Story by {username}', - 'uploader': full_name, - 'uploader_id': user_id, - } for highlight in highlights], playlist_id=story_id, playlist_title=highlight_title) + info_data = [] + for highlight in highlights: + highlight_data = self._extract_product(highlight) + if highlight_data.get('formats'): + info_data.append({ + **highlight_data, + 'uploader': full_name, + 'uploader_id': user_id, + }) + return self.playlist_result(info_data, playlist_id=story_id, playlist_title=story_title) diff --git a/hypervideo_dl/extractor/internazionale.py b/hypervideo_dl/extractor/internazionale.py index 45e2af6..1b1cb57 100644 --- a/hypervideo_dl/extractor/internazionale.py +++ b/hypervideo_dl/extractor/internazionale.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import unified_timestamp @@ -63,7 +60,6 @@ class InternazionaleIE(InfoExtractor): entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) formats.extend(self._extract_mpd_formats( video_base + 'mpd', display_id, mpd_id='dash', fatal=False)) - self._sort_formats(formats) timestamp = unified_timestamp(self._html_search_meta( 'article:published_time', webpage, 'timestamp')) diff --git a/hypervideo_dl/extractor/internetvideoarchive.py b/hypervideo_dl/extractor/internetvideoarchive.py index 880918c..9d2574c 100644 --- a/hypervideo_dl/extractor/internetvideoarchive.py +++ b/hypervideo_dl/extractor/internetvideoarchive.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import json import re @@ -50,7 +48,6 @@ class InternetVideoArchiveIE(InfoExtractor): replace_url('.mpd'), video_id, mpd_id='dash', fatal=False)) formats.extend(self._extract_ism_formats( replace_url('Manifest'), video_id, ism_id='mss', fatal=False)) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/iprima.py b/hypervideo_dl/extractor/iprima.py index 1a20384..1818205 100644 --- a/hypervideo_dl/extractor/iprima.py +++ b/hypervideo_dl/extractor/iprima.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re import time @@ -151,9 +148,8 @@ class IPrimaIE(InfoExtractor): elif manifest_type == 'DASH' or ext == 'mpd': formats += self._extract_mpd_formats( manifest_url, video_id, mpd_id='dash', fatal=False) - self._sort_formats(formats) - final_result = self._search_json_ld(webpage, video_id) or {} + final_result = self._search_json_ld(webpage, video_id, default={}) final_result.update({ 'id': video_id, 'title': title, @@ -251,8 +247,6 @@ class IPrimaCNNIE(InfoExtractor): if not formats and '>GEO_IP_NOT_ALLOWED<' in playerpage: self.raise_geo_restricted(countries=['CZ'], metadata_available=True) - self._sort_formats(formats) - return { 'id': video_id, 'title': title, diff --git a/hypervideo_dl/extractor/iqiyi.py b/hypervideo_dl/extractor/iqiyi.py index d07b39d..c41f6db 100644 --- a/hypervideo_dl/extractor/iqiyi.py +++ b/hypervideo_dl/extractor/iqiyi.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import hashlib import itertools import re @@ -218,7 +215,6 @@ class IqiyiIE(InfoExtractor): self._sleep(5, video_id) - self._sort_formats(formats) title = (get_element_by_id('widget-videotitle', webpage) or clean_html(get_element_by_attribute('class', 'mod-play-tit', webpage)) or self._html_search_regex(r'<span[^>]+data-videochanged-title="word"[^>]*>([^<]+)</span>', webpage, 'title')) @@ -274,6 +270,7 @@ class IqIE(InfoExtractor): '1': 'zh_CN', '2': 'zh_TW', '3': 'en', + '4': 'kor', '18': 'th', '21': 'my', '23': 'vi', @@ -354,7 +351,7 @@ class IqIE(InfoExtractor): ''' def _extract_vms_player_js(self, webpage, video_id): - player_js_cache = self._downloader.cache.load('iq', 'player_js') + player_js_cache = self.cache.load('iq', 'player_js') if player_js_cache: return player_js_cache webpack_js_url = self._proto_relative_url(self._search_regex( @@ -367,7 +364,7 @@ class IqIE(InfoExtractor): f'https://stc.iqiyipic.com/_next/static/chunks/{webpack_map1.get(module_index, module_index)}.{webpack_map2[module_index]}.js', video_id, note=f'Downloading #{module_index} module JS', errnote='Unable to download module JS', fatal=False) or '' if 'vms request' in module_js: - self._downloader.cache.store('iq', 'player_js', module_js) + self.cache.store('iq', 'player_js', module_js) return module_js raise ExtractorError('Unable to extract player JS') @@ -420,8 +417,9 @@ class IqIE(InfoExtractor): ut_list = ['0'] # bid 0 as an initial format checker - dash_paths = self._parse_json(PhantomJSwrapper(self).get( - url, html='<!DOCTYPE html>', video_id=video_id, note2='Executing signature code', jscode=self._DASH_JS % { + dash_paths = self._parse_json(PhantomJSwrapper(self, timeout=120_000).get( + url, note2='Executing signature code (this may take a couple minutes)', + html='<!DOCTYPE html>', video_id=video_id, jscode=self._DASH_JS % { 'tvid': video_info['tvId'], 'vid': video_info['vid'], 'src': traverse_obj(next_props, ('initialProps', 'pageProps', 'ptid'), @@ -443,7 +441,7 @@ class IqIE(InfoExtractor): preview_time = traverse_obj( initial_format_data, ('boss_ts', (None, 'data'), ('previewTime', 'rtime')), expected_type=float_or_none, get_all=False) if traverse_obj(initial_format_data, ('boss_ts', 'data', 'prv'), expected_type=int_or_none): - self.report_warning('This preview video is limited%s' % format_field(preview_time, template=' to %s seconds')) + self.report_warning('This preview video is limited%s' % format_field(preview_time, None, ' to %s seconds')) # TODO: Extract audio-only formats for bid in set(traverse_obj(initial_format_data, ('program', 'video', ..., 'bid'), expected_type=str_or_none, default=[])): @@ -498,8 +496,6 @@ class IqIE(InfoExtractor): }) formats.extend(extracted_formats) - self._sort_formats(formats) - for sub_format in traverse_obj(initial_format_data, ('program', 'stl', ...), expected_type=dict, default=[]): lang = self._LID_TAGS.get(str_or_none(sub_format.get('lid')), sub_format.get('_name')) subtitles.setdefault(lang, []).extend([{ diff --git a/hypervideo_dl/extractor/ir90tv.py b/hypervideo_dl/extractor/ir90tv.py deleted file mode 100644 index d5a3f6f..0000000 --- a/hypervideo_dl/extractor/ir90tv.py +++ /dev/null @@ -1,42 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import remove_start - - -class Ir90TvIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?90tv\.ir/video/(?P<id>[0-9]+)/.*' - _TESTS = [{ - 'url': 'http://90tv.ir/video/95719/%D8%B4%D8%A7%DB%8C%D8%B9%D8%A7%D8%AA-%D9%86%D9%82%D9%84-%D9%88-%D8%A7%D9%86%D8%AA%D9%82%D8%A7%D9%84%D8%A7%D8%AA-%D9%85%D9%87%D9%85-%D9%81%D9%88%D8%AA%D8%A8%D8%A7%D9%84-%D8%A7%D8%B1%D9%88%D9%BE%D8%A7-940218', - 'md5': '411dbd94891381960cb9e13daa47a869', - 'info_dict': { - 'id': '95719', - 'ext': 'mp4', - 'title': 'شایعات نقل و انتقالات مهم فوتبال اروپا 94/02/18', - 'thumbnail': r're:^https?://.*\.jpg$', - } - }, { - 'url': 'http://www.90tv.ir/video/95719/%D8%B4%D8%A7%DB%8C%D8%B9%D8%A7%D8%AA-%D9%86%D9%82%D9%84-%D9%88-%D8%A7%D9%86%D8%AA%D9%82%D8%A7%D9%84%D8%A7%D8%AA-%D9%85%D9%87%D9%85-%D9%81%D9%88%D8%AA%D8%A8%D8%A7%D9%84-%D8%A7%D8%B1%D9%88%D9%BE%D8%A7-940218', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - title = remove_start(self._html_search_regex( - r'<title>([^<]+)', webpage, 'title'), '90tv.ir :: ') - - video_url = self._search_regex( - r']+src="([^"]+)"', webpage, 'video url') - - thumbnail = self._search_regex(r'poster="([^"]+)"', webpage, 'thumbnail url', fatal=False) - - return { - 'url': video_url, - 'id': video_id, - 'title': title, - 'video_url': video_url, - 'thumbnail': thumbnail, - } diff --git a/hypervideo_dl/extractor/islamchannel.py b/hypervideo_dl/extractor/islamchannel.py new file mode 100644 index 0000000..253a846 --- /dev/null +++ b/hypervideo_dl/extractor/islamchannel.py @@ -0,0 +1,81 @@ +import re + +from .common import InfoExtractor +from ..utils import traverse_obj, urljoin + + +class IslamChannelIE(InfoExtractor): + _VALID_URL = r'https?://watch\.islamchannel\.tv/watch/(?P\d+)' + _TESTS = [{ + 'url': 'https://watch.islamchannel.tv/watch/38604310', + 'info_dict': { + 'id': '38604310', + 'title': 'Omar - Young Omar', + 'description': 'md5:5cc7ddecef064ea7afe52eb5e0e33b55', + 'thumbnail': r're:https?://.+', + 'ext': 'mp4', + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + thumbnail = self._search_regex( + r'data-poster="([^"]+)"', webpage, 'data poster', fatal=False) or \ + self._html_search_meta(('og:image', 'twitter:image'), webpage) + + headers = { + 'Token': self._search_regex(r'data-token="([^"]+)"', webpage, 'data token'), + 'Token-Expiry': self._search_regex(r'data-expiry="([^"]+)"', webpage, 'data expiry'), + 'Uvid': video_id, + } + show_stream = self._download_json( + f'https://v2-streams-elb.simplestreamcdn.com/api/show/stream/{video_id}', video_id, + query={ + 'key': self._search_regex(r'data-key="([^"]+)"', webpage, 'data key'), + 'platform': 'chrome', + }, headers=headers) + # TODO: show_stream['stream'] and show_stream['drm'] may contain something interesting + streams = self._download_json( + traverse_obj(show_stream, ('response', 'tokenization', 'url')), video_id, + headers=headers) + formats, subs = self._extract_m3u8_formats_and_subtitles(traverse_obj(streams, ('Streams', 'Adaptive')), video_id, 'mp4') + + return { + 'id': video_id, + 'title': self._html_search_meta(('og:title', 'twitter:title'), webpage), + 'description': self._html_search_meta(('og:description', 'twitter:description', 'description'), webpage), + 'formats': formats, + 'subtitles': subs, + 'thumbnails': [{ + 'id': 'unscaled', + 'url': thumbnail.split('?')[0], + 'ext': 'jpg', + 'preference': 2, + }, { + 'id': 'orig', + 'url': thumbnail, + 'ext': 'jpg', + 'preference': 1, + }] if thumbnail else None, + } + + +class IslamChannelSeriesIE(InfoExtractor): + _VALID_URL = r'https?://watch\.islamchannel\.tv/series/(?P[a-f\d-]+)' + _TESTS = [{ + 'url': 'https://watch.islamchannel.tv/series/a6cccef3-3ef1-11eb-bc19-06b69c2357cd', + 'info_dict': { + 'id': 'a6cccef3-3ef1-11eb-bc19-06b69c2357cd', + }, + 'playlist_mincount': 31, + }] + + def _real_extract(self, url): + pl_id = self._match_id(url) + webpage = self._download_webpage(url, pl_id) + + return self.playlist_from_matches( + re.finditer(r']+?data-video-type="show">', webpage), + pl_id, getter=lambda x: urljoin(url, x.group(1)), ie=IslamChannelIE) diff --git a/hypervideo_dl/extractor/israelnationalnews.py b/hypervideo_dl/extractor/israelnationalnews.py new file mode 100644 index 0000000..35040f5 --- /dev/null +++ b/hypervideo_dl/extractor/israelnationalnews.py @@ -0,0 +1,50 @@ +from .common import InfoExtractor +from ..utils import ExtractorError, traverse_obj + + +class IsraelNationalNewsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?israelnationalnews\.com/news/(?P\d+)' + _TESTS = [{ + 'url': 'https://www.israelnationalnews.com/news/354520', + 'info_dict': { + 'id': '354520' + }, + 'playlist': [{ + 'info_dict': { + 'id': 'jA84wQhVvg8', + 'title': 'Even CNN Host Is Shocked by How Bad Biden\'s Approval Ratings Have Gotten | DM CLIPS | Rubin Report', + 'ext': 'mp4', + 'description': 'md5:b7325a3d00c7596337dc3ae37e32d35c', + 'channel': 'The Rubin Report', + 'channel_follower_count': int, + 'comment_count': int, + 'categories': ['News & Politics'], + 'like_count': int, + 'uploader_url': 'http://www.youtube.com/user/RubinReport', + 'uploader_id': 'RubinReport', + 'availability': 'public', + 'view_count': int, + 'duration': 240, + 'thumbnail': 'https://i.ytimg.com/vi_webp/jA84wQhVvg8/maxresdefault.webp', + 'live_status': 'not_live', + 'playable_in_embed': True, + 'age_limit': 0, + 'tags': 'count:29', + 'channel_id': 'UCJdKr0Bgd_5saZYqLCa9mng', + 'channel_url': 'https://www.youtube.com/channel/UCJdKr0Bgd_5saZYqLCa9mng', + 'upload_date': '20220606', + 'uploader': 'The Rubin Report', + } + }] + }] + + def _real_extract(self, url): + news_article_id = self._match_id(url) + article_json = self._download_json( + f'https://www.israelnationalnews.com/Generic/NewAPI/Item?type=0&Item={news_article_id}', news_article_id) + + urls = traverse_obj(article_json, ('Content2', ..., 'content', ..., 'attrs', 'src')) + if not urls: + raise ExtractorError('This article does not have any videos', expected=True) + + return self.playlist_from_matches(urls, news_article_id, ie='Youtube') diff --git a/hypervideo_dl/extractor/itprotv.py b/hypervideo_dl/extractor/itprotv.py index 64cb4e6..4ac1260 100644 --- a/hypervideo_dl/extractor/itprotv.py +++ b/hypervideo_dl/extractor/itprotv.py @@ -1,5 +1,3 @@ -# coding: utf-8 - import re from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/itv.py b/hypervideo_dl/extractor/itv.py index 66705a2..0681050 100644 --- a/hypervideo_dl/extractor/itv.py +++ b/hypervideo_dl/extractor/itv.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json from .common import InfoExtractor @@ -175,7 +172,6 @@ class ITVIE(InfoExtractor): formats.append({ 'url': href, }) - self._sort_formats(formats) info = self._search_json_ld(webpage, video_id, default={}) if not info: json_ld = self._parse_json(self._search_regex( diff --git a/hypervideo_dl/extractor/ivi.py b/hypervideo_dl/extractor/ivi.py index 098ab66..27a222a 100644 --- a/hypervideo_dl/extractor/ivi.py +++ b/hypervideo_dl/extractor/ivi.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json import re @@ -16,6 +13,7 @@ class IviIE(InfoExtractor): IE_DESC = 'ivi.ru' IE_NAME = 'ivi' _VALID_URL = r'https?://(?:www\.)?ivi\.(?:ru|tv)/(?:watch/(?:[^/]+/)?|video/player\?.*?videoId=)(?P\d+)' + _EMBED_REGEX = [r']+?src=(["\'])(?Phttps?://(?:www\.)?ivi\.ru/video/player.+?)\1'] _GEO_BYPASS = False _GEO_COUNTRIES = ['RU'] _LIGHT_KEY = b'\xf1\x02\x32\xb7\xbc\x5c\x7a\xe8\xf7\x96\xc1\x33\x2b\x27\xa1\x8c' @@ -168,7 +166,6 @@ class IviIE(InfoExtractor): 'quality': quality(content_format), 'filesize': int_or_none(f.get('size_in_bytes')), }) - self._sort_formats(formats) compilation = result.get('compilation') episode = title if compilation else None diff --git a/hypervideo_dl/extractor/ivideon.py b/hypervideo_dl/extractor/ivideon.py index 44b2208..7d1e554 100644 --- a/hypervideo_dl/extractor/ivideon.py +++ b/hypervideo_dl/extractor/ivideon.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor from ..compat import ( compat_urllib_parse_urlencode, @@ -71,7 +67,6 @@ class IvideonIE(InfoExtractor): 'ext': 'flv', 'quality': quality(format_id), } for format_id in self._QUALITIES] - self._sort_formats(formats) return { 'id': server_id, diff --git a/hypervideo_dl/extractor/iwara.py b/hypervideo_dl/extractor/iwara.py index c0e01e3..ec3e59c 100644 --- a/hypervideo_dl/extractor/iwara.py +++ b/hypervideo_dl/extractor/iwara.py @@ -1,21 +1,29 @@ -# coding: utf-8 -from __future__ import unicode_literals +import itertools import re +import urllib.parse from .common import InfoExtractor -from ..compat import compat_urllib_parse_urlparse from ..utils import ( int_or_none, mimetype2ext, remove_end, - url_or_none, - unified_strdate, strip_or_none, + unified_strdate, + url_or_none, + urljoin, ) -class IwaraIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.|ecchi\.)?iwara\.tv/videos/(?P[a-zA-Z0-9]+)' +class IwaraBaseIE(InfoExtractor): + _BASE_REGEX = r'(?Phttps?://(?:www\.|ecchi\.)?iwara\.tv)' + + def _extract_playlist(self, base_url, webpage): + for path in re.findall(r'class="title">\s*[a-zA-Z0-9]+)' _TESTS = [{ 'url': 'http://iwara.tv/videos/amVwUl1EHpAD9RD', # md5 is unstable @@ -60,7 +68,7 @@ class IwaraIE(InfoExtractor): webpage, urlh = self._download_webpage_handle(url, video_id) - hostname = compat_urllib_parse_urlparse(urlh.geturl()).hostname + hostname = urllib.parse.urlparse(urlh.geturl()).hostname # ecchi is 'sexy' in Japanese age_limit = 18 if hostname.split('.')[0] == 'ecchi' else 0 @@ -108,8 +116,6 @@ class IwaraIE(InfoExtractor): 'quality': 1 if format_id == 'Source' else 0, }) - self._sort_formats(formats) - return { 'id': video_id, 'title': title, @@ -120,3 +126,114 @@ class IwaraIE(InfoExtractor): 'upload_date': upload_date, 'description': description, } + + +class IwaraPlaylistIE(IwaraBaseIE): + _VALID_URL = fr'{IwaraBaseIE._BASE_REGEX}/playlist/(?P[^/?#&]+)' + IE_NAME = 'iwara:playlist' + + _TESTS = [{ + 'url': 'https://ecchi.iwara.tv/playlist/best-enf', + 'info_dict': { + 'title': 'Best enf', + 'uploader': 'Jared98112', + 'id': 'best-enf', + }, + 'playlist_mincount': 1097, + }, { + # urlencoded + 'url': 'https://ecchi.iwara.tv/playlist/%E3%83%97%E3%83%AC%E3%82%A4%E3%83%AA%E3%82%B9%E3%83%88-2', + 'info_dict': { + 'id': 'プレイリスト-2', + 'title': 'プレイリスト', + 'uploader': 'mainyu', + }, + 'playlist_mincount': 91, + }] + + def _real_extract(self, url): + playlist_id, base_url = self._match_valid_url(url).group('id', 'base_url') + playlist_id = urllib.parse.unquote(playlist_id) + webpage = self._download_webpage(url, playlist_id) + + return { + '_type': 'playlist', + 'id': playlist_id, + 'title': self._html_search_regex(r'class="title"[^>]*>([^<]+)', webpage, 'title', fatal=False), + 'uploader': self._html_search_regex(r'

    ([^<]+)', webpage, 'uploader', fatal=False), + 'entries': self._extract_playlist(base_url, webpage), + } + + +class IwaraUserIE(IwaraBaseIE): + _VALID_URL = fr'{IwaraBaseIE._BASE_REGEX}/users/(?P[^/?#&]+)' + IE_NAME = 'iwara:user' + + _TESTS = [{ + 'note': 'number of all videos page is just 1 page. less than 40 videos', + 'url': 'https://ecchi.iwara.tv/users/infinityyukarip', + 'info_dict': { + 'title': 'Uploaded videos from Infinity_YukariP', + 'id': 'infinityyukarip', + 'uploader': 'Infinity_YukariP', + 'uploader_id': 'infinityyukarip', + }, + 'playlist_mincount': 39, + }, { + 'note': 'no even all videos page. probably less than 10 videos', + 'url': 'https://ecchi.iwara.tv/users/mmd-quintet', + 'info_dict': { + 'title': 'Uploaded videos from mmd quintet', + 'id': 'mmd-quintet', + 'uploader': 'mmd quintet', + 'uploader_id': 'mmd-quintet', + }, + 'playlist_mincount': 6, + }, { + 'note': 'has paging. more than 40 videos', + 'url': 'https://ecchi.iwara.tv/users/theblackbirdcalls', + 'info_dict': { + 'title': 'Uploaded videos from TheBlackbirdCalls', + 'id': 'theblackbirdcalls', + 'uploader': 'TheBlackbirdCalls', + 'uploader_id': 'theblackbirdcalls', + }, + 'playlist_mincount': 420, + }, { + 'note': 'foreign chars in URL. there must be foreign characters in URL', + 'url': 'https://ecchi.iwara.tv/users/ぶた丼', + 'info_dict': { + 'title': 'Uploaded videos from ぶた丼', + 'id': 'ぶた丼', + 'uploader': 'ぶた丼', + 'uploader_id': 'ぶた丼', + }, + 'playlist_mincount': 170, + }] + + def _entries(self, playlist_id, base_url): + webpage = self._download_webpage( + f'{base_url}/users/{playlist_id}', playlist_id) + videos_url = self._search_regex(r'', webpage, 'all videos url', default=None) + if not videos_url: + yield from self._extract_playlist(base_url, webpage) + return + + videos_url = urljoin(base_url, videos_url) + + for n in itertools.count(1): + page = self._download_webpage( + videos_url, playlist_id, note=f'Downloading playlist page {n}', + query={'page': str(n - 1)} if n > 1 else {}) + yield from self._extract_playlist( + base_url, page) + + if f'page={n}' not in page: + break + + def _real_extract(self, url): + playlist_id, base_url = self._match_valid_url(url).group('id', 'base_url') + playlist_id = urllib.parse.unquote(playlist_id) + + return self.playlist_result( + self._entries(playlist_id, base_url), playlist_id) diff --git a/hypervideo_dl/extractor/ixigua.py b/hypervideo_dl/extractor/ixigua.py new file mode 100644 index 0000000..1f086d2 --- /dev/null +++ b/hypervideo_dl/extractor/ixigua.py @@ -0,0 +1,83 @@ +import base64 + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + get_element_by_id, + int_or_none, + js_to_json, + str_or_none, + traverse_obj, +) + + +class IxiguaIE(InfoExtractor): + _VALID_URL = r'https?://(?:\w+\.)?ixigua\.com/(?:video/)?(?P\d+).+' + _TESTS = [{ + 'url': 'https://www.ixigua.com/6996881461559165471', + 'info_dict': { + 'id': '6996881461559165471', + 'ext': 'mp4', + 'title': '盲目涉水风险大,亲身示范高水位行车注意事项', + 'description': 'md5:8c82f46186299add4a1c455430740229', + 'tags': ['video_car'], + 'like_count': int, + 'dislike_count': int, + 'view_count': int, + 'uploader': '懂车帝原创', + 'uploader_id': '6480145787', + 'thumbnail': r're:^https?://.+\.(avif|webp)', + 'timestamp': 1629088414, + 'duration': 1030, + } + }] + + def _get_json_data(self, webpage, video_id): + js_data = get_element_by_id('SSR_HYDRATED_DATA', webpage) + if not js_data: + if self._cookies_passed: + raise ExtractorError('Failed to get SSR_HYDRATED_DATA') + raise ExtractorError('Cookies (not necessarily logged in) are needed', expected=True) + + return self._parse_json( + js_data.replace('window._SSR_HYDRATED_DATA=', ''), video_id, transform_source=js_to_json) + + def _media_selector(self, json_data): + for path, override in ( + (('video_list', ), {}), + (('dynamic_video', 'dynamic_video_list'), {'acodec': 'none'}), + (('dynamic_video', 'dynamic_audio_list'), {'vcodec': 'none', 'ext': 'm4a'}), + ): + for media in traverse_obj(json_data, (..., *path, lambda _, v: v['main_url'])): + yield { + 'url': base64.b64decode(media['main_url']).decode(), + 'width': int_or_none(media.get('vwidth')), + 'height': int_or_none(media.get('vheight')), + 'fps': int_or_none(media.get('fps')), + 'vcodec': media.get('codec_type'), + 'format_id': str_or_none(media.get('quality_type')), + 'filesize': int_or_none(media.get('size')), + 'ext': 'mp4', + **override, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + json_data = self._get_json_data(webpage, video_id)['anyVideo']['gidInformation']['packerData']['video'] + + formats = list(self._media_selector(json_data.get('videoResource'))) + return { + 'id': video_id, + 'title': json_data.get('title'), + 'description': json_data.get('video_abstract'), + 'formats': formats, + 'like_count': json_data.get('video_like_count'), + 'duration': int_or_none(json_data.get('duration')), + 'tags': [json_data.get('tag')], + 'uploader_id': traverse_obj(json_data, ('user_info', 'user_id')), + 'uploader': traverse_obj(json_data, ('user_info', 'name')), + 'view_count': json_data.get('video_watch_count'), + 'dislike_count': json_data.get('video_unlike_count'), + 'timestamp': int_or_none(json_data.get('video_publish_time')), + } diff --git a/hypervideo_dl/extractor/izlesene.py b/hypervideo_dl/extractor/izlesene.py index f8fca6c..5cdf870 100644 --- a/hypervideo_dl/extractor/izlesene.py +++ b/hypervideo_dl/extractor/izlesene.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import ( compat_str, @@ -81,7 +78,6 @@ class IzleseneIE(InfoExtractor): 'ext': ext, 'height': height, }) - self._sort_formats(formats) description = self._og_search_description(webpage, default=None) thumbnail = video.get('posterURL') or self._proto_relative_url( diff --git a/hypervideo_dl/extractor/jable.py b/hypervideo_dl/extractor/jable.py new file mode 100644 index 0000000..84c3225 --- /dev/null +++ b/hypervideo_dl/extractor/jable.py @@ -0,0 +1,103 @@ +import re + +from .common import InfoExtractor +from ..utils import ( + InAdvancePagedList, + int_or_none, + orderedSet, + unified_strdate, +) + + +class JableIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?jable.tv/videos/(?P[\w-]+)' + _TESTS = [{ + 'url': 'https://jable.tv/videos/pppd-812/', + 'md5': 'f1537283a9bc073c31ff86ca35d9b2a6', + 'info_dict': { + 'id': 'pppd-812', + 'ext': 'mp4', + 'title': 'PPPD-812 只要表現好巨乳女教師吉根柚莉愛就獎勵學生們在白虎穴內射出精液', + 'description': 'md5:5b6d4199a854f62c5e56e26ccad19967', + 'thumbnail': r're:^https?://.*\.jpg$', + 'age_limit': 18, + 'like_count': int, + 'view_count': int, + }, + }, { + 'url': 'https://jable.tv/videos/apak-220/', + 'md5': '71f9239d69ced58ab74a816908847cc1', + 'info_dict': { + 'id': 'apak-220', + 'ext': 'mp4', + 'title': 'md5:5c3861b7cf80112a6e2b70bccf170824', + 'description': '', + 'thumbnail': r're:^https?://.*\.jpg$', + 'age_limit': 18, + 'like_count': int, + 'view_count': int, + 'upload_date': '20220319', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + formats = self._extract_m3u8_formats( + self._search_regex(r'var\s+hlsUrl\s*=\s*\'([^\']+)', webpage, 'hls_url'), video_id, 'mp4', m3u8_id='hls') + + return { + 'id': video_id, + 'title': self._og_search_title(webpage), + 'description': self._og_search_description(webpage, default=''), + 'thumbnail': self._og_search_thumbnail(webpage, default=None), + 'formats': formats, + 'age_limit': 18, + 'upload_date': unified_strdate(self._search_regex( + r'class="inactive-color">\D+\s+(\d{4}-\d+-\d+)', webpage, 'upload_date', default=None)), + 'view_count': int_or_none(self._search_regex( + r'#icon-eye">\n*([\d ]+)', + webpage, 'view_count', default='').replace(' ', '')), + 'like_count': int_or_none(self._search_regex( + r'#icon-heart">(\d+)', webpage, 'link_count', default=None)), + } + + +class JablePlaylistIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?jable.tv/(?:categories|models|tags)/(?P[\w-]+)' + _TESTS = [{ + 'url': 'https://jable.tv/models/kaede-karen/', + 'info_dict': { + 'id': 'kaede-karen', + 'title': '楓カレン', + }, + 'playlist_count': 34, + }, { + 'url': 'https://jable.tv/categories/roleplay/', + 'only_matching': True, + }, { + 'url': 'https://jable.tv/tags/girl/', + 'only_matching': True, + }] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + webpage = self._download_webpage(url, playlist_id) + + def page_func(page_num): + return [ + self.url_result(player_url, JableIE) + for player_url in orderedSet(re.findall( + r'href="(https://jable.tv/videos/[\w-]+/?)"', + self._download_webpage(url, playlist_id, query={ + 'mode': 'async', + 'from': page_num + 1, + 'function': 'get_block', + 'block_id': 'list_videos_common_videos_list', + }, note=f'Downloading page {page_num + 1}')))] + + return self.playlist_result( + InAdvancePagedList(page_func, int_or_none(self._search_regex( + r'from:(\d+)">[^<]+\s*»', webpage, 'last page number', default=1)), 24), + playlist_id, self._search_regex( + r'

    ([^<]+)', webpage, 'playlist title', default=None)) diff --git a/hypervideo_dl/extractor/jamendo.py b/hypervideo_dl/extractor/jamendo.py index 755d970..a2bbba3 100644 --- a/hypervideo_dl/extractor/jamendo.py +++ b/hypervideo_dl/extractor/jamendo.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import hashlib import random @@ -31,10 +28,11 @@ class JamendoIE(InfoExtractor): 'ext': 'flac', # 'title': 'Maya Filipič - Stories from Emona I', 'title': 'Stories from Emona I', - # 'artist': 'Maya Filipič', + 'artist': 'Maya Filipič', + 'album': 'Between two worlds', 'track': 'Stories from Emona I', 'duration': 210, - 'thumbnail': r're:^https?://.*\.jpg', + 'thumbnail': 'https://usercontent.jamendo.com?type=album&id=29279&width=300&trackid=196219', 'timestamp': 1217438117, 'upload_date': '20080730', 'license': 'by-nc-nd', @@ -48,11 +46,11 @@ class JamendoIE(InfoExtractor): 'only_matching': True, }] - def _call_api(self, resource, resource_id): + def _call_api(self, resource, resource_id, fatal=True): path = '/api/%ss' % resource rand = compat_str(random.random()) return self._download_json( - 'https://www.jamendo.com' + path, resource_id, query={ + 'https://www.jamendo.com' + path, resource_id, fatal=fatal, query={ 'id[]': resource_id, }, headers={ 'X-Jam-Call': '$%s*%s~' % (hashlib.sha1((path + rand).encode()).hexdigest(), rand) @@ -74,6 +72,8 @@ class JamendoIE(InfoExtractor): # if artist_name: # title = '%s - %s' % (artist_name, title) # album = get_model('album') + artist = self._call_api("artist", track.get('artistId'), fatal=False) + album = self._call_api("album", track.get('albumId'), fatal=False) formats = [{ 'url': 'https://%s.jamendo.com/?trackid=%s&format=%s&from=app-97dab294' @@ -87,7 +87,6 @@ class JamendoIE(InfoExtractor): ('ogg1', 'ogg', 'ogg'), ('flac', 'flac', 'flac'), ))] - self._sort_formats(formats) urls = [] thumbnails = [] @@ -121,9 +120,9 @@ class JamendoIE(InfoExtractor): 'title': title, 'description': track.get('description'), 'duration': int_or_none(track.get('duration')), - # 'artist': artist_name, + 'artist': artist.get('name'), 'track': track_name, - # 'album': album.get('name'), + 'album': album.get('name'), 'formats': formats, 'license': '-'.join(license) if license else None, 'timestamp': int_or_none(track.get('dateCreated')), @@ -134,7 +133,7 @@ class JamendoIE(InfoExtractor): } -class JamendoAlbumIE(JamendoIE): +class JamendoAlbumIE(JamendoIE): # XXX: Do not subclass from concrete IE _VALID_URL = r'https?://(?:www\.)?jamendo\.com/album/(?P[0-9]+)' _TESTS = [{ 'url': 'https://www.jamendo.com/album/121486/duck-on-cover', @@ -148,22 +147,38 @@ class JamendoAlbumIE(JamendoIE): 'info_dict': { 'id': '1032333', 'ext': 'flac', - 'title': 'Shearer - Warmachine', + 'title': 'Warmachine', 'artist': 'Shearer', 'track': 'Warmachine', 'timestamp': 1368089771, 'upload_date': '20130509', + 'view_count': int, + 'thumbnail': 'https://usercontent.jamendo.com?type=album&id=121486&width=300&trackid=1032333', + 'duration': 190, + 'license': 'by', + 'album': 'Duck On Cover', + 'average_rating': 4, + 'tags': ['rock', 'drums', 'bass', 'world', 'punk', 'neutral'], + 'like_count': int, } }, { 'md5': '1f358d7b2f98edfe90fd55dac0799d50', 'info_dict': { 'id': '1032330', 'ext': 'flac', - 'title': 'Shearer - Without Your Ghost', + 'title': 'Without Your Ghost', 'artist': 'Shearer', 'track': 'Without Your Ghost', 'timestamp': 1368089771, 'upload_date': '20130509', + 'duration': 192, + 'tags': ['rock', 'drums', 'bass', 'world', 'punk'], + 'album': 'Duck On Cover', + 'thumbnail': 'https://usercontent.jamendo.com?type=album&id=121486&width=300&trackid=1032330', + 'view_count': int, + 'average_rating': 4, + 'license': 'by', + 'like_count': int, } }], 'params': { diff --git a/hypervideo_dl/extractor/japandiet.py b/hypervideo_dl/extractor/japandiet.py new file mode 100644 index 0000000..6c65056 --- /dev/null +++ b/hypervideo_dl/extractor/japandiet.py @@ -0,0 +1,274 @@ +import re + +from ..utils import ( + ExtractorError, + clean_html, + int_or_none, + join_nonempty, + parse_qs, + smuggle_url, + traverse_obj, + try_call, + unsmuggle_url +) +from .common import InfoExtractor + + +def _parse_japanese_date(text): + if not text: + return None + ERA_TABLE = { + '明治': 1868, + '大正': 1912, + '昭和': 1926, + '平成': 1989, + '令和': 2019, + } + ERA_RE = '|'.join(map(re.escape, ERA_TABLE.keys())) + mobj = re.search(rf'({ERA_RE})?(\d+)年(\d+)月(\d+)日', re.sub(r'[\s\u3000]+', '', text)) + if not mobj: + return None + era, year, month, day = mobj.groups() + year, month, day = map(int, (year, month, day)) + if era: + # example input: 令和5年3月34日 + # even though each era have their end, don't check here + year += ERA_TABLE[era] + return '%04d%02d%02d' % (year, month, day) + + +def _parse_japanese_duration(text): + mobj = re.search(r'(?:(\d+)日間?)?(?:(\d+)時間?)?(?:(\d+)分)?(?:(\d+)秒)?', re.sub(r'[\s\u3000]+', '', text or '')) + if not mobj: + return + days, hours, mins, secs = [int_or_none(x, default=0) for x in mobj.groups()] + return secs + mins * 60 + hours * 60 * 60 + days * 24 * 60 * 60 + + +class ShugiinItvBaseIE(InfoExtractor): + _INDEX_ROOMS = None + + @classmethod + def _find_rooms(cls, webpage): + return [{ + '_type': 'url', + 'id': x.group(1), + 'title': clean_html(x.group(2)).strip(), + 'url': smuggle_url(f'https://www.shugiintv.go.jp/jp/index.php?room_id={x.group(1)}', {'g': x.groups()}), + 'ie_key': ShugiinItvLiveIE.ie_key(), + } for x in re.finditer(r'(?s)(.+?)', webpage)] + + def _fetch_rooms(self): + if not self._INDEX_ROOMS: + webpage = self._download_webpage( + 'https://www.shugiintv.go.jp/jp/index.php', None, + encoding='euc-jp', note='Downloading proceedings info') + ShugiinItvBaseIE._INDEX_ROOMS = self._find_rooms(webpage) + return self._INDEX_ROOMS + + +class ShugiinItvLiveIE(ShugiinItvBaseIE): + _VALID_URL = r'https?://(?:www\.)?shugiintv\.go\.jp/(?:jp|en)(?:/index\.php)?$' + IE_DESC = '衆議院インターネット審議中継' + + _TESTS = [{ + 'url': 'https://www.shugiintv.go.jp/jp/index.php', + 'info_dict': { + '_type': 'playlist', + 'title': 'All proceedings for today', + }, + # expect at least one proceedings is running + 'playlist_mincount': 1, + }] + + @classmethod + def suitable(cls, url): + return super().suitable(url) and not any(x.suitable(url) for x in (ShugiinItvLiveRoomIE, ShugiinItvVodIE)) + + def _real_extract(self, url): + self.to_screen( + 'Downloading all running proceedings. To specify one proceeding, use direct link from the website') + return self.playlist_result(self._fetch_rooms(), playlist_title='All proceedings for today') + + +class ShugiinItvLiveRoomIE(ShugiinItvBaseIE): + _VALID_URL = r'https?://(?:www\.)?shugiintv\.go\.jp/(?:jp|en)/index\.php\?room_id=(?Proom\d+)' + IE_DESC = '衆議院インターネット審議中継 (中継)' + + _TESTS = [{ + 'url': 'https://www.shugiintv.go.jp/jp/index.php?room_id=room01', + 'info_dict': { + 'id': 'room01', + 'title': '内閣委員会', + }, + 'skip': 'this runs for a time and not every day', + }, { + 'url': 'https://www.shugiintv.go.jp/jp/index.php?room_id=room11', + 'info_dict': { + 'id': 'room11', + 'title': '外務委員会', + }, + 'skip': 'this runs for a time and not every day', + }] + + def _real_extract(self, url): + url, smug = unsmuggle_url(url, default={}) + if smug.get('g'): + room_id, title = smug['g'] + else: + room_id = self._match_id(url) + title = traverse_obj(self._fetch_rooms(), (lambda k, v: v['id'] == room_id, 'title'), get_all=False) + + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + f'https://hlslive.shugiintv.go.jp/{room_id}/amlst:{room_id}/playlist.m3u8', + room_id, ext='mp4') + + return { + 'id': room_id, + 'title': title, + 'formats': formats, + 'subtitles': subtitles, + 'is_live': True, + } + + +class ShugiinItvVodIE(ShugiinItvBaseIE): + _VALID_URL = r'https?://(?:www\.)?shugiintv\.go\.jp/(?:jp|en)/index\.php\?ex=VL(?:\&[^=]+=[^&]*)*\&deli_id=(?P\d+)' + IE_DESC = '衆議院インターネット審議中継 (ビデオライブラリ)' + _TESTS = [{ + 'url': 'https://www.shugiintv.go.jp/jp/index.php?ex=VL&media_type=&deli_id=53846', + 'info_dict': { + 'id': '53846', + 'title': 'ウクライナ大統領国会演説(オンライン)', + 'release_date': '20220323', + 'chapters': 'count:4', + } + }, { + 'url': 'https://www.shugiintv.go.jp/en/index.php?ex=VL&media_type=&deli_id=53846', + 'only_matching': True + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage( + f'https://www.shugiintv.go.jp/jp/index.php?ex=VL&media_type=&deli_id={video_id}', video_id, + encoding='euc-jp') + + m3u8_url = self._search_regex( + r'id="vtag_src_base_vod"\s*value="(http.+?\.m3u8)"', webpage, 'm3u8 url') + m3u8_url = re.sub(r'^http://', 'https://', m3u8_url) + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + m3u8_url, video_id, ext='mp4') + + title = self._html_search_regex( + (r'(.+)\s*\(\d+分\)', + r'(.+?)\s*\s*(.+?)', + webpage, 'title', fatal=False)) + + chapters = [] + for chp in re.finditer(r'(?i)(?!', webpage): + chapters.append({ + 'title': clean_html(chp.group(2)).strip(), + 'start_time': try_call(lambda: float(parse_qs(chp.group(1))['time'][0].strip())), + }) + # NOTE: there are blanks at the first and the end of the videos, + # so getting/providing the video duration is not possible + # also, the exact end_time for the last chapter is unknown (we can get at most minutes of granularity) + last_tr = re.findall(r'(?s)(.+?)', webpage)[-1] + if last_tr and chapters: + last_td = re.findall(r'', last_tr)[-1] + if last_td: + chapters[-1]['end_time'] = chapters[-1]['start_time'] + _parse_japanese_duration(clean_html(last_td)) + + return { + 'id': video_id, + 'title': title, + 'release_date': release_date, + 'chapters': chapters, + 'formats': formats, + 'subtitles': subtitles, + } + + +class SangiinInstructionIE(InfoExtractor): + _VALID_URL = r'^https?://www\.webtv\.sangiin\.go\.jp/webtv/index\.php' + IE_DESC = False # this shouldn't be listed as a supported site + + def _real_extract(self, url): + raise ExtractorError('Copy the link from the botton below the video description or player, and use the link to download. If there are no button in the frame, get the URL of the frame showing the video.', expected=True) + + +class SangiinIE(InfoExtractor): + _VALID_URL = r'https?://www\.webtv\.sangiin\.go\.jp/webtv/detail\.php\?sid=(?P\d+)' + IE_DESC = '参議院インターネット審議中継 (archive)' + + _TESTS = [{ + 'url': 'https://www.webtv.sangiin.go.jp/webtv/detail.php?sid=7052', + 'info_dict': { + 'id': '7052', + 'title': '2022年10月7日 本会議', + 'description': 'md5:0a5fed523f95c88105a0b0bf1dd71489', + 'upload_date': '20221007', + 'ext': 'mp4', + }, + }, { + 'url': 'https://www.webtv.sangiin.go.jp/webtv/detail.php?sid=7037', + 'info_dict': { + 'id': '7037', + 'title': '2022年10月3日 開会式', + 'upload_date': '20221003', + 'ext': 'mp4', + }, + }, { + 'url': 'https://www.webtv.sangiin.go.jp/webtv/detail.php?sid=7076', + 'info_dict': { + 'id': '7076', + 'title': '2022年10月27日 法務委員会', + 'upload_date': '20221027', + 'ext': 'mp4', + 'is_live': True, + }, + 'skip': 'this live is turned into archive after it ends', + }, ] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + date = self._html_search_regex( + r']*>\s*開会日\s*\s*]*>\s*(.+?)\s*', webpage, + 'date', fatal=False) + upload_date = _parse_japanese_date(date) + + title = self._html_search_regex( + r']*>\s*会議名\s*\s*]*>\s*(.+?)\s*', webpage, + 'date', fatal=False) + + # some videos don't have the elements, so assume it's missing + description = self._html_search_regex( + r'会議の経過\s*

    \s*]*>(.+?)
    ', webpage, + 'description', default=None) + + # this row appears only when it's livestream + is_live = bool(self._html_search_regex( + r']*>\s*公報掲載時刻\s*\s*]*>\s*(.+?)\s*', webpage, + 'is_live', default=None)) + + m3u8_url = self._search_regex( + r'var\s+videopath\s*=\s*(["\'])([^"\']+)\1', webpage, + 'm3u8 url', group=2) + + formats, subs = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, 'mp4') + + return { + 'id': video_id, + 'title': join_nonempty(date, title, delim=' '), + 'description': description, + 'upload_date': upload_date, + 'formats': formats, + 'subtitles': subs, + 'is_live': is_live, + } diff --git a/hypervideo_dl/extractor/jeuxvideo.py b/hypervideo_dl/extractor/jeuxvideo.py index 77c0f52..56ea15c 100644 --- a/hypervideo_dl/extractor/jeuxvideo.py +++ b/hypervideo_dl/extractor/jeuxvideo.py @@ -1,8 +1,3 @@ -# coding: utf-8 - -from __future__ import unicode_literals - - from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/jixie.py b/hypervideo_dl/extractor/jixie.py new file mode 100644 index 0000000..4830e61 --- /dev/null +++ b/hypervideo_dl/extractor/jixie.py @@ -0,0 +1,47 @@ +from .common import InfoExtractor +from ..utils import clean_html, float_or_none, traverse_obj, try_call + + +class JixieBaseIE(InfoExtractor): + """ + API Reference: + https://jixie.atlassian.net/servicedesk/customer/portal/2/article/1339654214?src=-1456335525, + https://scripts.jixie.media/jxvideo.3.1.min.js + """ + + def _extract_data_from_jixie_id(self, display_id, video_id, webpage): + json_data = self._download_json( + 'https://apidam.jixie.io/api/public/stream', display_id, + query={'metadata': 'full', 'video_id': video_id})['data'] + + formats, subtitles = [], {} + for stream in json_data['streams']: + if stream.get('type') == 'HLS': + fmt, sub = self._extract_m3u8_formats_and_subtitles(stream.get('url'), display_id, ext='mp4') + if json_data.get('drm'): + for f in fmt: + f['has_drm'] = True + formats.extend(fmt) + self._merge_subtitles(sub, target=subtitles) + else: + formats.append({ + 'url': stream.get('url'), + 'width': stream.get('width'), + 'height': stream.get('height'), + 'ext': 'mp4', + }) + + return { + 'id': video_id, + 'display_id': display_id, + 'formats': formats, + 'subtitles': subtitles, + 'title': json_data.get('title') or self._html_search_meta(['og:title', 'twitter:title'], webpage), + 'description': (clean_html(traverse_obj(json_data, ('metadata', 'description'))) + or self._html_search_meta(['description', 'og:description', 'twitter:description'], webpage)), + 'thumbnails': traverse_obj(json_data, ('metadata', 'thumbnails')), + 'duration': float_or_none(traverse_obj(json_data, ('metadata', 'duration'))), + 'tags': try_call(lambda: (json_data['metadata']['keywords'] or None).split(',')), + 'categories': try_call(lambda: (json_data['metadata']['categories'] or None).split(',')), + 'uploader_id': json_data.get('owner_id'), + } diff --git a/hypervideo_dl/extractor/joj.py b/hypervideo_dl/extractor/joj.py index 7350f53..9b62284 100644 --- a/hypervideo_dl/extractor/joj.py +++ b/hypervideo_dl/extractor/joj.py @@ -1,8 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - from .common import InfoExtractor from ..compat import compat_str from ..utils import ( @@ -21,6 +16,7 @@ class JojIE(InfoExtractor): ) (?P[^/?#^]+) ''' + _EMBED_REGEX = [r']+\bsrc=(["\'])(?P(?:https?:)?//media\.joj\.sk/embed/(?:(?!\1).)+)\1'] _TESTS = [{ 'url': 'https://media.joj.sk/embed/a388ec4c-6019-4a4a-9312-b1bee194e932', 'info_dict': { @@ -41,14 +37,6 @@ class JojIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): - return [ - mobj.group('url') - for mobj in re.finditer( - r']+\bsrc=(["\'])(?P(?:https?:)?//media\.joj\.sk/embed/(?:(?!\1).)+)\1', - webpage)] - def _real_extract(self, url): video_id = self._match_id(url) @@ -73,7 +61,7 @@ class JojIE(InfoExtractor): r'(\d+)[pP]\.', format_url, 'height', default=None) formats.append({ 'url': format_url, - 'format_id': format_field(height, template='%sp'), + 'format_id': format_field(height, None, '%sp'), 'height': int(height), }) if not formats: @@ -93,7 +81,6 @@ class JojIE(InfoExtractor): r'(\d+)[pP]', format_id or path, 'height', default=None)), }) - self._sort_formats(formats) thumbnail = self._og_search_thumbnail(webpage) diff --git a/hypervideo_dl/extractor/jove.py b/hypervideo_dl/extractor/jove.py index 4b7dfc5..245fe73 100644 --- a/hypervideo_dl/extractor/jove.py +++ b/hypervideo_dl/extractor/jove.py @@ -1,6 +1,3 @@ -from __future__ import unicode_literals - - from .common import InfoExtractor from ..utils import ( ExtractorError, diff --git a/hypervideo_dl/extractor/jwplatform.py b/hypervideo_dl/extractor/jwplatform.py index 5aa508b..c949689 100644 --- a/hypervideo_dl/extractor/jwplatform.py +++ b/hypervideo_dl/extractor/jwplatform.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -8,7 +5,7 @@ from ..utils import unsmuggle_url class JWPlatformIE(InfoExtractor): - _VALID_URL = r'(?:https?://(?:content\.jwplatform|cdn\.jwplayer)\.com/(?:(?:feed|player|thumb|preview)s|jw6|v2/media)/|jwplatform:)(?P[a-zA-Z0-9]{8})' + _VALID_URL = r'(?:https?://(?:content\.jwplatform|cdn\.jwplayer)\.com/(?:(?:feed|player|thumb|preview|manifest)s|jw6|v2/media)/|jwplatform:)(?P[a-zA-Z0-9]{8})' _TESTS = [{ 'url': 'http://content.jwplatform.com/players/nPripu9l-ALJ3XQCI.js', 'md5': 'fa8899fa601eb7c83a64e9d568bdf325', @@ -25,21 +22,48 @@ class JWPlatformIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_url(webpage): - urls = JWPlatformIE._extract_urls(webpage) - return urls[0] if urls else None + _WEBPAGE_TESTS = [{ + # JWPlatform iframe + 'url': 'https://www.covermagazine.co.uk/feature/2465255/business-protection-involved', + 'info_dict': { + 'id': 'AG26UQXM', + 'ext': 'mp4', + 'upload_date': '20160719', + 'timestamp': 1468923808, + 'title': '2016_05_18 Cover L&G Business Protection V1 FINAL.mp4', + 'thumbnail': 'https://cdn.jwplayer.com/v2/media/AG26UQXM/poster.jpg?width=720', + 'description': '', + 'duration': 294.0, + }, + }, { + # Player url not surrounded by quotes + 'url': 'https://www.deutsche-kinemathek.de/en/online/streaming/darling-berlin', + 'info_dict': { + 'id': 'R10NQdhY', + 'title': 'Playgirl', + 'ext': 'mp4', + 'upload_date': '20220624', + 'thumbnail': 'https://cdn.jwplayer.com/v2/media/R10NQdhY/poster.jpg?width=720', + 'timestamp': 1656064800, + 'description': 'BRD 1966, Will Tremper', + 'duration': 5146.0, + }, + 'params': {'allowed_extractors': ['generic', 'jwplatform']}, + }] - @staticmethod - def _extract_urls(webpage): + @classmethod + def _extract_embed_urls(cls, url, webpage): for tag, key in ((r'(?:script|iframe)', 'src'), ('input', 'value')): # is used by hyland.com # if we find